tesseract  5.0.0
unicodetext.cc
Go to the documentation of this file.
1 
17 #include "include_gunit.h"
18 #include "util/utf8/unicodetext.h"
19 
20 #include <string.h> // for memcpy, NULL, memcmp, etc
21 #include <algorithm> // for max
22 
23 //#include "base/logging.h" // for operator<<, CHECK, etc
24 //#include "base/stringprintf.h" // for StringPrintf, StringAppendF
25 //#include "strings/stringpiece.h" // for StringPiece, etc
26 
27 #include "third_party/utf/utf.h" // for isvalidcharntorune, etc
28 #include "util/utf8/unilib.h" // for IsInterchangeValid, etc
29 #include "util/utf8/unilib_utf8_utils.h" // for OneCharLen
30 
31 static int CodepointDistance(const char *start, const char *end) {
32  int n = 0;
33  // Increment n on every non-trail-byte.
34  for (const char *p = start; p < end; ++p) {
35  n += (*reinterpret_cast<const signed char *>(p) >= -0x40);
36  }
37  return n;
38 }
39 
40 static int CodepointCount(const char *utf8, int len) {
41  return CodepointDistance(utf8, utf8 + len);
42 }
43 
46  return CodepointDistance(first.it_, last.it_);
47 }
48 
49 // ---------- Utility ----------
50 
51 static int ConvertToInterchangeValid(char *start, int len) {
52  // This routine is called only when we've discovered that a UTF-8 buffer
53  // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
54  // was not interchange valid. This indicates a bug in the caller, and
55  // a LOG(WARNING) is done in that case.
56  // This is similar to CoerceToInterchangeValid, but it replaces each
57  // structurally valid byte with a space, and each non-interchange
58  // character with a space, even when that character requires more
59  // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
60  // structurally valid UTF8, but U+FDD0 is not an interchange-valid
61  // code point. The result should contain one space, not three.
62  //
63  // Since the conversion never needs to write more data than it
64  // reads, it is safe to change the buffer in place. It returns the
65  // number of bytes written.
66  char *const in = start;
67  char *out = start;
68  char *const end = start + len;
69  while (start < end) {
70  int good = UniLib::SpanInterchangeValid(start, end - start);
71  if (good > 0) {
72  if (out != start) {
73  memmove(out, start, good);
74  }
75  out += good;
76  start += good;
77  if (start == end) {
78  break;
79  }
80  }
81  // Is the current string invalid UTF8 or just non-interchange UTF8?
82  char32 rune;
83  int n;
84  if (isvalidcharntorune(start, end - start, &rune, &n)) {
85  // structurally valid UTF8, but not interchange valid
86  start += n; // Skip over the whole character.
87  } else { // bad UTF8
88  start += 1; // Skip over just one byte
89  }
90  *out++ = ' ';
91  }
92  return out - in;
93 }
94 
95 // *************** Data representation **********
96 
97 // Note: the copy constructor is undefined.
98 
99 // After reserve(), resize(), or clear(), we're an owner, not an alias.
100 
101 void UnicodeText::Repr::reserve(int new_capacity) {
102  // If there's already enough capacity, and we're an owner, do nothing.
103  if (capacity_ >= new_capacity && ours_)
104  return;
105 
106  // Otherwise, allocate a new buffer.
107  capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
108  char *new_data = new char[capacity_];
109 
110  // If there is an old buffer, copy it into the new buffer.
111  if (data_) {
112  memcpy(new_data, data_, size_);
113  if (ours_)
114  delete[] data_; // If we owned the old buffer, free it.
115  }
116  data_ = new_data;
117  ours_ = true; // We own the new buffer.
118  // size_ is unchanged.
119 }
120 
121 void UnicodeText::Repr::resize(int new_size) {
122  if (new_size == 0) {
123  clear();
124  } else {
125  if (!ours_ || new_size > capacity_)
126  reserve(new_size);
127  // Clear the memory in the expanded part.
128  if (size_ < new_size)
129  memset(data_ + size_, 0, new_size - size_);
130  size_ = new_size;
131  ours_ = true;
132  }
133 }
134 
135 // This implementation of clear() deallocates the buffer if we're an owner.
136 // That's not strictly necessary; we could just set size_ to 0.
137 void UnicodeText::Repr::clear() {
138  if (ours_)
139  delete[] data_;
140  data_ = nullptr;
141  size_ = capacity_ = 0;
142  ours_ = true;
143 }
144 
145 void UnicodeText::Repr::Copy(const char *data, int size) {
146  resize(size);
147  memcpy(data_, data, size);
148 }
149 
150 void UnicodeText::Repr::TakeOwnershipOf(char *data, int size, int capacity) {
151  if (data == data_)
152  return; // We already own this memory. (Weird case.)
153  if (ours_ && data_)
154  delete[] data_; // If we owned the old buffer, free it.
155  data_ = data;
156  size_ = size;
157  capacity_ = capacity;
158  ours_ = true;
159 }
160 
161 void UnicodeText::Repr::PointTo(const char *data, int size) {
162  if (ours_ && data_)
163  delete[] data_; // If we owned the old buffer, free it.
164  data_ = const_cast<char *>(data);
165  size_ = size;
166  capacity_ = size;
167  ours_ = false;
168 }
169 
170 void UnicodeText::Repr::append(const char *bytes, int byte_length) {
171  reserve(size_ + byte_length);
172  memcpy(data_ + size_, bytes, byte_length);
173  size_ += byte_length;
174 }
175 
176 #ifdef INCLUDE_TENSORFLOW
177 string UnicodeText::Repr::DebugString() const {
178  return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_,
179  capacity_, ours_ ? "Owned" : "Alias");
180 }
181 #endif
182 
183 // *************** UnicodeText ******************
184 
185 // ----- Constructors -----
186 
187 // Default constructor
189 
190 // Copy constructor
192  Copy(src);
193 }
194 
195 // Substring constructor
198  CHECK(first <= last) << " Incompatible iterators";
199  repr_.append(first.it_, last.it_ - first.it_);
200 }
201 
203  CHECK(first <= last) << " Incompatible iterators";
204  return string(first.it_, last.it_ - first.it_);
205 }
206 
207 // ----- Copy -----
208 
210  if (this != &src) {
211  Copy(src);
212  }
213  return *this;
214 }
215 
217  repr_.Copy(src.repr_.data_, src.repr_.size_);
218  return *this;
219 }
220 
221 UnicodeText &UnicodeText::CopyUTF8(const char *buffer, int byte_length) {
222  repr_.Copy(buffer, byte_length);
223  if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
224  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
225  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
226  }
227  return *this;
228 }
229 
230 UnicodeText &UnicodeText::UnsafeCopyUTF8(const char *buffer, int byte_length) {
231  repr_.Copy(buffer, byte_length);
232  return *this;
233 }
234 
235 // ----- TakeOwnershipOf -----
236 
237 UnicodeText &UnicodeText::TakeOwnershipOfUTF8(char *buffer, int byte_length, int byte_capacity) {
238  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
239  if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
240  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
241  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
242  }
243  return *this;
244 }
245 
246 UnicodeText &UnicodeText::UnsafeTakeOwnershipOfUTF8(char *buffer, int byte_length,
247  int byte_capacity) {
248  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
249  return *this;
250 }
251 
252 // ----- PointTo -----
253 
254 UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {
255  if (UniLib::IsInterchangeValid(buffer, byte_length)) {
256  repr_.PointTo(buffer, byte_length);
257  } else {
258  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
259  repr_.Copy(buffer, byte_length);
260  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
261  }
262  return *this;
263 }
264 
265 UnicodeText &UnicodeText::UnsafePointToUTF8(const char *buffer, int byte_length) {
266  repr_.PointTo(buffer, byte_length);
267  return *this;
268 }
269 
271  repr_.PointTo(src.repr_.data_, src.repr_.size_);
272  return *this;
273 }
274 
276  CHECK(first <= last) << " Incompatible iterators";
277  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
278  return *this;
279 }
280 
281 // ----- Append -----
282 
284  repr_.append(u.repr_.data_, u.repr_.size_);
285  return *this;
286 }
287 
289  CHECK(first <= last) << " Incompatible iterators";
290  repr_.append(first.it_, last.it_ - first.it_);
291  return *this;
292 }
293 
294 UnicodeText &UnicodeText::UnsafeAppendUTF8(const char *utf8, int len) {
295  repr_.append(utf8, len);
296  return *this;
297 }
298 
299 // ----- substring searching -----
300 
302  const_iterator start_pos) const {
303  CHECK_GE(start_pos.utf8_data(), utf8_data());
304  CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
305  return UnsafeFind(look, start_pos);
306 }
307 
309  return UnsafeFind(look, begin());
310 }
311 
312 UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,
313  const_iterator start_pos) const {
314  // Due to the magic of the UTF8 encoding, searching for a sequence of
315  // letters is equivalent to substring search.
316 #ifdef INCLUDE_TENSORFLOW
317  StringPiece searching(utf8_data(), utf8_length());
318  StringPiece look_piece(look.utf8_data(), look.utf8_length());
319 #endif
320  LOG(FATAL) << "Not implemented";
321 #ifdef INCLUDE_TENSORFLOW
322  // StringPiece::size_type found =
323  // searching.find(look_piece, start_pos.utf8_data() - utf8_data());
324  StringPiece::size_type found = StringPiece::npos;
325  if (found == StringPiece::npos)
326  return end();
327  return const_iterator(utf8_data() + found);
328 #else
329  return end();
330 #endif
331 }
332 
333 #ifdef INCLUDE_TENSORFLOW
334 bool UnicodeText::HasReplacementChar() const {
335  // Equivalent to:
336  // UnicodeText replacement_char;
337  // replacement_char.push_back(0xFFFD);
338  // return find(replacement_char) != end();
339  StringPiece searching(utf8_data(), utf8_length());
340  StringPiece looking_for("\xEF\xBF\xBD", 3);
341  LOG(FATAL) << "Not implemented";
342  // return searching.find(looking_for) != StringPiece::npos;
343  return false;
344 }
345 #endif
346 
347 // ----- other methods -----
348 
349 // Clear operator
351  repr_.clear();
352 }
353 
354 // Destructor
356 
358  if (UniLib::IsValidCodepoint(c)) {
359  char buf[UTFmax];
360  int len = runetochar(buf, &c);
361  if (UniLib::IsInterchangeValid(buf, len)) {
362  repr_.append(buf, len);
363  } else {
364  LOG(WARNING) << "Unicode value 0x" << std::hex << c << " is not valid for interchange";
365  repr_.append(" ", 1);
366  }
367  } else {
368  LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
369  repr_.append(" ", 1);
370  }
371 }
372 
373 int UnicodeText::size() const {
374  return CodepointCount(repr_.data_, repr_.size_);
375 }
376 
377 bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {
378  if (&lhs == &rhs)
379  return true;
380  if (lhs.repr_.size_ != rhs.repr_.size_)
381  return false;
382  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
383 }
384 
385 #ifdef INCLUDE_TENSORFLOW
386 string UnicodeText::DebugString() const {
387  return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(),
388  repr_.DebugString().c_str());
389 }
390 #endif
391 
392 // ******************* UnicodeText::const_iterator *********************
393 
394 // The implementation of const_iterator would be nicer if it
395 // inherited from boost::iterator_facade
396 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
397 
399 
401 
403  if (&other != this)
404  it_ = other.it_;
405  return *this;
406 }
407 
409  return const_iterator(repr_.data_);
410 }
411 
413  return const_iterator(repr_.data_ + repr_.size_);
414 }
415 
417  return lhs.it_ < rhs.it_;
418 }
419 
421  // (We could call chartorune here, but that does some
422  // error-checking, and we're guaranteed that our data is valid
423  // UTF-8. Also, we expect this routine to be called very often. So
424  // for speed, we do the calculation ourselves.)
425 
426  // Convert from UTF-8
427  unsigned char byte1 = it_[0];
428  if (byte1 < 0x80)
429  return byte1;
430 
431  unsigned char byte2 = it_[1];
432  if (byte1 < 0xE0)
433  return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
434 
435  unsigned char byte3 = it_[2];
436  if (byte1 < 0xF0)
437  return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
438 
439  unsigned char byte4 = it_[3];
440  return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
441 }
442 
444  it_ += UniLib::OneCharLen(it_);
445  return *this;
446 }
447 
449  while (UniLib::IsTrailByte(*--it_))
450  ;
451  return *this;
452 }
453 
454 int UnicodeText::const_iterator::get_utf8(char *utf8_output) const {
455  utf8_output[0] = it_[0];
456  if ((it_[0] & 0xff) < 0x80)
457  return 1;
458  utf8_output[1] = it_[1];
459  if ((it_[0] & 0xff) < 0xE0)
460  return 2;
461  utf8_output[2] = it_[2];
462  if ((it_[0] & 0xff) < 0xF0)
463  return 3;
464  utf8_output[3] = it_[3];
465  return 4;
466 }
467 
469  return string(utf8_data(), utf8_length());
470 }
471 
473  if ((it_[0] & 0xff) < 0x80) {
474  return 1;
475  } else if ((it_[0] & 0xff) < 0xE0) {
476  return 2;
477  } else if ((it_[0] & 0xff) < 0xF0) {
478  return 3;
479  } else {
480  return 4;
481  }
482 }
483 
485  CHECK(p != nullptr);
486  const char *start = utf8_data();
487  int len = utf8_length();
488  const char *end = start + len;
489  CHECK(p >= start);
490  CHECK(p <= end);
491  CHECK(p == end || !UniLib::IsTrailByte(*p));
492  return const_iterator(p);
493 }
494 
495 #ifdef INCLUDE_TENSORFLOW
497  return tensorflow::strings::Printf("{iter %p}", it_);
498 }
499 
500 // *************************** Utilities *************************
501 
502 string CodepointString(const UnicodeText &t) {
503  string s;
504  UnicodeText::const_iterator it = t.begin(), end = t.end();
505  while (it != end)
506  tensorflow::strings::Appendf(&s, "%X ", *it++);
507  return s;
508 }
509 #endif
signed int char32
@ LOG
#define CHECK_GE(test, value)
Definition: include_gunit.h:80
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_LE(test, value)
Definition: include_gunit.h:83
@ FATAL
Definition: log.h:28
@ WARNING
Definition: log.h:28
int runetochar(char *str, const Rune *rune)
Definition: rune.c:244
int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed)
Definition: rune.c:239
@ UTFmax
Definition: utf.h:22
bool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs)
Definition: unicodetext.cc:416
bool operator==(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.cc:377
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
Definition: unicodetext.cc:44
string CodepointString(const UnicodeText &t)
LIST last(LIST var_list)
Definition: oldlist.cpp:153
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
bool IsValidCodepoint(char32 c)
int SpanInterchangeValid(const char *begin, int byte_length)
Definition: unilib.cc:39
bool IsTrailByte(char x)
int OneCharLen(const char *src)
static string UTF8Substring(const const_iterator &first, const const_iterator &last)
Definition: unicodetext.cc:202
void push_back(char32 codepoint)
Definition: unicodetext.cc:357
const_iterator MakeIterator(const char *p) const
Definition: unicodetext.cc:484
UnicodeText & CopyUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:221
const_iterator find(const UnicodeText &look, const_iterator start_pos) const
Definition: unicodetext.cc:301
const char * utf8_data() const
Definition: unicodetext.h:305
UnicodeText & Copy(const UnicodeText &src)
Definition: unicodetext.cc:216
UnicodeText & PointTo(const UnicodeText &src)
Definition: unicodetext.cc:270
string DebugString() const
UnicodeText & append(ForwardIterator first, const ForwardIterator last)
Definition: unicodetext.h:163
const_iterator end() const
Definition: unicodetext.cc:412
friend class const_iterator
Definition: unicodetext.h:348
UnicodeText & operator=(const UnicodeText &src)
Definition: unicodetext.cc:209
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:254
UnicodeText & TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity)
Definition: unicodetext.cc:237
int utf8_length() const
Definition: unicodetext.h:308
int size() const
Definition: unicodetext.cc:373
bool HasReplacementChar() const
const_iterator begin() const
Definition: unicodetext.cc:408
void clear()
Definition: unicodetext.cc:350
const char * utf8_data() const
Definition: unicodetext.h:244
const_iterator & operator++()
Definition: unicodetext.cc:443
const_iterator & operator--()
Definition: unicodetext.cc:448
int get_utf8(char *buf) const
Definition: unicodetext.cc:454
const_iterator & operator=(const const_iterator &other)
Definition: unicodetext.cc:402
string get_utf8_string() const
Definition: unicodetext.cc:468