tesseract  5.0.0
UnicodeText Class Reference

#include <unicodetext.h>

Classes

class  const_iterator
 
class  const_reverse_iterator
 

Public Types

typedef char32 value_type
 

Public Member Functions

 UnicodeText ()
 
 UnicodeText (const UnicodeText &src)
 
 UnicodeText (const const_iterator &first, const const_iterator &last)
 
UnicodeTextoperator= (const UnicodeText &src)
 
UnicodeTextCopy (const UnicodeText &src)
 
UnicodeTextassign (const UnicodeText &src)
 
UnicodeTextPointTo (const UnicodeText &src)
 
UnicodeTextPointTo (const const_iterator &first, const const_iterator &last)
 
 ~UnicodeText ()
 
void clear ()
 
bool empty () const
 
void push_back (char32 codepoint)
 
template<typename ForwardIterator >
UnicodeTextappend (ForwardIterator first, const ForwardIterator last)
 
UnicodeTextappend (const const_iterator &first, const const_iterator &last)
 
UnicodeTextappend (const UnicodeText &source)
 
int size () const
 
const_iterator begin () const
 
const_iterator end () const
 
const_reverse_iterator rbegin () const
 
const_reverse_iterator rend () const
 
const_iterator find (const UnicodeText &look, const_iterator start_pos) const
 
const_iterator find (const UnicodeText &look) const
 
bool HasReplacementChar () const
 
const char * utf8_data () const
 
int utf8_length () const
 
int utf8_capacity () const
 
UnicodeTextCopyUTF8 (const char *utf8_buffer, int byte_length)
 
UnicodeTextTakeOwnershipOfUTF8 (char *utf8_buffer, int byte_length, int byte_capacity)
 
UnicodeTextPointToUTF8 (const char *utf8_buffer, int byte_length)
 
const_iterator MakeIterator (const char *p) const
 
string DebugString () const
 

Static Public Member Functions

static string UTF8Substring (const const_iterator &first, const const_iterator &last)
 

Friends

class const_iterator
 
class UnicodeTextUtils
 
bool operator== (const UnicodeText &lhs, const UnicodeText &rhs)
 
bool operator!= (const UnicodeText &lhs, const UnicodeText &rhs)
 

Detailed Description

Copyright 2010 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Definition at line 116 of file unicodetext.h.

Member Typedef Documentation

◆ value_type

Definition at line 120 of file unicodetext.h.

Constructor & Destructor Documentation

◆ UnicodeText() [1/3]

UnicodeText::UnicodeText ( )

Definition at line 188 of file unicodetext.cc.

188 {}

◆ UnicodeText() [2/3]

UnicodeText::UnicodeText ( const UnicodeText src)

Definition at line 191 of file unicodetext.cc.

191  {
192  Copy(src);
193 }
UnicodeText & Copy(const UnicodeText &src)
Definition: unicodetext.cc:216

◆ UnicodeText() [3/3]

UnicodeText::UnicodeText ( const const_iterator first,
const const_iterator last 
)

Definition at line 196 of file unicodetext.cc.

197  {
198  CHECK(first <= last) << " Incompatible iterators";
199  repr_.append(first.it_, last.it_ - first.it_);
200 }
#define CHECK(condition)
Definition: include_gunit.h:76
LIST last(LIST var_list)
Definition: oldlist.cpp:153

◆ ~UnicodeText()

UnicodeText::~UnicodeText ( )

Definition at line 355 of file unicodetext.cc.

355 {}

Member Function Documentation

◆ append() [1/3]

UnicodeText & UnicodeText::append ( const const_iterator first,
const const_iterator last 
)

Definition at line 288 of file unicodetext.cc.

288  {
289  CHECK(first <= last) << " Incompatible iterators";
290  repr_.append(first.it_, last.it_ - first.it_);
291  return *this;
292 }

◆ append() [2/3]

UnicodeText & UnicodeText::append ( const UnicodeText source)

Definition at line 283 of file unicodetext.cc.

283  {
284  repr_.append(u.repr_.data_, u.repr_.size_);
285  return *this;
286 }

◆ append() [3/3]

template<typename ForwardIterator >
UnicodeText& UnicodeText::append ( ForwardIterator  first,
const ForwardIterator  last 
)
inline

Definition at line 163 of file unicodetext.h.

163  {
164  while (first != last) {
165  push_back(*first++);
166  }
167  return *this;
168  }
void push_back(char32 codepoint)
Definition: unicodetext.cc:357

◆ assign()

UnicodeText& UnicodeText::assign ( const UnicodeText src)
inline

Definition at line 134 of file unicodetext.h.

134  {
135  return Copy(src);
136  }

◆ begin()

UnicodeText::const_iterator UnicodeText::begin ( ) const

Definition at line 408 of file unicodetext.cc.

408  {
409  return const_iterator(repr_.data_);
410 }
friend class const_iterator
Definition: unicodetext.h:348

◆ clear()

void UnicodeText::clear ( )

Definition at line 350 of file unicodetext.cc.

350  {
351  repr_.clear();
352 }

◆ Copy()

UnicodeText & UnicodeText::Copy ( const UnicodeText src)

Definition at line 216 of file unicodetext.cc.

216  {
217  repr_.Copy(src.repr_.data_, src.repr_.size_);
218  return *this;
219 }

◆ CopyUTF8()

UnicodeText & UnicodeText::CopyUTF8 ( const char *  utf8_buffer,
int  byte_length 
)

Definition at line 221 of file unicodetext.cc.

221  {
222  repr_.Copy(buffer, byte_length);
223  if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
224  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
225  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
226  }
227  return *this;
228 }
@ LOG
@ WARNING
Definition: log.h:28
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33

◆ DebugString()

string UnicodeText::DebugString ( ) const

◆ empty()

bool UnicodeText::empty ( ) const
inline

Definition at line 146 of file unicodetext.h.

146  {
147  return repr_.size_ == 0;
148  } // Test if text is empty.

◆ end()

UnicodeText::const_iterator UnicodeText::end ( ) const

Definition at line 412 of file unicodetext.cc.

412  {
413  return const_iterator(repr_.data_ + repr_.size_);
414 }

◆ find() [1/2]

UnicodeText::const_iterator UnicodeText::find ( const UnicodeText look) const

Definition at line 308 of file unicodetext.cc.

308  {
309  return UnsafeFind(look, begin());
310 }
const_iterator begin() const
Definition: unicodetext.cc:408

◆ find() [2/2]

UnicodeText::const_iterator UnicodeText::find ( const UnicodeText look,
const_iterator  start_pos 
) const

Definition at line 301 of file unicodetext.cc.

302  {
303  CHECK_GE(start_pos.utf8_data(), utf8_data());
304  CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
305  return UnsafeFind(look, start_pos);
306 }
#define CHECK_GE(test, value)
Definition: include_gunit.h:80
#define CHECK_LE(test, value)
Definition: include_gunit.h:83
const char * utf8_data() const
Definition: unicodetext.h:305
int utf8_length() const
Definition: unicodetext.h:308

◆ HasReplacementChar()

bool UnicodeText::HasReplacementChar ( ) const

◆ MakeIterator()

UnicodeText::const_iterator UnicodeText::MakeIterator ( const char *  p) const

Definition at line 484 of file unicodetext.cc.

484  {
485  CHECK(p != nullptr);
486  const char *start = utf8_data();
487  int len = utf8_length();
488  const char *end = start + len;
489  CHECK(p >= start);
490  CHECK(p <= end);
491  CHECK(p == end || !UniLib::IsTrailByte(*p));
492  return const_iterator(p);
493 }
bool IsTrailByte(char x)
const_iterator end() const
Definition: unicodetext.cc:412

◆ operator=()

UnicodeText & UnicodeText::operator= ( const UnicodeText src)

Definition at line 209 of file unicodetext.cc.

209  {
210  if (this != &src) {
211  Copy(src);
212  }
213  return *this;
214 }

◆ PointTo() [1/2]

UnicodeText & UnicodeText::PointTo ( const const_iterator first,
const const_iterator last 
)

Definition at line 275 of file unicodetext.cc.

275  {
276  CHECK(first <= last) << " Incompatible iterators";
277  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
278  return *this;
279 }

◆ PointTo() [2/2]

UnicodeText & UnicodeText::PointTo ( const UnicodeText src)

Definition at line 270 of file unicodetext.cc.

270  {
271  repr_.PointTo(src.repr_.data_, src.repr_.size_);
272  return *this;
273 }

◆ PointToUTF8()

UnicodeText & UnicodeText::PointToUTF8 ( const char *  utf8_buffer,
int  byte_length 
)

Definition at line 254 of file unicodetext.cc.

254  {
255  if (UniLib::IsInterchangeValid(buffer, byte_length)) {
256  repr_.PointTo(buffer, byte_length);
257  } else {
258  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
259  repr_.Copy(buffer, byte_length);
260  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
261  }
262  return *this;
263 }

◆ push_back()

void UnicodeText::push_back ( char32  codepoint)

Definition at line 357 of file unicodetext.cc.

357  {
358  if (UniLib::IsValidCodepoint(c)) {
359  char buf[UTFmax];
360  int len = runetochar(buf, &c);
361  if (UniLib::IsInterchangeValid(buf, len)) {
362  repr_.append(buf, len);
363  } else {
364  LOG(WARNING) << "Unicode value 0x" << std::hex << c << " is not valid for interchange";
365  repr_.append(" ", 1);
366  }
367  } else {
368  LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
369  repr_.append(" ", 1);
370  }
371 }
int runetochar(char *str, const Rune *rune)
Definition: rune.c:244
@ UTFmax
Definition: utf.h:22
bool IsValidCodepoint(char32 c)

◆ rbegin()

const_reverse_iterator UnicodeText::rbegin ( ) const
inline

Definition at line 283 of file unicodetext.h.

283  {
284  return const_reverse_iterator(end());
285  }

◆ rend()

const_reverse_iterator UnicodeText::rend ( ) const
inline

Definition at line 286 of file unicodetext.h.

286  {
287  return const_reverse_iterator(begin());
288  }

◆ size()

int UnicodeText::size ( ) const

Definition at line 373 of file unicodetext.cc.

373  {
374  return CodepointCount(repr_.data_, repr_.size_);
375 }

◆ TakeOwnershipOfUTF8()

UnicodeText & UnicodeText::TakeOwnershipOfUTF8 ( char *  utf8_buffer,
int  byte_length,
int  byte_capacity 
)

Definition at line 237 of file unicodetext.cc.

237  {
238  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
239  if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
240  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
241  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
242  }
243  return *this;
244 }

◆ utf8_capacity()

int UnicodeText::utf8_capacity ( ) const
inline

Definition at line 311 of file unicodetext.h.

311  {
312  return repr_.capacity_;
313  }

◆ utf8_data()

const char* UnicodeText::utf8_data ( ) const
inline

Definition at line 305 of file unicodetext.h.

305  {
306  return repr_.data_;
307  }

◆ utf8_length()

int UnicodeText::utf8_length ( ) const
inline

Definition at line 308 of file unicodetext.h.

308  {
309  return repr_.size_;
310  }

◆ UTF8Substring()

string UnicodeText::UTF8Substring ( const const_iterator first,
const const_iterator last 
)
static

Definition at line 202 of file unicodetext.cc.

202  {
203  CHECK(first <= last) << " Incompatible iterators";
204  return string(first.it_, last.it_ - first.it_);
205 }

Friends And Related Function Documentation

◆ const_iterator

friend class const_iterator
friend

Definition at line 348 of file unicodetext.h.

◆ operator!=

bool operator!= ( const UnicodeText lhs,
const UnicodeText rhs 
)
friend

Definition at line 397 of file unicodetext.h.

397  {
398  return !(lhs == rhs);
399 }

◆ operator==

bool operator== ( const UnicodeText lhs,
const UnicodeText rhs 
)
friend

Definition at line 377 of file unicodetext.cc.

377  {
378  if (&lhs == &rhs)
379  return true;
380  if (lhs.repr_.size_ != rhs.repr_.size_)
381  return false;
382  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
383 }

◆ UnicodeTextUtils

friend class UnicodeTextUtils
friend

Definition at line 349 of file unicodetext.h.


The documentation for this class was generated from the following files: