tesseract  5.0.0
tesseract::UNICHARSET Class Reference

#include <unicharset.h>

Public Types

enum  Direction {
  U_LEFT_TO_RIGHT = 0 , U_RIGHT_TO_LEFT = 1 , U_EUROPEAN_NUMBER = 2 , U_EUROPEAN_NUMBER_SEPARATOR = 3 ,
  U_EUROPEAN_NUMBER_TERMINATOR = 4 , U_ARABIC_NUMBER = 5 , U_COMMON_NUMBER_SEPARATOR = 6 , U_BLOCK_SEPARATOR = 7 ,
  U_SEGMENT_SEPARATOR = 8 , U_WHITE_SPACE_NEUTRAL = 9 , U_OTHER_NEUTRAL = 10 , U_LEFT_TO_RIGHT_EMBEDDING = 11 ,
  U_LEFT_TO_RIGHT_OVERRIDE = 12 , U_RIGHT_TO_LEFT_ARABIC = 13 , U_RIGHT_TO_LEFT_EMBEDDING = 14 , U_RIGHT_TO_LEFT_OVERRIDE = 15 ,
  U_POP_DIRECTIONAL_FORMAT = 16 , U_DIR_NON_SPACING_MARK = 17 , U_BOUNDARY_NEUTRAL = 18 , U_FIRST_STRONG_ISOLATE = 19 ,
  U_LEFT_TO_RIGHT_ISOLATE = 20 , U_RIGHT_TO_LEFT_ISOLATE = 21 , U_POP_DIRECTIONAL_ISOLATE = 22 , U_CHAR_DIRECTION_COUNT
}
 

Public Member Functions

 UNICHARSET ()
 
 ~UNICHARSET ()
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr) const
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr, int length) const
 
int step (const char *str) const
 
bool encodable_string (const char *str, unsigned *first_bad_position) const
 
bool encode_string (const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
 
const char * id_to_unichar (UNICHAR_ID id) const
 
const char * id_to_unichar_ext (UNICHAR_ID id) const
 
std::string debug_str (UNICHAR_ID id) const
 
std::string debug_str (const char *unichar_repr) const
 
void unichar_insert (const char *const unichar_repr, OldUncleanUnichars old_style)
 
void unichar_insert (const char *const unichar_repr)
 
void unichar_insert_backwards_compatible (const char *const unichar_repr)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
bool contains_unichar (const char *const unichar_repr) const
 
bool contains_unichar (const char *const unichar_repr, int length) const
 
bool eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const
 
void delete_pointers_in_unichars ()
 
void clear ()
 
size_t size () const
 
bool save_to_file (const char *const filename) const
 
bool save_to_file (FILE *file) const
 
bool save_to_file (tesseract::TFile *file) const
 
bool save_to_string (std::string &str) const
 
bool load_from_file (const char *const filename, bool skip_fragments)
 
bool load_from_file (const char *const filename)
 
bool load_from_file (FILE *file, bool skip_fragments)
 
bool load_from_file (FILE *file)
 
bool load_from_file (tesseract::TFile *file, bool skip_fragments)
 
void post_load_setup ()
 
bool major_right_to_left () const
 
void set_black_and_whitelist (const char *blacklist, const char *whitelist, const char *unblacklist)
 
void set_isalpha (UNICHAR_ID unichar_id, bool value)
 
void set_islower (UNICHAR_ID unichar_id, bool value)
 
void set_isupper (UNICHAR_ID unichar_id, bool value)
 
void set_isdigit (UNICHAR_ID unichar_id, bool value)
 
void set_ispunctuation (UNICHAR_ID unichar_id, bool value)
 
void set_isngram (UNICHAR_ID unichar_id, bool value)
 
void set_script (UNICHAR_ID unichar_id, const char *value)
 
void set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)
 
void set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)
 
void set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)
 
void set_normed (UNICHAR_ID unichar_id, const char *normed)
 
void set_normed_ids (UNICHAR_ID unichar_id)
 
bool get_isalpha (UNICHAR_ID unichar_id) const
 
bool get_islower (UNICHAR_ID unichar_id) const
 
bool get_isupper (UNICHAR_ID unichar_id) const
 
bool get_isdigit (UNICHAR_ID unichar_id) const
 
bool get_ispunctuation (UNICHAR_ID unichar_id) const
 
bool get_isngram (UNICHAR_ID unichar_id) const
 
bool get_isprivate (UNICHAR_ID unichar_id) const
 
bool top_bottom_useful () const
 
void set_ranges_empty ()
 
void SetPropertiesFromOther (const UNICHARSET &src)
 
void PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src)
 
void ExpandRangesFromOther (const UNICHARSET &src)
 
void CopyFrom (const UNICHARSET &src)
 
void AppendOtherUnicharset (const UNICHARSET &src)
 
bool SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const
 
void get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
 
void set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
 
void get_width_stats (UNICHAR_ID unichar_id, float *width, float *width_sd) const
 
void set_width_stats (UNICHAR_ID unichar_id, float width, float width_sd)
 
void get_bearing_stats (UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
 
void set_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd)
 
void get_advance_stats (UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
 
void set_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd)
 
bool PropertiesIncomplete (UNICHAR_ID unichar_id) const
 
bool IsSpaceDelimited (UNICHAR_ID unichar_id) const
 
int get_script (UNICHAR_ID unichar_id) const
 
unsigned int get_properties (UNICHAR_ID unichar_id) const
 
char get_chartype (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_other_case (UNICHAR_ID unichar_id) const
 
Direction get_direction (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_mirror (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_lower (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_upper (UNICHAR_ID unichar_id) const
 
bool has_special_codes () const
 
bool AnyRepeatedUnicodes () const
 
const CHAR_FRAGMENTget_fragment (UNICHAR_ID unichar_id) const
 
bool get_isalpha (const char *const unichar_repr) const
 
bool get_islower (const char *const unichar_repr) const
 
bool get_isupper (const char *const unichar_repr) const
 
bool get_isdigit (const char *const unichar_repr) const
 
bool get_ispunctuation (const char *const unichar_repr) const
 
unsigned int get_properties (const char *const unichar_repr) const
 
char get_chartype (const char *const unichar_repr) const
 
int get_script (const char *const unichar_repr) const
 
const CHAR_FRAGMENTget_fragment (const char *const unichar_repr) const
 
bool get_isalpha (const char *const unichar_repr, int length) const
 
bool get_islower (const char *const unichar_repr, int length) const
 
bool get_isupper (const char *const unichar_repr, int length) const
 
bool get_isdigit (const char *const unichar_repr, int length) const
 
bool get_ispunctuation (const char *const unichar_repr, int length) const
 
const char * get_normed_unichar (UNICHAR_ID unichar_id) const
 
const std::vector< UNICHAR_ID > & normed_ids (UNICHAR_ID unichar_id) const
 
int get_script (const char *const unichar_repr, int length) const
 
int get_script_table_size () const
 
const char * get_script_from_script_id (int id) const
 
int get_script_id_from_name (const char *script_name) const
 
bool is_null_script (const char *script) const
 
int add_script (const char *script)
 
bool get_enabled (UNICHAR_ID unichar_id) const
 
int null_sid () const
 
int common_sid () const
 
int latin_sid () const
 
int cyrillic_sid () const
 
int greek_sid () const
 
int han_sid () const
 
int hiragana_sid () const
 
int katakana_sid () const
 
int thai_sid () const
 
int hangul_sid () const
 
int default_sid () const
 
bool script_has_upper_lower () const
 
bool script_has_xheight () const
 

Static Public Member Functions

static std::string debug_utf8_str (const char *str)
 
static std::string CleanupString (const char *utf8_str)
 
static std::string CleanupString (const char *utf8_str, size_t length)
 

Static Public Attributes

static const char * kCustomLigatures [][2]
 
static const char * kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT]
 

Detailed Description

Definition at line 164 of file unicharset.h.

Member Enumeration Documentation

◆ Direction

Enumerator
U_LEFT_TO_RIGHT 
U_RIGHT_TO_LEFT 
U_EUROPEAN_NUMBER 
U_EUROPEAN_NUMBER_SEPARATOR 
U_EUROPEAN_NUMBER_TERMINATOR 
U_ARABIC_NUMBER 
U_COMMON_NUMBER_SEPARATOR 
U_BLOCK_SEPARATOR 
U_SEGMENT_SEPARATOR 
U_WHITE_SPACE_NEUTRAL 
U_OTHER_NEUTRAL 
U_LEFT_TO_RIGHT_EMBEDDING 
U_LEFT_TO_RIGHT_OVERRIDE 
U_RIGHT_TO_LEFT_ARABIC 
U_RIGHT_TO_LEFT_EMBEDDING 
U_RIGHT_TO_LEFT_OVERRIDE 
U_POP_DIRECTIONAL_FORMAT 
U_DIR_NON_SPACING_MARK 
U_BOUNDARY_NEUTRAL 
U_FIRST_STRONG_ISOLATE 
U_LEFT_TO_RIGHT_ISOLATE 
U_RIGHT_TO_LEFT_ISOLATE 
U_POP_DIRECTIONAL_ISOLATE 
U_CHAR_DIRECTION_COUNT 

Definition at line 175 of file unicharset.h.

175  {
176  U_LEFT_TO_RIGHT = 0,
177  U_RIGHT_TO_LEFT = 1,
178  U_EUROPEAN_NUMBER = 2,
181  U_ARABIC_NUMBER = 5,
183  U_BLOCK_SEPARATOR = 7,
186  U_OTHER_NEUTRAL = 10,
194  U_BOUNDARY_NEUTRAL = 18,
199 #ifndef U_HIDE_DEPRECATED_API
201 #endif // U_HIDE_DEPRECATED_API
202  };

Constructor & Destructor Documentation

◆ UNICHARSET()

tesseract::UNICHARSET::UNICHARSET ( )

Definition at line 170 of file unicharset.cpp.

171  : ids(), script_table(nullptr), script_table_size_used(0) {
172  clear();
173  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
175  if (i == UNICHAR_JOINED) {
176  set_isngram(i, true);
177  }
178  }
179 }
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
@ UNICHAR_JOINED
Definition: unicharset.h:37
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
static const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:172
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:462

◆ ~UNICHARSET()

tesseract::UNICHARSET::~UNICHARSET ( )

Definition at line 181 of file unicharset.cpp.

181  {
182  clear();
183 }

Member Function Documentation

◆ add_script()

int tesseract::UNICHARSET::add_script ( const char *  script)

Definition at line 1063 of file unicharset.cpp.

1063  {
1064  for (int i = 0; i < script_table_size_used; ++i) {
1065  if (strcmp(script, script_table[i]) == 0) {
1066  return i;
1067  }
1068  }
1069  if (script_table_size_reserved == 0) {
1070  script_table_size_reserved = 8;
1071  script_table = new char *[script_table_size_reserved];
1072  } else if (script_table_size_used >= script_table_size_reserved) {
1073  assert(script_table_size_used == script_table_size_reserved);
1074  script_table_size_reserved += script_table_size_reserved;
1075  char **new_script_table = new char *[script_table_size_reserved];
1076  memcpy(new_script_table, script_table,
1077  script_table_size_used * sizeof(char *));
1078  delete[] script_table;
1079  script_table = new_script_table;
1080  }
1081  script_table[script_table_size_used] = new char[strlen(script) + 1];
1082  strcpy(script_table[script_table_size_used], script);
1083  return script_table_size_used++;
1084 }

◆ AnyRepeatedUnicodes()

bool tesseract::UNICHARSET::AnyRepeatedUnicodes ( ) const

Definition at line 1046 of file unicharset.cpp.

1046  {
1047  int start_id = 0;
1048  if (has_special_codes()) {
1049  start_id = SPECIAL_UNICHAR_CODES_COUNT;
1050  }
1051  for (unsigned id = start_id; id < unichars.size(); ++id) {
1052  // Convert to unicodes.
1053  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1054  for (size_t u = 1; u < unicodes.size(); ++u) {
1055  if (unicodes[u - 1] == unicodes[u]) {
1056  return true;
1057  }
1058  }
1059  }
1060  return false;
1061 }
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:860
bool has_special_codes() const
Definition: unicharset.h:757

◆ AppendOtherUnicharset()

void tesseract::UNICHARSET::AppendOtherUnicharset ( const UNICHARSET src)

Definition at line 454 of file unicharset.cpp.

454  {
455  int initial_used = unichars.size();
456  for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
457  const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
458  const char *utf8 = src.id_to_unichar(ch);
459  int id = unichars.size();
460  if (contains_unichar(utf8)) {
461  id = unichar_to_id(utf8);
462  // Just expand current ranges.
463  unichars[id].properties.ExpandRangesFrom(src_props);
464  } else {
466  unichars[id].properties.SetRangesEmpty();
467  }
468  }
469  // Set properties, including mirror and other_case, WITHOUT reordering
470  // the unicharset.
471  PartialSetPropertiesFromOther(initial_used, src);
472 }
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:288
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:395

◆ CleanupString() [1/2]

static std::string tesseract::UNICHARSET::CleanupString ( const char *  utf8_str)
inlinestatic

Definition at line 265 of file unicharset.h.

265  {
266  return CleanupString(utf8_str, strlen(utf8_str));
267  }
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265

◆ CleanupString() [2/2]

std::string tesseract::UNICHARSET::CleanupString ( const char *  utf8_str,
size_t  length 
)
static

Definition at line 1158 of file unicharset.cpp.

1158  {
1159  std::string result;
1160  result.reserve(length);
1161  char ch;
1162  while ((ch = *utf8_str) != '\0' && length-- > 0) {
1163  int key_index = 0;
1164  const char *key;
1165  while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1166  int match = 0;
1167  while (key[match] != '\0' && key[match] == utf8_str[match]) {
1168  ++match;
1169  }
1170  if (key[match] == '\0') {
1171  utf8_str += match;
1172  break;
1173  }
1174  ++key_index;
1175  }
1176  if (key == nullptr) {
1177  result.push_back(ch);
1178  ++utf8_str;
1179  } else {
1180  result.append(kCleanupMaps[key_index][1]);
1181  }
1182  }
1183  return result;
1184 }

◆ clear()

void tesseract::UNICHARSET::clear ( )
inline

Definition at line 324 of file unicharset.h.

324  {
325  if (script_table != nullptr) {
326  for (int i = 0; i < script_table_size_used; ++i) {
327  delete[] script_table[i];
328  }
329  delete[] script_table;
330  script_table = nullptr;
331  script_table_size_used = 0;
332  }
333  script_table_size_reserved = 0;
335  unichars.clear();
336  ids.clear();
337  top_bottom_set_ = false;
338  script_has_upper_lower_ = false;
339  script_has_xheight_ = false;
340  old_style_included_ = false;
341  null_sid_ = 0;
342  common_sid_ = 0;
343  latin_sid_ = 0;
344  cyrillic_sid_ = 0;
345  greek_sid_ = 0;
346  han_sid_ = 0;
347  hiragana_sid_ = 0;
348  katakana_sid_ = 0;
349  thai_sid_ = 0;
350  hangul_sid_ = 0;
351  default_sid_ = 0;
352  }
void delete_pointers_in_unichars()
Definition: unicharset.h:316

◆ common_sid()

int tesseract::UNICHARSET::common_sid ( ) const
inline

Definition at line 920 of file unicharset.h.

920  {
921  return common_sid_;
922  }

◆ contains_unichar() [1/2]

bool tesseract::UNICHARSET::contains_unichar ( const char *const  unichar_repr) const

Definition at line 695 of file unicharset.cpp.

695  {
696  std::string cleaned =
697  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
698  return ids.contains(cleaned.data(), cleaned.size());
699 }
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:83

◆ contains_unichar() [2/2]

bool tesseract::UNICHARSET::contains_unichar ( const char *const  unichar_repr,
int  length 
) const

Definition at line 701 of file unicharset.cpp.

702  {
703  if (length == 0) {
704  return false;
705  }
706  std::string cleaned(unichar_repr, length);
707  if (!old_style_included_) {
708  cleaned = CleanupString(unichar_repr, length);
709  }
710  return ids.contains(cleaned.data(), cleaned.size());
711 }

◆ contains_unichar_id()

bool tesseract::UNICHARSET::contains_unichar_id ( UNICHAR_ID  unichar_id) const
inline

Definition at line 303 of file unicharset.h.

303  {
304  return static_cast<size_t>(unichar_id) < unichars.size();
305  }

◆ CopyFrom()

void tesseract::UNICHARSET::CopyFrom ( const UNICHARSET src)

Definition at line 438 of file unicharset.cpp.

438  {
439  clear();
440  for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
441  const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
442  const char *utf8 = src.id_to_unichar(ch);
444  unichars[ch].properties.ExpandRangesFrom(src_props);
445  }
446  // Set properties, including mirror and other_case, WITHOUT reordering
447  // the unicharset.
449 }

◆ cyrillic_sid()

int tesseract::UNICHARSET::cyrillic_sid ( ) const
inline

Definition at line 926 of file unicharset.h.

926  {
927  return cyrillic_sid_;
928  }

◆ debug_str() [1/2]

std::string tesseract::UNICHARSET::debug_str ( const char *  unichar_repr) const
inline

Definition at line 273 of file unicharset.h.

273  {
274  return debug_str(unichar_to_id(unichar_repr));
275  }
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331

◆ debug_str() [2/2]

std::string tesseract::UNICHARSET::debug_str ( UNICHAR_ID  id) const

Definition at line 331 of file unicharset.cpp.

331  {
332  if (id == INVALID_UNICHAR_ID) {
333  return std::string(id_to_unichar(id));
334  }
335  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
336  if (fragment) {
337  return fragment->to_string();
338  }
339  const char *str = id_to_unichar(id);
340  std::string result = debug_utf8_str(str);
341  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
342  if (get_isalpha(id)) {
343  if (get_islower(id)) {
344  result += "a";
345  } else if (get_isupper(id)) {
346  result += "A";
347  } else {
348  result += "x";
349  }
350  }
351  // Append 0 if a digit.
352  if (get_isdigit(id)) {
353  result += "0";
354  }
355  // Append p is a punctuation symbol.
356  if (get_ispunctuation(id)) {
357  result += "p";
358  }
359  return result;
360 }
static std::string debug_utf8_str(const char *str)
Definition: unicharset.cpp:307
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:769

◆ debug_utf8_str()

std::string tesseract::UNICHARSET::debug_utf8_str ( const char *  str)
static

Definition at line 307 of file unicharset.cpp.

307  {
308  std::string result = str;
309  result += " [";
310  int step = 1;
311  // Chop into unicodes and code each as hex.
312  for (int i = 0; str[i] != '\0'; i += step) {
313  char hex[sizeof(int) * 2 + 1];
314  step = UNICHAR::utf8_step(str + i);
315  if (step == 0) {
316  step = 1;
317  sprintf(hex, "%x", str[i]);
318  } else {
319  UNICHAR ch(str + i, step);
320  sprintf(hex, "%x", ch.first_uni());
321  }
322  result += hex;
323  result += " ";
324  }
325  result += "]";
326  return result;
327 }
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:143
int step(const char *str) const
Definition: unicharset.cpp:211

◆ default_sid()

int tesseract::UNICHARSET::default_sid ( ) const
inline

Definition at line 947 of file unicharset.h.

947  {
948  return default_sid_;
949  }

◆ delete_pointers_in_unichars()

void tesseract::UNICHARSET::delete_pointers_in_unichars ( )
inline

Definition at line 316 of file unicharset.h.

316  {
317  for (auto &unichar : unichars) {
318  delete unichar.properties.fragment;
319  unichar.properties.fragment = nullptr;
320  }
321  }

◆ encodable_string()

bool tesseract::UNICHARSET::encodable_string ( const char *  str,
unsigned *  first_bad_position 
) const

Definition at line 224 of file unicharset.cpp.

225  {
226  std::vector<UNICHAR_ID> encoding;
227  return encode_string(str, true, &encoding, nullptr, first_bad_position);
228 }
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239

◆ encode_string()

bool tesseract::UNICHARSET::encode_string ( const char *  str,
bool  give_up_on_failure,
std::vector< UNICHAR_ID > *  encoding,
std::vector< char > *  lengths,
unsigned *  encoded_length 
) const

Definition at line 239 of file unicharset.cpp.

242  {
243  std::vector<UNICHAR_ID> working_encoding;
244  std::vector<char> working_lengths;
245  std::vector<char> best_lengths;
246  encoding->clear(); // Just in case str is empty.
247  auto str_length = strlen(str);
248  unsigned str_pos = 0;
249  bool perfect = true;
250  while (str_pos < str_length) {
251  encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
252  &str_pos, encoding, &best_lengths);
253  if (str_pos < str_length) {
254  // This is a non-match. Skip one utf-8 character.
255  perfect = false;
256  if (give_up_on_failure) {
257  break;
258  }
259  int step = UNICHAR::utf8_step(str + str_pos);
260  if (step == 0) {
261  step = 1;
262  }
263  encoding->push_back(INVALID_UNICHAR_ID);
264  best_lengths.push_back(step);
265  str_pos += step;
266  working_encoding = *encoding;
267  working_lengths = best_lengths;
268  }
269  }
270  if (lengths != nullptr) {
271  *lengths = best_lengths;
272  }
273  if (encoded_length != nullptr) {
274  *encoded_length = str_pos;
275  }
276  return perfect;
277 }

◆ eq()

bool tesseract::UNICHARSET::eq ( UNICHAR_ID  unichar_id,
const char *const  unichar_repr 
) const

Definition at line 713 of file unicharset.cpp.

714  {
715  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
716 }

◆ ExpandRangesFromOther()

void tesseract::UNICHARSET::ExpandRangesFromOther ( const UNICHARSET src)

Definition at line 425 of file unicharset.cpp.

425  {
426  for (unsigned ch = 0; ch < unichars.size(); ++ch) {
427  const char *utf8 = id_to_unichar(ch);
428  UNICHAR_PROPERTIES properties;
429  if (src.GetStrProperties(utf8, &properties)) {
430  // Expand just the ranges from properties.
431  unichars[ch].properties.ExpandRangesFrom(properties);
432  }
433  }
434 }

◆ get_advance_stats()

void tesseract::UNICHARSET::get_advance_stats ( UNICHAR_ID  unichar_id,
float *  advance,
float *  advance_sd 
) const
inline

Definition at line 647 of file unicharset.h.

648  {
649  if (INVALID_UNICHAR_ID == unichar_id) {
650  *advance = *advance_sd = 0;
651  return;
652  }
653  ASSERT_HOST(contains_unichar_id(unichar_id));
654  *advance = unichars[unichar_id].properties.advance;
655  *advance_sd = unichars[unichar_id].properties.advance_sd;
656  }
#define ASSERT_HOST(x)
Definition: errcode.h:59
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303

◆ get_bearing_stats()

void tesseract::UNICHARSET::get_bearing_stats ( UNICHAR_ID  unichar_id,
float *  bearing,
float *  bearing_sd 
) const
inline

Definition at line 630 of file unicharset.h.

631  {
632  if (INVALID_UNICHAR_ID == unichar_id) {
633  *bearing = *bearing_sd = 0.0f;
634  return;
635  }
636  ASSERT_HOST(contains_unichar_id(unichar_id));
637  *bearing = unichars[unichar_id].properties.bearing;
638  *bearing_sd = unichars[unichar_id].properties.bearing_sd;
639  }

◆ get_chartype() [1/2]

char tesseract::UNICHARSET::get_chartype ( const char *const  unichar_repr) const
inline

Definition at line 808 of file unicharset.h.

808  {
809  return get_chartype(unichar_to_id(unichar_repr));
810  }
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:635

◆ get_chartype() [2/2]

char tesseract::UNICHARSET::get_chartype ( UNICHAR_ID  unichar_id) const

Definition at line 635 of file unicharset.cpp.

635  {
636  if (this->get_isupper(id)) {
637  return 'A';
638  }
639  if (this->get_islower(id)) {
640  return 'a';
641  }
642  if (this->get_isalpha(id)) {
643  return 'x';
644  }
645  if (this->get_isdigit(id)) {
646  return '0';
647  }
648  if (this->get_ispunctuation(id)) {
649  return 'p';
650  }
651  return 0;
652 }

◆ get_direction()

Direction tesseract::UNICHARSET::get_direction ( UNICHAR_ID  unichar_id) const
inline

Definition at line 713 of file unicharset.h.

713  {
714  if (INVALID_UNICHAR_ID == unichar_id) {
716  }
717  ASSERT_HOST(contains_unichar_id(unichar_id));
718  return unichars[unichar_id].properties.direction;
719  }

◆ get_enabled()

bool tesseract::UNICHARSET::get_enabled ( UNICHAR_ID  unichar_id) const
inline

Definition at line 912 of file unicharset.h.

912  {
913  ASSERT_HOST(contains_unichar_id(unichar_id));
914  return unichars[unichar_id].properties.enabled;
915  }

◆ get_fragment() [1/2]

const CHAR_FRAGMENT* tesseract::UNICHARSET::get_fragment ( const char *const  unichar_repr) const
inline

Definition at line 821 of file unicharset.h.

821  {
822  if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
823  !ids.contains(unichar_repr, false)) {
824  return nullptr;
825  }
826  return get_fragment(unichar_to_id(unichar_repr));
827  }

◆ get_fragment() [2/2]

const CHAR_FRAGMENT* tesseract::UNICHARSET::get_fragment ( UNICHAR_ID  unichar_id) const
inline

Definition at line 769 of file unicharset.h.

769  {
770  if (INVALID_UNICHAR_ID == unichar_id) {
771  return nullptr;
772  }
773  ASSERT_HOST(contains_unichar_id(unichar_id));
774  return unichars[unichar_id].properties.fragment;
775  }

◆ get_isalpha() [1/3]

bool tesseract::UNICHARSET::get_isalpha ( const char *const  unichar_repr) const
inline

Definition at line 778 of file unicharset.h.

778  {
779  return get_isalpha(unichar_to_id(unichar_repr));
780  }

◆ get_isalpha() [2/3]

bool tesseract::UNICHARSET::get_isalpha ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 831 of file unicharset.h.

831  {
832  return get_isalpha(unichar_to_id(unichar_repr, length));
833  }

◆ get_isalpha() [3/3]

bool tesseract::UNICHARSET::get_isalpha ( UNICHAR_ID  unichar_id) const
inline

Definition at line 497 of file unicharset.h.

497  {
498  if (INVALID_UNICHAR_ID == unichar_id) {
499  return false;
500  }
501  ASSERT_HOST(contains_unichar_id(unichar_id));
502  return unichars[unichar_id].properties.isalpha;
503  }

◆ get_isdigit() [1/3]

bool tesseract::UNICHARSET::get_isdigit ( const char *const  unichar_repr) const
inline

Definition at line 793 of file unicharset.h.

793  {
794  return get_isdigit(unichar_to_id(unichar_repr));
795  }

◆ get_isdigit() [2/3]

bool tesseract::UNICHARSET::get_isdigit ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 849 of file unicharset.h.

849  {
850  return get_isdigit(unichar_to_id(unichar_repr, length));
851  }

◆ get_isdigit() [3/3]

bool tesseract::UNICHARSET::get_isdigit ( UNICHAR_ID  unichar_id) const
inline

Definition at line 524 of file unicharset.h.

524  {
525  if (INVALID_UNICHAR_ID == unichar_id) {
526  return false;
527  }
528  ASSERT_HOST(contains_unichar_id(unichar_id));
529  return unichars[unichar_id].properties.isdigit;
530  }

◆ get_islower() [1/3]

bool tesseract::UNICHARSET::get_islower ( const char *const  unichar_repr) const
inline

Definition at line 783 of file unicharset.h.

783  {
784  return get_islower(unichar_to_id(unichar_repr));
785  }

◆ get_islower() [2/3]

bool tesseract::UNICHARSET::get_islower ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 837 of file unicharset.h.

837  {
838  return get_islower(unichar_to_id(unichar_repr, length));
839  }

◆ get_islower() [3/3]

bool tesseract::UNICHARSET::get_islower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 506 of file unicharset.h.

506  {
507  if (INVALID_UNICHAR_ID == unichar_id) {
508  return false;
509  }
510  ASSERT_HOST(contains_unichar_id(unichar_id));
511  return unichars[unichar_id].properties.islower;
512  }

◆ get_isngram()

bool tesseract::UNICHARSET::get_isngram ( UNICHAR_ID  unichar_id) const
inline

Definition at line 542 of file unicharset.h.

542  {
543  if (INVALID_UNICHAR_ID == unichar_id) {
544  return false;
545  }
546  ASSERT_HOST(contains_unichar_id(unichar_id));
547  return unichars[unichar_id].properties.isngram;
548  }

◆ get_isprivate()

bool tesseract::UNICHARSET::get_isprivate ( UNICHAR_ID  unichar_id) const

Definition at line 379 of file unicharset.cpp.

379  {
380  UNICHAR uc(id_to_unichar(unichar_id), -1);
381  int uni = uc.first_uni();
382  return (uni >= 0xE000 && uni <= 0xF8FF);
383 }

◆ get_ispunctuation() [1/3]

bool tesseract::UNICHARSET::get_ispunctuation ( const char *const  unichar_repr) const
inline

Definition at line 798 of file unicharset.h.

798  {
799  return get_ispunctuation(unichar_to_id(unichar_repr));
800  }

◆ get_ispunctuation() [2/3]

bool tesseract::UNICHARSET::get_ispunctuation ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 855 of file unicharset.h.

855  {
856  return get_ispunctuation(unichar_to_id(unichar_repr, length));
857  }

◆ get_ispunctuation() [3/3]

bool tesseract::UNICHARSET::get_ispunctuation ( UNICHAR_ID  unichar_id) const
inline

Definition at line 533 of file unicharset.h.

533  {
534  if (INVALID_UNICHAR_ID == unichar_id) {
535  return false;
536  }
537  ASSERT_HOST(contains_unichar_id(unichar_id));
538  return unichars[unichar_id].properties.ispunctuation;
539  }

◆ get_isupper() [1/3]

bool tesseract::UNICHARSET::get_isupper ( const char *const  unichar_repr) const
inline

Definition at line 788 of file unicharset.h.

788  {
789  return get_isupper(unichar_to_id(unichar_repr));
790  }

◆ get_isupper() [2/3]

bool tesseract::UNICHARSET::get_isupper ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 843 of file unicharset.h.

843  {
844  return get_isupper(unichar_to_id(unichar_repr, length));
845  }

◆ get_isupper() [3/3]

bool tesseract::UNICHARSET::get_isupper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 515 of file unicharset.h.

515  {
516  if (INVALID_UNICHAR_ID == unichar_id) {
517  return false;
518  }
519  ASSERT_HOST(contains_unichar_id(unichar_id));
520  return unichars[unichar_id].properties.isupper;
521  }

◆ get_mirror()

UNICHAR_ID tesseract::UNICHARSET::get_mirror ( UNICHAR_ID  unichar_id) const
inline

Definition at line 722 of file unicharset.h.

722  {
723  if (INVALID_UNICHAR_ID == unichar_id) {
724  return INVALID_UNICHAR_ID;
725  }
726  ASSERT_HOST(contains_unichar_id(unichar_id));
727  return unichars[unichar_id].properties.mirror;
728  }

◆ get_normed_unichar()

const char* tesseract::UNICHARSET::get_normed_unichar ( UNICHAR_ID  unichar_id) const
inline

Definition at line 860 of file unicharset.h.

860  {
861  if (unichar_id == UNICHAR_SPACE) {
862  return " ";
863  }
864  return unichars[unichar_id].properties.normed.c_str();
865  }
@ UNICHAR_SPACE
Definition: unicharset.h:36

◆ get_other_case()

UNICHAR_ID tesseract::UNICHARSET::get_other_case ( UNICHAR_ID  unichar_id) const
inline

Definition at line 704 of file unicharset.h.

704  {
705  if (INVALID_UNICHAR_ID == unichar_id) {
706  return INVALID_UNICHAR_ID;
707  }
708  ASSERT_HOST(contains_unichar_id(unichar_id));
709  return unichars[unichar_id].properties.other_case;
710  }

◆ get_properties() [1/2]

unsigned int tesseract::UNICHARSET::get_properties ( const char *const  unichar_repr) const
inline

Definition at line 804 of file unicharset.h.

804  {
805  return get_properties(unichar_to_id(unichar_repr));
806  }
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:615

◆ get_properties() [2/2]

unsigned int tesseract::UNICHARSET::get_properties ( UNICHAR_ID  unichar_id) const

Definition at line 615 of file unicharset.cpp.

615  {
616  unsigned int properties = 0;
617  if (this->get_isalpha(id)) {
618  properties |= ISALPHA_MASK;
619  }
620  if (this->get_islower(id)) {
621  properties |= ISLOWER_MASK;
622  }
623  if (this->get_isupper(id)) {
624  properties |= ISUPPER_MASK;
625  }
626  if (this->get_isdigit(id)) {
627  properties |= ISDIGIT_MASK;
628  }
629  if (this->get_ispunctuation(id)) {
630  properties |= ISPUNCTUATION_MASK;
631  }
632  return properties;
633 }

◆ get_script() [1/3]

int tesseract::UNICHARSET::get_script ( const char *const  unichar_repr) const
inline

Definition at line 815 of file unicharset.h.

815  {
816  return get_script(unichar_to_id(unichar_repr));
817  }
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682

◆ get_script() [2/3]

int tesseract::UNICHARSET::get_script ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 877 of file unicharset.h.

877  {
878  return get_script(unichar_to_id(unichar_repr, length));
879  }

◆ get_script() [3/3]

int tesseract::UNICHARSET::get_script ( UNICHAR_ID  unichar_id) const
inline

Definition at line 682 of file unicharset.h.

682  {
683  if (INVALID_UNICHAR_ID == unichar_id) {
684  return null_sid_;
685  }
686  ASSERT_HOST(contains_unichar_id(unichar_id));
687  return unichars[unichar_id].properties.script_id;
688  }

◆ get_script_from_script_id()

const char* tesseract::UNICHARSET::get_script_from_script_id ( int  id) const
inline

Definition at line 887 of file unicharset.h.

887  {
888  if (id >= script_table_size_used || id < 0) {
889  return null_script;
890  }
891  return script_table[id];
892  }

◆ get_script_id_from_name()

int tesseract::UNICHARSET::get_script_id_from_name ( const char *  script_name) const

Definition at line 1146 of file unicharset.cpp.

1146  {
1147  for (int i = 0; i < script_table_size_used; ++i) {
1148  if (strcmp(script_name, script_table[i]) == 0) {
1149  return i;
1150  }
1151  }
1152  return 0; // 0 is always the null_script
1153 }

◆ get_script_table_size()

int tesseract::UNICHARSET::get_script_table_size ( ) const
inline

Definition at line 882 of file unicharset.h.

882  {
883  return script_table_size_used;
884  }

◆ get_top_bottom()

void tesseract::UNICHARSET::get_top_bottom ( UNICHAR_ID  unichar_id,
int *  min_bottom,
int *  max_bottom,
int *  min_top,
int *  max_top 
) const
inline

Definition at line 586 of file unicharset.h.

587  {
588  if (INVALID_UNICHAR_ID == unichar_id) {
589  *min_bottom = *min_top = 0;
590  *max_bottom = *max_top = 256; // kBlnCellHeight
591  return;
592  }
593  ASSERT_HOST(contains_unichar_id(unichar_id));
594  *min_bottom = unichars[unichar_id].properties.min_bottom;
595  *max_bottom = unichars[unichar_id].properties.max_bottom;
596  *min_top = unichars[unichar_id].properties.min_top;
597  *max_top = unichars[unichar_id].properties.max_top;
598  }

◆ get_width_stats()

void tesseract::UNICHARSET::get_width_stats ( UNICHAR_ID  unichar_id,
float *  width,
float *  width_sd 
) const
inline

Definition at line 612 of file unicharset.h.

613  {
614  if (INVALID_UNICHAR_ID == unichar_id) {
615  *width = 0.0f;
616  *width_sd = 0.0f;
617  ;
618  return;
619  }
620  ASSERT_HOST(contains_unichar_id(unichar_id));
621  *width = unichars[unichar_id].properties.width;
622  *width_sd = unichars[unichar_id].properties.width_sd;
623  }

◆ greek_sid()

int tesseract::UNICHARSET::greek_sid ( ) const
inline

Definition at line 929 of file unicharset.h.

929  {
930  return greek_sid_;
931  }

◆ han_sid()

int tesseract::UNICHARSET::han_sid ( ) const
inline

Definition at line 932 of file unicharset.h.

932  {
933  return han_sid_;
934  }

◆ hangul_sid()

int tesseract::UNICHARSET::hangul_sid ( ) const
inline

Definition at line 944 of file unicharset.h.

944  {
945  return hangul_sid_;
946  }

◆ has_special_codes()

bool tesseract::UNICHARSET::has_special_codes ( ) const
inline

Definition at line 757 of file unicharset.h.

757  {
758  return get_fragment(UNICHAR_BROKEN) != nullptr &&
761  }
@ UNICHAR_BROKEN
Definition: unicharset.h:38

◆ hiragana_sid()

int tesseract::UNICHARSET::hiragana_sid ( ) const
inline

Definition at line 935 of file unicharset.h.

935  {
936  return hiragana_sid_;
937  }

◆ id_to_unichar()

const char * tesseract::UNICHARSET::id_to_unichar ( UNICHAR_ID  id) const

Definition at line 279 of file unicharset.cpp.

279  {
280  if (id == INVALID_UNICHAR_ID) {
281  return INVALID_UNICHAR;
282  }
283  ASSERT_HOST(static_cast<unsigned>(id) < this->size());
284  return unichars[id].representation;
285 }
size_t size() const
Definition: unicharset.h:355

◆ id_to_unichar_ext()

const char * tesseract::UNICHARSET::id_to_unichar_ext ( UNICHAR_ID  id) const

Definition at line 287 of file unicharset.cpp.

287  {
288  if (id == INVALID_UNICHAR_ID) {
289  return INVALID_UNICHAR;
290  }
291  ASSERT_HOST(static_cast<unsigned>(id) < this->size());
292  // Resolve from the kCustomLigatures table if this is a private encoding.
293  if (get_isprivate(id)) {
294  const char *ch = id_to_unichar(id);
295  for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
296  if (!strcmp(ch, kCustomLigatures[i][1])) {
297  return kCustomLigatures[i][0];
298  }
299  }
300  }
301  // Otherwise return the stored representation.
302  return unichars[id].representation;
303 }
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:379
static const char * kCustomLigatures[][2]
Definition: unicharset.h:169

◆ is_null_script()

bool tesseract::UNICHARSET::is_null_script ( const char *  script) const
inline

Definition at line 902 of file unicharset.h.

902  {
903  return script == null_script;
904  }

◆ IsSpaceDelimited()

bool tesseract::UNICHARSET::IsSpaceDelimited ( UNICHAR_ID  unichar_id) const
inline

Definition at line 669 of file unicharset.h.

669  {
670  if (INVALID_UNICHAR_ID == unichar_id) {
671  return true;
672  }
673  int script_id = get_script(unichar_id);
674  return script_id != han_sid_ && script_id != thai_sid_ &&
675  script_id != hangul_sid_ && script_id != hiragana_sid_ &&
676  script_id != katakana_sid_;
677  }

◆ katakana_sid()

int tesseract::UNICHARSET::katakana_sid ( ) const
inline

Definition at line 938 of file unicharset.h.

938  {
939  return katakana_sid_;
940  }

◆ latin_sid()

int tesseract::UNICHARSET::latin_sid ( ) const
inline

Definition at line 923 of file unicharset.h.

923  {
924  return latin_sid_;
925  }

◆ load_from_file() [1/5]

bool tesseract::UNICHARSET::load_from_file ( const char *const  filename)
inline

Definition at line 401 of file unicharset.h.

401  {
402  return load_from_file(filename, false);
403  }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391

◆ load_from_file() [2/5]

bool tesseract::UNICHARSET::load_from_file ( const char *const  filename,
bool  skip_fragments 
)
inline

Definition at line 391 of file unicharset.h.

391  {
392  FILE *file = fopen(filename, "rb");
393  if (file == nullptr) {
394  return false;
395  }
396  bool result = load_from_file(file, skip_fragments);
397  fclose(file);
398  return result;
399  }

◆ load_from_file() [3/5]

bool tesseract::UNICHARSET::load_from_file ( FILE *  file)
inline

Definition at line 408 of file unicharset.h.

408  {
409  return load_from_file(file, false);
410  }

◆ load_from_file() [4/5]

bool tesseract::UNICHARSET::load_from_file ( FILE *  file,
bool  skip_fragments 
)

Definition at line 767 of file unicharset.cpp.

767  {
768  LocalFilePointer lfp(file);
769  using namespace std::placeholders; // for _1, _2
770  std::function<char *(char *, int)> fgets_cb =
771  std::bind(&LocalFilePointer::fgets, &lfp, _1, _2);
772  bool success = load_via_fgets(fgets_cb, skip_fragments);
773  return success;
774 }
char * fgets(char *dst, int size)
Definition: unicharset.cpp:759

◆ load_from_file() [5/5]

bool tesseract::UNICHARSET::load_from_file ( tesseract::TFile file,
bool  skip_fragments 
)

Definition at line 776 of file unicharset.cpp.

776  {
777  using namespace std::placeholders; // for _1, _2
778  std::function<char *(char *, int)> fgets_cb =
779  std::bind(&tesseract::TFile::FGets, file, _1, _2);
780  bool success = load_via_fgets(fgets_cb, skip_fragments);
781  return success;
782 }
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:195

◆ major_right_to_left()

bool tesseract::UNICHARSET::major_right_to_left ( ) const

Definition at line 983 of file unicharset.cpp.

983  {
984  int ltr_count = 0;
985  int rtl_count = 0;
986  for (unsigned id = 0; id < unichars.size(); ++id) {
987  int dir = get_direction(id);
988  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) {
989  ltr_count++;
990  }
991  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
994  rtl_count++;
995  }
996  }
997  return rtl_count > ltr_count;
998 }
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:713

◆ normed_ids()

const std::vector<UNICHAR_ID>& tesseract::UNICHARSET::normed_ids ( UNICHAR_ID  unichar_id) const
inline

Definition at line 869 of file unicharset.h.

869  {
870  return unichars[unichar_id].properties.normed_ids;
871  }

◆ null_sid()

int tesseract::UNICHARSET::null_sid ( ) const
inline

Definition at line 917 of file unicharset.h.

917  {
918  return null_sid_;
919  }

◆ PartialSetPropertiesFromOther()

void tesseract::UNICHARSET::PartialSetPropertiesFromOther ( int  start_index,
const UNICHARSET src 
)

Definition at line 395 of file unicharset.cpp.

396  {
397  for (unsigned ch = start_index; ch < unichars.size(); ++ch) {
398  const char *utf8 = id_to_unichar(ch);
399  UNICHAR_PROPERTIES properties;
400  if (src.GetStrProperties(utf8, &properties)) {
401  // Setup the script_id, other_case, and mirror properly.
402  const char *script = src.get_script_from_script_id(properties.script_id);
403  properties.script_id = add_script(script);
404  const char *other_case = src.id_to_unichar(properties.other_case);
405  if (contains_unichar(other_case)) {
406  properties.other_case = unichar_to_id(other_case);
407  } else {
408  properties.other_case = ch;
409  }
410  const char *mirror_str = src.id_to_unichar(properties.mirror);
411  if (contains_unichar(mirror_str)) {
412  properties.mirror = unichar_to_id(mirror_str);
413  } else {
414  properties.mirror = ch;
415  }
416  unichars[ch].properties.CopyFrom(properties);
417  set_normed_ids(ch);
418  }
419  }
420 }
int add_script(const char *script)
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:364

◆ post_load_setup()

void tesseract::UNICHARSET::post_load_setup ( )

Definition at line 912 of file unicharset.cpp.

912  {
913  // Number of alpha chars with the case property minus those without,
914  // in order to determine that half the alpha chars have case.
915  int net_case_alphas = 0;
916  int x_height_alphas = 0;
917  int cap_height_alphas = 0;
918  top_bottom_set_ = false;
919  for (unsigned id = 0; id < unichars.size(); ++id) {
920  int min_bottom = 0;
921  int max_bottom = UINT8_MAX;
922  int min_top = 0;
923  int max_top = UINT8_MAX;
924  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
925  if (min_top > 0) {
926  top_bottom_set_ = true;
927  }
928  if (get_isalpha(id)) {
929  if (get_islower(id) || get_isupper(id)) {
930  ++net_case_alphas;
931  } else {
932  --net_case_alphas;
933  }
934  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) {
935  ++x_height_alphas;
936  } else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) {
937  ++cap_height_alphas;
938  }
939  }
940  set_normed_ids(id);
941  }
942 
943  script_has_upper_lower_ = net_case_alphas > 0;
944  script_has_xheight_ =
945  script_has_upper_lower_ ||
946  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
947  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
948 
949  null_sid_ = get_script_id_from_name(null_script);
950  ASSERT_HOST(null_sid_ == 0);
951  common_sid_ = get_script_id_from_name("Common");
952  latin_sid_ = get_script_id_from_name("Latin");
953  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
954  greek_sid_ = get_script_id_from_name("Greek");
955  han_sid_ = get_script_id_from_name("Han");
956  hiragana_sid_ = get_script_id_from_name("Hiragana");
957  katakana_sid_ = get_script_id_from_name("Katakana");
958  thai_sid_ = get_script_id_from_name("Thai");
959  hangul_sid_ = get_script_id_from_name("Hangul");
960 
961  // Compute default script. Use the highest-counting alpha script, that is
962  // not the common script, as that still contains some "alphas".
963  int *script_counts = new int[script_table_size_used];
964  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
965  for (unsigned id = 0; id < unichars.size(); ++id) {
966  if (get_isalpha(id)) {
967  ++script_counts[get_script(id)];
968  }
969  }
970  default_sid_ = 0;
971  for (int s = 1; s < script_table_size_used; ++s) {
972  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) {
973  default_sid_ = s;
974  }
975  }
976  delete[] script_counts;
977 }
const double kMinCapHeightFraction
Definition: unicharset.cpp:58
const double kMinXHeightFraction
Definition: unicharset.cpp:57
int get_script_id_from_name(const char *script_name) const
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586

◆ PropertiesIncomplete()

bool tesseract::UNICHARSET::PropertiesIncomplete ( UNICHAR_ID  unichar_id) const
inline

Definition at line 663 of file unicharset.h.

663  {
664  return unichars[unichar_id].properties.AnyRangeEmpty();
665  }

◆ save_to_file() [1/3]

bool tesseract::UNICHARSET::save_to_file ( const char *const  filename) const
inline

Definition at line 361 of file unicharset.h.

361  {
362  FILE *file = fopen(filename, "w+b");
363  if (file == nullptr) {
364  return false;
365  }
366  bool result = save_to_file(file);
367  fclose(file);
368  return result;
369  }
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361

◆ save_to_file() [2/3]

bool tesseract::UNICHARSET::save_to_file ( FILE *  file) const
inline

Definition at line 373 of file unicharset.h.

373  {
374  std::string str;
375  return save_to_string(str) &&
376  tesseract::Serialize(file, &str[0], str.length());
377  }
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:251
bool save_to_string(std::string &str) const
Definition: unicharset.cpp:718

◆ save_to_file() [3/3]

bool tesseract::UNICHARSET::save_to_file ( tesseract::TFile file) const
inline

Definition at line 379 of file unicharset.h.

379  {
380  std::string str;
381  return save_to_string(str) && file->Serialize(&str[0], str.length());
382  }

◆ save_to_string()

bool tesseract::UNICHARSET::save_to_string ( std::string &  str) const

Definition at line 718 of file unicharset.cpp.

718  {
719  const int kFileBufSize = 1024;
720  char buffer[kFileBufSize + 1];
721  snprintf(buffer, kFileBufSize, "%zu\n", this->size());
722  str = buffer;
723  for (unsigned id = 0; id < this->size(); ++id) {
724  int min_bottom, max_bottom, min_top, max_top;
725  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
726  float width, width_sd;
727  get_width_stats(id, &width, &width_sd);
728  float bearing, bearing_sd;
729  get_bearing_stats(id, &bearing, &bearing_sd);
730  float advance, advance_sd;
731  get_advance_stats(id, &advance, &advance_sd);
732  unsigned int properties = this->get_properties(id);
733  if (strcmp(this->id_to_unichar(id), " ") == 0) {
734  snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
735  this->get_script_from_script_id(this->get_script(id)),
736  this->get_other_case(id));
737  str += buffer;
738  } else {
739  std::ostringstream stream;
740  stream.imbue(std::locale::classic());
741  stream << this->id_to_unichar(id) << ' ' << properties << ' '
742  << min_bottom << ',' << max_bottom << ',' << min_top << ','
743  << max_top << ',' << width << ',' << width_sd << ',' << bearing
744  << ',' << bearing_sd << ',' << advance << ',' << advance_sd << ' '
745  << this->get_script_from_script_id(this->get_script(id)) << ' '
746  << this->get_other_case(id) << ' ' << this->get_direction(id)
747  << ' ' << this->get_mirror(id) << ' '
748  << this->get_normed_unichar(id) << "\t# "
749  << this->debug_str(id).c_str() << '\n';
750  str += stream.str().c_str();
751  }
752  }
753  return true;
754 }
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:887
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:647
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:630
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:704
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:722
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:612

◆ script_has_upper_lower()

bool tesseract::UNICHARSET::script_has_upper_lower ( ) const
inline

Definition at line 952 of file unicharset.h.

952  {
953  return script_has_upper_lower_;
954  }

◆ script_has_xheight()

bool tesseract::UNICHARSET::script_has_xheight ( ) const
inline

Definition at line 959 of file unicharset.h.

959  {
960  return script_has_xheight_;
961  }

◆ set_advance_stats()

void tesseract::UNICHARSET::set_advance_stats ( UNICHAR_ID  unichar_id,
float  advance,
float  advance_sd 
)
inline

Definition at line 657 of file unicharset.h.

658  {
659  unichars[unichar_id].properties.advance = advance;
660  unichars[unichar_id].properties.advance_sd = advance_sd;
661  }

◆ set_bearing_stats()

void tesseract::UNICHARSET::set_bearing_stats ( UNICHAR_ID  unichar_id,
float  bearing,
float  bearing_sd 
)
inline

Definition at line 640 of file unicharset.h.

641  {
642  unichars[unichar_id].properties.bearing = bearing;
643  unichars[unichar_id].properties.bearing_sd = bearing_sd;
644  }

◆ set_black_and_whitelist()

void tesseract::UNICHARSET::set_black_and_whitelist ( const char *  blacklist,
const char *  whitelist,
const char *  unblacklist 
)

Definition at line 1004 of file unicharset.cpp.

1006  {
1007  bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
1008  // Set everything to default
1009  for (auto &uc : unichars) {
1010  uc.properties.enabled = def_enabled;
1011  }
1012  if (!def_enabled) {
1013  // Enable the whitelist.
1014  std::vector<UNICHAR_ID> encoding;
1015  encode_string(whitelist, false, &encoding, nullptr, nullptr);
1016  for (auto it : encoding) {
1017  if (it != INVALID_UNICHAR_ID) {
1018  unichars[it].properties.enabled = true;
1019  }
1020  }
1021  }
1022  if (blacklist != nullptr && blacklist[0] != '\0') {
1023  // Disable the blacklist.
1024  std::vector<UNICHAR_ID> encoding;
1025  encode_string(blacklist, false, &encoding, nullptr, nullptr);
1026  for (auto it : encoding) {
1027  if (it != INVALID_UNICHAR_ID) {
1028  unichars[it].properties.enabled = false;
1029  }
1030  }
1031  }
1032  if (unblacklist != nullptr && unblacklist[0] != '\0') {
1033  // Re-enable the unblacklist.
1034  std::vector<UNICHAR_ID> encoding;
1035  encode_string(unblacklist, false, &encoding, nullptr, nullptr);
1036  for (auto it : encoding) {
1037  if (it != INVALID_UNICHAR_ID) {
1038  unichars[it].properties.enabled = true;
1039  }
1040  }
1041  }
1042 }

◆ set_direction()

void tesseract::UNICHARSET::set_direction ( UNICHAR_ID  unichar_id,
UNICHARSET::Direction  value 
)
inline

Definition at line 478 of file unicharset.h.

478  {
479  unichars[unichar_id].properties.direction = value;
480  }

◆ set_isalpha()

void tesseract::UNICHARSET::set_isalpha ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 437 of file unicharset.h.

437  {
438  unichars[unichar_id].properties.isalpha = value;
439  }

◆ set_isdigit()

void tesseract::UNICHARSET::set_isdigit ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 452 of file unicharset.h.

452  {
453  unichars[unichar_id].properties.isdigit = value;
454  }

◆ set_islower()

void tesseract::UNICHARSET::set_islower ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 442 of file unicharset.h.

442  {
443  unichars[unichar_id].properties.islower = value;
444  }

◆ set_isngram()

void tesseract::UNICHARSET::set_isngram ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 462 of file unicharset.h.

462  {
463  unichars[unichar_id].properties.isngram = value;
464  }

◆ set_ispunctuation()

void tesseract::UNICHARSET::set_ispunctuation ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 457 of file unicharset.h.

457  {
458  unichars[unichar_id].properties.ispunctuation = value;
459  }

◆ set_isupper()

void tesseract::UNICHARSET::set_isupper ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 447 of file unicharset.h.

447  {
448  unichars[unichar_id].properties.isupper = value;
449  }

◆ set_mirror()

void tesseract::UNICHARSET::set_mirror ( UNICHAR_ID  unichar_id,
UNICHAR_ID  mirror 
)
inline

Definition at line 483 of file unicharset.h.

483  {
484  unichars[unichar_id].properties.mirror = mirror;
485  }

◆ set_normed()

void tesseract::UNICHARSET::set_normed ( UNICHAR_ID  unichar_id,
const char *  normed 
)
inline

Definition at line 488 of file unicharset.h.

488  {
489  unichars[unichar_id].properties.normed = normed;
490  unichars[unichar_id].properties.normed_ids.clear();
491  }

◆ set_normed_ids()

void tesseract::UNICHARSET::set_normed_ids ( UNICHAR_ID  unichar_id)

Definition at line 364 of file unicharset.cpp.

364  {
365  unichars[unichar_id].properties.normed_ids.clear();
366  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
367  unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
368  } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
369  true, &unichars[unichar_id].properties.normed_ids,
370  nullptr, nullptr)) {
371  unichars[unichar_id].properties.normed_ids.clear();
372  unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
373  }
374 }

◆ set_other_case()

void tesseract::UNICHARSET::set_other_case ( UNICHAR_ID  unichar_id,
UNICHAR_ID  other_case 
)
inline

Definition at line 473 of file unicharset.h.

473  {
474  unichars[unichar_id].properties.other_case = other_case;
475  }

◆ set_ranges_empty()

void tesseract::UNICHARSET::set_ranges_empty ( )

Definition at line 386 of file unicharset.cpp.

386  {
387  for (auto &uc : unichars) {
388  uc.properties.SetRangesEmpty();
389  }
390 }

◆ set_script()

void tesseract::UNICHARSET::set_script ( UNICHAR_ID  unichar_id,
const char *  value 
)
inline

Definition at line 468 of file unicharset.h.

468  {
469  unichars[unichar_id].properties.script_id = add_script(value);
470  }

◆ set_top_bottom()

void tesseract::UNICHARSET::set_top_bottom ( UNICHAR_ID  unichar_id,
int  min_bottom,
int  max_bottom,
int  min_top,
int  max_top 
)
inline

Definition at line 599 of file unicharset.h.

600  {
601  unichars[unichar_id].properties.min_bottom =
602  ClipToRange<int>(min_bottom, 0, UINT8_MAX);
603  unichars[unichar_id].properties.max_bottom =
604  ClipToRange<int>(max_bottom, 0, UINT8_MAX);
605  unichars[unichar_id].properties.min_top =
606  ClipToRange<int>(min_top, 0, UINT8_MAX);
607  unichars[unichar_id].properties.max_top =
608  ClipToRange<int>(max_top, 0, UINT8_MAX);
609  }

◆ set_width_stats()

void tesseract::UNICHARSET::set_width_stats ( UNICHAR_ID  unichar_id,
float  width,
float  width_sd 
)
inline

Definition at line 624 of file unicharset.h.

624  {
625  unichars[unichar_id].properties.width = width;
626  unichars[unichar_id].properties.width_sd = width_sd;
627  }

◆ SetPropertiesFromOther()

void tesseract::UNICHARSET::SetPropertiesFromOther ( const UNICHARSET src)
inline

Definition at line 563 of file unicharset.h.

563  {
565  }

◆ size()

size_t tesseract::UNICHARSET::size ( ) const
inline

Definition at line 355 of file unicharset.h.

355  {
356  return unichars.size();
357  }

◆ SizesDistinct()

bool tesseract::UNICHARSET::SizesDistinct ( UNICHAR_ID  id1,
UNICHAR_ID  id2 
) const

Definition at line 476 of file unicharset.cpp.

476  {
477  int overlap = std::min(unichars[id1].properties.max_top,
478  unichars[id2].properties.max_top) -
479  std::max(unichars[id1].properties.min_top,
480  unichars[id2].properties.min_top);
481  return overlap <= 0;
482 }

◆ step()

int tesseract::UNICHARSET::step ( const char *  str) const

Definition at line 211 of file unicharset.cpp.

211  {
212  std::vector<UNICHAR_ID> encoding;
213  std::vector<char> lengths;
214  encode_string(str, true, &encoding, &lengths, nullptr);
215  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) {
216  return 0;
217  }
218  return lengths[0];
219 }

◆ thai_sid()

int tesseract::UNICHARSET::thai_sid ( ) const
inline

Definition at line 941 of file unicharset.h.

941  {
942  return thai_sid_;
943  }

◆ to_lower()

UNICHAR_ID tesseract::UNICHARSET::to_lower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 731 of file unicharset.h.

731  {
732  if (INVALID_UNICHAR_ID == unichar_id) {
733  return INVALID_UNICHAR_ID;
734  }
735  ASSERT_HOST(contains_unichar_id(unichar_id));
736  if (unichars[unichar_id].properties.islower) {
737  return unichar_id;
738  }
739  return unichars[unichar_id].properties.other_case;
740  }

◆ to_upper()

UNICHAR_ID tesseract::UNICHARSET::to_upper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 743 of file unicharset.h.

743  {
744  if (INVALID_UNICHAR_ID == unichar_id) {
745  return INVALID_UNICHAR_ID;
746  }
747  ASSERT_HOST(contains_unichar_id(unichar_id));
748  if (unichars[unichar_id].properties.isupper) {
749  return unichar_id;
750  }
751  return unichars[unichar_id].properties.other_case;
752  }

◆ top_bottom_useful()

bool tesseract::UNICHARSET::top_bottom_useful ( ) const
inline

Definition at line 555 of file unicharset.h.

555  {
556  return top_bottom_set_;
557  }

◆ unichar_insert() [1/2]

void tesseract::UNICHARSET::unichar_insert ( const char *const  unichar_repr)
inline

Definition at line 283 of file unicharset.h.

◆ unichar_insert() [2/2]

void tesseract::UNICHARSET::unichar_insert ( const char *const  unichar_repr,
OldUncleanUnichars  old_style 
)

Definition at line 654 of file unicharset.cpp.

655  {
656  if (old_style == OldUncleanUnichars::kTrue) {
657  old_style_included_ = true;
658  }
659  std::string cleaned =
660  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
661  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
662  const char *str = cleaned.c_str();
663  std::vector<int> encoding;
664  if (!old_style_included_ &&
665  encode_string(str, true, &encoding, nullptr, nullptr)) {
666  return;
667  }
668  unichars.emplace_back();
669  auto &u = unichars.back();
670  int index = 0;
671  do {
672  if (index >= UNICHAR_LEN) {
673  fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
674  unichar_repr);
675  return;
676  }
677  u.representation[index++] = *str++;
678  } while (*str != '\0');
679  u.representation[index] = '\0';
680  this->set_script(unichars.size() - 1, null_script);
681  // If the given unichar_repr represents a fragmented character, set
682  // fragment property to a pointer to CHAR_FRAGMENT class instance with
683  // information parsed from the unichar representation. Use the script
684  // of the base unichar for the fragmented character if possible.
685  CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(u.representation);
686  u.properties.fragment = frag;
687  if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
688  u.properties.script_id = this->get_script(frag->get_unichar());
689  }
690  u.properties.enabled = true;
691  ids.insert(u.representation, unichars.size() - 1);
692  }
693 }
#define UNICHAR_LEN
Definition: unichar.h:33
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:59
static CHAR_FRAGMENT * parse_from_string(const char *str)
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:468

◆ unichar_insert_backwards_compatible()

void tesseract::UNICHARSET::unichar_insert_backwards_compatible ( const char *const  unichar_repr)
inline

Definition at line 288 of file unicharset.h.

288  {
289  std::string cleaned = CleanupString(unichar_repr);
290  if (cleaned != unichar_repr) {
292  } else {
293  auto old_size = size();
295  if (size() == old_size) {
297  }
298  }
299  }

◆ unichar_to_id() [1/2]

UNICHAR_ID tesseract::UNICHARSET::unichar_to_id ( const char *const  unichar_repr) const

Definition at line 186 of file unicharset.cpp.

186  {
187  std::string cleaned =
188  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
189  return ids.contains(cleaned.data(), cleaned.size())
190  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
191  : INVALID_UNICHAR_ID;
192 }
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:36

◆ unichar_to_id() [2/2]

UNICHAR_ID tesseract::UNICHARSET::unichar_to_id ( const char *const  unichar_repr,
int  length 
) const

Definition at line 194 of file unicharset.cpp.

195  {
196  assert(length > 0 && length <= UNICHAR_LEN);
197  std::string cleaned(unichar_repr, length);
198  if (!old_style_included_) {
199  cleaned = CleanupString(unichar_repr, length);
200  }
201  return ids.contains(cleaned.data(), cleaned.size())
202  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
203  : INVALID_UNICHAR_ID;
204 }

Member Data Documentation

◆ kCustomLigatures

const char * tesseract::UNICHARSET::kCustomLigatures
static
Initial value:
= {
{"ct", "\uE003"},
{"ſh", "\uE006"},
{"ſi", "\uE007"},
{"ſl", "\uE008"},
{"ſſ", "\uE009"},
{nullptr, nullptr}}

Definition at line 169 of file unicharset.h.

◆ kSpecialUnicharCodes

const char * tesseract::UNICHARSET::kSpecialUnicharCodes
static
Initial value:
= {
" ", "Joined", "|Broken|0|1"}

Definition at line 172 of file unicharset.h.


The documentation for this class was generated from the following files: