tesseract  5.0.0
unicharset.cpp
Go to the documentation of this file.
1 // File: unicharset.cpp
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "unicharset.h"
20 
21 #include "params.h"
22 
23 #include <tesseract/unichar.h>
24 #include "serialis.h"
25 
26 #include <algorithm>
27 #include <cassert>
28 #include <cstdio>
29 #include <cstring>
30 #include <iomanip> // for std::setw
31 #include <locale> // for std::locale::classic
32 #include <sstream> // for std::istringstream, std::ostringstream
33 
34 namespace tesseract {
35 
36 // Special character used in representing character fragments.
37 static const char kSeparator = '|';
38 // Special character used in representing 'natural' character fragments.
39 static const char kNaturalFlag = 'n';
40 
41 static const int ISALPHA_MASK = 0x1;
42 static const int ISLOWER_MASK = 0x2;
43 static const int ISUPPER_MASK = 0x4;
44 static const int ISDIGIT_MASK = 0x8;
45 static const int ISPUNCTUATION_MASK = 0x10;
46 
47 // Y coordinate threshold for determining cap-height vs x-height.
48 // TODO(rays) Bring the global definition down to the ccutil library level,
49 // so this constant is relative to some other constants.
50 static const int kMeanlineThreshold = 220;
51 // Let C be the number of alpha chars for which all tops exceed
52 // kMeanlineThreshold, and X the number of alpha chars for which all
53 // tops are below kMeanlineThreshold, then if X > C *
54 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
55 // half the alpha characters have upper or lower case, then the
56 // unicharset "has x-height".
57 const double kMinXHeightFraction = 0.25;
58 const double kMinCapHeightFraction = 0.05;
59 
60 /*static */
61 const char *UNICHARSET::kCustomLigatures[][2] = {
62  {"ct", "\uE003"}, // c + t -> U+E003
63  {"ſh", "\uE006"}, // long-s + h -> U+E006
64  {"ſi", "\uE007"}, // long-s + i -> U+E007
65  {"ſl", "\uE008"}, // long-s + l -> U+E008
66  {"ſſ", "\uE009"}, // long-s + long-s -> U+E009
67  {nullptr, nullptr}};
68 
69 // List of mappings to make when ingesting strings from the outside.
70 // The substitutions clean up text that should exist for rendering of
71 // synthetic data, but not in the recognition set.
72 const char *UNICHARSET::kCleanupMaps[][2] = {
73  {"\u0640", ""}, // TATWEEL is deleted.
74  {"\ufb01", "fi"}, // fi ligature->fi pair.
75  {"\ufb02", "fl"}, // fl ligature->fl pair.
76  {nullptr, nullptr}};
77 
78 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
80  " ", "Joined", "|Broken|0|1"};
81 
82 const char *UNICHARSET::null_script = "NULL";
83 
84 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
85  Init();
86 }
87 
88 // Initialize all properties to sensible default values.
89 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
90  isalpha = false;
91  islower = false;
92  isupper = false;
93  isdigit = false;
94  ispunctuation = false;
95  isngram = false;
96  enabled = false;
97  SetRangesOpen();
98  script_id = 0;
99  other_case = 0;
100  mirror = 0;
101  normed = "";
102  direction = UNICHARSET::U_LEFT_TO_RIGHT;
103  fragment = nullptr;
104 }
105 
106 // Sets all ranges wide open. Initialization default in case there are
107 // no useful values available.
108 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
109  min_bottom = 0;
110  max_bottom = UINT8_MAX;
111  min_top = 0;
112  max_top = UINT8_MAX;
113  width = 0.0f;
114  width_sd = 0.0f;
115  bearing = 0.0f;
116  bearing_sd = 0.0f;
117  advance = 0.0f;
118  advance_sd = 0.0f;
119 }
120 
121 // Sets all ranges to empty. Used before expanding with font-based data.
122 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
123  min_bottom = UINT8_MAX;
124  max_bottom = 0;
125  min_top = UINT8_MAX;
126  max_top = 0;
127  width = 0.0f;
128  width_sd = 0.0f;
129  bearing = 0.0f;
130  bearing_sd = 0.0f;
131  advance = 0.0f;
132  advance_sd = 0.0f;
133 }
134 
135 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
136 // is empty.
137 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
138  return width == 0.0f || advance == 0.0f;
139 }
140 
141 // Expands the ranges with the ranges from the src properties.
142 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
143  const UNICHAR_PROPERTIES &src) {
144  UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
145  UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
146  UpdateRange(src.min_top, &min_top, &max_top);
147  UpdateRange(src.max_top, &min_top, &max_top);
148  if (src.width_sd > width_sd) {
149  width = src.width;
150  width_sd = src.width_sd;
151  }
152  if (src.bearing_sd > bearing_sd) {
153  bearing = src.bearing;
154  bearing_sd = src.bearing_sd;
155  }
156  if (src.advance_sd > advance_sd) {
157  advance = src.advance;
158  advance_sd = src.advance_sd;
159  }
160 }
161 
162 // Copies the properties from src into this.
163 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES &src) {
164  // Apart from the fragment, everything else can be done with a default copy.
165  CHAR_FRAGMENT *saved_fragment = fragment;
166  *this = src; // Bitwise copy.
167  fragment = saved_fragment;
168 }
169 
171  : ids(), script_table(nullptr), script_table_size_used(0) {
172  clear();
173  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
175  if (i == UNICHAR_JOINED) {
176  set_isngram(i, true);
177  }
178  }
179 }
180 
182  clear();
183 }
184 
186 UNICHARSET::unichar_to_id(const char *const unichar_repr) const {
187  std::string cleaned =
188  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
189  return ids.contains(cleaned.data(), cleaned.size())
190  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
191  : INVALID_UNICHAR_ID;
192 }
193 
194 UNICHAR_ID UNICHARSET::unichar_to_id(const char *const unichar_repr,
195  int length) const {
196  assert(length > 0 && length <= UNICHAR_LEN);
197  std::string cleaned(unichar_repr, length);
198  if (!old_style_included_) {
199  cleaned = CleanupString(unichar_repr, length);
200  }
201  return ids.contains(cleaned.data(), cleaned.size())
202  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
203  : INVALID_UNICHAR_ID;
204 }
205 
206 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
207 // while leaving the rest of the string encodable. Returns 0 if the
208 // beginning of the string is not encodable.
209 // WARNING: this function now encodes the whole string for precision.
210 // Use encode_string in preference to repeatedly calling step.
211 int UNICHARSET::step(const char *str) const {
212  std::vector<UNICHAR_ID> encoding;
213  std::vector<char> lengths;
214  encode_string(str, true, &encoding, &lengths, nullptr);
215  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) {
216  return 0;
217  }
218  return lengths[0];
219 }
220 
221 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
222 // If not encodable, write the first byte offset which cannot be converted
223 // into the second (return) argument.
224 bool UNICHARSET::encodable_string(const char *str,
225  unsigned *first_bad_position) const {
226  std::vector<UNICHAR_ID> encoding;
227  return encode_string(str, true, &encoding, nullptr, first_bad_position);
228 }
229 
230 // Encodes the given UTF-8 string with this UNICHARSET.
231 // Returns true if the encoding succeeds completely, false if there is at
232 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case
233 // the rest of the string is still encoded.
234 // If lengths is not nullptr, then it is filled with the corresponding
235 // byte length of each encoded UNICHAR_ID.
236 // WARNING: Caller must guarantee that str has already been cleaned of codes
237 // that do not belong in the unicharset, or encoding may fail.
238 // Use CleanupString to perform the cleaning.
239 bool UNICHARSET::encode_string(const char *str, bool give_up_on_failure,
240  std::vector<UNICHAR_ID> *encoding,
241  std::vector<char> *lengths,
242  unsigned *encoded_length) const {
243  std::vector<UNICHAR_ID> working_encoding;
244  std::vector<char> working_lengths;
245  std::vector<char> best_lengths;
246  encoding->clear(); // Just in case str is empty.
247  auto str_length = strlen(str);
248  unsigned str_pos = 0;
249  bool perfect = true;
250  while (str_pos < str_length) {
251  encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
252  &str_pos, encoding, &best_lengths);
253  if (str_pos < str_length) {
254  // This is a non-match. Skip one utf-8 character.
255  perfect = false;
256  if (give_up_on_failure) {
257  break;
258  }
259  int step = UNICHAR::utf8_step(str + str_pos);
260  if (step == 0) {
261  step = 1;
262  }
263  encoding->push_back(INVALID_UNICHAR_ID);
264  best_lengths.push_back(step);
265  str_pos += step;
266  working_encoding = *encoding;
267  working_lengths = best_lengths;
268  }
269  }
270  if (lengths != nullptr) {
271  *lengths = best_lengths;
272  }
273  if (encoded_length != nullptr) {
274  *encoded_length = str_pos;
275  }
276  return perfect;
277 }
278 
279 const char *UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
280  if (id == INVALID_UNICHAR_ID) {
281  return INVALID_UNICHAR;
282  }
283  ASSERT_HOST(static_cast<unsigned>(id) < this->size());
284  return unichars[id].representation;
285 }
286 
288  if (id == INVALID_UNICHAR_ID) {
289  return INVALID_UNICHAR;
290  }
291  ASSERT_HOST(static_cast<unsigned>(id) < this->size());
292  // Resolve from the kCustomLigatures table if this is a private encoding.
293  if (get_isprivate(id)) {
294  const char *ch = id_to_unichar(id);
295  for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
296  if (!strcmp(ch, kCustomLigatures[i][1])) {
297  return kCustomLigatures[i][0];
298  }
299  }
300  }
301  // Otherwise return the stored representation.
302  return unichars[id].representation;
303 }
304 
305 // Return a string that reformats the utf8 str into the str followed
306 // by its hex unicodes.
307 std::string UNICHARSET::debug_utf8_str(const char *str) {
308  std::string result = str;
309  result += " [";
310  int step = 1;
311  // Chop into unicodes and code each as hex.
312  for (int i = 0; str[i] != '\0'; i += step) {
313  char hex[sizeof(int) * 2 + 1];
314  step = UNICHAR::utf8_step(str + i);
315  if (step == 0) {
316  step = 1;
317  sprintf(hex, "%x", str[i]);
318  } else {
319  UNICHAR ch(str + i, step);
320  sprintf(hex, "%x", ch.first_uni());
321  }
322  result += hex;
323  result += " ";
324  }
325  result += "]";
326  return result;
327 }
328 
329 // Return a string containing debug information on the unichar, including
330 // the id_to_unichar, its hex unicodes and the properties.
331 std::string UNICHARSET::debug_str(UNICHAR_ID id) const {
332  if (id == INVALID_UNICHAR_ID) {
333  return std::string(id_to_unichar(id));
334  }
335  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
336  if (fragment) {
337  return fragment->to_string();
338  }
339  const char *str = id_to_unichar(id);
340  std::string result = debug_utf8_str(str);
341  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
342  if (get_isalpha(id)) {
343  if (get_islower(id)) {
344  result += "a";
345  } else if (get_isupper(id)) {
346  result += "A";
347  } else {
348  result += "x";
349  }
350  }
351  // Append 0 if a digit.
352  if (get_isdigit(id)) {
353  result += "0";
354  }
355  // Append p is a punctuation symbol.
356  if (get_ispunctuation(id)) {
357  result += "p";
358  }
359  return result;
360 }
361 
362 // Sets the normed_ids vector from the normed string. normed_ids is not
363 // stored in the file, and needs to be set when the UNICHARSET is loaded.
365  unichars[unichar_id].properties.normed_ids.clear();
366  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
367  unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
368  } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
369  true, &unichars[unichar_id].properties.normed_ids,
370  nullptr, nullptr)) {
371  unichars[unichar_id].properties.normed_ids.clear();
372  unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
373  }
374 }
375 
376 // Returns whether the unichar id represents a unicode value in the private use
377 // area. We use this range only internally to represent uncommon ligatures
378 // (eg. 'ct') that do not have regular unicode values.
379 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
380  UNICHAR uc(id_to_unichar(unichar_id), -1);
381  int uni = uc.first_uni();
382  return (uni >= 0xE000 && uni <= 0xF8FF);
383 }
384 
385 // Sets all ranges to empty, so they can be expanded to set the values.
387  for (auto &uc : unichars) {
388  uc.properties.SetRangesEmpty();
389  }
390 }
391 
392 // Sets all the properties for this unicharset given a src unicharset with
393 // everything set. The unicharsets don't have to be the same, and graphemes
394 // are correctly accounted for.
396  const UNICHARSET &src) {
397  for (unsigned ch = start_index; ch < unichars.size(); ++ch) {
398  const char *utf8 = id_to_unichar(ch);
399  UNICHAR_PROPERTIES properties;
400  if (src.GetStrProperties(utf8, &properties)) {
401  // Setup the script_id, other_case, and mirror properly.
402  const char *script = src.get_script_from_script_id(properties.script_id);
403  properties.script_id = add_script(script);
404  const char *other_case = src.id_to_unichar(properties.other_case);
405  if (contains_unichar(other_case)) {
406  properties.other_case = unichar_to_id(other_case);
407  } else {
408  properties.other_case = ch;
409  }
410  const char *mirror_str = src.id_to_unichar(properties.mirror);
411  if (contains_unichar(mirror_str)) {
412  properties.mirror = unichar_to_id(mirror_str);
413  } else {
414  properties.mirror = ch;
415  }
416  unichars[ch].properties.CopyFrom(properties);
417  set_normed_ids(ch);
418  }
419  }
420 }
421 
422 // Expands the tops and bottoms and widths for this unicharset given a
423 // src unicharset with ranges in it. The unicharsets don't have to be the
424 // same, and graphemes are correctly accounted for.
426  for (unsigned ch = 0; ch < unichars.size(); ++ch) {
427  const char *utf8 = id_to_unichar(ch);
428  UNICHAR_PROPERTIES properties;
429  if (src.GetStrProperties(utf8, &properties)) {
430  // Expand just the ranges from properties.
431  unichars[ch].properties.ExpandRangesFrom(properties);
432  }
433  }
434 }
435 
436 // Makes this a copy of src. Clears this completely first, so the automatic
437 // ids will not be present in this if not in src. Does NOT reorder the set!
439  clear();
440  for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
441  const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
442  const char *utf8 = src.id_to_unichar(ch);
444  unichars[ch].properties.ExpandRangesFrom(src_props);
445  }
446  // Set properties, including mirror and other_case, WITHOUT reordering
447  // the unicharset.
449 }
450 
451 // For each id in src, if it does not occur in this, add it, as in
452 // SetPropertiesFromOther, otherwise expand the ranges, as in
453 // ExpandRangesFromOther.
455  int initial_used = unichars.size();
456  for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
457  const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
458  const char *utf8 = src.id_to_unichar(ch);
459  int id = unichars.size();
460  if (contains_unichar(utf8)) {
461  id = unichar_to_id(utf8);
462  // Just expand current ranges.
463  unichars[id].properties.ExpandRangesFrom(src_props);
464  } else {
466  unichars[id].properties.SetRangesEmpty();
467  }
468  }
469  // Set properties, including mirror and other_case, WITHOUT reordering
470  // the unicharset.
471  PartialSetPropertiesFromOther(initial_used, src);
472 }
473 
474 // Returns true if the acceptable ranges of the tops of the characters do
475 // not overlap, making their x-height calculations distinct.
477  int overlap = std::min(unichars[id1].properties.max_top,
478  unichars[id2].properties.max_top) -
479  std::max(unichars[id1].properties.min_top,
480  unichars[id2].properties.min_top);
481  return overlap <= 0;
482 }
483 
484 // Internal recursive version of encode_string above.
485 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that
486 // each UNICHAR_ID uses the least possible part of the utf8 str.
487 // It does this by depth-first tail recursion on increasing length matches
488 // to the UNICHARSET, saving the first encountered result that encodes the
489 // maximum total length of str. It stops on a failure to encode to make
490 // the overall process of encoding a partially failed string more efficient.
491 // See unicharset.h for definition of the args.
492 void UNICHARSET::encode_string(const char *str, int str_index, int str_length,
493  std::vector<UNICHAR_ID> *encoding,
494  std::vector<char> *lengths,
495  unsigned *best_total_length,
496  std::vector<UNICHAR_ID> *best_encoding,
497  std::vector<char> *best_lengths) const {
498  if (str_index > static_cast<int>(*best_total_length)) {
499  // This is the best result so far.
500  *best_total_length = str_index;
501  *best_encoding = *encoding;
502  if (best_lengths != nullptr) {
503  *best_lengths = *lengths;
504  }
505  }
506  if (str_index == str_length) {
507  return;
508  }
509  int encoding_index = encoding->size();
510  // Find the length of the first matching unicharset member.
511  int length = ids.minmatch(str + str_index);
512  if (length == 0 || str_index + length > str_length) {
513  return;
514  }
515  do {
516  if (ids.contains(str + str_index, length)) {
517  // Successful encoding so far.
518  UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
519  encoding->push_back(id);
520  lengths->push_back(length);
521  encode_string(str, str_index + length, str_length, encoding, lengths,
522  best_total_length, best_encoding, best_lengths);
523  if (static_cast<int>(*best_total_length) == str_length) {
524  return; // Tail recursion success!
525  }
526  // Failed with that length, truncate back and try again.
527  encoding->resize(encoding_index);
528  lengths->resize(encoding_index);
529  }
530  int step = UNICHAR::utf8_step(str + str_index + length);
531  if (step == 0) {
532  step = 1;
533  }
534  length += step;
535  } while (length <= UNICHAR_LEN && str_index + length <= str_length);
536 }
537 
538 // Gets the properties for a grapheme string, combining properties for
539 // multiple characters in a meaningful way where possible.
540 // Returns false if no valid match was found in the unicharset.
541 // NOTE that script_id, mirror, and other_case refer to this unicharset on
542 // return and will need translation if the target unicharset is different.
543 bool UNICHARSET::GetStrProperties(const char *utf8_str,
544  UNICHAR_PROPERTIES *props) const {
545  props->Init();
546  props->SetRangesEmpty();
547  int total_unicodes = 0;
548  std::vector<UNICHAR_ID> encoding;
549  if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr)) {
550  return false; // Some part was invalid.
551  }
552  for (auto it : encoding) {
553  int id = it;
554  const UNICHAR_PROPERTIES &src_props = unichars[id].properties;
555  // Logical OR all the bools.
556  if (src_props.isalpha) {
557  props->isalpha = true;
558  }
559  if (src_props.islower) {
560  props->islower = true;
561  }
562  if (src_props.isupper) {
563  props->isupper = true;
564  }
565  if (src_props.isdigit) {
566  props->isdigit = true;
567  }
568  if (src_props.ispunctuation) {
569  props->ispunctuation = true;
570  }
571  if (src_props.isngram) {
572  props->isngram = true;
573  }
574  if (src_props.enabled) {
575  props->enabled = true;
576  }
577  // Min/max the tops/bottoms.
578  UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
579  UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
580  UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
581  UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
582  float bearing = props->advance + src_props.bearing;
583  if (total_unicodes == 0 || bearing < props->bearing) {
584  props->bearing = bearing;
585  props->bearing_sd = props->advance_sd + src_props.bearing_sd;
586  }
587  props->advance += src_props.advance;
588  props->advance_sd += src_props.advance_sd;
589  // With a single width, just use the widths stored in the unicharset.
590  props->width = src_props.width;
591  props->width_sd = src_props.width_sd;
592  // Use the first script id, other_case, mirror, direction.
593  // Note that these will need translation, except direction.
594  if (total_unicodes == 0) {
595  props->script_id = src_props.script_id;
596  props->other_case = src_props.other_case;
597  props->mirror = src_props.mirror;
598  props->direction = src_props.direction;
599  }
600  // The normed string for the compound character is the concatenation of
601  // the normed versions of the individual characters.
602  props->normed += src_props.normed;
603  ++total_unicodes;
604  }
605  if (total_unicodes > 1) {
606  // Estimate the total widths from the advance - bearing.
607  props->width = props->advance - props->bearing;
608  props->width_sd = props->advance_sd + props->bearing_sd;
609  }
610  return total_unicodes > 0;
611 }
612 
613 // TODO(rays) clean-up the order of functions to match unicharset.h.
614 
615 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
616  unsigned int properties = 0;
617  if (this->get_isalpha(id)) {
618  properties |= ISALPHA_MASK;
619  }
620  if (this->get_islower(id)) {
621  properties |= ISLOWER_MASK;
622  }
623  if (this->get_isupper(id)) {
624  properties |= ISUPPER_MASK;
625  }
626  if (this->get_isdigit(id)) {
627  properties |= ISDIGIT_MASK;
628  }
629  if (this->get_ispunctuation(id)) {
630  properties |= ISPUNCTUATION_MASK;
631  }
632  return properties;
633 }
634 
636  if (this->get_isupper(id)) {
637  return 'A';
638  }
639  if (this->get_islower(id)) {
640  return 'a';
641  }
642  if (this->get_isalpha(id)) {
643  return 'x';
644  }
645  if (this->get_isdigit(id)) {
646  return '0';
647  }
648  if (this->get_ispunctuation(id)) {
649  return 'p';
650  }
651  return 0;
652 }
653 
654 void UNICHARSET::unichar_insert(const char *const unichar_repr,
655  OldUncleanUnichars old_style) {
656  if (old_style == OldUncleanUnichars::kTrue) {
657  old_style_included_ = true;
658  }
659  std::string cleaned =
660  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
661  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
662  const char *str = cleaned.c_str();
663  std::vector<int> encoding;
664  if (!old_style_included_ &&
665  encode_string(str, true, &encoding, nullptr, nullptr)) {
666  return;
667  }
668  unichars.emplace_back();
669  auto &u = unichars.back();
670  int index = 0;
671  do {
672  if (index >= UNICHAR_LEN) {
673  fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
674  unichar_repr);
675  return;
676  }
677  u.representation[index++] = *str++;
678  } while (*str != '\0');
679  u.representation[index] = '\0';
680  this->set_script(unichars.size() - 1, null_script);
681  // If the given unichar_repr represents a fragmented character, set
682  // fragment property to a pointer to CHAR_FRAGMENT class instance with
683  // information parsed from the unichar representation. Use the script
684  // of the base unichar for the fragmented character if possible.
685  CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(u.representation);
686  u.properties.fragment = frag;
687  if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
688  u.properties.script_id = this->get_script(frag->get_unichar());
689  }
690  u.properties.enabled = true;
691  ids.insert(u.representation, unichars.size() - 1);
692  }
693 }
694 
695 bool UNICHARSET::contains_unichar(const char *const unichar_repr) const {
696  std::string cleaned =
697  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
698  return ids.contains(cleaned.data(), cleaned.size());
699 }
700 
701 bool UNICHARSET::contains_unichar(const char *const unichar_repr,
702  int length) const {
703  if (length == 0) {
704  return false;
705  }
706  std::string cleaned(unichar_repr, length);
707  if (!old_style_included_) {
708  cleaned = CleanupString(unichar_repr, length);
709  }
710  return ids.contains(cleaned.data(), cleaned.size());
711 }
712 
713 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
714  const char *const unichar_repr) const {
715  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
716 }
717 
718 bool UNICHARSET::save_to_string(std::string &str) const {
719  const int kFileBufSize = 1024;
720  char buffer[kFileBufSize + 1];
721  snprintf(buffer, kFileBufSize, "%zu\n", this->size());
722  str = buffer;
723  for (unsigned id = 0; id < this->size(); ++id) {
724  int min_bottom, max_bottom, min_top, max_top;
725  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
726  float width, width_sd;
727  get_width_stats(id, &width, &width_sd);
728  float bearing, bearing_sd;
729  get_bearing_stats(id, &bearing, &bearing_sd);
730  float advance, advance_sd;
731  get_advance_stats(id, &advance, &advance_sd);
732  unsigned int properties = this->get_properties(id);
733  if (strcmp(this->id_to_unichar(id), " ") == 0) {
734  snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
735  this->get_script_from_script_id(this->get_script(id)),
736  this->get_other_case(id));
737  str += buffer;
738  } else {
739  std::ostringstream stream;
740  stream.imbue(std::locale::classic());
741  stream << this->id_to_unichar(id) << ' ' << properties << ' '
742  << min_bottom << ',' << max_bottom << ',' << min_top << ','
743  << max_top << ',' << width << ',' << width_sd << ',' << bearing
744  << ',' << bearing_sd << ',' << advance << ',' << advance_sd << ' '
745  << this->get_script_from_script_id(this->get_script(id)) << ' '
746  << this->get_other_case(id) << ' ' << this->get_direction(id)
747  << ' ' << this->get_mirror(id) << ' '
748  << this->get_normed_unichar(id) << "\t# "
749  << this->debug_str(id).c_str() << '\n';
750  str += stream.str().c_str();
751  }
752  }
753  return true;
754 }
755 
757 public:
758  LocalFilePointer(FILE *stream) : fp_(stream) {}
759  char *fgets(char *dst, int size) {
760  return ::fgets(dst, size, fp_);
761  }
762 
763 private:
764  FILE *fp_;
765 };
766 
767 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
768  LocalFilePointer lfp(file);
769  using namespace std::placeholders; // for _1, _2
770  std::function<char *(char *, int)> fgets_cb =
771  std::bind(&LocalFilePointer::fgets, &lfp, _1, _2);
772  bool success = load_via_fgets(fgets_cb, skip_fragments);
773  return success;
774 }
775 
776 bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
777  using namespace std::placeholders; // for _1, _2
778  std::function<char *(char *, int)> fgets_cb =
779  std::bind(&tesseract::TFile::FGets, file, _1, _2);
780  bool success = load_via_fgets(fgets_cb, skip_fragments);
781  return success;
782 }
783 
784 bool UNICHARSET::load_via_fgets(
785  const std::function<char *(char *, int)> &fgets_cb, bool skip_fragments) {
786  int unicharset_size;
787  char buffer[256];
788 
789  this->clear();
790  if (fgets_cb(buffer, sizeof(buffer)) == nullptr ||
791  sscanf(buffer, "%d", &unicharset_size) != 1) {
792  return false;
793  }
794  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
795  char unichar[256];
796  unsigned int properties;
797  char script[64];
798 
799  strncpy(script, null_script, sizeof(script) - 1);
800  int min_bottom = 0;
801  int max_bottom = UINT8_MAX;
802  int min_top = 0;
803  int max_top = UINT8_MAX;
804  float width = 0.0f;
805  float width_sd = 0.0f;
806  float bearing = 0.0f;
807  float bearing_sd = 0.0f;
808  float advance = 0.0f;
809  float advance_sd = 0.0f;
810  // TODO(eger): check that this default it ok
811  // after enabling BiDi iterator for Arabic.
812  int direction = UNICHARSET::U_LEFT_TO_RIGHT;
813  UNICHAR_ID other_case = unicharset_size;
814  UNICHAR_ID mirror = unicharset_size;
815  if (fgets_cb(buffer, sizeof(buffer)) == nullptr) {
816  return false;
817  }
818  char normed[64];
819  normed[0] = '\0';
820  std::istringstream stream(buffer);
821  stream.imbue(std::locale::classic());
822  // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标 # 标 [6807 ]x
823  // stream.flags(std::ios::hex);
824  stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
825  // stream.flags(std::ios::dec);
826  if (stream.fail()) {
827  fprintf(stderr, "%s:%u failed\n", __FILE__, __LINE__);
828  return false;
829  }
830  auto position = stream.tellg();
831  stream.seekg(position);
832  char c1, c2, c3, c4, c5, c6, c7, c8, c9;
833  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
834  max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >>
835  bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >>
836  script >> other_case >> direction >> mirror >> std::setw(63) >> normed;
837  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
838  c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
839  stream.clear();
840  stream.seekg(position);
841  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
842  max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >>
843  bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >>
844  script >> other_case >> direction >> mirror;
845  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
846  c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
847  stream.clear();
848  stream.seekg(position);
849  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
850  max_top >> std::setw(63) >> script >> other_case >> direction >>
851  mirror;
852  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
853  stream.clear();
854  stream.seekg(position);
855  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
856  max_top >> std::setw(63) >> script >> other_case;
857  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
858  stream.clear();
859  stream.seekg(position);
860  stream >> std::setw(63) >> script >> other_case;
861  if (stream.fail()) {
862  stream.clear();
863  stream.seekg(position);
864  stream >> std::setw(63) >> script;
865  }
866  }
867  }
868  }
869  }
870 
871  // Skip fragments if needed.
872  CHAR_FRAGMENT *frag = nullptr;
873  if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
874  int num_pieces = frag->get_total();
875  delete frag;
876  // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
877  if (num_pieces > 1) {
878  continue;
879  }
880  }
881  // Insert unichar into unicharset and set its properties.
882  if (strcmp(unichar, "NULL") == 0) {
883  this->unichar_insert(" ");
884  } else {
886  }
887 
888  this->set_isalpha(id, properties & ISALPHA_MASK);
889  this->set_islower(id, properties & ISLOWER_MASK);
890  this->set_isupper(id, properties & ISUPPER_MASK);
891  this->set_isdigit(id, properties & ISDIGIT_MASK);
892  this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
893  this->set_isngram(id, false);
894  this->set_script(id, script);
895  this->unichars[id].properties.enabled = true;
896  this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
897  this->set_width_stats(id, width, width_sd);
898  this->set_bearing_stats(id, bearing, bearing_sd);
899  this->set_advance_stats(id, advance, advance_sd);
900  this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
901  this->set_other_case(id, (other_case < unicharset_size) ? other_case : id);
902  this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);
903  this->set_normed(id, normed[0] != '\0' ? normed : unichar);
904  }
905  post_load_setup();
906  return true;
907 }
908 
909 // Sets up internal data after loading the file, based on the char
910 // properties. Called from load_from_file, but also needs to be run
911 // during set_unicharset_properties.
913  // Number of alpha chars with the case property minus those without,
914  // in order to determine that half the alpha chars have case.
915  int net_case_alphas = 0;
916  int x_height_alphas = 0;
917  int cap_height_alphas = 0;
918  top_bottom_set_ = false;
919  for (unsigned id = 0; id < unichars.size(); ++id) {
920  int min_bottom = 0;
921  int max_bottom = UINT8_MAX;
922  int min_top = 0;
923  int max_top = UINT8_MAX;
924  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
925  if (min_top > 0) {
926  top_bottom_set_ = true;
927  }
928  if (get_isalpha(id)) {
929  if (get_islower(id) || get_isupper(id)) {
930  ++net_case_alphas;
931  } else {
932  --net_case_alphas;
933  }
934  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) {
935  ++x_height_alphas;
936  } else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) {
937  ++cap_height_alphas;
938  }
939  }
940  set_normed_ids(id);
941  }
942 
943  script_has_upper_lower_ = net_case_alphas > 0;
944  script_has_xheight_ =
945  script_has_upper_lower_ ||
946  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
947  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
948 
949  null_sid_ = get_script_id_from_name(null_script);
950  ASSERT_HOST(null_sid_ == 0);
951  common_sid_ = get_script_id_from_name("Common");
952  latin_sid_ = get_script_id_from_name("Latin");
953  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
954  greek_sid_ = get_script_id_from_name("Greek");
955  han_sid_ = get_script_id_from_name("Han");
956  hiragana_sid_ = get_script_id_from_name("Hiragana");
957  katakana_sid_ = get_script_id_from_name("Katakana");
958  thai_sid_ = get_script_id_from_name("Thai");
959  hangul_sid_ = get_script_id_from_name("Hangul");
960 
961  // Compute default script. Use the highest-counting alpha script, that is
962  // not the common script, as that still contains some "alphas".
963  int *script_counts = new int[script_table_size_used];
964  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
965  for (unsigned id = 0; id < unichars.size(); ++id) {
966  if (get_isalpha(id)) {
967  ++script_counts[get_script(id)];
968  }
969  }
970  default_sid_ = 0;
971  for (int s = 1; s < script_table_size_used; ++s) {
972  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) {
973  default_sid_ = s;
974  }
975  }
976  delete[] script_counts;
977 }
978 
979 // Returns true if right_to_left scripts are significant in the unicharset,
980 // but without being so sensitive that "universal" unicharsets containing
981 // characters from many scripts, like orientation and script detection,
982 // look like they are right_to_left.
984  int ltr_count = 0;
985  int rtl_count = 0;
986  for (unsigned id = 0; id < unichars.size(); ++id) {
987  int dir = get_direction(id);
988  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) {
989  ltr_count++;
990  }
991  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
994  rtl_count++;
995  }
996  }
997  return rtl_count > ltr_count;
998 }
999 
1000 // Set a whitelist and/or blacklist of characters to recognize.
1001 // An empty or nullptr whitelist enables everything (minus any blacklist).
1002 // An empty or nullptr blacklist disables nothing.
1003 // An empty or nullptr blacklist has no effect.
1004 void UNICHARSET::set_black_and_whitelist(const char *blacklist,
1005  const char *whitelist,
1006  const char *unblacklist) {
1007  bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
1008  // Set everything to default
1009  for (auto &uc : unichars) {
1010  uc.properties.enabled = def_enabled;
1011  }
1012  if (!def_enabled) {
1013  // Enable the whitelist.
1014  std::vector<UNICHAR_ID> encoding;
1015  encode_string(whitelist, false, &encoding, nullptr, nullptr);
1016  for (auto it : encoding) {
1017  if (it != INVALID_UNICHAR_ID) {
1018  unichars[it].properties.enabled = true;
1019  }
1020  }
1021  }
1022  if (blacklist != nullptr && blacklist[0] != '\0') {
1023  // Disable the blacklist.
1024  std::vector<UNICHAR_ID> encoding;
1025  encode_string(blacklist, false, &encoding, nullptr, nullptr);
1026  for (auto it : encoding) {
1027  if (it != INVALID_UNICHAR_ID) {
1028  unichars[it].properties.enabled = false;
1029  }
1030  }
1031  }
1032  if (unblacklist != nullptr && unblacklist[0] != '\0') {
1033  // Re-enable the unblacklist.
1034  std::vector<UNICHAR_ID> encoding;
1035  encode_string(unblacklist, false, &encoding, nullptr, nullptr);
1036  for (auto it : encoding) {
1037  if (it != INVALID_UNICHAR_ID) {
1038  unichars[it].properties.enabled = true;
1039  }
1040  }
1041  }
1042 }
1043 
1044 // Returns true if there are any repeated unicodes in the normalized
1045 // text of any unichar-id in the unicharset.
1047  int start_id = 0;
1048  if (has_special_codes()) {
1049  start_id = SPECIAL_UNICHAR_CODES_COUNT;
1050  }
1051  for (unsigned id = start_id; id < unichars.size(); ++id) {
1052  // Convert to unicodes.
1053  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1054  for (size_t u = 1; u < unicodes.size(); ++u) {
1055  if (unicodes[u - 1] == unicodes[u]) {
1056  return true;
1057  }
1058  }
1059  }
1060  return false;
1061 }
1062 
1063 int UNICHARSET::add_script(const char *script) {
1064  for (int i = 0; i < script_table_size_used; ++i) {
1065  if (strcmp(script, script_table[i]) == 0) {
1066  return i;
1067  }
1068  }
1069  if (script_table_size_reserved == 0) {
1070  script_table_size_reserved = 8;
1071  script_table = new char *[script_table_size_reserved];
1072  } else if (script_table_size_used >= script_table_size_reserved) {
1073  assert(script_table_size_used == script_table_size_reserved);
1074  script_table_size_reserved += script_table_size_reserved;
1075  char **new_script_table = new char *[script_table_size_reserved];
1076  memcpy(new_script_table, script_table,
1077  script_table_size_used * sizeof(char *));
1078  delete[] script_table;
1079  script_table = new_script_table;
1080  }
1081  script_table[script_table_size_used] = new char[strlen(script) + 1];
1082  strcpy(script_table[script_table_size_used], script);
1083  return script_table_size_used++;
1084 }
1085 
1086 // Returns the string that represents a fragment
1087 // with the given unichar, pos and total.
1088 std::string CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
1089  bool natural) {
1090  if (total == 1) {
1091  return std::string(unichar);
1092  }
1093  std::string result;
1094  result += kSeparator;
1095  result += unichar;
1096  char buffer[kMaxLen];
1097  snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
1098  natural ? kNaturalFlag : kSeparator, total);
1099  result += buffer;
1100  return result;
1101 }
1102 
1104  const char *ptr = string;
1105  int len = strlen(string);
1106  if (len < kMinLen || *ptr != kSeparator) {
1107  return nullptr; // this string can not represent a fragment
1108  }
1109  ptr++; // move to the next character
1110  int step = 0;
1111  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
1112  step += UNICHAR::utf8_step(ptr + step);
1113  }
1114  if (step == 0 || step > UNICHAR_LEN) {
1115  return nullptr; // no character for unichar or the character is too long
1116  }
1117  char unichar[UNICHAR_LEN + 1];
1118  strncpy(unichar, ptr, step);
1119  unichar[step] = '\0'; // null terminate unichar
1120  ptr += step; // move to the next fragment separator
1121  int pos = 0;
1122  int total = 0;
1123  bool natural = false;
1124  char *end_ptr = nullptr;
1125  for (int i = 0; i < 2; i++) {
1126  if (ptr > string + len || *ptr != kSeparator) {
1127  if (i == 1 && *ptr == kNaturalFlag) {
1128  natural = true;
1129  } else {
1130  return nullptr; // Failed to parse fragment representation.
1131  }
1132  }
1133  ptr++; // move to the next character
1134  i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
1135  : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
1136  ptr = end_ptr;
1137  }
1138  if (ptr != string + len) {
1139  return nullptr; // malformed fragment representation
1140  }
1141  auto *fragment = new CHAR_FRAGMENT();
1142  fragment->set_all(unichar, pos, total, natural);
1143  return fragment;
1144 }
1145 
1146 int UNICHARSET::get_script_id_from_name(const char *script_name) const {
1147  for (int i = 0; i < script_table_size_used; ++i) {
1148  if (strcmp(script_name, script_table[i]) == 0) {
1149  return i;
1150  }
1151  }
1152  return 0; // 0 is always the null_script
1153 }
1154 
1155 // Removes/replaces content that belongs in rendered text, but not in the
1156 // unicharset.
1157 /* static */
1158 std::string UNICHARSET::CleanupString(const char *utf8_str, size_t length) {
1159  std::string result;
1160  result.reserve(length);
1161  char ch;
1162  while ((ch = *utf8_str) != '\0' && length-- > 0) {
1163  int key_index = 0;
1164  const char *key;
1165  while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1166  int match = 0;
1167  while (key[match] != '\0' && key[match] == utf8_str[match]) {
1168  ++match;
1169  }
1170  if (key[match] == '\0') {
1171  utf8_str += match;
1172  break;
1173  }
1174  ++key_index;
1175  }
1176  if (key == nullptr) {
1177  result.push_back(ch);
1178  ++utf8_str;
1179  } else {
1180  result.append(kCleanupMaps[key_index][1]);
1181  }
1182  }
1183  return result;
1184 }
1185 
1186 } // namespace tesseract
#define UNICHAR_LEN
Definition: unichar.h:33
#define ASSERT_HOST(x)
Definition: errcode.h:59
OldUncleanUnichars
Definition: unicharset.h:45
int UNICHAR_ID
Definition: unichar.h:36
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
@ UNICHAR_JOINED
Definition: unicharset.h:37
const double kMinCapHeightFraction
Definition: unicharset.cpp:58
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:122
const double kMinXHeightFraction
Definition: unicharset.cpp:57
int first_uni() const
Definition: unichar.cpp:105
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:143
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:195
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:83
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:59
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:36
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:106
char * fgets(char *dst, int size)
Definition: unicharset.cpp:759
LocalFilePointer(FILE *stream)
Definition: unicharset.cpp:758
static CHAR_FRAGMENT * parse_from_string(const char *str)
static std::string to_string(const char *unichar, int pos, int total, bool natural)
const char * get_unichar() const
Definition: unicharset.h:76
std::string to_string() const
Definition: unicharset.h:91
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:379
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:483
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:468
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:860
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
static std::string debug_utf8_str(const char *str)
Definition: unicharset.cpp:307
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:887
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:476
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool has_special_codes() const
Definition: unicharset.h:757
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:713
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:447
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:488
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:615
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:647
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:624
bool encodable_string(const char *str, unsigned *first_bad_position) const
Definition: unicharset.cpp:224
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:599
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:478
void ExpandRangesFromOther(const UNICHARSET &src)
Definition: unicharset.cpp:425
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:457
int get_script_id_from_name(const char *script_name) const
bool AnyRepeatedUnicodes() const
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
bool major_right_to_left() const
Definition: unicharset.cpp:983
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:438
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:630
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:704
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:635
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:437
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:722
int step(const char *str) const
Definition: unicharset.cpp:211
int add_script(const char *script)
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:288
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:473
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
static const char * kCustomLigatures[][2]
Definition: unicharset.h:169
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:442
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533
size_t size() const
Definition: unicharset.h:355
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:454
bool save_to_string(std::string &str) const
Definition: unicharset.cpp:718
static const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:172
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:657
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:612
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:452
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:713
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:364
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:769
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:395
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:462
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:640
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:287