tesseract  5.0.0
ratngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: ratngs.cpp (Formerly ratings.c)
3  * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config_auto.h"
21 #endif
22 
23 #include "ratngs.h"
24 
25 #include "blobs.h"
26 #include "matrix.h"
27 #include "normalis.h" // kBlnBaselineOffset.
28 #include "unicharset.h"
29 
30 #include <algorithm>
31 #include <cmath>
32 #include <string>
33 #include <vector>
34 
35 namespace tesseract {
36 
37 const float WERD_CHOICE::kBadRating = 100000.0;
38 // Min offset in baseline-normalized coords to make a character a subscript.
39 const int kMinSubscriptOffset = 20;
40 // Min offset in baseline-normalized coords to make a character a superscript.
41 const int kMinSuperscriptOffset = 20;
42 // Max y of bottom of a drop-cap blob.
43 const int kMaxDropCapBottom = -128;
44 // Max fraction of x-height to use as denominator in measuring x-height overlap.
45 const double kMaxOverlapDenominator = 0.125;
46 // Min fraction of x-height range that should be in agreement for matching
47 // x-heights.
48 const double kMinXHeightMatch = 0.5;
49 // Max tolerance on baseline position as a fraction of x-height for matching
50 // baselines.
51 const double kMaxBaselineDrift = 0.0625;
52 
53 static const char kPermuterTypeNoPerm[] = "None";
54 static const char kPermuterTypePuncPerm[] = "Punctuation";
55 static const char kPermuterTypeTopPerm[] = "Top Choice";
56 static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
57 static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
58 static const char kPermuterTypeNgramPerm[] = "Ngram";
59 static const char kPermuterTypeNumberPerm[] = "Number";
60 static const char kPermuterTypeUserPatPerm[] = "User Pattern";
61 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
62 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
63 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
64 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
65 static const char kPermuterTypeCompoundPerm[] = "Compound";
66 
67 static const char *const kPermuterTypeNames[] = {
68  kPermuterTypeNoPerm, // 0
69  kPermuterTypePuncPerm, // 1
70  kPermuterTypeTopPerm, // 2
71  kPermuterTypeLowerPerm, // 3
72  kPermuterTypeUpperPerm, // 4
73  kPermuterTypeNgramPerm, // 5
74  kPermuterTypeNumberPerm, // 6
75  kPermuterTypeUserPatPerm, // 7
76  kPermuterTypeSysDawgPerm, // 8
77  kPermuterTypeDocDawgPerm, // 9
78  kPermuterTypeUserDawgPerm, // 10
79  kPermuterTypeFreqDawgPerm, // 11
80  kPermuterTypeCompoundPerm // 12
81 };
82 
88 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
89  float src_rating, // rating
90  float src_cert, // certainty
91  int src_script_id, // script
92  float min_xheight, // min xheight allowed
93  float max_xheight, // max xheight by this char
94  float yshift, // yshift out of position
95  BlobChoiceClassifier c) { // adapted match or other
96  unichar_id_ = src_unichar_id;
97  rating_ = src_rating;
98  certainty_ = src_cert;
99  fontinfo_id_ = -1;
100  fontinfo_id2_ = -1;
101  script_id_ = src_script_id;
102  min_xheight_ = min_xheight;
103  max_xheight_ = max_xheight;
104  yshift_ = yshift;
105  classifier_ = c;
106 }
107 
114  unichar_id_ = other.unichar_id();
115  rating_ = other.rating();
116  certainty_ = other.certainty();
117  fontinfo_id_ = other.fontinfo_id();
118  fontinfo_id2_ = other.fontinfo_id2();
119  script_id_ = other.script_id();
120  matrix_cell_ = other.matrix_cell_;
121  min_xheight_ = other.min_xheight_;
122  max_xheight_ = other.max_xheight_;
123  yshift_ = other.yshift();
124  classifier_ = other.classifier_;
125 #ifndef DISABLED_LEGACY_ENGINE
126  fonts_ = other.fonts_;
127 #endif // ndef DISABLED_LEGACY_ENGINE
128 }
129 
130 // Copy assignment operator.
131 BLOB_CHOICE &BLOB_CHOICE::operator=(const BLOB_CHOICE &other) {
132  ELIST_LINK::operator=(other);
133  unichar_id_ = other.unichar_id();
134  rating_ = other.rating();
135  certainty_ = other.certainty();
136  fontinfo_id_ = other.fontinfo_id();
137  fontinfo_id2_ = other.fontinfo_id2();
138  script_id_ = other.script_id();
139  matrix_cell_ = other.matrix_cell_;
140  min_xheight_ = other.min_xheight_;
141  max_xheight_ = other.max_xheight_;
142  yshift_ = other.yshift();
143  classifier_ = other.classifier_;
144 #ifndef DISABLED_LEGACY_ENGINE
145  fonts_ = other.fonts_;
146 #endif // ndef DISABLED_LEGACY_ENGINE
147  return *this;
148 }
149 
150 // Returns true if *this and other agree on the baseline and x-height
151 // to within some tolerance based on a given estimate of the x-height.
152 bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const {
153  double baseline_diff = std::fabs(yshift() - other.yshift());
154  if (baseline_diff > kMaxBaselineDrift * x_height) {
155  if (debug) {
156  tprintf("Baseline diff %g for %d v %d\n", baseline_diff, unichar_id_, other.unichar_id_);
157  }
158  return false;
159  }
160  double this_range = max_xheight() - min_xheight();
161  double other_range = other.max_xheight() - other.min_xheight();
162  double denominator =
163  ClipToRange(std::min(this_range, other_range), 1.0, kMaxOverlapDenominator * x_height);
164  double overlap =
165  std::min(max_xheight(), other.max_xheight()) - std::max(min_xheight(), other.min_xheight());
166  overlap /= denominator;
167  if (debug) {
168  tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n", unichar_id_,
169  other.unichar_id_, baseline_diff, this_range, other_range, denominator, overlap);
170  }
171 
172  return overlap >= kMinXHeightMatch;
173 }
174 
175 // Helper to find the BLOB_CHOICE in the bc_list that matches the given
176 // unichar_id, or nullptr if there is no match.
177 BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list) {
178  // Find the corresponding best BLOB_CHOICE.
179  BLOB_CHOICE_IT choice_it(bc_list);
180  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
181  BLOB_CHOICE *choice = choice_it.data();
182  if (choice->unichar_id() == char_id) {
183  return choice;
184  }
185  }
186  return nullptr;
187 }
188 
189 const char *WERD_CHOICE::permuter_name(uint8_t permuter) {
190  return kPermuterTypeNames[permuter];
191 }
192 
193 const char *ScriptPosToString(enum ScriptPos script_pos) {
194  switch (script_pos) {
195  case SP_NORMAL:
196  return "NORM";
197  case SP_SUBSCRIPT:
198  return "SUB";
199  case SP_SUPERSCRIPT:
200  return "SUPER";
201  case SP_DROPCAP:
202  return "DROPC";
203  }
204  return "SP_UNKNOWN";
205 }
206 
213 WERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset)
214  : unicharset_(&unicharset) {
215  std::vector<UNICHAR_ID> encoding;
216  std::vector<char> lengths;
217  std::string cleaned = unicharset.CleanupString(src_string);
218  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths, nullptr)) {
219  lengths.push_back('\0');
220  std::string src_lengths = &lengths[0];
221  this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM);
222  } else { // There must have been an invalid unichar in the string.
223  this->init(8);
224  this->make_bad();
225  }
226 }
227 
238 void WERD_CHOICE::init(const char *src_string, const char *src_lengths, float src_rating,
239  float src_certainty, uint8_t src_permuter) {
240  int src_string_len = strlen(src_string);
241  if (src_string_len == 0) {
242  this->init(8);
243  } else {
244  this->init(src_lengths ? strlen(src_lengths) : src_string_len);
245  length_ = reserved_;
246  int offset = 0;
247  for (unsigned i = 0; i < length_; ++i) {
248  int unichar_length = src_lengths ? src_lengths[i] : 1;
249  unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length);
250  state_[i] = 1;
251  certainties_[i] = src_certainty;
252  offset += unichar_length;
253  }
254  }
255  adjust_factor_ = 1.0f;
256  rating_ = src_rating;
257  certainty_ = src_certainty;
258  permuter_ = src_permuter;
259  dangerous_ambig_found_ = false;
260 }
261 
265 WERD_CHOICE::~WERD_CHOICE() = default;
266 
267 const char *WERD_CHOICE::permuter_name() const {
268  return kPermuterTypeNames[permuter_];
269 }
270 
271 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
272 // taken from the appropriate cell in the ratings MATRIX.
273 // Borrowed pointer, so do not delete.
274 BLOB_CHOICE_LIST *WERD_CHOICE::blob_choices(unsigned index, MATRIX *ratings) const {
275  MATRIX_COORD coord = MatrixCoord(index);
276  BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row);
277  if (result == nullptr) {
278  result = new BLOB_CHOICE_LIST;
279  ratings->put(coord.col, coord.row, result);
280  }
281  return result;
282 }
283 
284 // Returns the MATRIX_COORD corresponding to the location in the ratings
285 // MATRIX for the given index into the word.
286 MATRIX_COORD WERD_CHOICE::MatrixCoord(unsigned index) const {
287  int col = 0;
288  for (unsigned i = 0; i < index; ++i) {
289  col += state_[i];
290  }
291  int row = col + state_[index] - 1;
292  return MATRIX_COORD(col, row);
293 }
294 
295 // Sets the entries for the given index from the BLOB_CHOICE, assuming
296 // unit fragment lengths, but setting the state for this index to blob_count.
297 void WERD_CHOICE::set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice) {
298  unichar_ids_[index] = blob_choice->unichar_id();
299  script_pos_[index] = tesseract::SP_NORMAL;
300  state_[index] = blob_count;
301  certainties_[index] = blob_choice->certainty();
302 }
303 
310  for (unsigned i = 0; i < length_; ++i) {
311  if (unichar_ids_[i] == unichar_id) {
312  return true;
313  }
314  }
315  return false;
316 }
317 
325 void WERD_CHOICE::remove_unichar_ids(unsigned start, int num) {
326  ASSERT_HOST(start + num <= length_);
327  // Accumulate the states to account for the merged blobs.
328  for (int i = 0; i < num; ++i) {
329  if (start > 0) {
330  state_[start - 1] += state_[start + i];
331  } else if (start + num < length_) {
332  state_[start + num] += state_[start + i];
333  }
334  }
335  for (unsigned i = start; i + num < length_; ++i) {
336  unichar_ids_[i] = unichar_ids_[i + num];
337  script_pos_[i] = script_pos_[i + num];
338  state_[i] = state_[i + num];
339  certainties_[i] = certainties_[i + num];
340  }
341  length_ -= num;
342 }
343 
350  for (unsigned i = 0; i < length_ / 2; ++i) {
351  UNICHAR_ID tmp_id = unichar_ids_[i];
352  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]);
353  unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id);
354  }
355  if (length_ % 2 != 0) {
356  unichar_ids_[length_ / 2] = unicharset_->get_mirror(unichar_ids_[length_ / 2]);
357  }
358 }
359 
367 void WERD_CHOICE::punct_stripped(unsigned *start, unsigned *end) const {
368  *start = 0;
369  *end = length();
370  while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) {
371  (*start)++;
372  }
373  while (*end > 0 && unicharset()->get_ispunctuation(unichar_id(*end - 1))) {
374  (*end)--;
375  }
376 }
377 
378 void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
379  int end = length();
380  while (end > 0 && unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
382  end--;
383  }
384  int start = 0;
385  while (start < end && unicharset_->get_isdigit(unichar_ids_[start]) &&
387  start++;
388  }
389  *pstart = start;
390  *pend = end;
391 }
392 
393 WERD_CHOICE WERD_CHOICE::shallow_copy(unsigned start, unsigned end) const {
394  ASSERT_HOST(start <= length_);
395  ASSERT_HOST(end <= length_);
396  if (end < start) {
397  end = start;
398  }
399  WERD_CHOICE retval(unicharset_, end - start);
400  for (auto i = start; i < end; i++) {
401  retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
402  }
403  return retval;
404 }
405 
412  for (unsigned i = 0; i < length_; ++i) {
413  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
415  return true;
416  }
417  }
418  return false;
419 }
420 
427 void WERD_CHOICE::string_and_lengths(std::string *word_str, std::string *word_lengths_str) const {
428  *word_str = "";
429  if (word_lengths_str != nullptr) {
430  *word_lengths_str = "";
431  }
432  for (unsigned i = 0; i < length_; ++i) {
433  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
434  *word_str += ch;
435  if (word_lengths_str != nullptr) {
436  *word_lengths_str += (char)strlen(ch);
437  }
438  }
439 }
440 
447 void WERD_CHOICE::append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating,
448  float certainty) {
449  if (length_ == reserved_) {
450  this->double_the_size();
451  }
452  this->append_unichar_id_space_allocated(unichar_id, blob_count, rating, certainty);
453 }
454 
463  ASSERT_HOST(unicharset_ == second.unicharset_);
464  while (reserved_ < length_ + second.length()) {
465  this->double_the_size();
466  }
467  const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids();
468  for (unsigned i = 0; i < second.length(); ++i) {
469  unichar_ids_[length_ + i] = other_unichar_ids[i];
470  state_[length_ + i] = second.state_[i];
471  certainties_[length_ + i] = second.certainties_[i];
472  script_pos_[length_ + i] = second.BlobPosition(i);
473  }
474  length_ += second.length();
475  if (second.adjust_factor_ > adjust_factor_) {
476  adjust_factor_ = second.adjust_factor_;
477  }
478  rating_ += second.rating(); // add ratings
479  if (second.certainty() < certainty_) { // take min
480  certainty_ = second.certainty();
481  }
482  if (second.dangerous_ambig_found_) {
483  dangerous_ambig_found_ = true;
484  }
485  if (permuter_ == NO_PERM) {
486  permuter_ = second.permuter();
487  } else if (second.permuter() != NO_PERM && second.permuter() != permuter_) {
488  permuter_ = COMPOUND_PERM;
489  }
490  return *this;
491 }
492 
500  while (reserved_ < source.length()) {
501  this->double_the_size();
502  }
503 
504  unicharset_ = source.unicharset_;
505  const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids();
506  for (unsigned i = 0; i < source.length(); ++i) {
507  unichar_ids_[i] = other_unichar_ids[i];
508  state_[i] = source.state_[i];
509  certainties_[i] = source.certainties_[i];
510  script_pos_[i] = source.BlobPosition(i);
511  }
512  length_ = source.length();
513  adjust_factor_ = source.adjust_factor_;
514  rating_ = source.rating();
515  certainty_ = source.certainty();
516  min_x_height_ = source.min_x_height();
517  max_x_height_ = source.max_x_height();
518  permuter_ = source.permuter();
519  dangerous_ambig_found_ = source.dangerous_ambig_found_;
520  return *this;
521 }
522 
523 // Sets up the script_pos_ member using the blobs_list to get the bln
524 // bounding boxes, *this to get the unichars, and this->unicharset
525 // to get the target positions. If small_caps is true, sub/super are not
526 // considered, but dropcaps are.
527 // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
528 void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) {
529  // Initialize to normal.
530  for (unsigned i = 0; i < length_; ++i) {
531  script_pos_[i] = tesseract::SP_NORMAL;
532  }
533  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
534  return;
535  }
536 
537  unsigned position_counts[4] = {0, 0, 0, 0};
538 
539  int chunk_index = 0;
540  for (unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
541  TBLOB *tblob = word->blobs[chunk_index];
542  int uni_id = unichar_id(blob_index);
543  TBOX blob_box = tblob->bounding_box();
544  if (!state_.empty()) {
545  for (int i = 1; i < state_[blob_index]; ++i) {
546  ++chunk_index;
547  tblob = word->blobs[chunk_index];
548  blob_box += tblob->bounding_box();
549  }
550  }
551  script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, uni_id);
552  if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
553  script_pos_[blob_index] = tesseract::SP_NORMAL;
554  }
555  position_counts[script_pos_[blob_index]]++;
556  }
557  // If almost everything looks like a superscript or subscript,
558  // we most likely just got the baseline wrong.
559  if (4 * position_counts[tesseract::SP_SUBSCRIPT] > 3 * length_ ||
560  4 * position_counts[tesseract::SP_SUPERSCRIPT] > 3 * length_) {
561  if (debug >= 2) {
562  tprintf(
563  "Most characters of %s are subscript or superscript.\n"
564  "That seems wrong, so I'll assume we got the baseline wrong\n",
565  unichar_string().c_str());
566  }
567  for (unsigned i = 0; i < length_; i++) {
568  ScriptPos sp = script_pos_[i];
570  ASSERT_HOST(position_counts[sp] > 0);
571  position_counts[sp]--;
572  position_counts[tesseract::SP_NORMAL]++;
573  script_pos_[i] = tesseract::SP_NORMAL;
574  }
575  }
576  }
577 
578  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) {
579  tprintf("SetScriptPosition on %s\n", unichar_string().c_str());
580  int chunk_index = 0;
581  for (unsigned blob_index = 0; blob_index < length_; ++blob_index) {
582  if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
583  TBLOB *tblob = word->blobs[chunk_index];
584  ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index));
585  }
586  chunk_index += state_.empty() ? 1 : state_[blob_index];
587  }
588  }
589 }
590 
591 // Sets all the script_pos_ positions to the given position.
593  for (unsigned i = 0; i < length_; ++i) {
594  script_pos_[i] = position;
595  }
596 }
597 
598 /* static */
599 ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset,
600  const TBOX &blob_box, UNICHAR_ID unichar_id) {
602  int top = blob_box.top();
603  int bottom = blob_box.bottom();
604  int min_bottom, max_bottom, min_top, max_top;
605  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);
606 
607  int sub_thresh_top = min_top - kMinSubscriptOffset;
608  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
609  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
610  if (bottom <= kMaxDropCapBottom) {
611  retval = tesseract::SP_DROPCAP;
612  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
613  retval = tesseract::SP_SUBSCRIPT;
614  } else if (bottom > sup_thresh_bot) {
615  retval = tesseract::SP_SUPERSCRIPT;
616  }
617 
618  if (print_debug) {
619  const char *pos = ScriptPosToString(retval);
620  tprintf(
621  "%s Character %s[bot:%d top: %d] "
622  "bot_range[%d,%d] top_range[%d, %d] "
623  "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
624  pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top,
625  max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot);
626  }
627  return retval;
628 }
629 
630 // Returns the script-id (eg Han) of the dominant script in the word.
632  unsigned max_script = unicharset_->get_script_table_size();
633  std::vector<unsigned> sid(max_script);
634  for (unsigned x = 0; x < length_; ++x) {
635  int script_id = unicharset_->get_script(unichar_id(x));
636  sid[script_id]++;
637  }
638  if (unicharset_->han_sid() != unicharset_->null_sid()) {
639  // Add the Hiragana & Katakana counts to Han and zero them out.
640  if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
641  sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
642  sid[unicharset_->hiragana_sid()] = 0;
643  }
644  if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
645  sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
646  sid[unicharset_->katakana_sid()] = 0;
647  }
648  }
649  // Note that high script ID overrides lower one on a tie, thus biasing
650  // towards non-Common script (if sorted that way in unicharset file).
651  unsigned max_sid = 0;
652  for (unsigned x = 1; x < max_script; x++) {
653  if (sid[x] >= sid[max_sid]) {
654  max_sid = x;
655  }
656  }
657  if (sid[max_sid] < length_ / 2) {
658  max_sid = unicharset_->null_sid();
659  }
660  return max_sid;
661 }
662 
663 // Fixes the state_ for a chop at the given blob_posiiton.
664 void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
665  int total_chunks = 0;
666  for (unsigned i = 0; i < length_; ++i) {
667  total_chunks += state_[i];
668  if (total_chunks > blob_position) {
669  ++state_[i];
670  return;
671  }
672  }
673 }
674 
675 // Returns the sum of all the state elements, being the total number of blobs.
676 unsigned WERD_CHOICE::TotalOfStates() const {
677  unsigned total_chunks = 0;
678  for (unsigned i = 0; i < length_; ++i) {
679  total_chunks += state_[i];
680  }
681  return total_chunks;
682 }
683 
689 void WERD_CHOICE::print(const char *msg) const {
690  tprintf("%s : ", msg);
691  for (unsigned i = 0; i < length_; ++i) {
692  tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
693  }
694  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_,
695  adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_);
696  tprintf("pos");
697  for (unsigned i = 0; i < length_; ++i) {
698  tprintf("\t%s", ScriptPosToString(script_pos_[i]));
699  }
700  tprintf("\nstr");
701  for (unsigned i = 0; i < length_; ++i) {
702  tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
703  }
704  tprintf("\nstate:");
705  for (unsigned i = 0; i < length_; ++i) {
706  tprintf("\t%d ", state_[i]);
707  }
708  tprintf("\nC");
709  for (unsigned i = 0; i < length_; ++i) {
710  tprintf("\t%.3f", certainties_[i]);
711  }
712  tprintf("\n");
713 }
714 
715 // Prints the segmentation state with an introductory message.
716 void WERD_CHOICE::print_state(const char *msg) const {
717  tprintf("%s", msg);
718  for (unsigned i = 0; i < length_; ++i) {
719  tprintf(" %d", state_[i]);
720  }
721  tprintf("\n");
722 }
723 
724 #ifndef GRAPHICS_DISABLED
725 
726 // Displays the segmentation state of *this (if not the same as the last
727 // one displayed) and waits for a click in the window.
729  // Number of different colors to draw with.
730  const int kNumColors = 6;
731  static ScrollView *segm_window = nullptr;
732  // Check the state against the static prev_drawn_state.
733  static std::vector<int> prev_drawn_state;
734  bool already_done = prev_drawn_state.size() == length_;
735  if (!already_done) {
736  prev_drawn_state.clear();
737  prev_drawn_state.resize(length_);
738  }
739  for (unsigned i = 0; i < length_; ++i) {
740  if (prev_drawn_state[i] != state_[i]) {
741  already_done = false;
742  }
743  prev_drawn_state[i] = state_[i];
744  }
745  if (already_done || word->blobs.empty()) {
746  return;
747  }
748 
749  // Create the window if needed.
750  if (segm_window == nullptr) {
751  segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 2000.0, 256.0, true);
752  } else {
753  segm_window->Clear();
754  }
755 
756  TBOX bbox;
757  int blob_index = 0;
758  for (unsigned c = 0; c < length_; ++c) {
759  auto color = static_cast<ScrollView::Color>(c % kNumColors + 3);
760  for (int i = 0; i < state_[c]; ++i, ++blob_index) {
761  TBLOB *blob = word->blobs[blob_index];
762  bbox += blob->bounding_box();
763  blob->plot(segm_window, color, color);
764  }
765  }
766  segm_window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom());
767  segm_window->Update();
768  segm_window->Wait();
769 }
770 
771 #endif // !GRAPHICS_DISABLED
772 
774  const UNICHARSET *uchset = word1.unicharset();
775  if (word2.unicharset() != uchset) {
776  return false;
777  }
778  unsigned w1start, w1end;
779  word1.punct_stripped(&w1start, &w1end);
780  unsigned w2start, w2end;
781  word2.punct_stripped(&w2start, &w2end);
782  if (w1end - w1start != w2end - w2start) {
783  return false;
784  }
785  for (unsigned i = 0; i < w1end - w1start; i++) {
786  if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
787  uchset->to_lower(word2.unichar_id(w2start + i))) {
788  return false;
789  }
790  }
791  return true;
792 }
793 
804 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings,
805  const UNICHARSET &current_unicharset) {
806  if (ratings->empty()) {
807  tprintf("%s:<none>\n", msg);
808  return;
809  }
810  if (*msg != '\0') {
811  tprintf("%s\n", msg);
812  }
813  BLOB_CHOICE_IT c_it;
814  c_it.set_to_list(ratings);
815  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
816  c_it.data()->print(&current_unicharset);
817  if (!c_it.at_last()) {
818  tprintf("\n");
819  }
820  }
821  tprintf("\n");
822  fflush(stdout);
823 }
824 
825 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
const int kMaxDropCapBottom
Definition: ratngs.cpp:43
const double kMinXHeightMatch
Definition: ratngs.cpp:48
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:804
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ SP_SUBSCRIPT
Definition: ratngs.h:250
@ SP_DROPCAP
Definition: ratngs.h:250
@ SP_NORMAL
Definition: ratngs.h:250
@ SP_SUPERSCRIPT
Definition: ratngs.h:250
const double kMaxBaselineDrift
Definition: ratngs.cpp:51
const double kMaxOverlapDenominator
Definition: ratngs.cpp:45
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:110
int UNICHAR_ID
Definition: unichar.h:36
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:193
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:177
BlobChoiceClassifier
Definition: ratngs.h:48
@ COMPOUND_PERM
Definition: ratngs.h:244
@ NO_PERM
Definition: ratngs.h:232
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:773
const int kMinSubscriptOffset
Definition: ratngs.cpp:39
const int kMinSuperscriptOffset
Definition: ratngs.cpp:41
const int kBlnBaselineOffset
Definition: normalis.h:34
T get(ICOORD pos) const
Definition: matrix.h:268
void put(ICOORD pos, const T &thing)
Definition: matrix.h:260
TBOX bounding_box() const
Definition: blobs.cpp:466
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:509
std::vector< TBLOB * > blobs
Definition: blobs.h:462
unsigned NumBlobs() const
Definition: blobs.h:449
int16_t fontinfo_id2() const
Definition: ratngs.h:93
float certainty() const
Definition: ratngs.h:87
UNICHAR_ID unichar_id() const
Definition: ratngs.h:81
int script_id() const
Definition: ratngs.h:118
float min_xheight() const
Definition: ratngs.h:124
float yshift() const
Definition: ratngs.h:130
float max_xheight() const
Definition: ratngs.h:127
int16_t fontinfo_id() const
Definition: ratngs.h:90
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:152
float rating() const
Definition: ratngs.h:84
void punct_stripped(unsigned *start_core, unsigned *end_core) const
Definition: ratngs.cpp:367
float max_x_height() const
Definition: ratngs.h:320
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:411
unsigned TotalOfStates() const
Definition: ratngs.cpp:676
float certainty() const
Definition: ratngs.h:311
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:499
int GetTopScriptID() const
Definition: ratngs.cpp:631
MATRIX_COORD MatrixCoord(unsigned index) const
Definition: ratngs.cpp:286
void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:297
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:309
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
Definition: ratngs.cpp:427
WERD_CHOICE(const UNICHARSET *unicharset)
Definition: ratngs.h:259
WERD_CHOICE & operator+=(const WERD_CHOICE &second)
Definition: ratngs.cpp:462
const std::vector< UNICHAR_ID > & unichar_ids() const
Definition: ratngs.h:292
WERD_CHOICE shallow_copy(unsigned start, unsigned end) const
Definition: ratngs.cpp:393
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
void print_state(const char *msg) const
Definition: ratngs.cpp:716
uint8_t permuter() const
Definition: ratngs.h:327
static const float kBadRating
Definition: ratngs.h:256
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:415
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:378
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:349
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:424
void init(unsigned reserved)
Definition: ratngs.h:382
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:368
BLOB_CHOICE_LIST * blob_choices(unsigned index, MATRIX *ratings) const
Definition: ratngs.cpp:274
float min_x_height() const
Definition: ratngs.h:317
const UNICHARSET * unicharset() const
Definition: ratngs.h:277
unsigned length() const
Definition: ratngs.h:283
const char * permuter_name() const
Definition: ratngs.cpp:267
void print() const
Definition: ratngs.h:557
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:447
static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:599
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:728
void SetAllScriptPositions(ScriptPos position)
Definition: ratngs.cpp:592
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:664
float rating() const
Definition: ratngs.h:308
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:528
void remove_unichar_ids(unsigned index, int num)
Definition: ratngs.cpp:325
std::string & unichar_string()
Definition: ratngs.h:515
ScriptPos BlobPosition(unsigned index) const
Definition: ratngs.h:302
TDimension left() const
Definition: rect.h:82
TDimension top() const
Definition: rect.h:68
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void operator=(const ELIST_LINK &)
Definition: elst.h:100
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
int han_sid() const
Definition: unicharset.h:932
int get_script_table_size() const
Definition: unicharset.h:882
int hiragana_sid() const
Definition: unicharset.h:935
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:713
int null_sid() const
Definition: unicharset.h:917
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586
int katakana_sid() const
Definition: unicharset.h:938
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:722
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:287
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:731
void void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:759
static void Update()
Definition: scrollview.cpp:713