tesseract  5.0.0
pageres.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.cpp (Formerly page_res.c)
3  * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
4  * and an iterator class to iterate over the words.
5  * Main purposes:
6  * Easy way to iterate over the words without a 3-nested loop.
7  * Holds data used during word recognition.
8  * Holds information about alternative spacing paths.
9  * Author: Phil Cheatle
10  *
11  * (C) Copyright 1992, Hewlett-Packard Ltd.
12  ** Licensed under the Apache License, Version 2.0 (the "License");
13  ** you may not use this file except in compliance with the License.
14  ** You may obtain a copy of the License at
15  ** http://www.apache.org/licenses/LICENSE-2.0
16  ** Unless required by applicable law or agreed to in writing, software
17  ** distributed under the License is distributed on an "AS IS" BASIS,
18  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  ** See the License for the specific language governing permissions and
20  ** limitations under the License.
21  *
22  **********************************************************************/
23 
24 #include "pageres.h"
25 
26 #include "blamer.h" // for BlamerBundle
27 #include "blobs.h" // for TWERD, TBLOB
28 #include "boxword.h" // for BoxWord
29 #include "errcode.h" // for ASSERT_HOST
30 #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
31 #include "ocrrow.h" // for ROW, ROW_IT
32 #include "pdblock.h" // for PDBLK
33 #include "polyblk.h" // for POLY_BLOCK
34 #include "seam.h" // for SEAM, start_seam_list
35 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
36 #include "tprintf.h" // for tprintf
37 
38 #include <tesseract/publictypes.h> // for OcrEngineMode, OEM_LSTM_ONLY
39 
40 #include <cassert> // for assert
41 #include <cstdint> // for INT32_MAX
42 #include <cstring> // for strlen
43 
44 struct Pix;
45 
46 namespace tesseract {
47 
48 // Gain factor for computing thresholds that determine the ambiguity of a
49 // word.
50 static const double kStopperAmbiguityThresholdGain = 8.0;
51 // Constant offset for computing thresholds that determine the ambiguity of a
52 // word.
53 static const double kStopperAmbiguityThresholdOffset = 1.5;
54 // Max number of broken pieces to associate.
56 // Max ratio of word box height to line size to allow it to be processed as
57 // a line with other words.
58 const double kMaxWordSizeRatio = 1.25;
59 // Max ratio of line box height to line size to allow a new word to be added.
60 const double kMaxLineSizeRatio = 1.25;
61 // Max ratio of word gap to line size to allow a new word to be added.
62 const double kMaxWordGapRatio = 2.0;
63 
64 // Computes and returns a threshold of certainty difference used to determine
65 // which words to keep, based on the adjustment factors of the two words.
66 // TODO(rays) This is horrible. Replace with an enhance params training model.
67 static double StopperAmbigThreshold(double f1, double f2) {
68  return (f2 - f1) * kStopperAmbiguityThresholdGain -
69  kStopperAmbiguityThresholdOffset;
70 }
71 
72 /*************************************************************************
73  * PAGE_RES::PAGE_RES
74  *
75  * Constructor for page results
76  *************************************************************************/
77 PAGE_RES::PAGE_RES(bool merge_similar_words, BLOCK_LIST *the_block_list,
78  WERD_CHOICE **prev_word_best_choice_ptr) {
79  Init();
80  BLOCK_IT block_it(the_block_list);
81  BLOCK_RES_IT block_res_it(&block_res_list);
82  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
83  block_res_it.add_to_end(
84  new BLOCK_RES(merge_similar_words, block_it.data()));
85  }
86  prev_word_best_choice = prev_word_best_choice_ptr;
87 }
88 
89 /*************************************************************************
90  * BLOCK_RES::BLOCK_RES
91  *
92  * Constructor for BLOCK results
93  *************************************************************************/
94 
95 BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
96  ROW_IT row_it(the_block->row_list());
97  ROW_RES_IT row_res_it(&row_res_list);
98 
99  char_count = 0;
100  rej_count = 0;
101  font_class = -1; // not assigned
102  x_height = -1.0;
103  font_assigned = false;
104  row_count = 0;
105 
106  block = the_block;
107 
108  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
109  row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
110  }
111 }
112 
113 /*************************************************************************
114  * ROW_RES::ROW_RES
115  *
116  * Constructor for ROW results
117  *************************************************************************/
118 
119 ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
120  WERD_IT word_it(the_row->word_list());
121  WERD_RES_IT word_res_it(&word_res_list);
122  WERD_RES *combo = nullptr; // current combination of fuzzies
123  WERD *copy_word;
124 
125  char_count = 0;
126  rej_count = 0;
128 
129  row = the_row;
130  bool add_next_word = false;
131  TBOX union_box;
132  float line_height =
133  the_row->x_height() + the_row->ascenders() - the_row->descenders();
134  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
135  auto *word_res = new WERD_RES(word_it.data());
136  word_res->x_height = the_row->x_height();
137  if (add_next_word) {
138  ASSERT_HOST(combo != nullptr);
139  // We are adding this word to the combination.
140  word_res->part_of_combo = true;
141  combo->copy_on(word_res);
142  } else if (merge_similar_words) {
143  union_box = word_res->word->bounding_box();
144  add_next_word = !word_res->word->flag(W_REP_CHAR) &&
145  union_box.height() <= line_height * kMaxWordSizeRatio;
146  word_res->odd_size = !add_next_word;
147  }
148  WERD *next_word = word_it.data_relative(1);
149  if (merge_similar_words) {
150  if (add_next_word && !next_word->flag(W_REP_CHAR)) {
151  // Next word will be added on if all of the following are true:
152  // Not a rep char.
153  // Box height small enough.
154  // Union box height small enough.
155  // Horizontal gap small enough.
156  TBOX next_box = next_word->bounding_box();
157  int prev_right = union_box.right();
158  union_box += next_box;
159  if (next_box.height() > line_height * kMaxWordSizeRatio ||
160  union_box.height() > line_height * kMaxLineSizeRatio ||
161  next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
162  add_next_word = false;
163  }
164  }
165  next_word->set_flag(W_FUZZY_NON, add_next_word);
166  } else {
167  add_next_word = next_word->flag(W_FUZZY_NON);
168  }
169  if (add_next_word) {
170  if (combo == nullptr) {
171  copy_word = new WERD;
172  *copy_word = *(word_it.data()); // deep copy
173  combo = new WERD_RES(copy_word);
174  combo->x_height = the_row->x_height();
175  combo->combination = true;
176  word_res_it.add_to_end(combo);
177  }
178  word_res->part_of_combo = true;
179  } else {
180  combo = nullptr;
181  }
182  word_res_it.add_to_end(word_res);
183  }
184 }
185 
187  this->ELIST_LINK::operator=(source);
188  Clear();
189  if (source.combination) {
190  word = new WERD;
191  *word = *(source.word); // deep copy
192  } else {
193  word = source.word; // pt to same word
194  }
195  if (source.bln_boxes != nullptr) {
196  bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
197  }
198  if (source.chopped_word != nullptr) {
199  chopped_word = new TWERD(*source.chopped_word);
200  }
201  if (source.rebuild_word != nullptr) {
202  rebuild_word = new TWERD(*source.rebuild_word);
203  }
204  // TODO(rays) Do we ever need to copy the seam_array?
205  blob_row = source.blob_row;
206  denorm = source.denorm;
207  if (source.box_word != nullptr) {
208  box_word = new tesseract::BoxWord(*source.box_word);
209  }
210  best_state = source.best_state;
211  correct_text = source.correct_text;
212  blob_widths = source.blob_widths;
213  blob_gaps = source.blob_gaps;
214  // None of the uses of operator= require the ratings matrix to be copied,
215  // so don't as it would be really slow.
216 
217  // Copy the cooked choices.
218  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&source.best_choices));
219  WERD_CHOICE_IT wc_dest_it(&best_choices);
220  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
221  const WERD_CHOICE *choice = wc_it.data();
222  wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
223  }
224  if (!wc_dest_it.empty()) {
225  wc_dest_it.move_to_first();
226  best_choice = wc_dest_it.data();
227  } else {
228  best_choice = nullptr;
229  }
230 
231  if (source.raw_choice != nullptr) {
232  raw_choice = new WERD_CHOICE(*source.raw_choice);
233  } else {
234  raw_choice = nullptr;
235  }
236  if (source.ep_choice != nullptr) {
237  ep_choice = new WERD_CHOICE(*source.ep_choice);
238  } else {
239  ep_choice = nullptr;
240  }
241  reject_map = source.reject_map;
242  combination = source.combination;
243  part_of_combo = source.part_of_combo;
244  CopySimpleFields(source);
245  if (source.blamer_bundle != nullptr) {
246  blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
247  }
248  return *this;
249 }
250 
251 // Copies basic fields that don't involve pointers that might be useful
252 // to copy when making one WERD_RES from another.
254  tess_failed = source.tess_failed;
255  tess_accepted = source.tess_accepted;
257  done = source.done;
259  small_caps = source.small_caps;
260  odd_size = source.odd_size;
261  fontinfo = source.fontinfo;
262  fontinfo2 = source.fontinfo2;
265  x_height = source.x_height;
266  caps_height = source.caps_height;
268  guessed_x_ht = source.guessed_x_ht;
270  reject_spaces = source.reject_spaces;
271  uch_set = source.uch_set;
272  tesseract = source.tesseract;
273 }
274 
275 // Initializes a blank (default constructed) WERD_RES from one that has
276 // already been recognized.
277 // Use SetupFor*Recognition afterwards to complete the setup and make
278 // it ready for a retry recognition.
280  word = source.word;
281  CopySimpleFields(source);
282  if (source.blamer_bundle != nullptr) {
283  blamer_bundle = new BlamerBundle();
285  }
286 }
287 
288 // Sets up the members used in recognition: bln_boxes, chopped_word,
289 // seam_array, denorm. Returns false if
290 // the word is empty and sets up fake results. If use_body_size is
291 // true and row->body_size is set, then body_size will be used for
292 // blob normalization instead of xheight + ascrise. This flag is for
293 // those languages that are using CJK pitch model and thus it has to
294 // be true if and only if tesseract->textord_use_cjk_fp_model is
295 // true.
296 // If allow_detailed_fx is true, the feature extractor will receive fine
297 // precision outline information, allowing smoother features and better
298 // features on low resolution images.
299 // The norm_mode_hint sets the default mode for normalization in absence
300 // of any of the above flags.
301 // norm_box is used to override the word bounding box to determine the
302 // normalization scale and offset.
303 // Returns false if the word is empty and sets up fake results.
304 bool WERD_RES::SetupForRecognition(const UNICHARSET &unicharset_in,
305  tesseract::Tesseract *tess, Image pix,
306  int norm_mode, const TBOX *norm_box,
307  bool numeric_mode, bool use_body_size,
308  bool allow_detailed_fx, ROW *row,
309  const BLOCK *block) {
310  auto norm_mode_hint = static_cast<tesseract::OcrEngineMode>(norm_mode);
311  tesseract = tess;
312  POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
313  if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
314  word->cblob_list()->empty()) ||
315  (pb != nullptr && !pb->IsText())) {
316  // Empty words occur when all the blobs have been moved to the rej_blobs
317  // list, which seems to occur frequently in junk.
318  SetupFake(unicharset_in);
319  word->set_flag(W_REP_CHAR, false);
320  return false;
321  }
322  ClearResults();
323  SetupWordScript(unicharset_in);
324  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
325  float word_xheight =
326  use_body_size && row != nullptr && row->body_size() > 0.0f
327  ? row->body_size()
328  : x_height;
329  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
330  word_xheight, baseline_shift, numeric_mode,
331  norm_mode_hint, norm_box, &denorm);
332  blob_row = row;
333  SetupBasicsFromChoppedWord(unicharset_in);
335  int num_blobs = chopped_word->NumBlobs();
336  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
337  tess_failed = false;
338  return true;
339 }
340 
341 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
342 // accumulators from a made chopped word. We presume the fields are already
343 // empty.
349 }
350 
351 // Sets up the members used in recognition for an empty recognition result:
352 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
353 void WERD_RES::SetupFake(const UNICHARSET &unicharset_in) {
354  ClearResults();
355  SetupWordScript(unicharset_in);
356  chopped_word = new TWERD;
357  rebuild_word = new TWERD;
360  int blob_count = word->cblob_list()->length();
361  if (blob_count > 0) {
362  auto **fake_choices = new BLOB_CHOICE *[blob_count];
363  // For non-text blocks, just pass any blobs through to the box_word
364  // and call the word failed with a fake classification.
365  C_BLOB_IT b_it(word->cblob_list());
366  int blob_id = 0;
367  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
368  TBOX box = b_it.data()->bounding_box();
369  box_word->InsertBox(box_word->length(), box);
370  fake_choices[blob_id++] = new BLOB_CHOICE;
371  }
372  FakeClassifyWord(blob_count, fake_choices);
373  delete[] fake_choices;
374  } else {
375  auto *word = new WERD_CHOICE(&unicharset_in);
376  word->make_bad();
378  // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
379  LogNewCookedChoice(1, false, word);
380  }
381  tess_failed = true;
382  done = true;
383 }
384 
386  uch_set = &uch;
387  int script = uch.default_sid();
388  word->set_script_id(script);
390  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
391 }
392 
393 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
395  if (blamer_bundle != nullptr) {
397  }
398 }
399 
400 // Computes the blob_widths and blob_gaps from the chopped_word.
402  blob_widths.clear();
403  blob_gaps.clear();
404  int num_blobs = chopped_word->NumBlobs();
405  for (int b = 0; b < num_blobs; ++b) {
406  TBLOB *blob = chopped_word->blobs[b];
407  TBOX box = blob->bounding_box();
408  blob_widths.push_back(box.width());
409  if (b + 1 < num_blobs) {
410  blob_gaps.push_back(chopped_word->blobs[b + 1]->bounding_box().left() -
411  box.right());
412  }
413  }
414 }
415 
416 // Updates internal data to account for a new SEAM (chop) at the given
417 // blob_number. Fixes the ratings matrix and states in the choices, as well
418 // as the blob widths and gaps.
419 void WERD_RES::InsertSeam(int blob_number, SEAM *seam) {
420  // Insert the seam into the SEAMS array.
421  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
422  seam_array.insert(seam_array.begin() + blob_number, seam);
423  if (ratings != nullptr) {
424  // Expand the ratings matrix.
425  ratings = ratings->ConsumeAndMakeBigger(blob_number);
426  // Fix all the segmentation states.
427  if (raw_choice != nullptr) {
428  raw_choice->UpdateStateForSplit(blob_number);
429  }
430  WERD_CHOICE_IT wc_it(&best_choices);
431  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
432  WERD_CHOICE *choice = wc_it.data();
433  choice->UpdateStateForSplit(blob_number);
434  }
436  }
437 }
438 
439 // Returns true if all the word choices except the first have adjust_factors
440 // worse than the given threshold.
442  // The choices are not changed by this iteration.
443  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
444  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
445  WERD_CHOICE *choice = wc_it.data();
446  if (choice->adjust_factor() <= threshold) {
447  return false;
448  }
449  }
450  return true;
451 }
452 
453 // Returns true if the current word is ambiguous (by number of answers or
454 // by dangerous ambigs.)
456  return !best_choices.singleton() || best_choice->dangerous_ambig_found();
457 }
458 
459 // Returns true if the ratings matrix size matches the sum of each of the
460 // segmentation states.
462  unsigned ratings_dim = ratings->dimension();
463  if (raw_choice->TotalOfStates() != ratings_dim) {
464  tprintf("raw_choice has total of states = %u vs ratings dim of %u\n",
465  raw_choice->TotalOfStates(), ratings_dim);
466  return false;
467  }
468  WERD_CHOICE_IT it(&best_choices);
469  unsigned index = 0;
470  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
471  WERD_CHOICE *choice = it.data();
472  if (choice->TotalOfStates() != ratings_dim) {
473  tprintf("Cooked #%u has total of states = %u vs ratings dim of %u\n",
474  index, choice->TotalOfStates(), ratings_dim);
475  return false;
476  }
477  }
478  return true;
479 }
480 
481 // Prints a list of words found if debug is true or the word result matches
482 // the word_to_debug.
483 void WERD_RES::DebugWordChoices(bool debug, const char *word_to_debug) {
484  if (debug || (word_to_debug != nullptr && *word_to_debug != '\0' &&
485  best_choice != nullptr &&
486  best_choice->unichar_string() == std::string(word_to_debug))) {
487  if (raw_choice != nullptr) {
488  raw_choice->print("\nBest Raw Choice");
489  }
490 
491  WERD_CHOICE_IT it(&best_choices);
492  int index = 0;
493  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
494  WERD_CHOICE *choice = it.data();
495  std::string label;
496  label += "\nCooked Choice #" + std::to_string(index);
497  choice->print(label.c_str());
498  }
499  }
500 }
501 
502 // Prints the top choice along with the accepted/done flags.
503 void WERD_RES::DebugTopChoice(const char *msg) const {
504  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ", tess_accepted,
506  if (best_choice == nullptr) {
507  tprintf("<Null choice>\n");
508  } else {
509  best_choice->print(msg);
510  }
511 }
512 
513 // Removes from best_choices all choices which are not within a reasonable
514 // range of the best choice.
515 // TODO(rays) incorporate the information used here into the params training
516 // re-ranker, in place of this heuristic that is based on the previous
517 // adjustment factor.
518 void WERD_RES::FilterWordChoices(int debug_level) {
519  if (best_choice == nullptr || best_choices.singleton()) {
520  return;
521  }
522 
523  if (debug_level >= 2) {
524  best_choice->print("\nFiltering against best choice");
525  }
526  WERD_CHOICE_IT it(&best_choices);
527  int index = 0;
528  for (it.forward(); !it.at_first(); it.forward(), ++index) {
529  WERD_CHOICE *choice = it.data();
530  float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
531  choice->adjust_factor());
532  // i, j index the blob choice in choice, best_choice.
533  // chunk is an index into the chopped_word blobs (AKA chunks).
534  // Since the two words may use different segmentations of the chunks, we
535  // iterate over the chunks to find out whether a comparable blob
536  // classification is much worse than the best result.
537  unsigned i = 0, j = 0, chunk = 0;
538  // Each iteration of the while deals with 1 chunk. On entry choice_chunk
539  // and best_chunk are the indices of the first chunk in the NEXT blob,
540  // i.e. we don't have to increment i, j while chunk < choice_chunk and
541  // best_chunk respectively.
542  auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
543  while (i < choice->length() && j < best_choice->length()) {
544  if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
545  choice->certainty(i) - best_choice->certainty(j) < threshold) {
546  if (debug_level >= 2) {
547  choice->print("WorstCertaintyDiffWorseThan");
548  tprintf(
549  "i %u j %u Choice->Blob[i].Certainty %.4g"
550  " WorstOtherChoiceCertainty %g Threshold %g\n",
551  i, j, choice->certainty(i), best_choice->certainty(j), threshold);
552  tprintf("Discarding bad choice #%d\n", index);
553  }
554  delete it.extract();
555  break;
556  }
557  ++chunk;
558  // If needed, advance choice_chunk to keep up with chunk.
559  while (choice_chunk < chunk && ++i < choice->length()) {
560  choice_chunk += choice->state(i);
561  }
562  // If needed, advance best_chunk to keep up with chunk.
563  while (best_chunk < chunk && ++j < best_choice->length()) {
564  best_chunk += best_choice->state(j);
565  }
566  }
567  }
568 }
569 
570 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
571  float min_rating, float max_rating,
572  float rating_margin,
573  float *thresholds) {
574  int chunk = 0;
575  int end_chunk = best_choice->state(0);
576  int end_raw_chunk = raw_choice->state(0);
577  int raw_blob = 0;
578  for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) {
579  float avg_rating = 0.0f;
580  int num_error_chunks = 0;
581 
582  // For each chunk in best choice blob i, count non-matching raw results.
583  while (chunk < end_chunk) {
584  if (chunk >= end_raw_chunk) {
585  ++raw_blob;
586  end_raw_chunk += raw_choice->state(raw_blob);
587  }
588  if (best_choice->unichar_id(i) != raw_choice->unichar_id(raw_blob)) {
589  avg_rating += raw_choice->certainty(raw_blob);
590  ++num_error_chunks;
591  }
592  ++chunk;
593  }
594 
595  if (num_error_chunks > 0) {
596  avg_rating /= num_error_chunks;
597  *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
598  } else {
599  *thresholds = max_rating;
600  }
601 
602  if (*thresholds > max_rating) {
603  *thresholds = max_rating;
604  }
605  if (*thresholds < min_rating) {
606  *thresholds = min_rating;
607  }
608  }
609 }
610 
611 // Saves a copy of the word_choice if it has the best unadjusted rating.
612 // Returns true if the word_choice was the new best.
614  if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
615  delete raw_choice;
616  raw_choice = new WERD_CHOICE(*word_choice);
618  return true;
619  }
620  return false;
621 }
622 
623 // Consumes word_choice by adding it to best_choices, (taking ownership) if
624 // the certainty for word_choice is some distance of the best choice in
625 // best_choices, or by deleting the word_choice and returning false.
626 // The best_choices list is kept in sorted order by rating. Duplicates are
627 // removed, and the list is kept no longer than max_num_choices in length.
628 // Returns true if the word_choice is still a valid pointer.
629 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
630  WERD_CHOICE *word_choice) {
631  if (best_choice != nullptr) {
632  // Throw out obviously bad choices to save some work.
633  // TODO(rays) Get rid of this! This piece of code produces different
634  // results according to the order in which words are found, which is an
635  // undesirable behavior. It would be better to keep all the choices and
636  // prune them later when more information is available.
637  float max_certainty_delta = StopperAmbigThreshold(
638  best_choice->adjust_factor(), word_choice->adjust_factor());
639  if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) {
640  max_certainty_delta = -kStopperAmbiguityThresholdOffset;
641  }
642  if (word_choice->certainty() - best_choice->certainty() <
643  max_certainty_delta) {
644  if (debug) {
645  std::string bad_string;
646  word_choice->string_and_lengths(&bad_string, nullptr);
647  tprintf(
648  "Discarding choice \"%s\" with an overly low certainty"
649  " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
650  bad_string.c_str(), word_choice->certainty(),
652  max_certainty_delta + best_choice->certainty());
653  }
654  delete word_choice;
655  return false;
656  }
657  }
658 
659  // Insert in the list in order of increasing rating, but knock out worse
660  // string duplicates.
661  WERD_CHOICE_IT it(&best_choices);
662  const std::string &new_str = word_choice->unichar_string();
663  bool inserted = false;
664  int num_choices = 0;
665  if (!it.empty()) {
666  do {
667  WERD_CHOICE *choice = it.data();
668  if (choice->rating() > word_choice->rating() && !inserted) {
669  // Time to insert.
670  it.add_before_stay_put(word_choice);
671  inserted = true;
672  if (num_choices == 0) {
673  best_choice = word_choice; // This is the new best.
674  }
675  ++num_choices;
676  }
677  if (choice->unichar_string() == new_str) {
678  if (inserted) {
679  // New is better.
680  delete it.extract();
681  } else {
682  // Old is better.
683  if (debug) {
684  tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
685  new_str.c_str(), word_choice->rating(), choice->rating());
686  }
687  delete word_choice;
688  return false;
689  }
690  } else {
691  ++num_choices;
692  if (num_choices > max_num_choices) {
693  delete it.extract();
694  }
695  }
696  it.forward();
697  } while (!it.at_first());
698  }
699  if (!inserted && num_choices < max_num_choices) {
700  it.add_to_end(word_choice);
701  inserted = true;
702  if (num_choices == 0) {
703  best_choice = word_choice; // This is the new best.
704  }
705  }
706  if (debug) {
707  if (inserted) {
708  tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
709  } else {
710  tprintf("Poor");
711  }
712  word_choice->print(" Word Choice");
713  }
714  if (!inserted) {
715  delete word_choice;
716  return false;
717  }
718  return true;
719 }
720 
721 // Simple helper moves the ownership of the pointer data from src to dest,
722 // first deleting anything in dest, and nulling out src afterwards.
723 template <class T>
724 static void MovePointerData(T **dest, T **src) {
725  delete *dest;
726  *dest = *src;
727  *src = nullptr;
728 }
729 
730 // Prints a brief list of all the best choices.
732  std::string alternates_str;
733  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
734  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
735  if (!it.at_first()) {
736  alternates_str += "\", \"";
737  }
738  alternates_str += it.data()->unichar_string();
739  }
740  tprintf("Alternates for \"%s\": {\"%s\"}\n",
741  best_choice->unichar_string().c_str(), alternates_str.c_str());
742 }
743 
744 // Returns the sum of the widths of the blob between start_blob and last_blob
745 // inclusive.
746 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) const {
747  int result = 0;
748  for (int b = start_blob; b <= last_blob; ++b) {
749  result += blob_widths[b];
750  if (b < last_blob) {
751  result += blob_gaps[b];
752  }
753  }
754  return result;
755 }
756 // Returns the width of a gap between the specified blob and the next one.
757 int WERD_RES::GetBlobsGap(unsigned blob_index) const {
758  if (blob_index >= blob_gaps.size()) {
759  return 0;
760  }
761  return blob_gaps[blob_index];
762 }
763 
764 // Returns the BLOB_CHOICE corresponding to the given index in the
765 // best choice word taken from the appropriate cell in the ratings MATRIX.
766 // Borrowed pointer, so do not delete. May return nullptr if there is no
767 // BLOB_CHOICE matching the unichar_id at the given index.
768 BLOB_CHOICE *WERD_RES::GetBlobChoice(unsigned index) const {
769  if (index >= best_choice->length()) {
770  return nullptr;
771  }
772  BLOB_CHOICE_LIST *choices = GetBlobChoices(index);
773  return FindMatchingChoice(best_choice->unichar_id(index), choices);
774 }
775 
776 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
777 // best choice word taken from the appropriate cell in the ratings MATRIX.
778 // Borrowed pointer, so do not delete.
779 BLOB_CHOICE_LIST *WERD_RES::GetBlobChoices(int index) const {
780  return best_choice->blob_choices(index, ratings);
781 }
782 
783 // Moves the results fields from word to this. This takes ownership of all
784 // the data, so src can be destructed.
786  denorm = word->denorm;
787  blob_row = word->blob_row;
788  MovePointerData(&chopped_word, &word->chopped_word);
789  MovePointerData(&rebuild_word, &word->rebuild_word);
790  MovePointerData(&box_word, &word->box_word);
791  for (auto data : seam_array) {
792  delete data;
793  }
794  seam_array = word->seam_array;
795  word->seam_array.clear();
796  // TODO: optimize moves.
797  best_state = word->best_state;
798  word->best_state.clear();
799  correct_text = word->correct_text;
800  word->correct_text.clear();
801  blob_widths = word->blob_widths;
802  word->blob_widths.clear();
803  blob_gaps = word->blob_gaps;
804  word->blob_gaps.clear();
805  if (ratings != nullptr) {
807  }
808  MovePointerData(&ratings, &word->ratings);
809  best_choice = word->best_choice;
810  MovePointerData(&raw_choice, &word->raw_choice);
811  best_choices.clear();
812  WERD_CHOICE_IT wc_it(&best_choices);
813  wc_it.add_list_after(&word->best_choices);
814  reject_map = word->reject_map;
815  if (word->blamer_bundle != nullptr) {
816  assert(blamer_bundle != nullptr);
817  blamer_bundle->CopyResults(*(word->blamer_bundle));
818  }
820 }
821 
822 // Replace the best choice and rebuild box word.
823 // choice must be from the current best_choices list.
825  best_choice = choice;
827  SetupBoxWord();
828  // Make up a fake reject map of the right length to keep the
829  // rejection pass happy.
833 }
834 
835 // Builds the rebuild_word and sets the best_state from the chopped_word and
836 // the best_choice->state.
838  ASSERT_HOST(best_choice != nullptr);
839  delete rebuild_word;
840  rebuild_word = new TWERD;
841  if (seam_array.empty()) {
843  }
844  best_state.clear();
845  int start = 0;
846  for (unsigned i = 0; i < best_choice->length(); ++i) {
847  int length = best_choice->state(i);
848  best_state.push_back(length);
849  if (length > 1) {
851  start + length - 1);
852  }
853  TBLOB *blob = chopped_word->blobs[start];
854  rebuild_word->blobs.push_back(new TBLOB(*blob));
855  if (length > 1) {
857  start + length - 1);
858  }
859  start += length;
860  }
861 }
862 
863 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
864 // Also sets up the output box_word.
866  delete rebuild_word;
868  SetupBoxWord();
869  auto word_len = box_word->length();
870  best_state.reserve(word_len);
871  correct_text.reserve(word_len);
872  for (unsigned i = 0; i < word_len; ++i) {
873  best_state.push_back(1);
874  correct_text.emplace_back("");
875  }
876 }
877 
878 // Sets/replaces the box_word with one made from the rebuild_word.
880  delete box_word;
884 }
885 
886 // Sets up the script positions in the output best_choice using the best_choice
887 // to get the unichars, and the unicharset to get the target positions.
890 }
891 // Sets all the blobs in all the words (raw choice and best choices) to be
892 // the given position. (When a sub/superscript is recognized as a separate
893 // word, it falls victim to the rule that a whole word cannot be sub or
894 // superscript, so this function overrides that problem.)
897  WERD_CHOICE_IT wc_it(&best_choices);
898  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
899  wc_it.data()->SetAllScriptPositions(position);
900  }
901 }
902 
903 // Classifies the word with some already-calculated BLOB_CHOICEs.
904 // The choices are an array of blob_count pointers to BLOB_CHOICE,
905 // providing a single classifier result for each blob.
906 // The BLOB_CHOICEs are consumed and the word takes ownership.
907 // The number of blobs in the box_word must match blob_count.
908 void WERD_RES::FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices) {
909  // Setup the WERD_RES.
910  ASSERT_HOST(box_word != nullptr);
911  ASSERT_HOST(blob_count == box_word->length());
913  ClearRatings();
914  ratings = new MATRIX(blob_count, 1);
915  for (unsigned c = 0; c < blob_count; ++c) {
916  auto *choice_list = new BLOB_CHOICE_LIST;
917  BLOB_CHOICE_IT choice_it(choice_list);
918  choice_it.add_after_then_move(choices[c]);
919  ratings->put(c, c, choice_list);
920  }
922  reject_map.initialise(blob_count);
923  best_state.clear();
924  best_state.resize(blob_count, 1);
925  done = true;
926 }
927 
928 // Creates a WERD_CHOICE for the word using the top choices from the leading
929 // diagonal of the ratings matrix.
931  int num_blobs = ratings->dimension();
932  auto *word_choice = new WERD_CHOICE(uch_set, num_blobs);
933  word_choice->set_permuter(permuter);
934  for (int b = 0; b < num_blobs; ++b) {
935  UNICHAR_ID unichar_id = UNICHAR_SPACE;
936  // Initialize rating and certainty like in WERD_CHOICE::make_bad().
937  float rating = WERD_CHOICE::kBadRating;
938  float certainty = -FLT_MAX;
939  BLOB_CHOICE_LIST *choices = ratings->get(b, b);
940  if (choices != nullptr && !choices->empty()) {
941  BLOB_CHOICE_IT bc_it(choices);
942  BLOB_CHOICE *choice = bc_it.data();
943  unichar_id = choice->unichar_id();
944  rating = choice->rating();
945  certainty = choice->certainty();
946  }
947  word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
948  certainty);
949  }
950  LogNewRawChoice(word_choice);
951  // Ownership of word_choice taken by word here.
952  LogNewCookedChoice(1, false, word_choice);
953 }
954 
955 // Copies the best_choice strings to the correct_text for adaption/training.
957  correct_text.clear();
958  ASSERT_HOST(best_choice != nullptr);
959  for (unsigned i = 0; i < best_choice->length(); ++i) {
960  UNICHAR_ID choice_id = best_choice->unichar_id(i);
961  const char *blob_choice = uch_set->id_to_unichar(choice_id);
962  correct_text.emplace_back(blob_choice);
963  }
964 }
965 
966 // Merges 2 adjacent blobs in the result if the permanent callback
967 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
968 // callback box_cb is nullptr or returns true, setting the merged blob
969 // result to the class returned from class_cb.
970 // Returns true if anything was merged.
972  const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,
973  const std::function<bool(const TBOX &, const TBOX &)> &box_cb) {
974  ASSERT_HOST(best_choice->empty() || ratings != nullptr);
975  bool modified = false;
976  for (unsigned i = 0; i + 1 < best_choice->length(); ++i) {
977  UNICHAR_ID new_id =
978  class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1));
979  if (new_id != INVALID_UNICHAR_ID &&
980  (box_cb == nullptr ||
981  box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) {
982  // Raw choice should not be fixed.
983  best_choice->set_unichar_id(new_id, i);
984  modified = true;
986  const MATRIX_COORD &coord = best_choice->MatrixCoord(i);
987  if (!coord.Valid(*ratings)) {
988  ratings->IncreaseBandSize(coord.row + 1 - coord.col);
989  }
990  BLOB_CHOICE_LIST *blob_choices = GetBlobChoices(i);
991  if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
992  // Insert a fake result.
993  auto *blob_choice = new BLOB_CHOICE;
994  blob_choice->set_unichar_id(new_id);
995  BLOB_CHOICE_IT bc_it(blob_choices);
996  bc_it.add_before_then_move(blob_choice);
997  }
998  }
999  }
1000  return modified;
1001 }
1002 
1003 // Merges 2 adjacent blobs in the result (index and index+1) and corrects
1004 // all the data to account for the change.
1005 void WERD_RES::MergeAdjacentBlobs(unsigned index) {
1006  if (reject_map.length() == best_choice->length()) {
1007  reject_map.remove_pos(index);
1008  }
1009  best_choice->remove_unichar_id(index + 1);
1010  rebuild_word->MergeBlobs(index, index + 2);
1011  box_word->MergeBoxes(index, index + 2);
1012  if (index + 1 < best_state.size()) {
1013  best_state[index] += best_state[index + 1];
1014  best_state.erase(best_state.begin() + index + 1);
1015  }
1016 }
1017 
1018 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
1019 // training data.
1020 
1021 // Utility function for fix_quotes
1022 // Return true if the next character in the string (given the UTF8 length in
1023 // bytes) is a quote character.
1024 static int is_simple_quote(const char *signed_str, int length) {
1025  const auto *str = reinterpret_cast<const unsigned char *>(signed_str);
1026  // Standard 1 byte quotes.
1027  return (length == 1 && (*str == '\'' || *str == '`')) ||
1028  // UTF-8 3 bytes curved quotes.
1029  (length == 3 &&
1030  ((*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x98) ||
1031  (*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x99)));
1032 }
1033 
1034 // Callback helper for fix_quotes returns a double quote if both
1035 // arguments are quote, otherwise INVALID_UNICHAR_ID.
1037  const char *ch = uch_set->id_to_unichar(id1);
1038  const char *next_ch = uch_set->id_to_unichar(id2);
1039  if (is_simple_quote(ch, strlen(ch)) &&
1040  is_simple_quote(next_ch, strlen(next_ch))) {
1041  return uch_set->unichar_to_id("\"");
1042  }
1043  return INVALID_UNICHAR_ID;
1044 }
1045 
1046 // Change pairs of quotes to double quotes.
1048  if (!uch_set->contains_unichar("\"") ||
1050  return; // Don't create it if it is disallowed.
1051  }
1052 
1053  using namespace std::placeholders; // for _1, _2
1054  ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2), nullptr);
1055 }
1056 
1057 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
1058 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
1060  const char *ch = uch_set->id_to_unichar(id1);
1061  const char *next_ch = uch_set->id_to_unichar(id2);
1062  if (strlen(ch) == 1 && strlen(next_ch) == 1 && (*ch == '-' || *ch == '~') &&
1063  (*next_ch == '-' || *next_ch == '~')) {
1064  return uch_set->unichar_to_id("-");
1065  }
1066  return INVALID_UNICHAR_ID;
1067 }
1068 
1069 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
1070 // (assuming both on the same textline, are in order and a chopped em dash.)
1071 bool WERD_RES::HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2) {
1072  return box1.right() >= box2.left();
1073 }
1074 
1075 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
1076 // Typically a long dash which has been segmented.
1078  if (!uch_set->contains_unichar("-") ||
1080  return; // Don't create it if it is disallowed.
1081  }
1082 
1083  using namespace std::placeholders; // for _1, _2
1084  ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2),
1085  std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2));
1086 }
1087 
1088 // Callback helper for merge_tess_fails returns a space if both
1089 // arguments are space, otherwise INVALID_UNICHAR_ID.
1091  if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) {
1092  return id1;
1093  } else {
1094  return INVALID_UNICHAR_ID;
1095  }
1096 }
1097 
1098 // Change pairs of tess failures to a single one
1100  using namespace std::placeholders; // for _1, _2
1101  if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2),
1102  nullptr)) {
1103  unsigned len = best_choice->length();
1104  ASSERT_HOST(reject_map.length() == len);
1105  ASSERT_HOST(box_word->length() == len);
1106  }
1107 }
1108 
1109 // Returns true if the collection of count pieces, starting at start, are all
1110 // natural connected components, ie there are no real chops involved.
1111 bool WERD_RES::PiecesAllNatural(int start, int count) const {
1112  // all seams must have no splits.
1113  for (int index = start; index < start + count - 1; ++index) {
1114  if (index >= 0 && static_cast<size_t>(index) < seam_array.size()) {
1115  SEAM *seam = seam_array[index];
1116  if (seam != nullptr && seam->HasAnySplits()) {
1117  return false;
1118  }
1119  }
1120  }
1121  return true;
1122 }
1123 
1125  Clear();
1126 }
1127 
1129  if (combination) {
1130  delete word;
1131  }
1132  word = nullptr;
1133  delete blamer_bundle;
1134  blamer_bundle = nullptr;
1135  ClearResults();
1136 }
1137 
1139  done = false;
1140  fontinfo = nullptr;
1141  fontinfo2 = nullptr;
1142  fontinfo_id_count = 0;
1143  fontinfo_id2_count = 0;
1144  delete bln_boxes;
1145  bln_boxes = nullptr;
1146  blob_row = nullptr;
1147  delete chopped_word;
1148  chopped_word = nullptr;
1149  delete rebuild_word;
1150  rebuild_word = nullptr;
1151  delete box_word;
1152  box_word = nullptr;
1153  best_state.clear();
1154  correct_text.clear();
1155  for (auto data : seam_array) {
1156  delete data;
1157  }
1158  seam_array.clear();
1159  blob_widths.clear();
1160  blob_gaps.clear();
1161  ClearRatings();
1162  ClearWordChoices();
1163  if (blamer_bundle != nullptr) {
1165  }
1166 }
1168  best_choice = nullptr;
1169  delete raw_choice;
1170  raw_choice = nullptr;
1171  best_choices.clear();
1172  delete ep_choice;
1173  ep_choice = nullptr;
1174 }
1176  if (ratings != nullptr) {
1178  delete ratings;
1179  ratings = nullptr;
1180  }
1181 }
1182 
1183 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
1184  ASSERT_HOST(page_res == other.page_res);
1185  if (other.block_res == nullptr) {
1186  // other points to the end of the page.
1187  if (block_res == nullptr) {
1188  return 0;
1189  }
1190  return -1;
1191  }
1192  if (block_res == nullptr) {
1193  return 1; // we point to the end of the page.
1194  }
1195  if (block_res == other.block_res) {
1196  if (other.row_res == nullptr || row_res == nullptr) {
1197  // this should only happen if we hit an image block.
1198  return 0;
1199  }
1200  if (row_res == other.row_res) {
1201  // we point to the same block and row.
1202  ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
1203  if (word_res == other.word_res) {
1204  // we point to the same word!
1205  return 0;
1206  }
1207 
1208  WERD_RES_IT word_res_it(&row_res->word_res_list);
1209  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1210  word_res_it.forward()) {
1211  if (word_res_it.data() == word_res) {
1212  return -1;
1213  } else if (word_res_it.data() == other.word_res) {
1214  return 1;
1215  }
1216  }
1217  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1218  }
1219 
1220  // we both point to the same block, but different rows.
1221  ROW_RES_IT row_res_it(&block_res->row_res_list);
1222  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1223  row_res_it.forward()) {
1224  if (row_res_it.data() == row_res) {
1225  return -1;
1226  } else if (row_res_it.data() == other.row_res) {
1227  return 1;
1228  }
1229  }
1230  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1231  }
1232 
1233  // We point to different blocks.
1234  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1235  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
1236  block_res_it.forward()) {
1237  if (block_res_it.data() == block_res) {
1238  return -1;
1239  } else if (block_res_it.data() == other.block_res) {
1240  return 1;
1241  }
1242  }
1243  // Shouldn't happen...
1244  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1245  return 0;
1246 }
1247 
1248 // Inserts the new_word as a combination owned by a corresponding WERD_RES
1249 // before the current position. The simple fields of the WERD_RES are copied
1250 // from clone_res and the resulting WERD_RES is returned for further setup
1251 // with best_choice etc.
1253  WERD *new_word) {
1254  // Make a WERD_RES for the new_word.
1255  auto *new_res = new WERD_RES(new_word);
1256  new_res->CopySimpleFields(clone_res);
1257  new_res->combination = true;
1258  // Insert into the appropriate place in the ROW_RES.
1259  WERD_RES_IT wr_it(&row()->word_res_list);
1260  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1261  WERD_RES *word = wr_it.data();
1262  if (word == word_res) {
1263  break;
1264  }
1265  }
1266  ASSERT_HOST(!wr_it.cycled_list());
1267  wr_it.add_before_then_move(new_res);
1268  if (wr_it.at_first()) {
1269  // This is the new first word, so reset the member iterator so it
1270  // detects the cycled_list state correctly.
1271  ResetWordIterator();
1272  }
1273  return new_res;
1274 }
1275 
1276 // Helper computes the boundaries between blobs in the word. The blob bounds
1277 // are likely very poor, if they come from LSTM, where it only outputs the
1278 // character at one pixel within it, so we find the midpoints between them.
1279 static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box,
1280  C_BLOB_LIST *next_word_blobs,
1281  std::vector<int> *blob_ends) {
1282  C_BLOB_IT blob_it(word.word->cblob_list());
1283  for (int length : word.best_state) {
1284  // Get the bounding box of the fake blobs
1285  TBOX blob_box = blob_it.data()->bounding_box();
1286  blob_it.forward();
1287  for (int b = 1; b < length; ++b) {
1288  blob_box += blob_it.data()->bounding_box();
1289  blob_it.forward();
1290  }
1291  // This blob_box is crap, so for now we are only looking for the
1292  // boundaries between them.
1293  int blob_end = INT32_MAX;
1294  if (!blob_it.at_first() || next_word_blobs != nullptr) {
1295  if (blob_it.at_first()) {
1296  blob_it.set_to_list(next_word_blobs);
1297  }
1298  blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
1299  }
1300  blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());
1301  blob_ends->push_back(blob_end);
1302  }
1303  blob_ends->back() = clip_box.right();
1304 }
1305 
1306 // Helper computes the bounds of a word by restricting it to existing words
1307 // that significantly overlap.
1308 static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words,
1309  int w_index, TBOX prev_box, WERD_RES_IT w_it) {
1310  constexpr int kSignificantOverlapFraction = 4;
1311  TBOX clipped_box;
1312  TBOX current_box = words[w_index]->word->bounding_box();
1313  TBOX next_box;
1314  if (static_cast<size_t>(w_index + 1) < words.size() &&
1315  words[w_index + 1] != nullptr && words[w_index + 1]->word != nullptr) {
1316  next_box = words[w_index + 1]->word->bounding_box();
1317  }
1318  for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
1319  w_it.forward()) {
1320  if (w_it.data() == nullptr || w_it.data()->word == nullptr) {
1321  continue;
1322  }
1323  TBOX w_box = w_it.data()->word->bounding_box();
1324  int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);
1325  int width_limit = w_box.width() / kSignificantOverlapFraction;
1326  int min_significant_overlap = std::max(height_limit, width_limit);
1327  int overlap = w_box.intersection(current_box).width();
1328  int prev_overlap = w_box.intersection(prev_box).width();
1329  int next_overlap = w_box.intersection(next_box).width();
1330  if (overlap > min_significant_overlap) {
1331  if (prev_overlap > min_significant_overlap) {
1332  // We have no choice but to use the LSTM word edge.
1333  clipped_box.set_left(current_box.left());
1334  } else if (next_overlap > min_significant_overlap) {
1335  // We have no choice but to use the LSTM word edge.
1336  clipped_box.set_right(current_box.right());
1337  } else {
1338  clipped_box += w_box;
1339  }
1340  }
1341  }
1342  if (clipped_box.height() <= 0) {
1343  clipped_box.set_top(current_box.top());
1344  clipped_box.set_bottom(current_box.bottom());
1345  }
1346  if (clipped_box.width() <= 0) {
1347  clipped_box = current_box;
1348  }
1349  return clipped_box;
1350 }
1351 
1352 // Helper moves the blob from src to dest. If it isn't contained by clip_box,
1353 // the blob is replaced by a fake that is contained.
1354 static TBOX MoveAndClipBlob(C_BLOB_IT *src_it, C_BLOB_IT *dest_it,
1355  const TBOX &clip_box) {
1356  C_BLOB *src_blob = src_it->extract();
1357  TBOX box = src_blob->bounding_box();
1358  if (!clip_box.contains(box)) {
1359  int left =
1360  ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);
1361  int right =
1362  ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());
1363  int top =
1364  ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());
1365  int bottom =
1366  ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);
1367  box = TBOX(left, bottom, right, top);
1368  delete src_blob;
1369  src_blob = C_BLOB::FakeBlob(box);
1370  }
1371  dest_it->add_after_then_move(src_blob);
1372  return box;
1373 }
1374 
1375 // Replaces the current WERD/WERD_RES with the given words. The given words
1376 // contain fake blobs that indicate the position of the characters. These are
1377 // replaced with real blobs from the current word as much as possible.
1380  if (words->empty()) {
1381  DeleteCurrentWord();
1382  return;
1383  }
1384  WERD_RES *input_word = word();
1385  // Set the BOL/EOL flags on the words from the input word.
1386  if (input_word->word->flag(W_BOL)) {
1387  (*words)[0]->word->set_flag(W_BOL, true);
1388  } else {
1389  (*words)[0]->word->set_blanks(input_word->word->space());
1390  }
1391  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1392 
1393  // Move the blobs from the input word to the new set of words.
1394  // If the input word_res is a combination, then the replacements will also be
1395  // combinations, and will own their own words. If the input word_res is not a
1396  // combination, then the final replacements will not be either, (although it
1397  // is allowed for the input words to be combinations) and their words
1398  // will get put on the row list. This maintains the ownership rules.
1399  WERD_IT w_it(row()->row->word_list());
1400  if (!input_word->combination) {
1401  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1402  WERD *word = w_it.data();
1403  if (word == input_word->word) {
1404  break;
1405  }
1406  }
1407  // w_it is now set to the input_word's word.
1408  ASSERT_HOST(!w_it.cycled_list());
1409  }
1410  // Insert into the appropriate place in the ROW_RES.
1411  WERD_RES_IT wr_it(&row()->word_res_list);
1412  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1413  WERD_RES *word = wr_it.data();
1414  if (word == input_word) {
1415  break;
1416  }
1417  }
1418  ASSERT_HOST(!wr_it.cycled_list());
1419  // Since we only have an estimate of the bounds between blobs, use the blob
1420  // x-middle as the determiner of where to put the blobs
1421  C_BLOB_IT src_b_it(input_word->word->cblob_list());
1422  src_b_it.sort(&C_BLOB::SortByXMiddle);
1423  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1424  rej_b_it.sort(&C_BLOB::SortByXMiddle);
1425  TBOX clip_box;
1426  for (size_t w = 0; w < words->size(); ++w) {
1427  WERD_RES *word_w = (*words)[w];
1428  clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1429  // Compute blob boundaries.
1430  std::vector<int> blob_ends;
1431  C_BLOB_LIST *next_word_blobs =
1432  w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1433  ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1434  // Remove the fake blobs on the current word, but keep safe for back-up if
1435  // no blob can be found.
1436  C_BLOB_LIST fake_blobs;
1437  C_BLOB_IT fake_b_it(&fake_blobs);
1438  fake_b_it.add_list_after(word_w->word->cblob_list());
1439  fake_b_it.move_to_first();
1440  word_w->word->cblob_list()->clear();
1441  C_BLOB_IT dest_it(word_w->word->cblob_list());
1442  // Build the box word as we move the blobs.
1443  auto *box_word = new tesseract::BoxWord;
1444  for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
1445  int end_x = blob_ends[i];
1446  TBOX blob_box;
1447  // Add the blobs up to end_x.
1448  while (!src_b_it.empty() &&
1449  src_b_it.data()->bounding_box().x_middle() < end_x) {
1450  blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1451  src_b_it.forward();
1452  }
1453  while (!rej_b_it.empty() &&
1454  rej_b_it.data()->bounding_box().x_middle() < end_x) {
1455  blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1456  rej_b_it.forward();
1457  }
1458  if (blob_box.null_box()) {
1459  // Use the original box as a back-up.
1460  blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1461  }
1462  box_word->InsertBox(i, blob_box);
1463  }
1464  delete word_w->box_word;
1465  word_w->box_word = box_word;
1466  if (!input_word->combination) {
1467  // Insert word_w->word into the ROW. It doesn't own its word, so the
1468  // ROW needs to own it.
1469  w_it.add_before_stay_put(word_w->word);
1470  word_w->combination = false;
1471  }
1472  (*words)[w] = nullptr; // We are taking ownership.
1473  wr_it.add_before_stay_put(word_w);
1474  }
1475  // We have taken ownership of the words.
1476  words->clear();
1477  // Delete the current word, which has been replaced. We could just call
1478  // DeleteCurrentWord, but that would iterate both lists again, and we know
1479  // we are already in the right place.
1480  if (!input_word->combination) {
1481  delete w_it.extract();
1482  }
1483  delete wr_it.extract();
1484  ResetWordIterator();
1485 }
1486 
1487 // Deletes the current WERD_RES and its underlying WERD.
1489  // Check that this word is as we expect. part_of_combos are NEVER iterated
1490  // by the normal iterator, so we should never be trying to delete them.
1491  ASSERT_HOST(!word_res->part_of_combo);
1492  if (!word_res->combination) {
1493  // Combinations own their own word, so we won't find the word on the
1494  // row's word_list, but it is legitimate to try to delete them.
1495  // Delete word from the ROW when not a combination.
1496  WERD_IT w_it(row()->row->word_list());
1497  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1498  if (w_it.data() == word_res->word) {
1499  break;
1500  }
1501  }
1502  ASSERT_HOST(!w_it.cycled_list());
1503  delete w_it.extract();
1504  }
1505  // Remove the WERD_RES for the new_word.
1506  // Remove the WORD_RES from the ROW_RES.
1507  WERD_RES_IT wr_it(&row()->word_res_list);
1508  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1509  if (wr_it.data() == word_res) {
1510  word_res = nullptr;
1511  break;
1512  }
1513  }
1514  ASSERT_HOST(!wr_it.cycled_list());
1515  delete wr_it.extract();
1516  ResetWordIterator();
1517 }
1518 
1519 // Makes the current word a fuzzy space if not already fuzzy. Updates
1520 // corresponding part of combo if required.
1522  WERD *real_word = word_res->word;
1523  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1524  real_word->set_flag(W_FUZZY_SP, true);
1525  if (word_res->combination) {
1526  // The next word should be the corresponding part of combo, but we have
1527  // already stepped past it, so find it by search.
1528  WERD_RES_IT wr_it(&row()->word_res_list);
1529  for (wr_it.mark_cycle_pt();
1530  !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1531  }
1532  wr_it.forward();
1533  ASSERT_HOST(wr_it.data()->part_of_combo);
1534  real_word = wr_it.data()->word;
1535  ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1536  !real_word->flag(W_FUZZY_NON));
1537  real_word->set_flag(W_FUZZY_SP, true);
1538  }
1539  }
1540 }
1541 
1542 /*************************************************************************
1543  * PAGE_RES_IT::restart_page
1544  *
1545  * Set things up at the start of the page
1546  *************************************************************************/
1547 
1549  block_res_it.set_to_list(&page_res->block_res_list);
1550  block_res_it.mark_cycle_pt();
1551  prev_block_res = nullptr;
1552  prev_row_res = nullptr;
1553  prev_word_res = nullptr;
1554  block_res = nullptr;
1555  row_res = nullptr;
1556  word_res = nullptr;
1557  next_block_res = nullptr;
1558  next_row_res = nullptr;
1559  next_word_res = nullptr;
1560  internal_forward(true, empty_ok);
1561  return internal_forward(false, empty_ok);
1562 }
1563 
1564 // Recovers from operations on the current word, such as in InsertCloneWord
1565 // and DeleteCurrentWord.
1566 // Resets the word_res_it so that it is one past the next_word_res, as
1567 // it should be after internal_forward. If next_row_res != row_res,
1568 // then the next_word_res is in the next row, so there is no need to do
1569 // anything to word_res_it, but it is still a good idea to reset the pointers
1570 // word_res and prev_word_res, which are still in the current row.
1572  if (row_res == next_row_res) {
1573  // Reset the member iterator so it can move forward and detect the
1574  // cycled_list state correctly.
1575  word_res_it.move_to_first();
1576  for (word_res_it.mark_cycle_pt();
1577  !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1578  word_res_it.forward()) {
1579  if (!word_res_it.data()->part_of_combo) {
1580  if (prev_row_res == row_res) {
1581  prev_word_res = word_res;
1582  }
1583  word_res = word_res_it.data();
1584  }
1585  }
1586  ASSERT_HOST(!word_res_it.cycled_list());
1587  wr_it_of_next_word = word_res_it;
1588  word_res_it.forward();
1589  } else {
1590  // word_res_it is OK, but reset word_res and prev_word_res if needed.
1591  WERD_RES_IT wr_it(&row_res->word_res_list);
1592  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1593  if (!wr_it.data()->part_of_combo) {
1594  if (prev_row_res == row_res) {
1595  prev_word_res = word_res;
1596  }
1597  word_res = wr_it.data();
1598  }
1599  }
1600  }
1601 }
1602 
1603 /*************************************************************************
1604  * PAGE_RES_IT::internal_forward
1605  *
1606  * Find the next word on the page. If empty_ok is true, then non-text blocks
1607  * and text blocks with no text are visited as if they contain a single
1608  * imaginary word in a single imaginary row. (word() and row() both return
1609  *nullptr in such a block and the return value is nullptr.) If empty_ok is
1610  *false, the old behaviour is maintained. Each real word is visited and empty
1611  *and non-text blocks and rows are skipped. new_block is used to initialize the
1612  *iterators for a new block. The iterator maintains pointers to block, row and
1613  *word for the previous, current and next words. These are correct, regardless
1614  *of block/row boundaries. nullptr values denote start and end of the page.
1615  *************************************************************************/
1616 
1617 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1618  bool new_row = false;
1619 
1620  prev_block_res = block_res;
1621  prev_row_res = row_res;
1622  prev_word_res = word_res;
1623  block_res = next_block_res;
1624  row_res = next_row_res;
1625  word_res = next_word_res;
1626  wr_it_of_current_word = wr_it_of_next_word;
1627  next_block_res = nullptr;
1628  next_row_res = nullptr;
1629  next_word_res = nullptr;
1630 
1631  while (!block_res_it.cycled_list()) {
1632  if (new_block) {
1633  new_block = false;
1634  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1635  row_res_it.mark_cycle_pt();
1636  if (row_res_it.empty() && empty_ok) {
1637  next_block_res = block_res_it.data();
1638  break;
1639  }
1640  new_row = true;
1641  }
1642  while (!row_res_it.cycled_list()) {
1643  if (new_row) {
1644  new_row = false;
1645  word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1646  word_res_it.mark_cycle_pt();
1647  }
1648  // Skip any part_of_combo words.
1649  while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) {
1650  word_res_it.forward();
1651  }
1652  if (!word_res_it.cycled_list()) {
1653  next_block_res = block_res_it.data();
1654  next_row_res = row_res_it.data();
1655  next_word_res = word_res_it.data();
1656  wr_it_of_next_word = word_res_it;
1657  word_res_it.forward();
1658  goto foundword;
1659  }
1660  // end of row reached
1661  row_res_it.forward();
1662  new_row = true;
1663  }
1664  // end of block reached
1665  block_res_it.forward();
1666  new_block = true;
1667  }
1668 foundword:
1669  // Update prev_word_best_choice pointer.
1670  if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {
1671  *page_res->prev_word_best_choice = (new_block || prev_word_res == nullptr)
1672  ? nullptr
1673  : prev_word_res->best_choice;
1674  }
1675  return word_res;
1676 }
1677 
1678 /*************************************************************************
1679  * PAGE_RES_IT::restart_row()
1680  *
1681  * Move to the beginning (leftmost word) of the current row.
1682  *************************************************************************/
1684  ROW_RES *row = this->row();
1685  if (!row) {
1686  return nullptr;
1687  }
1688  for (restart_page(); this->row() != row; forward()) {
1689  // pass
1690  }
1691  return word();
1692 }
1693 
1694 /*************************************************************************
1695  * PAGE_RES_IT::forward_paragraph
1696  *
1697  * Move to the beginning of the next paragraph, allowing empty blocks.
1698  *************************************************************************/
1699 
1701  while (block_res == next_block_res &&
1702  (next_row_res != nullptr && next_row_res->row != nullptr &&
1703  row_res->row->para() == next_row_res->row->para())) {
1704  internal_forward(false, true);
1705  }
1706  return internal_forward(false, true);
1707 }
1708 
1709 /*************************************************************************
1710  * PAGE_RES_IT::forward_block
1711  *
1712  * Move to the beginning of the next block, allowing empty blocks.
1713  *************************************************************************/
1714 
1716  while (block_res == next_block_res) {
1717  internal_forward(false, true);
1718  }
1719  return internal_forward(false, true);
1720 }
1721 
1723  int16_t chars_in_word;
1724  int16_t rejects_in_word = 0;
1725 
1726  chars_in_word = word_res->reject_map.length();
1727  page_res->char_count += chars_in_word;
1728  block_res->char_count += chars_in_word;
1729  row_res->char_count += chars_in_word;
1730 
1731  rejects_in_word = word_res->reject_map.reject_count();
1732 
1733  page_res->rej_count += rejects_in_word;
1734  block_res->rej_count += rejects_in_word;
1735  row_res->rej_count += rejects_in_word;
1736  if (chars_in_word == rejects_in_word) {
1737  row_res->whole_word_rej_count += rejects_in_word;
1738  }
1739 }
1740 
1741 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
@ TBOX
@ W_BOL
start of line
Definition: werd.h:34
@ W_INVERSE
white on black
Definition: werd.h:43
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_SCRIPT_HAS_XHEIGHT
x-height concept makes sense.
Definition: werd.h:37
@ W_EOL
end of line
Definition: werd.h:35
@ W_SCRIPT_IS_LATIN
Special case latin for y. splitting.
Definition: werd.h:38
@ W_REP_CHAR
repeated character
Definition: werd.h:40
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const double kMaxWordSizeRatio
Definition: pageres.cpp:58
const double kMaxLineSizeRatio
Definition: pageres.cpp:60
const int kWordrecMaxNumJoinChunks
Definition: pageres.cpp:55
int UNICHAR_ID
Definition: unichar.h:36
@ UNICHAR_SPACE
Definition: unicharset.h:36
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:177
void start_seam_list(TWERD *word, std::vector< SEAM * > *seam_array)
Definition: seam.cpp:262
PermuterType
Definition: ratngs.h:231
@ TOP_CHOICE_PERM
Definition: ratngs.h:234
const double kMaxWordGapRatio
Definition: pageres.cpp:62
T get(ICOORD pos) const
Definition: matrix.h:268
void put(ICOORD pos, const T &thing)
Definition: matrix.h:260
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:214
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:220
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:151
TBOX bounding_box() const
Definition: blobs.cpp:466
void ComputeBoundingBoxes()
Definition: blobs.cpp:857
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:778
std::vector< TBLOB * > blobs
Definition: blobs.h:462
void BLNormalize(const BLOCK *block, const ROW *row, Image pix, bool inverse, float x_height, float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX *norm_box, DENORM *word_denorm)
Definition: blobs.cpp:792
unsigned NumBlobs() const
Definition: blobs.h:449
void MergeBlobs(unsigned start, unsigned end)
Definition: blobs.cpp:874
void MergeBoxes(unsigned start, unsigned end)
Definition: boxword.cpp:138
const TBOX & BlobBox(unsigned index) const
Definition: boxword.h:84
static BoxWord * CopyFromNormalized(TWERD *tessword)
Definition: boxword.cpp:56
unsigned length() const
Definition: boxword.h:81
void InsertBox(unsigned index, const TBOX &box)
Definition: boxword.cpp:157
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:92
int dimension() const
Definition: matrix.h:612
MATRIX * ConsumeAndMakeBigger(int ind)
Definition: matrix.cpp:61
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:52
bool Valid(const MATRIX &m) const
Definition: matrix.h:697
const BLOCK * block() const
Definition: normalis.h:265
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:111
float body_size() const
Definition: ocrrow.h:75
WERD_LIST * word_list()
Definition: ocrrow.h:57
float x_height() const
Definition: ocrrow.h:66
float ascenders() const
Definition: ocrrow.h:84
float descenders() const
Definition: ocrrow.h:87
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:85
ROW_RES_LIST row_res_list
Definition: pageres.h:129
int16_t font_class
Definition: pageres.h:123
int32_t char_count
Definition: pageres.h:121
WERD_RES_LIST word_res_list
Definition: pageres.h:148
int32_t whole_word_rej_count
Definition: pageres.h:147
int32_t rej_count
Definition: pageres.h:146
int32_t char_count
Definition: pageres.h:145
void copy_on(WERD_RES *word_res)
Definition: pageres.h:667
const FontInfo * fontinfo2
Definition: pageres.h:308
void CloneChoppedToRebuild()
Definition: pageres.cpp:865
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:930
tesseract::Tesseract * tesseract
Definition: pageres.h:278
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1036
WERD_CHOICE * best_choice
Definition: pageres.h:239
WERD_CHOICE * raw_choice
Definition: pageres.h:244
int8_t fontinfo_id2_count
Definition: pageres.h:310
std::vector< std::string > correct_text
Definition: pageres.h:287
int GetBlobsGap(unsigned blob_index) const
Definition: pageres.cpp:757
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:570
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:518
void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:908
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:313
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:385
TWERD * chopped_word
Definition: pageres.h:210
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:419
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:279
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Image pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:304
bool ConditionalBlobMerge(const std::function< UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb, const std::function< bool(const TBOX &, const TBOX &)> &box_cb)
Definition: pageres.cpp:971
tesseract::BoxWord * bln_boxes
Definition: pageres.h:193
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:629
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:785
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:503
void SetScriptPositions()
Definition: pageres.cpp:888
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1059
BlamerBundle * blamer_bundle
Definition: pageres.h:250
void RebuildBestState()
Definition: pageres.cpp:837
int8_t fontinfo_id_count
Definition: pageres.h:309
const UNICHARSET * uch_set
Definition: pageres.h:201
const FontInfo * fontinfo
Definition: pageres.h:307
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:779
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:441
void BestChoiceToCorrectText()
Definition: pageres.cpp:956
WERD_CHOICE_LIST best_choices
Definition: pageres.h:247
MATRIX * ratings
Definition: pageres.h:235
std::vector< int > best_state
Definition: pageres.h:283
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:401
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:895
tesseract::BoxWord * box_word
Definition: pageres.h:270
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:824
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:353
std::vector< int > blob_widths
Definition: pageres.h:214
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:483
void PrintBestChoices() const
Definition: pageres.cpp:731
BLOB_CHOICE * GetBlobChoice(unsigned index) const
Definition: pageres.cpp:768
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1090
void SetupBlamerBundle()
Definition: pageres.cpp:394
int GetBlobsWidth(int start_blob, int last_blob) const
Definition: pageres.cpp:746
void MergeAdjacentBlobs(unsigned index)
Definition: pageres.cpp:1005
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:613
std::vector< int > blob_gaps
Definition: pageres.h:217
float baseline_shift
Definition: pageres.h:316
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1071
TWERD * rebuild_word
Definition: pageres.h:264
WERD_CHOICE * ep_choice
Definition: pageres.h:291
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1111
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:253
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:186
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:344
std::vector< SEAM * > seam_array
Definition: pageres.h:212
PAGE_RES * page_res
Definition: pageres.h:684
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1548
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1700
WERD_RES * restart_row()
Definition: pageres.cpp:1683
WERD_RES * forward_block()
Definition: pageres.cpp:1715
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1183
void ReplaceCurrentWord(PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1378
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1252
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
bool IsText() const
Definition: polyblk.h:52
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:144
float certainty() const
Definition: ratngs.h:87
UNICHAR_ID unichar_id() const
Definition: ratngs.h:81
float rating() const
Definition: ratngs.h:84
unsigned TotalOfStates() const
Definition: ratngs.cpp:676
float certainty() const
Definition: ratngs.h:311
void remove_unichar_id(unsigned index)
Definition: ratngs.h:454
MATRIX_COORD MatrixCoord(unsigned index) const
Definition: ratngs.cpp:286
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:340
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
Definition: ratngs.cpp:427
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
bool empty() const
Definition: ratngs.h:280
static const float kBadRating
Definition: ratngs.h:256
bool dangerous_ambig_found() const
Definition: ratngs.h:344
unsigned state(unsigned index) const
Definition: ratngs.h:299
void set_permuter(uint8_t perm)
Definition: ratngs.h:356
BLOB_CHOICE_LIST * blob_choices(unsigned index, MATRIX *ratings) const
Definition: ratngs.cpp:274
unsigned length() const
Definition: ratngs.h:283
void print() const
Definition: ratngs.h:557
void SetAllScriptPositions(ScriptPos position)
Definition: ratngs.cpp:592
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:664
float rating() const
Definition: ratngs.h:308
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:528
float adjust_factor() const
Definition: ratngs.h:286
std::string & unichar_string()
Definition: ratngs.h:515
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
bool null_box() const
Definition: rect.h:60
TDimension right() const
Definition: rect.h:89
void remove_pos(uint16_t pos)
Definition: rejctmap.cpp:100
uint16_t length() const
Definition: rejctmap.h:333
void initialise(uint16_t length)
Definition: rejctmap.cpp:67
static void JoinPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:204
bool HasAnySplits() const
Definition: seam.h:52
bool PrepareToInsertSeam(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int insert_index, bool modify)
Definition: seam.cpp:54
static void BreakPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:181
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:238
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124
C_BLOB_LIST * cblob_list()
Definition: werd.h:96
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:91
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131
void set_script_id(int id)
Definition: werd.h:109
TBOX bounding_box() const
Definition: werd.cpp:155
uint8_t space() const
Definition: werd.h:100
void operator=(const ELIST_LINK &)
Definition: elst.h:100
unsigned size() const
Definition: genericvector.h:74
int default_sid() const
Definition: unicharset.h:947
bool script_has_xheight() const
Definition: unicharset.h:959
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
int latin_sid() const
Definition: unicharset.h:923
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:912