tesseract  5.0.0
applybox.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: applybox.cpp (Formerly applybox.c)
3  * Description: Re segment rows according to box file data
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1993, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef DISABLED_LEGACY_ENGINE
20 # include <allheaders.h>
21 # include <cctype>
22 # include <cerrno>
23 # include <cstring>
24 # include "boxread.h"
25 #endif // ndef DISABLED_LEGACY_ENGINE
26 #include <tesseract/unichar.h>
27 #include "pageres.h"
28 #include "tesseractclass.h"
29 #include "unicharset.h"
30 
31 #ifndef DISABLED_LEGACY_ENGINE
33 const int kMaxGroupSize = 4;
36 const double kMaxXHeightDeviationFraction = 0.125;
37 #endif // ndef DISABLED_LEGACY_ENGINE
38 
74 namespace tesseract {
75 
76 #ifndef DISABLED_LEGACY_ENGINE
77 static void clear_any_old_text(BLOCK_LIST *block_list) {
78  BLOCK_IT block_it(block_list);
79  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
80  ROW_IT row_it(block_it.data()->row_list());
81  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
82  WERD_IT word_it(row_it.data()->word_list());
83  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
84  word_it.data()->set_text("");
85  }
86  }
87  }
88 }
89 
90 // Applies the box file based on the image name filename, and resegments
91 // the words in the block_list (page), with:
92 // blob-mode: one blob per line in the box file, words as input.
93 // word/line-mode: one blob per space-delimited unit after the #, and one word
94 // per line in the box file. (See comment above for box file format.)
95 // If find_segmentation is true, (word/line mode) then the classifier is used
96 // to re-segment words/lines to match the space-delimited truth string for
97 // each box. In this case, the input box may be for a word or even a whole
98 // text line, and the output words will contain multiple blobs corresponding
99 // to the space-delimited input string.
100 // With find_segmentation false, no classifier is needed, but the chopper
101 // can still be used to correctly segment touching characters with the help
102 // of the input boxes.
103 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
104 // from normal classification, ie. with a word, chopped_word, rebuild_word,
105 // seam_array, denorm, box_word, and best_state, but NO best_choice or
106 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
107 // Instead, the correct_text member of WERD_RES is set, and this may be later
108 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
109 // is not required before calling ApplyBoxTraining.
110 PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,
111  BLOCK_LIST *block_list) {
112  std::vector<TBOX> boxes;
113  std::vector<std::string> texts, full_texts;
114  if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
115  return nullptr; // Can't do it.
116  }
117 
118  const int box_count = boxes.size();
119  int box_failures = 0;
120 
121  // In word mode, we use the boxes to make a word for each box, but
122  // in blob mode we use the existing words and maximally chop them first.
123  PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
124  clear_any_old_text(block_list);
125 
126  for (int i = 0; i < box_count; i++) {
127  bool foundit = false;
128  if (page_res != nullptr) {
129  foundit =
130  ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
131  (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
132  } else {
133  foundit = ResegmentWordBox(block_list, boxes[i],
134  (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
135  }
136  if (!foundit) {
137  box_failures++;
138  ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
139  }
140  }
141 
142  if (page_res == nullptr) {
143  // In word/line mode, we now maximally chop all the words and resegment
144  // them with the classifier.
145  page_res = SetupApplyBoxes(boxes, block_list);
146  ReSegmentByClassification(page_res);
147  }
148  if (applybox_debug > 0) {
149  tprintf("APPLY_BOXES:\n");
150  tprintf(" Boxes read from boxfile: %6d\n", box_count);
151  if (box_failures > 0) {
152  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
153  }
154  }
155  TidyUp(page_res);
156  return page_res;
157 }
158 
159 // Helper computes median xheight in the image.
160 static double MedianXHeight(BLOCK_LIST *block_list) {
161  BLOCK_IT block_it(block_list);
162  STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
163  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
164  ROW_IT row_it(block_it.data()->row_list());
165  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
166  xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
167  }
168  }
169  return xheights.median();
170 }
171 
174 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
175  const double median_xheight = MedianXHeight(block_list);
176  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
177  // Strip all fuzzy space markers to simplify the PAGE_RES.
178  BLOCK_IT b_it(block_list);
179  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
180  BLOCK *block = b_it.data();
181  ROW_IT r_it(block->row_list());
182  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
183  ROW *row = r_it.data();
184  const double diff = fabs(row->x_height() - median_xheight);
185  if (diff > max_deviation) {
186  if (applybox_debug) {
187  tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
188  }
189  row->set_x_height(static_cast<float>(median_xheight));
190  }
191  }
192  }
193 }
194 
197 PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {
198  PreenXHeights(block_list);
199  // Strip all fuzzy space markers to simplify the PAGE_RES.
200  BLOCK_IT b_it(block_list);
201  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
202  BLOCK *block = b_it.data();
203  ROW_IT r_it(block->row_list());
204  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
205  ROW *row = r_it.data();
206  WERD_IT w_it(row->word_list());
207  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
208  WERD *word = w_it.data();
209  if (word->cblob_list()->empty()) {
210  delete w_it.extract();
211  } else {
212  word->set_flag(W_FUZZY_SP, false);
213  word->set_flag(W_FUZZY_NON, false);
214  }
215  }
216  }
217  }
218  auto *page_res = new PAGE_RES(false, block_list, nullptr);
219  PAGE_RES_IT pr_it(page_res);
220  WERD_RES *word_res;
221  while ((word_res = pr_it.word()) != nullptr) {
222  MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
223  pr_it.forward();
224  }
225  return page_res;
226 }
227 
231 void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,
232  WERD_RES *word_res) {
233  if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
234  classify_bln_numeric_mode, textord_use_cjk_fp_model,
235  poly_allow_detailed_fx, row, block)) {
236  word_res->CloneChoppedToRebuild();
237  return;
238  }
239  if (chop_debug) {
240  tprintf("Maximally chopping word at:");
241  word_res->word->bounding_box().print();
242  }
243  std::vector<BLOB_CHOICE *> blob_choices;
244  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
245  auto rating = static_cast<float>(INT8_MAX);
246  for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
247  // The rating and certainty are not quite arbitrary. Since
248  // select_blob_to_chop uses the worst certainty to choose, they all have
249  // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
250  // in here, and then divide by e each time they are chopped, which
251  // should guarantee a set of unequal values for the whole tree of blobs
252  // produced, however much chopping is required. The chops are thus only
253  // limited by the ability of the chopper to find suitable chop points,
254  // and not by the value of the certainties.
255  auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
256  blob_choices.push_back(choice);
257  rating -= 0.125f;
258  }
259  const double e = exp(1.0); // The base of natural logs.
260  unsigned blob_number;
261  int right_chop_index = 0;
262  if (!assume_fixed_pitch_char_segment) {
263  // We only chop if the language is not fixed pitch like CJK.
264  SEAM *seam = nullptr;
265  while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
266  word_res->InsertSeam(blob_number, seam);
267  BLOB_CHOICE *left_choice = blob_choices[blob_number];
268  rating = left_choice->rating() / e;
269  left_choice->set_rating(rating);
270  left_choice->set_certainty(-rating);
271  // combine confidence w/ serial #
272  auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
273  0.0f, 0.0f, BCC_FAKE);
274  blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
275  }
276  }
277  word_res->CloneChoppedToRebuild();
278  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
279 }
280 
292 static double BoxMissMetric(const TBOX &box1, const TBOX &box2) {
293  const int overlap_area = box1.intersection(box2).area();
294  const int a = box1.area();
295  const int b = box2.area();
296  ASSERT_HOST(a != 0 && b != 0);
297  return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
298 }
299 
310 bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,
311  const TBOX *next_box, const char *correct_text) {
312  if (applybox_debug > 1) {
313  tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
314  }
315  PAGE_RES_IT page_res_it(page_res);
316  WERD_RES *word_res;
317  for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {
318  if (!word_res->box_word->bounding_box().major_overlap(box)) {
319  continue;
320  }
321  if (applybox_debug > 1) {
322  tprintf("Checking word box:");
323  word_res->box_word->bounding_box().print();
324  }
325  int word_len = word_res->box_word->length();
326  for (int i = 0; i < word_len; ++i) {
327  TBOX char_box = TBOX();
328  int blob_count = 0;
329  for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
330  TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
331  if (!blob_box.major_overlap(box)) {
332  break;
333  }
334  if (word_res->correct_text[i + blob_count].length() > 0) {
335  break; // Blob is claimed already.
336  }
337  if (next_box != nullptr) {
338  const double current_box_miss_metric = BoxMissMetric(blob_box, box);
339  const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
340  if (applybox_debug > 2) {
341  tprintf("Checking blob:");
342  blob_box.print();
343  tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
344  next_box_miss_metric);
345  }
346  if (current_box_miss_metric > next_box_miss_metric) {
347  break; // Blob is a better match for next box.
348  }
349  }
350  char_box += blob_box;
351  }
352  if (blob_count > 0) {
353  if (applybox_debug > 1) {
354  tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
355  }
356  if (!char_box.almost_equal(box, 3) &&
357  ((next_box != nullptr && box.x_gap(*next_box) < -3) ||
358  (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
359  return false;
360  }
361  // We refine just the box_word, best_state and correct_text here.
362  // The rebuild_word is made in TidyUp.
363  // blob_count blobs are put together to match the box. Merge the
364  // box_word boxes, save the blob_count in the state and the text.
365  word_res->box_word->MergeBoxes(i, i + blob_count);
366  word_res->best_state[i] = blob_count;
367  word_res->correct_text[i] = correct_text;
368  if (applybox_debug > 2) {
369  tprintf("%d Blobs match: blob box:", blob_count);
370  word_res->box_word->BlobBox(i).print();
371  tprintf("Matches box:");
372  box.print();
373  if (next_box != nullptr) {
374  tprintf("With next box:");
375  next_box->print();
376  }
377  }
378  // Eliminated best_state and correct_text entries for the consumed
379  // blobs.
380  for (int j = 1; j < blob_count; ++j) {
381  word_res->best_state.erase(word_res->best_state.begin() + i + 1);
382  word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
383  }
384  // Assume that no box spans multiple source words, so we are done with
385  // this box.
386  if (applybox_debug > 1) {
387  tprintf("Best state = ");
388  for (auto best_state : word_res->best_state) {
389  tprintf("%d ", best_state);
390  }
391  tprintf("\n");
392  tprintf("Correct text = [[ ");
393  for (auto &it : word_res->correct_text) {
394  tprintf("%s ", it.c_str());
395  }
396  tprintf("]]\n");
397  }
398  return true;
399  }
400  }
401  }
402  if (applybox_debug > 0) {
403  tprintf("FAIL!\n");
404  }
405  return false; // Failure.
406 }
407 
414 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,
415  const char *correct_text) {
416  if (applybox_debug > 1) {
417  tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
418  }
419  WERD *new_word = nullptr;
420  BLOCK_IT b_it(block_list);
421  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
422  BLOCK *block = b_it.data();
423  if (!box.major_overlap(block->pdblk.bounding_box())) {
424  continue;
425  }
426  ROW_IT r_it(block->row_list());
427  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
428  ROW *row = r_it.data();
429  if (!box.major_overlap(row->bounding_box())) {
430  continue;
431  }
432  WERD_IT w_it(row->word_list());
433  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
434  WERD *word = w_it.data();
435  if (applybox_debug > 2) {
436  tprintf("Checking word:");
437  word->bounding_box().print();
438  }
439  if (word->text() != nullptr && word->text()[0] != '\0') {
440  continue; // Ignore words that are already done.
441  }
442  if (!box.major_overlap(word->bounding_box())) {
443  continue;
444  }
445  C_BLOB_IT blob_it(word->cblob_list());
446  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
447  C_BLOB *blob = blob_it.data();
448  TBOX blob_box = blob->bounding_box();
449  if (!blob_box.major_overlap(box)) {
450  continue;
451  }
452  if (next_box != nullptr) {
453  const double current_box_miss_metric = BoxMissMetric(blob_box, box);
454  const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
455  if (applybox_debug > 2) {
456  tprintf("Checking blob:");
457  blob_box.print();
458  tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
459  next_box_miss_metric);
460  }
461  if (current_box_miss_metric > next_box_miss_metric) {
462  continue; // Blob is a better match for next box.
463  }
464  }
465  if (applybox_debug > 2) {
466  tprintf("Blob match: blob:");
467  blob_box.print();
468  tprintf("Matches box:");
469  box.print();
470  if (next_box != nullptr) {
471  tprintf("With next box:");
472  next_box->print();
473  }
474  }
475  if (new_word == nullptr) {
476  // Make a new word with a single blob.
477  new_word = word->shallow_copy();
478  new_word->set_text(correct_text);
479  w_it.add_to_end(new_word);
480  }
481  C_BLOB_IT new_blob_it(new_word->cblob_list());
482  new_blob_it.add_to_end(blob_it.extract());
483  }
484  }
485  }
486  }
487  if (new_word == nullptr && applybox_debug > 0) {
488  tprintf("FAIL!\n");
489  }
490  return new_word != nullptr;
491 }
492 
495 void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
496  PAGE_RES_IT pr_it(page_res);
497  WERD_RES *word_res;
498  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
499  const WERD *word = word_res->word;
500  if (word->text() == nullptr || word->text()[0] == '\0') {
501  continue; // Ignore words that have no text.
502  }
503  // Convert the correct text to a vector of UNICHAR_ID
504  std::vector<UNICHAR_ID> target_text;
505  if (!ConvertStringToUnichars(word->text(), &target_text)) {
506  tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
507  pr_it.DeleteCurrentWord();
508  continue;
509  }
510  if (!FindSegmentation(target_text, word_res)) {
511  tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());
512  pr_it.DeleteCurrentWord();
513  continue;
514  }
515  }
516 }
517 
520 bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
521  for (int step = 0; *utf8 != '\0'; utf8 += step) {
522  const char *next_space = strchr(utf8, ' ');
523  if (next_space == nullptr) {
524  next_space = utf8 + strlen(utf8);
525  }
526  step = next_space - utf8;
527  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
528  if (class_id == INVALID_UNICHAR_ID) {
529  return false;
530  }
531  while (utf8[step] == ' ') {
532  ++step;
533  }
534  class_ids->push_back(class_id);
535  }
536  return true;
537 }
538 
545 bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
546  // Classify all required combinations of blobs and save results in choices.
547  const int word_length = word_res->box_word->length();
548  auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
549  for (int i = 0; i < word_length; ++i) {
550  for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
551  BLOB_CHOICE_LIST *match_result =
552  classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,
553  word_res->blamer_bundle);
554  if (applybox_debug > 2) {
555  tprintf("%d+%d:", i, j);
556  print_ratings_list("Segment:", match_result, unicharset);
557  }
558  choices[i].push_back(match_result);
559  }
560  }
561  // Search the segmentation graph for the target text. Must be an exact
562  // match. Using wildcards makes it difficult to find the correct
563  // segmentation even when it is there.
564  word_res->best_state.clear();
565  std::vector<int> search_segmentation;
566  float best_rating = 0.0f;
567  SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
568  &word_res->best_state);
569  for (int i = 0; i < word_length; ++i) {
570  for (auto choice : choices[i]) {
571  delete choice;
572  }
573  }
574  delete[] choices;
575  if (word_res->best_state.empty()) {
576  // Build the original segmentation and if it is the same length as the
577  // truth, assume it will do.
578  int blob_count = 1;
579  for (auto s : word_res->seam_array) {
580  SEAM *seam = s;
581  if (!seam->HasAnySplits()) {
582  word_res->best_state.push_back(blob_count);
583  blob_count = 1;
584  } else {
585  ++blob_count;
586  }
587  }
588  word_res->best_state.push_back(blob_count);
589  if (word_res->best_state.size() != target_text.size()) {
590  word_res->best_state.clear(); // No good. Original segmentation bad size.
591  return false;
592  }
593  }
594  word_res->correct_text.clear();
595  for (auto &text : target_text) {
596  word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));
597  }
598  return true;
599 }
600 
615 void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
616  unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,
617  unsigned text_index, float rating, std::vector<int> *segmentation,
618  float *best_rating, std::vector<int> *best_segmentation) {
620  for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
621  // Rating of matching choice or worst choice if no match.
622  float choice_rating = 0.0f;
623  // Find the corresponding best BLOB_CHOICE.
624  BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
625  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
626  const BLOB_CHOICE *choice = choice_it.data();
627  choice_rating = choice->rating();
628  auto class_id = choice->unichar_id();
629  if (class_id == target_text[text_index]) {
630  break;
631  }
632  // Search ambigs table.
633  if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) {
634  AmbigSpec_IT spec_it(table[class_id]);
635  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
636  const AmbigSpec *ambig_spec = spec_it.data();
637  // We'll only do 1-1.
638  if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
639  ambig_spec->correct_ngram_id == target_text[text_index]) {
640  break;
641  }
642  }
643  if (!spec_it.cycled_list()) {
644  break; // Found an ambig.
645  }
646  }
647  }
648  if (choice_it.cycled_list()) {
649  continue; // No match.
650  }
651  segmentation->push_back(length);
652  if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
653  // This is a complete match. If the rating is good record a new best.
654  if (applybox_debug > 2) {
655  tprintf("Complete match, rating = %g, best=%g, seglength=%zu, best=%zu\n",
656  rating + choice_rating, *best_rating, segmentation->size(),
657  best_segmentation->size());
658  }
659  if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
660  *best_segmentation = *segmentation;
661  *best_rating = rating + choice_rating;
662  }
663  } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
664  if (applybox_debug > 3) {
665  tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
666  unicharset.id_to_unichar(target_text[text_index]),
667  choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",
668  choices_pos, length);
669  }
670  SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
671  rating + choice_rating, segmentation, best_rating, best_segmentation);
672  if (applybox_debug > 3) {
673  tprintf("End recursion for %d=%s\n", target_text[text_index],
674  unicharset.id_to_unichar(target_text[text_index]));
675  }
676  }
677  segmentation->resize(segmentation->size() - 1);
678  }
679 }
680 
685 void Tesseract::TidyUp(PAGE_RES *page_res) {
686  int ok_blob_count = 0;
687  int bad_blob_count = 0;
688  int ok_word_count = 0;
689  int unlabelled_words = 0;
690  PAGE_RES_IT pr_it(page_res);
691  WERD_RES *word_res;
692  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
693  int ok_in_word = 0;
694  int blob_count = word_res->correct_text.size();
695  auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
696  word_choice->set_permuter(TOP_CHOICE_PERM);
697  for (int c = 0; c < blob_count; ++c) {
698  if (word_res->correct_text[c].length() > 0) {
699  ++ok_in_word;
700  }
701  // Since we only need a fake word_res->best_choice, the actual
702  // unichar_ids do not matter. Which is fortunate, since TidyUp()
703  // can be called while training Tesseract, at the stage where
704  // unicharset is not meaningful yet.
705  word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],
706  1.0f, -1.0f);
707  }
708  if (ok_in_word > 0) {
709  ok_blob_count += ok_in_word;
710  bad_blob_count += word_res->correct_text.size() - ok_in_word;
711  word_res->LogNewRawChoice(word_choice);
712  word_res->LogNewCookedChoice(1, false, word_choice);
713  } else {
714  ++unlabelled_words;
715  if (applybox_debug > 0) {
716  tprintf("APPLY_BOXES: Unlabelled word at :");
717  word_res->word->bounding_box().print();
718  }
719  pr_it.DeleteCurrentWord();
720  delete word_choice;
721  }
722  }
723  pr_it.restart_page();
724  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
725  // Denormalize back to a BoxWord.
726  word_res->RebuildBestState();
727  word_res->SetupBoxWord();
728  word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
729  word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
730  }
731  if (applybox_debug > 0) {
732  tprintf(" Found %d good blobs.\n", ok_blob_count);
733  if (bad_blob_count > 0) {
734  tprintf(" Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
735  }
736  if (unlabelled_words > 0) {
737  tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
738  }
739  }
740 }
741 
743 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
744  const char *err_msg) {
745  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
746  box.left(), box.bottom(), box.right(), box.top(), err_msg);
747 }
748 
751 void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {
752  PAGE_RES_IT pr_it(page_res);
753  int word_count = 0;
754  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
755  LearnWord(fontname.c_str(), word_res);
756  ++word_count;
757  }
758  tprintf("Generated training data for %d words\n", word_count);
759 }
760 
761 #endif // ndef DISABLED_LEGACY_ENGINE
762 
764 void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {
765  PAGE_RES_IT pr_it(page_res);
766  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
767  auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
768  for (auto &correct_text : word_res->correct_text) {
769  // The part before the first space is the real ground truth, and the
770  // rest is the bounding box location and page number.
771  std::vector<std::string> tokens = split(correct_text, ' ');
772  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
773  choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);
774  }
775  word_res->ClearWordChoices();
776  word_res->LogNewRawChoice(choice);
777  word_res->LogNewCookedChoice(1, false, choice);
778  }
779 }
780 
781 } // namespace tesseract
const int kMaxGroupSize
Definition: applybox.cpp:33
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:36
#define ASSERT_HOST(x)
Definition: errcode.h:59
@ TBOX
@ W_BOL
start of line
Definition: werd.h:34
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_EOL
end of line
Definition: werd.h:35
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:41
std::vector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:140
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:804
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:175
int UNICHAR_ID
Definition: unichar.h:36
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
Definition: boxread.cpp:75
@ BCC_FAKE
Definition: ratngs.h:53
@ TOP_CHOICE_PERM
Definition: ratngs.h:234
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
PAGE_RES * ApplyBoxes(const char *filename, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:110
void TidyUp(PAGE_RES *page_res)
void ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res)
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
Definition: applybox.cpp:310
void ReSegmentByClassification(PAGE_RES *page_res)
bool ConvertStringToUnichars(const char *utf8, std::vector< UNICHAR_ID > *class_ids)
Dict & getDict() override
Image BestPix() const
void SearchForText(const std::vector< BLOB_CHOICE_LIST * > *choices, int choices_pos, unsigned choices_length, const std::vector< UNICHAR_ID > &target_text, unsigned text_index, float rating, std::vector< int > *segmentation, float *best_rating, std::vector< int > *best_segmentation)
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:174
void CorrectClassifyWords(PAGE_RES *page_res)
void MaximallyChopWord(const std::vector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:231
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
bool FindSegmentation(const std::vector< UNICHAR_ID > &target_text, WERD_RES *word_res)
PAGE_RES * SetupApplyBoxes(const std::vector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:197
std::vector< TBLOB * > blobs
Definition: blobs.h:462
unsigned NumBlobs() const
Definition: blobs.h:449
void MergeBoxes(unsigned start, unsigned end)
Definition: boxword.cpp:138
const TBOX & BlobBox(unsigned index) const
Definition: boxword.h:84
const TBOX & bounding_box() const
Definition: boxword.h:78
unsigned length() const
Definition: boxword.h:81
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:111
WERD_LIST * word_list()
Definition: ocrrow.h:57
float x_height() const
Definition: ocrrow.h:66
void set_x_height(float new_xheight)
Definition: ocrrow.h:69
void CloneChoppedToRebuild()
Definition: pageres.cpp:865
std::vector< std::string > correct_text
Definition: pageres.h:287
void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:908
TWERD * chopped_word
Definition: pageres.h:210
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:419
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Image pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:304
std::vector< int > best_state
Definition: pageres.h:283
tesseract::BoxWord * box_word
Definition: pageres.h:270
WERD_RES * forward()
Definition: pageres.h:743
ROW_RES * row() const
Definition: pageres.h:766
WERD_RES * word() const
Definition: pageres.h:763
BLOCK_RES * block() const
Definition: pageres.h:769
void set_certainty(float newrat)
Definition: ratngs.h:150
void set_rating(float newrat)
Definition: ratngs.h:147
float rating() const
Definition: ratngs.h:84
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:272
int x_gap(const TBOX &box) const
Definition: rect.h:238
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:84
void print() const
Definition: rect.h:289
bool major_overlap(const TBOX &box) const
Definition: rect.h:374
int32_t area() const
Definition: rect.h:134
C_BLOB_LIST * cblob_list()
Definition: werd.h:96
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131
TBOX bounding_box() const
Definition: werd.cpp:155
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:157
UNICHARSET unicharset
Definition: ccutil.h:61
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:262
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111
SEAM * chop_one_blob(const std::vector< TBOX > &boxes, const std::vector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, unsigned *blob_number)
Definition: chopper.cpp:367
virtual BLOB_CHOICE_LIST * classify_piece(const std::vector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:49