tesseract  5.0.0
fixspace.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: fixspace.cpp (Formerly fixspace.c)
3  * Description: Implements a pass over the page res, exploring the alternative
4  * spacing possibilities, trying to use context to improve the
5  * word spacing
6  * Author: Phil Cheatle
7  *
8  * (C) Copyright 1993, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 #include "fixspace.h"
22 
23 #include "blobs.h" // for TWERD, TBLOB, TESSLINE
24 #include "boxword.h" // for BoxWord
25 #include "errcode.h" // for ASSERT_HOST
26 #include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset
27 #include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
28 #include "params.h" // for IntParam, StringParam, BoolParam, DoubleParam, ...
29 #include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
30 #include "rect.h" // for TBOX
31 #include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
32 #include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
33 #include "tessvars.h" // for debug_fp
34 #include "tprintf.h" // for tprintf
35 #include "unicharset.h" // for UNICHARSET
36 #include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
37 
38 #include <tesseract/ocrclass.h> // for ETEXT_DESC
39 #include <tesseract/unichar.h> // for UNICHAR_ID
40 
41 #include <cstdint> // for INT16_MAX, int16_t, int32_t
42 
43 namespace tesseract {
44 
45 class BLOCK;
46 class ROW;
47 
48 #define PERFECT_WERDS 999
49 
50 /**********************************************************************
51  * c_blob_comparator()
52  *
53  * Blob comparator used to sort a blob list so that blobs are in increasing
54  * order of left edge.
55  **********************************************************************/
56 
57 static int c_blob_comparator( // sort blobs
58  const void *blob1p, // ptr to ptr to blob1
59  const void *blob2p // ptr to ptr to blob2
60 ) {
61  const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB *const *>(blob1p);
62  const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB *const *>(blob2p);
63 
64  return blob1->bounding_box().left() - blob2->bounding_box().left();
65 }
66 
77 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) {
78  BLOCK_RES_IT block_res_it;
79  ROW_RES_IT row_res_it;
80  WERD_RES_IT word_res_it_from;
81  WERD_RES_IT word_res_it_to;
82  WERD_RES *word_res;
83  WERD_RES_LIST fuzzy_space_words;
84  int16_t new_length;
85  bool prevent_null_wd_fixsp; // DON'T process blobless wds
86  int32_t word_index; // current word
87 
88  block_res_it.set_to_list(&page_res->block_res_list);
89  word_index = 0;
90  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {
91  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
92  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {
93  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
94  while (!word_res_it_from.at_last()) {
95  word_res = word_res_it_from.data();
96  while (!word_res_it_from.at_last() &&
97  !(word_res->combination ||
98  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
99  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
100  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
101  word_res = word_res_it_from.forward();
102  word_index++;
103  if (monitor != nullptr) {
104  monitor->ocr_alive = true;
105  monitor->progress = 90 + 5 * word_index / word_count;
106  if (monitor->deadline_exceeded() ||
107  (monitor->cancel != nullptr &&
108  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
109  return;
110  }
111  }
112  }
113 
114  if (!word_res_it_from.at_last()) {
115  word_res_it_to = word_res_it_from;
116  prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();
117  if (check_debug_pt(word_res, 60)) {
118  debug_fix_space_level.set_value(10);
119  }
120  word_res_it_to.forward();
121  word_index++;
122  if (monitor != nullptr) {
123  monitor->ocr_alive = true;
124  monitor->progress = 90 + 5 * word_index / word_count;
125  if (monitor->deadline_exceeded() ||
126  (monitor->cancel != nullptr &&
127  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
128  return;
129  }
130  }
131  while (!word_res_it_to.at_last() &&
132  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
133  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
134  if (check_debug_pt(word_res, 60)) {
135  debug_fix_space_level.set_value(10);
136  }
137  if (word_res->word->cblob_list()->empty()) {
138  prevent_null_wd_fixsp = true;
139  }
140  word_res = word_res_it_to.forward();
141  }
142  if (check_debug_pt(word_res, 60)) {
143  debug_fix_space_level.set_value(10);
144  }
145  if (word_res->word->cblob_list()->empty()) {
146  prevent_null_wd_fixsp = true;
147  }
148  if (prevent_null_wd_fixsp) {
149  word_res_it_from = word_res_it_to;
150  } else {
151  fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);
152  fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,
153  block_res_it.data()->block);
154  new_length = fuzzy_space_words.length();
155  word_res_it_from.add_list_before(&fuzzy_space_words);
156  for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {
157  word_res_it_from.forward();
158  }
159  }
160  if (test_pt) {
161  debug_fix_space_level.set_value(0);
162  }
163  }
164  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
165  // Last word in row
166  }
167  }
168  }
169 }
170 
171 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
172  int16_t best_score;
173  WERD_RES_LIST current_perm;
174  int16_t current_score;
175  bool improved = false;
176 
177  best_score = eval_word_spacing(best_perm); // default score
178  dump_words(best_perm, best_score, 1, improved);
179 
180  if (best_score != PERFECT_WERDS) {
181  initialise_search(best_perm, current_perm);
182  }
183 
184  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
185  match_current_words(current_perm, row, block);
186  current_score = eval_word_spacing(current_perm);
187  dump_words(current_perm, current_score, 2, improved);
188  if (current_score > best_score) {
189  best_perm.clear();
190  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
191  best_score = current_score;
192  improved = true;
193  }
194  if (current_score < PERFECT_WERDS) {
195  transform_to_next_perm(current_perm);
196  }
197  }
198  dump_words(best_perm, best_score, 3, improved);
199 }
200 
201 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
202  WERD_RES_IT src_it(&src_list);
203  WERD_RES_IT new_it(&new_list);
204  WERD_RES *src_wd;
205  WERD_RES *new_wd;
206 
207  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
208  src_wd = src_it.data();
209  if (!src_wd->combination) {
210  new_wd = WERD_RES::deep_copy(src_wd);
211  new_wd->combination = false;
212  new_wd->part_of_combo = false;
213  new_it.add_after_then_move(new_wd);
214  }
215  }
216 }
217 
218 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) {
219  WERD_RES_IT word_it(&words);
220  WERD_RES *word;
221  // Since we are not using PAGE_RES to iterate over words, we need to update
222  // prev_word_best_choice_ before calling classify_word_pass2().
223  prev_word_best_choice_ = nullptr;
224  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
225  word = word_it.data();
226  if ((!word->part_of_combo) && (word->box_word == nullptr)) {
227  WordData word_data(block, row, word);
228  SetupWordPassN(2, &word_data);
229  classify_word_and_language(2, nullptr, &word_data);
230  }
232  }
233 }
234 
260 int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
261  WERD_RES_IT word_res_it(&word_res_list);
262  int16_t total_score = 0;
263  int16_t word_count = 0;
264  int16_t done_word_count = 0;
265  int i;
266  int16_t offset;
267  int16_t prev_word_score = 0;
268  bool prev_word_done = false;
269  bool prev_char_1 = false; // prev ch a "1/I/l"?
270  bool prev_char_digit = false; // prev ch 2..9 or 0
271  const char *punct_chars = "!\"`',.:;";
272  bool prev_char_punct = false;
273 
274  do {
275  // current word
276  WERD_RES *word = word_res_it.data();
277  bool word_done = fixspace_thinks_word_done(word);
278  word_count++;
279  if (word->tess_failed) {
280  total_score += prev_word_score;
281  if (prev_word_done) {
282  done_word_count++;
283  }
284  prev_word_score = 0;
285  prev_char_1 = false;
286  prev_char_digit = false;
287  prev_word_done = false;
288  } else {
289  /*
290  Can we add the prev word score and potentially count this word?
291  Yes IF it didn't end in a 1 when the first char of this word is a digit
292  AND it didn't end in a digit when the first char of this word is a 1
293 */
294  auto word_len = word->reject_map.length();
295  bool current_word_ok_so_far = false;
296  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
297  (prev_char_digit &&
298  ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
299  word->best_choice->unichar_string()[0] == '1') ||
300  (!word_done &&
301  conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
302  total_score += prev_word_score;
303  if (prev_word_done) {
304  done_word_count++;
305  }
306  current_word_ok_so_far = word_done;
307  }
308 
309  if (current_word_ok_so_far) {
310  prev_word_done = true;
311  prev_word_score = word_len;
312  } else {
313  prev_word_done = false;
314  prev_word_score = 0;
315  }
316 
317  /* Add 1 to total score for every joined 1 regardless of context and
318  rejtn */
319  for (i = 0, prev_char_1 = false; i < word_len; i++) {
320  bool current_char_1 = word->best_choice->unichar_string()[i] == '1';
321  if (prev_char_1 || (current_char_1 && (i > 0))) {
322  total_score++;
323  }
324  prev_char_1 = current_char_1;
325  }
326 
327  /* Add 1 to total score for every joined punctuation regardless of context
328  and rejtn */
329  if (tessedit_prefer_joined_punct) {
330  for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
331  offset += word->best_choice->unichar_lengths()[i++]) {
332  bool current_char_punct =
333  strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;
334  if (prev_char_punct || (current_char_punct && i > 0)) {
335  total_score++;
336  }
337  prev_char_punct = current_char_punct;
338  }
339  }
340  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
341  for (i = 0, offset = 0; i < word_len - 1;
342  offset += word->best_choice->unichar_lengths()[i++]) {
343  ;
344  }
345  prev_char_1 =
346  ((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
347  (!word_done &&
348  conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
349  }
350  /* Find next word */
351  do {
352  word_res_it.forward();
353  } while (word_res_it.data()->part_of_combo);
354  } while (!word_res_it.at_first());
355  total_score += prev_word_score;
356  if (prev_word_done) {
357  done_word_count++;
358  }
359  if (done_word_count == word_count) {
360  return PERFECT_WERDS;
361  } else {
362  return total_score;
363  }
364 }
365 
366 bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
367  int i;
368  int offset;
369 
370  for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) {
371  ;
372  }
373  return (
374  word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
375  word->best_choice->unichar_lengths()[i]) ||
376  (word->best_choice->permuter() == NUMBER_PERM &&
377  numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
378 }
379 
391 void transform_to_next_perm(WERD_RES_LIST &words) {
392  WERD_RES_IT word_it(&words);
393  WERD_RES_IT prev_word_it(&words);
394  WERD_RES *word;
395  WERD_RES *prev_word;
396  WERD_RES *combo;
397  WERD *copy_word;
398  int16_t prev_right = -INT16_MAX;
399  TBOX box;
400  int16_t gap;
401  int16_t min_gap = INT16_MAX;
402 
403  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
404  word = word_it.data();
405  if (!word->part_of_combo) {
406  box = word->word->bounding_box();
407  if (prev_right > -INT16_MAX) {
408  gap = box.left() - prev_right;
409  if (gap < min_gap) {
410  min_gap = gap;
411  }
412  }
413  prev_right = box.right();
414  }
415  }
416  if (min_gap < INT16_MAX) {
417  prev_right = -INT16_MAX; // back to start
418  word_it.set_to_list(&words);
419  // Note: we can't use cycle_pt due to inserted combos at start of list.
420  for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) {
421  word = word_it.data();
422  if (!word->part_of_combo) {
423  box = word->word->bounding_box();
424  if (prev_right > -INT16_MAX) {
425  gap = box.left() - prev_right;
426  if (gap <= min_gap) {
427  prev_word = prev_word_it.data();
428  if (prev_word->combination) {
429  combo = prev_word;
430  } else {
431  /* Make a new combination and insert before
432  * the first word being joined. */
433  copy_word = new WERD;
434  *copy_word = *(prev_word->word);
435  // deep copy
436  combo = new WERD_RES(copy_word);
437  combo->combination = true;
438  combo->x_height = prev_word->x_height;
439  prev_word->part_of_combo = true;
440  prev_word_it.add_before_then_move(combo);
441  }
442  combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
443  if (word->combination) {
444  combo->word->join_on(word->word);
445  // Move blobs to combo
446  // old combo no longer needed
447  delete word_it.extract();
448  } else {
449  // Copy current wd to combo
450  combo->copy_on(word);
451  word->part_of_combo = true;
452  }
453  combo->done = false;
454  combo->ClearResults();
455  } else {
456  prev_word_it = word_it; // catch up
457  }
458  }
459  prev_right = box.right();
460  }
461  }
462  } else {
463  words.clear(); // signal termination
464  }
465 }
466 
467 void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) {
468  WERD_RES_IT word_res_it(&perm);
469 
470  if (debug_fix_space_level > 0) {
471  if (mode == 1) {
472  stats_.dump_words_str = "";
473  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
474  if (!word_res_it.data()->part_of_combo) {
475  stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();
476  stats_.dump_words_str += ' ';
477  }
478  }
479  }
480 
481  if (debug_fix_space_level > 1) {
482  switch (mode) {
483  case 1:
484  tprintf("EXTRACTED (%d): \"", score);
485  break;
486  case 2:
487  tprintf("TESTED (%d): \"", score);
488  break;
489  case 3:
490  tprintf("RETURNED (%d): \"", score);
491  break;
492  }
493 
494  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
495  if (!word_res_it.data()->part_of_combo) {
496  tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
497  static_cast<int>(word_res_it.data()->best_choice->permuter()));
498  }
499  }
500  tprintf("\"\n");
501  } else if (improved) {
502  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
503  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
504  if (!word_res_it.data()->part_of_combo) {
505  tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
506  static_cast<int>(word_res_it.data()->best_choice->permuter()));
507  }
508  }
509  tprintf("\"\n");
510  }
511  }
512 }
513 
515  if (word->done) {
516  return true;
517  }
518 
519  /*
520  Use all the standard pass 2 conditions for mode 5 in set_done() in
521  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
522  CARE WHETHER WE HAVE of/at on/an etc.
523 */
524  if (fixsp_done_mode > 0 &&
525  (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
526  fixsp_done_mode == 3) &&
527  (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
528  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
529  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
530  (word->best_choice->permuter() == USER_DAWG_PERM) ||
531  (word->best_choice->permuter() == NUMBER_PERM))) {
532  return true;
533  } else {
534  return false;
535  }
536 }
537 
545 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) {
546  WERD_RES *word_res;
547  WERD_RES_LIST sub_word_list;
548  WERD_RES_IT sub_word_list_it(&sub_word_list);
549  int16_t blob_index;
550  int16_t new_length;
551  float junk;
552 
553  word_res = word_res_it.data();
554  if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||
555  !word_res->word->flag(W_DONT_CHOP)) {
556  return;
557  }
558 
559  blob_index = worst_noise_blob(word_res, &junk);
560  if (blob_index < 0) {
561  return;
562  }
563 
564  if (debug_fix_space_level > 1) {
565  tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str());
566  }
567  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
568  sub_word_list_it.add_after_stay_put(word_res_it.extract());
569  fix_noisy_space_list(sub_word_list, row, block);
570  new_length = sub_word_list.length();
571  word_res_it.add_list_before(&sub_word_list);
572  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
573  word_res_it.forward();
574  }
575 }
576 
577 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
578  int16_t best_score;
579  WERD_RES_IT best_perm_it(&best_perm);
580  WERD_RES_LIST current_perm;
581  WERD_RES_IT current_perm_it(&current_perm);
582  WERD_RES *old_word_res;
583  int16_t current_score;
584  bool improved = false;
585 
586  best_score = fp_eval_word_spacing(best_perm); // default score
587 
588  dump_words(best_perm, best_score, 1, improved);
589 
590  old_word_res = best_perm_it.data();
591  // Even deep_copy doesn't copy the underlying WERD unless its combination
592  // flag is true!.
593  old_word_res->combination = true; // Kludge to force deep copy
594  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
595  old_word_res->combination = false; // Undo kludge
596 
597  break_noisiest_blob_word(current_perm);
598 
599  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
600  match_current_words(current_perm, row, block);
601  current_score = fp_eval_word_spacing(current_perm);
602  dump_words(current_perm, current_score, 2, improved);
603  if (current_score > best_score) {
604  best_perm.clear();
605  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
606  best_score = current_score;
607  improved = true;
608  }
609  if (current_score < PERFECT_WERDS) {
610  break_noisiest_blob_word(current_perm);
611  }
612  }
613  dump_words(best_perm, best_score, 3, improved);
614 }
615 
621 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
622  WERD_RES_IT word_it(&words);
623  WERD_RES_IT worst_word_it;
624  float worst_noise_score = 9999;
625  int worst_blob_index = -1; // Noisiest blob of noisiest wd
626  int blob_index; // of wds noisiest blob
627  float noise_score; // of wds noisiest blob
628  WERD_RES *word_res;
629  C_BLOB_IT blob_it;
630  C_BLOB_IT rej_cblob_it;
631  C_BLOB_LIST new_blob_list;
632  C_BLOB_IT new_blob_it;
633  C_BLOB_IT new_rej_cblob_it;
634  WERD *new_word;
635  int16_t start_of_noise_blob;
636  int16_t i;
637 
638  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
639  blob_index = worst_noise_blob(word_it.data(), &noise_score);
640  if (blob_index > -1 && worst_noise_score > noise_score) {
641  worst_noise_score = noise_score;
642  worst_blob_index = blob_index;
643  worst_word_it = word_it;
644  }
645  }
646  if (worst_blob_index < 0) {
647  words.clear(); // signal termination
648  return;
649  }
650 
651  /* Now split the worst_word_it */
652 
653  word_res = worst_word_it.data();
654 
655  /* Move blobs before noise blob to a new bloblist */
656 
657  new_blob_it.set_to_list(&new_blob_list);
658  blob_it.set_to_list(word_res->word->cblob_list());
659  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
660  new_blob_it.add_after_then_move(blob_it.extract());
661  }
662  start_of_noise_blob = blob_it.data()->bounding_box().left();
663  delete blob_it.extract(); // throw out noise blob
664 
665  new_word = new WERD(&new_blob_list, word_res->word);
666  new_word->set_flag(W_EOL, false);
667  word_res->word->set_flag(W_BOL, false);
668  word_res->word->set_blanks(1); // After break
669 
670  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
671  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
672  for (; (!rej_cblob_it.empty() &&
673  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
674  rej_cblob_it.forward()) {
675  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
676  }
677 
678  auto *new_word_res = new WERD_RES(new_word);
679  new_word_res->combination = true;
680  worst_word_it.add_before_then_move(new_word_res);
681 
682  word_res->ClearResults();
683 }
684 
685 int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
686  float noise_score[512];
687  int min_noise_blob; // 1st contender
688  int max_noise_blob; // last contender
689  int non_noise_count;
690  int worst_noise_blob; // Worst blob
691  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
692  float non_noise_limit = kBlnXHeight * 0.8;
693 
694  if (word_res->rebuild_word == nullptr) {
695  return -1; // Can't handle cube words.
696  }
697 
698  // Normalised.
699  auto blob_count = word_res->box_word->length();
700  ASSERT_HOST(blob_count <= 512);
701  if (blob_count < 5) {
702  return -1; // too short to split
703  }
704 
705  /* Get the noise scores for all blobs */
706 
707 #ifndef SECURE_NAMES
708  if (debug_fix_space_level > 5) {
709  tprintf("FP fixspace Noise metrics for \"%s\": ",
710  word_res->best_choice->unichar_string().c_str());
711  }
712 #endif
713 
714  for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
715  TBLOB *blob = word_res->rebuild_word->blobs[i];
716  if (word_res->reject_map[i].accepted()) {
717  noise_score[i] = non_noise_limit;
718  } else {
719  noise_score[i] = blob_noise_score(blob);
720  }
721 
722  if (debug_fix_space_level > 5) {
723  tprintf("%1.1f ", noise_score[i]);
724  }
725  }
726  if (debug_fix_space_level > 5) {
727  tprintf("\n");
728  }
729 
730  /* Now find the worst one which is far enough away from the end of the word */
731 
732  non_noise_count = 0;
733  int i;
734  for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
735  if (noise_score[i] >= non_noise_limit) {
736  non_noise_count++;
737  }
738  }
739  if (non_noise_count < fixsp_non_noise_limit) {
740  return -1;
741  }
742 
743  min_noise_blob = i;
744 
745  non_noise_count = 0;
746  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {
747  if (noise_score[i] >= non_noise_limit) {
748  non_noise_count++;
749  }
750  }
751  if (non_noise_count < fixsp_non_noise_limit) {
752  return -1;
753  }
754 
755  max_noise_blob = i;
756 
757  if (min_noise_blob > max_noise_blob) {
758  return -1;
759  }
760 
761  *worst_noise_score = small_limit;
762  worst_noise_blob = -1;
763  for (auto i = min_noise_blob; i <= max_noise_blob; i++) {
764  if (noise_score[i] < *worst_noise_score) {
765  worst_noise_blob = i;
766  *worst_noise_score = noise_score[i];
767  }
768  }
769  return worst_noise_blob;
770 }
771 
773  TBOX box; // BB of outline
774  int16_t outline_count = 0;
775  int16_t max_dimension;
776  int16_t largest_outline_dimension = 0;
777 
778  for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
779  outline_count++;
780  box = ol->bounding_box();
781  if (box.height() > box.width()) {
782  max_dimension = box.height();
783  } else {
784  max_dimension = box.width();
785  }
786 
787  if (largest_outline_dimension < max_dimension) {
788  largest_outline_dimension = max_dimension;
789  }
790  }
791 
792  if (outline_count > 5) {
793  // penalise LOTS of blobs
794  largest_outline_dimension *= 2;
795  }
796 
797  box = blob->bounding_box();
798  if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {
799  // Lax blob is if high or low
800  largest_outline_dimension /= 2;
801  }
802 
803  return largest_outline_dimension;
804 }
805 
806 void fixspace_dbg(WERD_RES *word) {
807  TBOX box = word->word->bounding_box();
808  const bool show_map_detail = false;
809  int16_t i;
810 
811  box.print();
812  tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
813  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", word->word->cblob_list()->length(),
814  word->rebuild_word->NumBlobs(), word->box_word->length());
815  word->reject_map.print(debug_fp);
816  tprintf("\n");
817  if (show_map_detail) {
818  tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
819  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
820  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
821  word->reject_map[i].full_print(debug_fp);
822  }
823  }
824 
825  tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
826  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
827 }
828 
837 int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
838  WERD_RES_IT word_it(&word_res_list);
839  WERD_RES *word;
840  int16_t score = 0;
841  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
842 
843  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
844  word = word_it.data();
845  if (word->rebuild_word == nullptr) {
846  continue; // Can't handle cube words.
847  }
848  if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
849  word->best_choice->permuter() == FREQ_DAWG_PERM ||
850  word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
851  auto num_blobs = word->rebuild_word->NumBlobs();
852  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
853  for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
854  TBLOB *blob = word->rebuild_word->blobs[i];
855  if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
856  score -= 1; // penalise possibly erroneous non-space
857  } else if (word->reject_map[i].accepted()) {
858  score++;
859  }
860  }
861  }
862  }
863  if (score < 0) {
864  score = 0;
865  }
866  return score;
867 }
868 
869 } // namespace tesseract
#define PERFECT_WERDS
Definition: fixspace.cpp:48
FILE * debug_fp
Definition: tessvars.cpp:24
#define ASSERT_HOST(x)
Definition: errcode.h:59
@ W_BOL
start of line
Definition: werd.h:34
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_EOL
end of line
Definition: werd.h:35
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:39
@ W_REP_CHAR
repeated character
Definition: werd.h:40
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const int kBlnXHeight
Definition: normalis.h:33
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:391
int UNICHAR_ID
Definition: unichar.h:36
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:201
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:240
@ NUMBER_PERM
Definition: ratngs.h:238
@ USER_DAWG_PERM
Definition: ratngs.h:242
@ FREQ_DAWG_PERM
Definition: ratngs.h:243
void fixspace_dbg(WERD_RES *word)
Definition: fixspace.cpp:806
const int kBlnBaselineOffset
Definition: normalis.h:34
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:109
bool deadline_exceeded() const
Definition: ocrclass.h:135
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:115
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:104
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:111
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:514
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1302
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:577
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:685
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:545
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:166
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:772
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:171
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:467
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:366
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:593
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:621
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1811
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:837
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:260
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:218
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:77
TESSLINE * next
Definition: blobs.h:288
TBOX bounding_box() const
Definition: blobs.cpp:466
TESSLINE * outlines
Definition: blobs.h:404
std::vector< TBLOB * > blobs
Definition: blobs.h:462
unsigned NumBlobs() const
Definition: blobs.h:449
unsigned length() const
Definition: boxword.h:81
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
void copy_on(WERD_RES *word_res)
Definition: pageres.h:667
WERD_CHOICE * best_choice
Definition: pageres.h:239
const UNICHARSET * uch_set
Definition: pageres.h:201
tesseract::BoxWord * box_word
Definition: pageres.h:270
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:655
TWERD * rebuild_word
Definition: pageres.h:264
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
uint8_t permuter() const
Definition: ratngs.h:327
const std::string & unichar_lengths() const
Definition: ratngs.h:529
unsigned length() const
Definition: ratngs.h:283
std::string & unichar_string()
Definition: ratngs.h:515
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
void print() const
Definition: rect.h:289
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void print(FILE *fp) const
Definition: rejctmap.cpp:112
int16_t reject_count() const
Definition: rejctmap.h:339
uint16_t length() const
Definition: rejctmap.h:333
void full_print(FILE *fp) const
Definition: rejctmap.cpp:120
C_BLOB_LIST * cblob_list()
Definition: werd.h:96
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:91
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131
TBOX bounding_box() const
Definition: werd.cpp:155
void join_on(WERD *other)
Definition: werd.cpp:208
void set_blanks(uint8_t new_blanks)
Definition: werd.h:103
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387