tesseract  5.0.0
tfacepp.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tfacepp.cpp (Formerly tface++.c)
3  * Description: C++ side of the C/C++ Tess/Editor interface.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include <cmath>
20 
21 #include "blamer.h"
22 #include "errcode.h"
23 #include "ratngs.h"
24 #include "reject.h"
25 #include "tesseractclass.h"
26 #include "werd.h"
27 
28 #define MAX_UNDIVIDED_LENGTH 24
29 
30 /**********************************************************************
31  * recog_word
32  *
33  * Convert the word to tess form and pass it to the tess segmenter.
34  * Convert the output back to editor form.
35  **********************************************************************/
36 namespace tesseract {
38  if (wordrec_skip_no_truth_words &&
39  (word->blamer_bundle == nullptr ||
41  if (classify_debug_level) {
42  tprintf("No truth for word - skipping\n");
43  }
44  word->tess_failed = true;
45  return;
46  }
47  ASSERT_HOST(!word->chopped_word->blobs.empty());
49  word->SetupBoxWord();
50  ASSERT_HOST(static_cast<unsigned>(word->best_choice->length()) == word->box_word->length());
51  // Check that the ratings matrix size matches the sum of all the
52  // segmentation states.
53  if (!word->StatesAllValid()) {
54  tprintf("Not all words have valid states relative to ratings matrix!!");
55  word->DebugWordChoices(true, nullptr);
56  ASSERT_HOST(word->StatesAllValid());
57  }
58  if (tessedit_override_permuter) {
59  /* Override the permuter type if a straight dictionary check disagrees. */
60  uint8_t perm_type = word->best_choice->permuter();
61  if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) &&
62  (perm_type != USER_DAWG_PERM)) {
63  uint8_t real_dict_perm_type = dict_word(*word->best_choice);
64  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) ||
65  (real_dict_perm_type == USER_DAWG_PERM)) &&
66  (alpha_count(word->best_choice->unichar_string().c_str(),
67  word->best_choice->unichar_lengths().c_str()) > 0)) {
68  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
69  }
70  }
71  if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) {
72  tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter());
73  }
74  }
75  // Factored out from control.cpp
76  ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
77  if (word->best_choice == nullptr || word->best_choice->empty() ||
78  strspn(word->best_choice->unichar_string().c_str(), " ") ==
79  word->best_choice->length()) {
80  word->tess_failed = true;
81  word->reject_map.initialise(word->box_word->length());
83  } else {
84  word->tess_failed = false;
85  }
86 }
87 
88 /**********************************************************************
89  * recog_word_recursive
90  *
91  * Convert the word to tess form and pass it to the tess segmenter.
92  * Convert the output back to editor form.
93  **********************************************************************/
95  auto word_length = word->chopped_word->NumBlobs(); // no of blobs
96  if (word_length > MAX_UNDIVIDED_LENGTH) {
97  return split_and_recog_word(word);
98  }
99  cc_recog(word);
100  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
101 
102  // Do sanity checks and minor fixes on best_choice.
103  if (word->best_choice->length() > word_length) {
104  word->best_choice->make_bad(); // should never happen
105  tprintf(
106  "recog_word: Discarded long string \"%s\""
107  " (%d characters vs %d blobs)\n",
108  word->best_choice->unichar_string().c_str(), word->best_choice->length(), word_length);
109  tprintf("Word is at:");
110  word->word->bounding_box().print();
111  }
112  if (word->best_choice->length() < word_length) {
113  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
114  while (word->best_choice->length() < word_length) {
115  word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty());
116  }
117  }
118 }
119 
120 /**********************************************************************
121  * split_and_recog_word
122  *
123  * Split the word into 2 smaller pieces at the largest gap.
124  * Recognize the pieces and stick the results back together.
125  **********************************************************************/
127  // Find the biggest blob gap in the chopped_word.
128  int bestgap = -INT32_MAX;
129  int split_index = 0;
130  for (unsigned b = 1; b < word->chopped_word->NumBlobs(); ++b) {
131  TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
132  TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
133  int gap = blob_box.left() - prev_box.right();
134  if (gap > bestgap) {
135  bestgap = gap;
136  split_index = b;
137  }
138  }
139  ASSERT_HOST(split_index > 0);
140 
141  WERD_RES *word2 = nullptr;
142  BlamerBundle *orig_bb = nullptr;
143  split_word(word, split_index, &word2, &orig_bb);
144 
145  // Recognize the first part of the word.
146  recog_word_recursive(word);
147  // Recognize the second part of the word.
148  recog_word_recursive(word2);
149 
150  join_words(word, word2, orig_bb);
151 }
152 
153 /**********************************************************************
154  * split_word
155  *
156  * Split a given WERD_RES in place into two smaller words for recognition.
157  * split_pt is the index of the first blob to go in the second word.
158  * The underlying word is left alone, only the TWERD (and subsequent data)
159  * are split up. orig_blamer_bundle is set to the original blamer bundle,
160  * and will now be owned by the caller. New blamer bundles are forged for the
161  * two pieces.
162  **********************************************************************/
163 void Tesseract::split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece,
164  BlamerBundle **orig_blamer_bundle) const {
165  ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
166 
167  // Save a copy of the blamer bundle so we can try to reconstruct it below.
168  BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
169 
170  auto *word2 = new WERD_RES(*word);
171 
172  // blow away the copied chopped_word, as we want to work with
173  // the blobs from the input chopped_word so seam_arrays can be merged.
174  TWERD *chopped = word->chopped_word;
175  auto *chopped2 = new TWERD;
176  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
177  for (auto i = split_pt; i < chopped->NumBlobs(); ++i) {
178  chopped2->blobs.push_back(chopped->blobs[i]);
179  }
180  chopped->blobs.resize(split_pt);
181  word->chopped_word = nullptr;
182  delete word2->chopped_word;
183  word2->chopped_word = nullptr;
184 
185  const UNICHARSET &unicharset = *word->uch_set;
186  word->ClearResults();
187  word2->ClearResults();
188  word->chopped_word = chopped;
189  word2->chopped_word = chopped2;
191  word2->SetupBasicsFromChoppedWord(unicharset);
192 
193  // Try to adjust the blamer bundle.
194  if (orig_bb != nullptr) {
195  // TODO(rays) Looks like a leak to me.
196  // orig_bb should take, rather than copy.
197  word->blamer_bundle = new BlamerBundle();
198  word2->blamer_bundle = new BlamerBundle();
199  orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
200  word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer,
201  word->blamer_bundle, word2->blamer_bundle);
202  }
203 
204  *right_piece = word2;
205  *orig_blamer_bundle = orig_bb;
206 }
207 
208 /**********************************************************************
209  * join_words
210  *
211  * The opposite of split_word():
212  * join word2 (including any recognized data / seam array / etc)
213  * onto the right of word and then delete word2.
214  * Also, if orig_bb is provided, stitch it back into word.
215  **********************************************************************/
216 void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const {
217  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
218  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
219  // Tack the word2 outputs onto the end of the word outputs.
220  word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());
221  word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());
222  word2->chopped_word->blobs.clear();
223  word2->rebuild_word->blobs.clear();
224  TPOINT split_pt;
225  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
226  split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4;
227  // Move the word2 seams onto the end of the word1 seam_array.
228  // Since the seam list is one element short, an empty seam marking the
229  // end of the last blob in the first word is needed first.
230  word->seam_array.push_back(new SEAM(0.0f, split_pt));
231  word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());
232  word2->seam_array.clear();
233  // Fix widths and gaps.
234  word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());
235  word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());
236  // Fix the ratings matrix.
237  int rat1 = word->ratings->dimension();
238  int rat2 = word2->ratings->dimension();
239  word->ratings->AttachOnCorner(word2->ratings);
240  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
241  word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());
242  // Append the word choices.
243  *word->raw_choice += *word2->raw_choice;
244 
245  // How many alt choices from each should we try to get?
246  const int kAltsPerPiece = 2;
247  // When do we start throwing away extra alt choices?
248  const int kTooManyAltChoices = 100;
249 
250  // Construct the cartesian product of the best_choices of word(1) and word2.
251  WERD_CHOICE_LIST joined_choices;
252  WERD_CHOICE_IT jc_it(&joined_choices);
253  WERD_CHOICE_IT bc1_it(&word->best_choices);
254  WERD_CHOICE_IT bc2_it(&word2->best_choices);
255  int num_word1_choices = word->best_choices.length();
256  int total_joined_choices = num_word1_choices;
257  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
258  // word2 choices, and put them in the joined_choices list. The 1st word2
259  // choice gets added to the original word1 choices in-place after we have
260  // finished with them.
261  int bc2_index = 1;
262  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
263  if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) {
264  break;
265  }
266  int bc1_index = 0;
267  for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) {
268  if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) {
269  break;
270  }
271  auto *wc = new WERD_CHOICE(*bc1_it.data());
272  *wc += *bc2_it.data();
273  jc_it.add_after_then_move(wc);
274  ++total_joined_choices;
275  }
276  }
277  // Now that we've filled in as many alternates as we want, paste the best
278  // choice for word2 onto the original word alt_choices.
279  bc1_it.move_to_first();
280  bc2_it.move_to_first();
281  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
282  *bc1_it.data() += *bc2_it.data();
283  }
284  bc1_it.move_to_last();
285  bc1_it.add_list_after(&joined_choices);
286 
287  // Restore the pointer to original blamer bundle and combine blamer
288  // information recorded in the splits.
289  if (orig_bb != nullptr) {
290  orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer);
291  delete word->blamer_bundle;
292  word->blamer_bundle = orig_bb;
293  }
294  word->SetupBoxWord();
295  word->reject_map.initialise(word->box_word->length());
296  delete word2;
297 }
298 
299 } // namespace tesseract
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:28
#define ASSERT_HOST(x)
Definition: errcode.h:59
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ IRR_NO_TRUTH
Definition: blamer.h:98
int UNICHAR_ID
Definition: unichar.h:36
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:240
@ USER_DAWG_PERM
Definition: ratngs.h:242
@ FREQ_DAWG_PERM
Definition: ratngs.h:243
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:483
void split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:163
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:126
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:94
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:216
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:37
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, BlamerBundle *bundle2) const
Definition: blamer.cpp:174
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:226
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:131
TDimension x
Definition: blobs.h:89
TDimension y
Definition: blobs.h:90
std::vector< TBLOB * > blobs
Definition: blobs.h:462
unsigned NumBlobs() const
Definition: blobs.h:449
unsigned length() const
Definition: boxword.h:81
int dimension() const
Definition: matrix.h:612
void AttachOnCorner(BandTriMatrix< T > *array2)
Definition: matrix.h:633
WERD_CHOICE * best_choice
Definition: pageres.h:239
WERD_CHOICE * raw_choice
Definition: pageres.h:244
TWERD * chopped_word
Definition: pageres.h:210
BlamerBundle * blamer_bundle
Definition: pageres.h:250
const UNICHARSET * uch_set
Definition: pageres.h:201
WERD_CHOICE_LIST best_choices
Definition: pageres.h:247
MATRIX * ratings
Definition: pageres.h:235
std::vector< int > best_state
Definition: pageres.h:283
tesseract::BoxWord * box_word
Definition: pageres.h:270
std::vector< int > blob_widths
Definition: pageres.h:214
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:483
std::vector< int > blob_gaps
Definition: pageres.h:217
TWERD * rebuild_word
Definition: pageres.h:264
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:344
std::vector< SEAM * > seam_array
Definition: pageres.h:212
float certainty() const
Definition: ratngs.h:311
bool empty() const
Definition: ratngs.h:280
uint8_t permuter() const
Definition: ratngs.h:327
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:415
void set_permuter(uint8_t perm)
Definition: ratngs.h:356
const std::string & unichar_lengths() const
Definition: ratngs.h:529
unsigned length() const
Definition: ratngs.h:283
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:447
std::string & unichar_string()
Definition: ratngs.h:515
TDimension left() const
Definition: rect.h:82
TDimension top() const
Definition: rect.h:68
void print() const
Definition: rect.h:289
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void rej_word_tess_failure()
Definition: rejctmap.cpp:133
void initialise(uint16_t length)
Definition: rejctmap.cpp:67
TBOX bounding_box() const
Definition: werd.cpp:155
UNICHARSET unicharset
Definition: ccutil.h:61
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:86
void cc_recog(WERD_RES *word)
Definition: tface.cpp:119