tesseract  5.0.0
segsearch.cpp
Go to the documentation of this file.
1 // File: segsearch.cpp
3 // Description: Segmentation search functions.
4 // Author: Daria Antonova
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include <cstdint> // for INT32_MAX
20 #include "blamer.h" // for BlamerBundle
21 #include "errcode.h" // for ASSERT_HOST
22 #include "lm_pain_points.h" // for LMPainPoints, LM_PPTYPE_SHAPE, LMPainPoi...
23 #include "lm_state.h" // for BestChoiceBundle, ViterbiStateEntry
24 #include "matrix.h" // for MATRIX_COORD, MATRIX
25 #include "pageres.h" // for WERD_RES
26 #include "params.h" // for BoolParam, IntParam, DoubleParam
27 #include "ratngs.h" // for BLOB_CHOICE_LIST, BLOB_CHOICE_IT
28 #include "tprintf.h" // for tprintf
29 #include "wordrec.h" // for Wordrec, SegSearchPending (ptr only)
30 
31 namespace tesseract {
32 
33 void Wordrec::SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
34  BlamerBundle *blamer_bundle) {
35  LMPainPoints pain_points(segsearch_max_pain_points, segsearch_max_char_wh_ratio,
36  assume_fixed_pitch_char_segment, &getDict(), segsearch_debug_level);
37  // Compute scaling factor that will help us recover blob outline length
38  // from classifier rating and certainty for the blob.
39  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
40  std::vector<SegSearchPending> pending;
41  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle, blamer_bundle);
42 
43  if (!SegSearchDone(0)) { // find a better choice
44  if (chop_enable && word_res->chopped_word != nullptr) {
45  improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle, blamer_bundle,
46  &pain_points, &pending);
47  }
48  if (chop_debug) {
49  SEAM::PrintSeams("Final seam list:", word_res->seam_array);
50  }
51 
52  if (blamer_bundle != nullptr && !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
53  blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
54  }
55  }
56  // Keep trying to find a better path by fixing the "pain points".
57 
58  MATRIX_COORD pain_point;
59  float pain_point_priority;
60  int num_futile_classifications = 0;
61  std::string blamer_debug;
62  while (wordrec_enable_assoc &&
63  (!SegSearchDone(num_futile_classifications) ||
64  (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()))) {
65  // Get the next valid "pain point".
66  bool found_nothing = true;
67  LMPainPointsType pp_type;
68  while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) != LM_PPTYPE_NUM) {
69  if (!pain_point.Valid(*word_res->ratings)) {
70  word_res->ratings->IncreaseBandSize(pain_point.row - pain_point.col + 1);
71  }
72  if (pain_point.Valid(*word_res->ratings) &&
73  !word_res->ratings->Classified(pain_point.col, pain_point.row, getDict().WildcardID())) {
74  found_nothing = false;
75  break;
76  }
77  }
78  if (found_nothing) {
79  if (segsearch_debug_level > 0) {
80  tprintf("Pain points queue is empty\n");
81  }
82  break;
83  }
84  ProcessSegSearchPainPoint(pain_point_priority, pain_point,
85  LMPainPoints::PainPointDescription(pp_type), &pending, word_res,
86  &pain_points, blamer_bundle);
87 
88  UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending, word_res, &pain_points,
89  best_choice_bundle, blamer_bundle);
90  if (!best_choice_bundle->updated) {
91  ++num_futile_classifications;
92  }
93 
94  if (segsearch_debug_level > 0) {
95  tprintf("num_futile_classifications %d\n", num_futile_classifications);
96  }
97 
98  best_choice_bundle->updated = false; // reset updated
99 
100  // See if it's time to terminate SegSearch or time for starting a guided
101  // search for the true path to find the blame for the incorrect best_choice.
102  if (SegSearchDone(num_futile_classifications) && blamer_bundle != nullptr &&
103  blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
104  InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle, blamer_debug);
105  }
106  } // end while loop exploring alternative paths
107  if (blamer_bundle != nullptr) {
108  blamer_bundle->FinishSegSearch(word_res->best_choice, wordrec_debug_blamer, blamer_debug);
109  }
110 
111  if (segsearch_debug_level > 0) {
112  tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
113  language_model_->AcceptableChoiceFound());
114  }
115 }
116 
117 // Setup and run just the initial segsearch on an established matrix,
118 // without doing any additional chopping or joining.
119 // (Internal factored version that can be used as part of the main SegSearch.)
120 void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
121  std::vector<SegSearchPending> *pending,
122  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) {
123  if (segsearch_debug_level > 0) {
124  tprintf("Starting SegSearch on ratings matrix%s:\n",
125  wordrec_enable_assoc ? " (with assoc)" : "");
126  word_res->ratings->print(getDict().getUnicharset());
127  }
128 
129  pain_points->GenerateInitial(word_res);
130 
131  // Compute scaling factor that will help us recover blob outline length
132  // from classifier rating and certainty for the blob.
133  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
134 
135  language_model_->InitForWord(prev_word_best_choice_, assume_fixed_pitch_char_segment,
136  segsearch_max_char_wh_ratio, rating_cert_scale);
137 
138  // Initialize blamer-related information: map character boxes recorded in
139  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
140  // ratings matrix. We expect this step to succeed, since when running the
141  // chopper we checked that the correct chops are present.
142  if (blamer_bundle != nullptr) {
143  blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word, wordrec_debug_blamer);
144  }
145 
146  // pending[col] tells whether there is update work to do to combine
147  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
148  // As the language model state is updated, pending entries are modified to
149  // minimize duplication of work. It is important that during the update the
150  // children are considered in the non-decreasing order of their column, since
151  // this guarantees that all the parents would be up to date before an update
152  // of a child is done.
153  pending->clear();
154  pending->resize(word_res->ratings->dimension(), SegSearchPending());
155 
156  // Search the ratings matrix for the initial best path.
157  (*pending)[0].SetColumnClassified();
158  UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res, pain_points, best_choice_bundle,
159  blamer_bundle);
160 }
161 
162 void Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
163  std::vector<SegSearchPending> *pending, WERD_RES *word_res,
164  LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
165  BlamerBundle *blamer_bundle) {
166  MATRIX *ratings = word_res->ratings;
167  ASSERT_HOST(static_cast<unsigned>(ratings->dimension()) == pending->size());
168  ASSERT_HOST(static_cast<unsigned>(ratings->dimension()) == best_choice_bundle->beam.size());
169  for (int col = starting_col; col < ratings->dimension(); ++col) {
170  if (!(*pending)[col].WorkToDo()) {
171  continue;
172  }
173  int first_row = col;
174  int last_row = std::min(ratings->dimension() - 1, col + ratings->bandwidth() - 1);
175  if ((*pending)[col].SingleRow() >= 0) {
176  first_row = last_row = (*pending)[col].SingleRow();
177  }
178  if (segsearch_debug_level > 0) {
179  tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n", col, first_row,
180  last_row, (*pending)[col].IsRowJustClassified(INT32_MAX));
181  }
182  // Iterate over the pending list for this column.
183  for (int row = first_row; row <= last_row; ++row) {
184  // Update language model state of this child+parent pair.
185  BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
186  LanguageModelState *parent_node = col == 0 ? nullptr : best_choice_bundle->beam[col - 1];
187  if (current_node != nullptr &&
188  language_model_->UpdateState((*pending)[col].IsRowJustClassified(row), col, row,
189  current_node, parent_node, pain_points, word_res,
190  best_choice_bundle, blamer_bundle) &&
191  row + 1 < ratings->dimension()) {
192  // Since the language model state of this entry changed, process all
193  // the child column.
194  (*pending)[row + 1].RevisitWholeColumn();
195  if (segsearch_debug_level > 0) {
196  tprintf("Added child col=%d to pending\n", row + 1);
197  }
198  } // end if UpdateState.
199  } // end for row.
200  } // end for col.
201  if (best_choice_bundle->best_vse != nullptr) {
202  ASSERT_HOST(word_res->StatesAllValid());
203  if (best_choice_bundle->best_vse->updated) {
204  pain_points->GenerateFromPath(rating_cert_scale, best_choice_bundle->best_vse, word_res);
205  if (!best_choice_bundle->fixpt.empty()) {
206  pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt, best_choice_bundle->best_vse,
207  word_res);
208  }
209  }
210  }
211  // The segsearch is completed. Reset all updated flags on all VSEs and reset
212  // all pendings.
213  for (unsigned col = 0; col < pending->size(); ++col) {
214  (*pending)[col].Clear();
215  ViterbiStateEntry_IT vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
216  for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
217  vse_it.data()->updated = false;
218  }
219  }
220 }
221 
222 void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,
223  const char *pain_point_type,
224  std::vector<SegSearchPending> *pending,
225  WERD_RES *word_res, LMPainPoints *pain_points,
226  BlamerBundle *blamer_bundle) {
227  if (segsearch_debug_level > 0) {
228  tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n", pain_point_type,
229  pain_point_priority, pain_point.col, pain_point.row);
230  }
231  ASSERT_HOST(pain_points != nullptr);
232  MATRIX *ratings = word_res->ratings;
233  // Classify blob [pain_point.col pain_point.row]
234  if (!pain_point.Valid(*ratings)) {
235  ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
236  }
237  ASSERT_HOST(pain_point.Valid(*ratings));
238  BLOB_CHOICE_LIST *classified =
239  classify_piece(word_res->seam_array, pain_point.col, pain_point.row, pain_point_type,
240  word_res->chopped_word, blamer_bundle);
241  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
242  if (lst == nullptr) {
243  ratings->put(pain_point.col, pain_point.row, classified);
244  } else {
245  // We can not delete old BLOB_CHOICEs, since they might contain
246  // ViterbiStateEntries that are parents of other "active" entries.
247  // Thus if the matrix cell already contains classifications we add
248  // the new ones to the beginning of the list.
249  BLOB_CHOICE_IT it(lst);
250  it.add_list_before(classified);
251  delete classified; // safe to delete, since empty after add_list_before()
252  classified = nullptr;
253  }
254 
255  if (segsearch_debug_level > 0) {
256  print_ratings_list("Updated ratings matrix with a new entry:",
257  ratings->get(pain_point.col, pain_point.row), getDict().getUnicharset());
258  ratings->print(getDict().getUnicharset());
259  }
260 
261  // Insert initial "pain points" to join the newly classified blob
262  // with its left and right neighbors.
263  if (classified != nullptr && !classified->empty()) {
264  if (pain_point.col > 0) {
265  pain_points->GeneratePainPoint(pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0, true,
266  segsearch_max_char_wh_ratio, word_res);
267  }
268  if (pain_point.row + 1 < ratings->dimension()) {
269  pain_points->GeneratePainPoint(pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0, true,
270  segsearch_max_char_wh_ratio, word_res);
271  }
272  }
273  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
274 }
275 
276 // Resets enough of the results so that the Viterbi search is re-run.
277 // Needed when the n-gram model is enabled, as the multi-length comparison
278 // implementation will re-value existing paths to worse values.
279 void Wordrec::ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
280  std::vector<SegSearchPending> &pending) {
281  // TODO(rays) More refactoring required here.
282  // Delete existing viterbi states.
283  for (auto &col : best_choice_bundle->beam) {
284  col->Clear();
285  }
286  // Reset best_choice_bundle.
287  word_res->ClearWordChoices();
288  best_choice_bundle->best_vse = nullptr;
289  // Clear out all existing pendings and add a new one for the first column.
290  pending[0].SetColumnClassified();
291  for (auto &data : pending) {
292  data.Clear();
293  }
294 }
295 
297  BlamerBundle *blamer_bundle, std::string &blamer_debug) {
298  pain_points->Clear(); // Clear pain points heap.
299  blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings, getDict().WildcardID(),
300  wordrec_debug_blamer, blamer_debug, pain_points,
301  segsearch_max_char_wh_ratio, word_res);
302 }
303 
304 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:804
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
T get(ICOORD pos) const
Definition: matrix.h:268
void put(ICOORD pos, const T &thing)
Definition: matrix.h:260
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:461
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:498
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str)
Definition: blamer.cpp:503
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:309
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:116
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points, double max_char_wh_ratio, WERD_RES *word_res)
Definition: blamer.cpp:468
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:399
int dimension() const
Definition: matrix.h:612
int bandwidth() const
Definition: matrix.h:616
bool Classified(int col, int row, int wildcard_id) const
Definition: matrix.cpp:36
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:52
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:115
bool Valid(const MATRIX &m) const
Definition: matrix.h:697
WERD_CHOICE * best_choice
Definition: pageres.h:239
TWERD * chopped_word
Definition: pageres.h:210
MATRIX * ratings
Definition: pageres.h:235
std::vector< SEAM * > seam_array
Definition: pageres.h:212
static void PrintSeams(const char *label, const std::vector< SEAM * > &seams)
Definition: seam.cpp:158
virtual Dict & getDict()
Definition: classify.h:98
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
void GenerateInitial(WERD_RES *word_res)
void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res)
static const char * PainPointDescription(LMPainPointsType type)
void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res)
LMPainPointsType Deque(MATRIX_COORD *pp, float *priority)
bool updated
set to true if the entry has just been created/updated
Definition: lm_state.h:198
Struct to store information maintained by various language model components.
Definition: lm_state.h:204
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:226
std::vector< LanguageModelState * > beam
Definition: lm_state.h:246
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:242
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
Definition: lm_state.h:248
bool updated
Flag to indicate whether anything was changed.
Definition: lm_state.h:240
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, std::vector< SegSearchPending > *pending)
Definition: chopper.cpp:445
void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, std::string &blamer_debug)
Definition: segsearch.cpp:296
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, std::vector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:162
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, std::vector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:120
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, std::vector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:222
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:394
virtual BLOB_CHOICE_LIST * classify_piece(const std::vector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:49
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, std::vector< SegSearchPending > &pending)
Definition: segsearch.cpp:279
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:33
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:382