tesseract  5.0.0
blamer.h
Go to the documentation of this file.
1 // File: blamer.h
3 // Description: Module allowing precise error causes to be allocated.
4 // Author: Rike Antonova
5 // Refactored: Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCSTRUCT_BLAMER_H_
21 #define TESSERACT_CCSTRUCT_BLAMER_H_
22 
23 #ifdef HAVE_CONFIG_H
24 # include "config_auto.h" // DISABLED_LEGACY_ENGINE
25 #endif
26 #include "boxword.h" // for BoxWord
27 #ifndef DISABLED_LEGACY_ENGINE
28 # include "params_training_featdef.h" // for ParamsTrainingBundle, ParamsTra...
29 #endif // ndef DISABLED_LEGACY_ENGINE
30 #include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only)
31 #include "rect.h" // for TBOX
32 #include "tprintf.h" // for tprintf
33 
34 #include <tesseract/unichar.h> // for UNICHAR_ID
35 
36 #include <cstdint> // for int16_t
37 #include <cstring> // for memcpy
38 #include <vector> // for std::vector
39 
40 namespace tesseract {
41 
42 class DENORM;
43 class MATRIX;
44 class UNICHARSET;
45 class WERD_RES;
46 
47 struct MATRIX_COORD;
48 struct TWERD;
49 
50 class LMPainPoints;
51 
52 static const int16_t kBlamerBoxTolerance = 5;
53 
54 // Enum for expressing the source of error.
55 // Note: Please update kIncorrectResultReasonNames when modifying this enum.
57  // The text recorded in best choice == truth text
59  // Either: Top choice is incorrect and is a dictionary word (language model
60  // is unlikely to help correct such errors, so blame the classifier).
61  // Or: the correct unichar was not included in shortlist produced by the
62  // classifier at all.
64  // Chopper have not found one or more splits that correspond to the correct
65  // character bounding boxes recorded in BlamerBundle::truth_word.
67  // Classifier did include correct unichars for each blob in the correct
68  // segmentation, however its rating could have been too bad to allow the
69  // language model to pull out the correct choice. On the other hand the
70  // strength of the language model might have been too weak to favor the
71  // correct answer, this we call this case a classifier-language model
72  // tradeoff error.
74  // Page layout failed to produce the correct bounding box. Blame page layout
75  // if the truth was not found for the word, which implies that the bounding
76  // box of the word was incorrect (no truth word had a similar bounding box).
78  // SegSearch heuristic prevented one or more blobs from the correct
79  // segmentation state to be classified (e.g. the blob was too wide).
81  // The correct segmentaiton state was not explored because of poor SegSearch
82  // pain point prioritization. We blame SegSearch pain point prioritization
83  // if the best rating of a choice constructed from correct segmentation is
84  // better than that of the best choice (i.e. if we got to explore the correct
85  // segmentation state, language model would have picked the correct choice).
87  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
88  // and thus use the old language model (permuters).
89  // TODO(antonova): integrate the new language mode with chopper
91  // If there is an incorrect adaptive template match with a better score than
92  // a correct one (either pre-trained or adapted), mark this as adaption error.
94  // split_and_recog_word() failed to find a suitable split in truth.
96  // Truth is not available for this word (e.g. when words in corrected content
97  // file are turned into ~~~~ because an appropriate alignment was not found.
99  // The text recorded in best choice != truth text, but none of the above
100  // reasons are set.
102 
104 };
105 
106 // Blamer-related information to determine the source of errors.
107 struct BlamerBundle {
108  static const char *IncorrectReasonName(IncorrectResultReason irr);
110  : truth_has_char_boxes_(false)
111  , incorrect_result_reason_(IRR_CORRECT)
112  , lattice_data_(nullptr) {
113  ClearResults();
114  }
115  BlamerBundle(const BlamerBundle &other) {
116  this->CopyTruth(other);
117  this->CopyResults(other);
118  }
120  delete[] lattice_data_;
121  }
122 
123  // Accessors.
124  std::string TruthString() const {
125  std::string truth_str;
126  for (auto &text : truth_text_) {
127  truth_str += text;
128  }
129  return truth_str;
130  }
132  return incorrect_result_reason_;
133  }
134  bool NoTruth() const {
135  return incorrect_result_reason_ == IRR_NO_TRUTH || incorrect_result_reason_ == IRR_PAGE_LAYOUT;
136  }
137  bool HasDebugInfo() const {
138  return debug_.length() > 0 || misadaption_debug_.length() > 0;
139  }
140  const std::string &debug() const {
141  return debug_;
142  }
143  const std::string &misadaption_debug() const {
144  return misadaption_debug_;
145  }
146  void UpdateBestRating(float rating) {
147  if (rating < best_correctly_segmented_rating_) {
148  best_correctly_segmented_rating_ = rating;
149  }
150  }
152  return correct_segmentation_cols_.size();
153  }
154  // Returns true if the given ratings matrix col,row position is included
155  // in the correct segmentation path at the given index.
156  bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord) {
157  return correct_segmentation_cols_[index] == coord.col &&
158  correct_segmentation_rows_[index] == coord.row;
159  }
161  best_choice_is_dict_and_top_choice_ = value;
162  }
163  const char *lattice_data() const {
164  return lattice_data_;
165  }
166  int lattice_size() const {
167  return lattice_size_; // size of lattice_data in bytes
168  }
169  void set_lattice_data(const char *data, int size) {
170  lattice_size_ = size;
171  delete[] lattice_data_;
172  lattice_data_ = new char[lattice_size_];
173  memcpy(lattice_data_, data, lattice_size_);
174  }
175 #ifndef DISABLED_LEGACY_ENGINE
177  return params_training_bundle_;
178  }
179  // Adds a new ParamsTrainingHypothesis to the current hypothesis list.
181  params_training_bundle_.AddHypothesis(hypo);
182  }
183 #endif // ndef DISABLED_LEGACY_ENGINE
184 
185  // Functions to setup the blamer.
186  // Whole word string, whole word bounding box.
187  void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box);
188  // Single "character" string, "character" bounding box.
189  // May be called multiple times to indicate the characters in a word.
190  void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box);
191  // Marks that there is something wrong with the truth text, like it contains
192  // reject characters.
193  void SetRejectedTruth();
194 
195  // Returns true if the provided word_choice is correct.
196  bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const;
197 
198  void ClearResults() {
199  norm_truth_word_.DeleteAllBoxes();
200  norm_box_tolerance_ = 0;
201  if (!NoTruth()) {
202  incorrect_result_reason_ = IRR_CORRECT;
203  }
204  debug_ = "";
205  segsearch_is_looking_for_blame_ = false;
206  best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
207  correct_segmentation_cols_.clear();
208  correct_segmentation_rows_.clear();
209  best_choice_is_dict_and_top_choice_ = false;
210  delete[] lattice_data_;
211  lattice_data_ = nullptr;
212  lattice_size_ = 0;
213  }
214  void CopyTruth(const BlamerBundle &other) {
215  truth_has_char_boxes_ = other.truth_has_char_boxes_;
216  truth_word_ = other.truth_word_;
217  truth_text_ = other.truth_text_;
218  incorrect_result_reason_ = (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
219  }
220  void CopyResults(const BlamerBundle &other) {
221  norm_truth_word_ = other.norm_truth_word_;
222  norm_box_tolerance_ = other.norm_box_tolerance_;
223  incorrect_result_reason_ = other.incorrect_result_reason_;
224  segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
225  best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
226  correct_segmentation_cols_ = other.correct_segmentation_cols_;
227  correct_segmentation_rows_ = other.correct_segmentation_rows_;
228  best_choice_is_dict_and_top_choice_ = other.best_choice_is_dict_and_top_choice_;
229  if (other.lattice_data_ != nullptr) {
230  lattice_data_ = new char[other.lattice_size_];
231  memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
232  lattice_size_ = other.lattice_size_;
233  } else {
234  lattice_data_ = nullptr;
235  }
236  }
237  const char *IncorrectReason() const;
238 
239  // Appends choice and truth details to the given debug string.
240  void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug);
241 
242  // Sets up the norm_truth_word from truth_word using the given DENORM.
243  void SetupNormTruthWord(const DENORM &denorm);
244 
245  // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
246  // bundles) where the right edge/ of the left-hand word is word1_right,
247  // and the left edge of the right-hand word is word2_left.
248  void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,
249  BlamerBundle *bundle2) const;
250  // "Joins" the blames from bundle1 and bundle2 into *this.
251  void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug);
252 
253  // If a blob with the same bounding box as one of the truth character
254  // bounding boxes is not classified as the corresponding truth character
255  // blames character classifier for incorrect answer.
256  void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,
257  const BLOB_CHOICE_LIST &choices, bool debug);
258 
259  // Checks whether chops were made at all the character bounding box
260  // boundaries in word->truth_word. If not - blames the chopper for an
261  // incorrect answer.
262  void SetChopperBlame(const WERD_RES *word, bool debug);
263  // Blames the classifier or the language model if, after running only the
264  // chopper, best_choice is incorrect and no blame has been yet set.
265  // Blames the classifier if best_choice is classifier's top choice and is a
266  // dictionary word (i.e. language model could not have helped).
267  // Otherwise, blames the language model (formerly permuter word adjustment).
268  void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,
269  bool valid_permuter, bool debug);
270  // Sets up the correct_segmentation_* to mark the correct bounding boxes.
271  void SetupCorrectSegmentation(const TWERD *word, bool debug);
272 
273  // Returns true if a guided segmentation search is needed.
274  bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
275  // Setup ready to guide the segmentation search to the correct segmentation.
276  void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id,
277  bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points,
278  double max_char_wh_ratio, WERD_RES *word_res);
279  // Returns true if the guided segsearch is in progress.
280  bool GuidedSegsearchStillGoing() const;
281  // The segmentation search has ended. Sets the blame appropriately.
282  void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str);
283 
284  // If the bundle is null or still does not indicate the correct result,
285  // fix it and use some backup reason for the blame.
286  static void LastChanceBlame(bool debug, WERD_RES *word);
287 
288  // Sets the misadaption debug if this word is incorrect, as this word is
289  // being adapted to.
290  void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
291 
292 private:
293  // Copy assignment operator (currently unused, therefore private).
294  BlamerBundle &operator=(const BlamerBundle &other) = delete;
295  void SetBlame(IncorrectResultReason irr, const std::string &msg, const WERD_CHOICE *choice,
296  bool debug) {
297  incorrect_result_reason_ = irr;
298  debug_ = IncorrectReason();
299  debug_ += " to blame: ";
300  FillDebugString(msg, choice, debug_);
301  if (debug) {
302  tprintf("SetBlame(): %s", debug_.c_str());
303  }
304  }
305 
306 private:
307  // Set to true when bounding boxes for individual unichars are recorded.
308  bool truth_has_char_boxes_;
309  // Variables used by the segmentation search when looking for the blame.
310  // Set to true while segmentation search is continued after the usual
311  // termination condition in order to look for the blame.
312  bool segsearch_is_looking_for_blame_;
313  // Set to true if best choice is a dictionary word and
314  // classifier's top choice.
315  bool best_choice_is_dict_and_top_choice_;
316  // Tolerance for bounding box comparisons in normalized space.
317  int norm_box_tolerance_;
318  // The true_word (in the original image coordinate space) contains ground
319  // truth bounding boxes for this WERD_RES.
320  tesseract::BoxWord truth_word_;
321  // Same as above, but in normalized coordinates
322  // (filled in by WERD_RES::SetupForRecognition()).
323  tesseract::BoxWord norm_truth_word_;
324  // Contains ground truth unichar for each of the bounding boxes in truth_word.
325  std::vector<std::string> truth_text_;
326  // The reason for incorrect OCR result.
327  IncorrectResultReason incorrect_result_reason_;
328  // Debug text associated with the blame.
329  std::string debug_;
330  // Misadaption debug information (filled in if this word was misadapted to).
331  std::string misadaption_debug_;
332  // Vectors populated by SegSearch to indicate column and row indices that
333  // correspond to blobs with correct bounding boxes.
334  std::vector<int> correct_segmentation_cols_;
335  std::vector<int> correct_segmentation_rows_;
336  // Best rating for correctly segmented path
337  // (set and used by SegSearch when looking for blame).
338  float best_correctly_segmented_rating_;
339  int lattice_size_; // size of lattice_data in bytes
340  // Serialized segmentation search lattice.
341  char *lattice_data_;
342  // Information about hypotheses (paths) explored by the segmentation search.
343 #ifndef DISABLED_LEGACY_ENGINE
344  tesseract::ParamsTrainingBundle params_training_bundle_;
345 #endif // ndef DISABLED_LEGACY_ENGINE
346 };
347 
348 } // namespace tesseract
349 
350 #endif // TESSERACT_CCSTRUCT_BLAMER_H_
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
IncorrectResultReason
Definition: blamer.h:56
@ IRR_CLASS_OLD_LM_TRADEOFF
Definition: blamer.h:90
@ IRR_SEGSEARCH_HEUR
Definition: blamer.h:80
@ IRR_CORRECT
Definition: blamer.h:58
@ IRR_SEGSEARCH_PP
Definition: blamer.h:86
@ IRR_CHOPPER
Definition: blamer.h:66
@ IRR_PAGE_LAYOUT
Definition: blamer.h:77
@ IRR_UNKNOWN
Definition: blamer.h:101
@ IRR_CLASS_LM_TRADEOFF
Definition: blamer.h:73
@ IRR_NUM_REASONS
Definition: blamer.h:103
@ IRR_CLASSIFIER
Definition: blamer.h:63
@ IRR_NO_TRUTH
Definition: blamer.h:98
@ IRR_NO_TRUTH_SPLIT
Definition: blamer.h:95
@ IRR_ADAPTION
Definition: blamer.h:93
int UNICHAR_ID
Definition: unichar.h:36
const std::string & debug() const
Definition: blamer.h:140
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:461
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:498
BlamerBundle(const BlamerBundle &other)
Definition: blamer.h:115
void set_lattice_data(const char *data, int size)
Definition: blamer.h:169
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
std::string TruthString() const
Definition: blamer.h:124
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str)
Definition: blamer.cpp:503
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:540
const tesseract::ParamsTrainingBundle & params_training_bundle() const
Definition: blamer.h:176
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:160
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, BlamerBundle *bundle2) const
Definition: blamer.cpp:174
int lattice_size() const
Definition: blamer.h:166
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:180
bool HasDebugInfo() const
Definition: blamer.h:137
void UpdateBestRating(float rating)
Definition: blamer.h:146
bool NoTruth() const
Definition: blamer.h:134
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:226
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:156
void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box)
Definition: blamer.cpp:66
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:309
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:116
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points, double max_char_wh_ratio, WERD_RES *word_res)
Definition: blamer.cpp:468
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:564
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:363
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:214
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:220
const char * IncorrectReason() const
Definition: blamer.cpp:60
void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box)
Definition: blamer.cpp:88
const std::string & misadaption_debug() const
Definition: blamer.h:143
const char * lattice_data() const
Definition: blamer.h:163
void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug)
Definition: blamer.cpp:129
int correct_segmentation_length() const
Definition: blamer.h:151
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
Definition: blamer.cpp:260
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:151
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:399
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:131
void DeleteAllBoxes()
Definition: boxword.cpp:184
ParamsTrainingHypothesis & AddHypothesis(const ParamsTrainingHypothesis &other)
static const float kBadRating
Definition: ratngs.h:256