tesseract  5.0.0
tesseractclass.h
Go to the documentation of this file.
1 // File: tesseractclass.h
3 // Description: The Tesseract class. It holds/owns everything needed
4 // to run Tesseract on a single language, and also a set of
5 // sub-Tesseracts to run sub-languages. For thread safety, *every*
6 // global variable goes in here, directly, or indirectly.
7 // This makes it safe to run multiple Tesseracts in different
8 // threads in parallel, and keeps the different language
9 // instances separate.
10 // Author: Ray Smith
11 //
12 // (C) Copyright 2008, Google Inc.
13 // Licensed under the Apache License, Version 2.0 (the "License");
14 // you may not use this file except in compliance with the License.
15 // You may obtain a copy of the License at
16 // http://www.apache.org/licenses/LICENSE-2.0
17 // Unless required by applicable law or agreed to in writing, software
18 // distributed under the License is distributed on an "AS IS" BASIS,
19 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 // See the License for the specific language governing permissions and
21 // limitations under the License.
22 //
24 
25 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_
26 #define TESSERACT_CCMAIN_TESSERACTCLASS_H_
27 
28 #ifdef HAVE_CONFIG_H
29 # include "config_auto.h" // DISABLED_LEGACY_ENGINE
30 #endif
31 
32 #include "control.h" // for ACCEPTABLE_WERD_TYPE
33 #include "debugpixa.h" // for DebugPixa
34 #include "devanagari_processing.h" // for ShiroRekhaSplitter
35 #ifndef DISABLED_LEGACY_ENGINE
36 # include "docqual.h" // for GARBAGE_LEVEL
37 #endif
38 #include "genericvector.h" // for PointerVector
39 #include "pageres.h" // for WERD_RES (ptr only), PAGE_RES (pt...
40 #include "params.h" // for BOOL_VAR_H, BoolParam, DoubleParam
41 #include "points.h" // for FCOORD
42 #include "ratngs.h" // for ScriptPos, WERD_CHOICE (ptr only)
43 #include "tessdatamanager.h" // for TessdataManager
44 #include "textord.h" // for Textord
45 #include "wordrec.h" // for Wordrec
46 
47 #include <tesseract/publictypes.h> // for OcrEngineMode, PageSegMode, OEM_L...
48 #include <tesseract/unichar.h> // for UNICHAR_ID
49 
50 #include <allheaders.h> // for pixDestroy, pixGetWidth, pixGetHe...
51 
52 #include <cstdint> // for int16_t, int32_t, uint16_t
53 #include <cstdio> // for FILE
54 
55 namespace tesseract {
56 
57 class BLOCK_LIST;
58 class ETEXT_DESC;
59 struct OSResults;
60 class PAGE_RES;
61 class PAGE_RES_IT;
62 class ROW;
63 class SVMenuNode;
64 class TBOX;
65 class TO_BLOCK_LIST;
66 class WERD;
67 class WERD_CHOICE;
68 class WERD_RES;
69 
70 class ColumnFinder;
71 class DocumentData;
72 #ifndef DISABLED_LEGACY_ENGINE
73 class EquationDetect;
74 #endif // ndef DISABLED_LEGACY_ENGINE
75 class ImageData;
76 class LSTMRecognizer;
77 class Tesseract;
78 
79 // Top-level class for all tesseract global instance data.
80 // This class either holds or points to all data used by an instance
81 // of Tesseract, including the memory allocator. When this is
82 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
83 //
84 // NOTE to developers: Do not create cyclic dependencies through this class!
85 // The directory dependency tree must remain a tree! To keep this clean,
86 // lower-level code (eg in ccutil, the bottom level) must never need to
87 // know about the content of a higher-level directory.
88 // The following scheme will grant the easiest access to lower-level
89 // global members without creating a cyclic dependency:
90 //
91 // Class Hierarchy (^ = inheritance):
92 //
93 // CCUtil (ccutil/ccutil.h)
94 // ^ Members include: UNICHARSET
95 // CCStruct (ccstruct/ccstruct.h)
96 // ^ Members include: Image
97 // Classify (classify/classify.h)
98 // ^ Members include: Dict
99 // WordRec (wordrec/wordrec.h)
100 // ^ Members include: WERD*, DENORM*
101 // Tesseract (ccmain/tesseractclass.h)
102 // Members include: Pix*
103 //
104 // Other important classes:
105 //
106 // TessBaseAPI (tesseract/baseapi.h)
107 // Members include: BLOCK_LIST*, PAGE_RES*,
108 // Tesseract*, ImageThresholder*
109 // Dict (dict/dict.h)
110 // Members include: Image* (private)
111 //
112 // NOTE: that each level contains members that correspond to global
113 // data that is defined (and used) at that level, not necessarily where
114 // the type is defined so for instance:
115 // BOOL_VAR_H(textord_show_blobs);
116 // goes inside the Textord class, not the cc_util class.
117 
118 // A collection of various variables for statistics and debugging.
122  , doc_blob_quality(0)
123  , doc_outline_errs(0)
124  , doc_char_quality(0)
125  , good_char_count(0)
127  , word_count(0)
128  , dict_words(0)
129  , tilde_crunch_written(false)
130  , last_char_was_newline(true)
131  , last_char_was_tilde(false)
132  , write_results_empty_block(true) {}
133 
140  int32_t word_count; // count of word in the document
141  int32_t dict_words; // number of dicitionary words in the document
142  std::string dump_words_str; // accumulator used by dump_words()
143  // Flags used by write_results()
148 };
149 
150 // Struct to hold all the pointers to relevant data for processing a word.
151 struct WordData {
152  WordData() : word(nullptr), row(nullptr), block(nullptr), prev_word(nullptr) {}
153  explicit WordData(const PAGE_RES_IT &page_res_it)
154  : word(page_res_it.word())
155  , row(page_res_it.row()->row)
156  , block(page_res_it.block()->block)
157  , prev_word(nullptr) {}
158  WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
159  : word(word_res), row(row_in), block(block_in), prev_word(nullptr) {}
160 
166 };
167 
168 // Definition of a Tesseract WordRecognizer. The WordData provides the context
169 // of row/block, in_word holds an initialized, possibly pre-classified word,
170 // that the recognizer may or may not consume (but if so it sets
171 // *in_word=nullptr) and produces one or more output words in out_words, which
172 // may be the consumed in_word, or may be generated independently. This api
173 // allows both a conventional tesseract classifier to work, or a line-level
174 // classifier that generates multiple words from a merged input.
175 using WordRecognizer = void (Tesseract::*)(const WordData &, WERD_RES **,
177 
178 class TESS_API Tesseract : public Wordrec {
179 public:
180  Tesseract();
181  ~Tesseract() override;
182 
183  // Return appropriate dictionary
184  Dict &getDict() override;
185 
186  // Clear as much used memory as possible without resetting the adaptive
187  // classifier or losing any other classifier data.
188  void Clear();
189  // Clear all memory of adaption for this and all subclassifiers.
190  void ResetAdaptiveClassifier();
191  // Clear the document dictionary for this and all subclassifiers.
192  void ResetDocumentDictionary();
193 
194 #ifndef DISABLED_LEGACY_ENGINE
195  // Set the equation detector.
196  void SetEquationDetect(EquationDetect *detector);
197 #endif // ndef DISABLED_LEGACY_ENGINE
198 
199  // Simple accessors.
200  const FCOORD &reskew() const {
201  return reskew_;
202  }
203  // Destroy any existing pix and return a pointer to the pointer.
205  pix_binary_.destroy();
206  return &pix_binary_;
207  }
208  Image pix_binary() const {
209  return pix_binary_;
210  }
211  Image pix_grey() const {
212  return pix_grey_;
213  }
214  void set_pix_grey(Image grey_pix) {
215  pix_grey_.destroy();
216  pix_grey_ = grey_pix;
217  }
218  Image pix_original() const {
219  return pix_original_;
220  }
221  // Takes ownership of the given original_pix.
222  void set_pix_original(Image original_pix) {
223  pix_original_.destroy();
224  pix_original_ = original_pix;
225  // Clone to sublangs as well.
226  for (auto &lang : sub_langs_) {
227  lang->set_pix_original(original_pix ? original_pix.clone() : nullptr);
228  }
229  }
230  // Returns a pointer to a Pix representing the best available resolution image
231  // of the page, with best available bit depth as second priority. Result can
232  // be of any bit depth, but never color-mapped, as that has always been
233  // removed. Note that in grey and color, 0 is black and 255 is
234  // white. If the input was binary, then black is 1 and white is 0.
235  // To tell the difference pixGetDepth() will return 32, 8 or 1.
236  // In any case, the return value is a borrowed Pix, and should not be
237  // deleted or pixDestroyed.
238  Image BestPix() const {
239  if (pixGetWidth(pix_original_) == ImageWidth()) {
240  return pix_original_;
241  } else if (pix_grey_ != nullptr) {
242  return pix_grey_;
243  } else {
244  return pix_binary_;
245  }
246  }
247  void set_pix_thresholds(Image thresholds) {
248  pix_thresholds_.destroy();
249  pix_thresholds_ = thresholds;
250  }
251  int source_resolution() const {
252  return source_resolution_;
253  }
254  void set_source_resolution(int ppi) {
255  source_resolution_ = ppi;
256  }
257  int ImageWidth() const {
258  return pixGetWidth(pix_binary_);
259  }
260  int ImageHeight() const {
261  return pixGetHeight(pix_binary_);
262  }
263  Image scaled_color() const {
264  return scaled_color_;
265  }
266  int scaled_factor() const {
267  return scaled_factor_;
268  }
269  void SetScaledColor(int factor, Image color) {
270  scaled_factor_ = factor;
271  scaled_color_ = color;
272  }
273  const Textord &textord() const {
274  return textord_;
275  }
277  return &textord_;
278  }
279 
280  bool right_to_left() const {
281  return right_to_left_;
282  }
283  int num_sub_langs() const {
284  return sub_langs_.size();
285  }
286  Tesseract *get_sub_lang(int index) const {
287  return sub_langs_[index];
288  }
289  // Returns true if any language uses Tesseract (as opposed to LSTM).
290  bool AnyTessLang() const {
291  if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
292  return true;
293  }
294  for (auto &lang : sub_langs_) {
295  if (lang->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
296  return true;
297  }
298  }
299  return false;
300  }
301  // Returns true if any language uses the LSTM.
302  bool AnyLSTMLang() const {
303  if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
304  return true;
305  }
306  for (auto &lang : sub_langs_) {
307  if (lang->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) {
308  return true;
309  }
310  }
311  return false;
312  }
313 
314  void SetBlackAndWhitelist();
315 
316  // Perform steps to prepare underlying binary image/other data structures for
317  // page segmentation. Uses the strategy specified in the global variable
318  // pageseg_devanagari_split_strategy for perform splitting while preparing for
319  // page segmentation.
320  void PrepareForPageseg();
321 
322  // Perform steps to prepare underlying binary image/other data structures for
323  // Tesseract OCR. The current segmentation is required by this method.
324  // Uses the strategy specified in the global variable
325  // ocr_devanagari_split_strategy for performing splitting while preparing for
326  // Tesseract ocr.
327  void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr);
328 
329  int SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr);
330  void SetupWordScripts(BLOCK_LIST *blocks);
331  int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks,
332  BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr);
333  ColumnFinder *SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks,
334  Tesseract *osd_tess, OSResults *osr,
335  TO_BLOCK_LIST *to_blocks, Image *photo_mask_pix,
336  Image *music_mask_pix);
337  // par_control.cpp
338  void PrerecAllWordsPar(const std::vector<WordData> &words);
339 
341  // Generates training data for training a line recognizer, eg LSTM.
342  // Breaks the page into lines, according to the boxes, and writes them to a
343  // serialized DocumentData based on output_basename.
344  // Return true if successful, false if an error occurred.
345  bool TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,
346  BLOCK_LIST *block_list);
347  // Generates training data for training a line recognizer, eg LSTM.
348  // Breaks the boxes into lines, normalizes them, converts to ImageData and
349  // appends them to the given training_data.
350  void TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
351  BLOCK_LIST *block_list, DocumentData *training_data);
352 
353  // Returns an Imagedata containing the image of the given textline,
354  // and ground truth boxes/truth text if available in the input.
355  // The image is not normalized in any way.
356  ImageData *GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,
357  const std::vector<std::string> &texts, int start_box, int end_box,
358  const BLOCK &block);
359  // Helper gets the image of a rectangle, using the block.re_rotation() if
360  // needed to get to the image, and rotating the result back to horizontal
361  // layout. (CJK characters will be on their left sides) The vertical text flag
362  // is set in the returned ImageData if the text was originally vertical, which
363  // can be used to invoke a different CJK recognition engine. The revised_box
364  // is also returned to enable calculation of output bounding boxes.
365  ImageData *GetRectImage(const TBOX &box, const BLOCK &block, int padding,
366  TBOX *revised_box) const;
367  // Recognizes a word or group of words, converting to WERD_RES in *words.
368  // Analogous to classify_word_pass1, but can handle a group of words as well.
369  void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
370  PointerVector<WERD_RES> *words);
371  // Apply segmentation search to the given set of words, within the constraints
372  // of the existing ratings matrix. If there is already a best_choice on a word
373  // leaves it untouched and just sets the done/accepted etc flags.
374  void SearchWords(PointerVector<WERD_RES> *words);
375 
377  bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config,
378  int pass);
379  // Sets up the words ready for whichever engine is to be run
380  void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config,
381  PAGE_RES *page_res, std::vector<WordData> *words);
382  // Sets up the single word ready for whichever engine is to be run.
383  void SetupWordPassN(int pass_n, WordData *word);
384  // Runs word recognition on all the words.
385  bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it,
386  std::vector<WordData> *words);
387  bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box,
388  const char *word_config, int dopasses);
389  void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box,
390  const char *word_config);
391  void bigram_correction_pass(PAGE_RES *page_res);
392  void blamer_pass(PAGE_RES *page_res);
393  // Sets script positions and detects smallcaps on all output words.
394  void script_pos_pass(PAGE_RES *page_res);
395  // Helper to recognize the word using the given (language-specific) tesseract.
396  // Returns positive if this recognizer found more new best words than the
397  // number kept from best_words.
398  int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug,
399  WERD_RES **in_word, PointerVector<WERD_RES> *best_words);
400  // Moves good-looking "noise"/diacritics from the reject list to the main
401  // blob list on the current word. Returns true if anything was done, and
402  // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
403  bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy);
404  // Attempts to put noise/diacritic outlines into the blobs that they overlap.
405  // Input: a set of noisy outlines that probably belong to the real_word.
406  // Output: outlines that overlapped blobs are set to nullptr and put back into
407  // the word, either in the blobs or in the reject list.
408  void AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
409  WERD *real_word, PAGE_RES_IT *pr_it,
410  std::vector<bool> *word_wanted,
411  std::vector<bool> *overlapped_any_blob,
412  std::vector<C_BLOB *> *target_blobs);
413  // Attempts to assign non-overlapping outlines to their nearest blobs or
414  // make new blobs out of them.
415  void AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
416  WERD *real_word, PAGE_RES_IT *pr_it,
417  std::vector<bool> *word_wanted,
418  std::vector<C_BLOB *> *target_blobs);
419  // Starting with ok_outlines set to indicate which outlines overlap the blob,
420  // chooses the optimal set (approximately) and returns true if any outlines
421  // are desired, in which case ok_outlines indicates which ones.
422  bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
423  C_BLOB *blob, const std::vector<C_OUTLINE *> &outlines,
424  int num_outlines, std::vector<bool> *ok_outlines);
425  // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
426  // the inclusion of the outlines, and returns the certainty of the raw choice.
427  float ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
428  const std::vector<C_OUTLINE *> &outlines, int pass_n,
429  PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str);
430  // Classifies the given blob (part of word_data->word->word) as an individual
431  // word, using languages, chopper etc, returning only the certainty of the
432  // best raw choice, and undoing all the work done to fake out the word.
433  float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str,
434  float *c2);
435  void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data);
436  void classify_word_pass1(const WordData &word_data, WERD_RES **in_word,
437  PointerVector<WERD_RES> *out_words);
438  void recog_pseudo_word(PAGE_RES *page_res, // blocks to check
439  TBOX &selection_box);
440 
441  void fix_rep_char(PAGE_RES_IT *page_res_it);
442 
443  ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s,
444  const char *lengths);
445  void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block);
446  void classify_word_pass2(const WordData &word_data, WERD_RES **in_word,
447  PointerVector<WERD_RES> *out_words);
448  void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word);
449  bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row);
450  bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row);
451  // Runs recognition with the test baseline shift and x-height and returns true
452  // if there was an improvement in recognition result.
453  bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht,
454  WERD_RES *word, BLOCK *block, ROW *row);
455  bool recog_interactive(PAGE_RES_IT *pr_it);
456 
457  // Set fonts of this word.
458  void set_word_fonts(WERD_RES *word);
459  void font_recognition_pass(PAGE_RES *page_res);
460  void dictionary_correction_pass(PAGE_RES *page_res);
461  bool check_debug_pt(WERD_RES *word, int location);
462 
464  bool SubAndSuperscriptFix(WERD_RES *word_res);
465  void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading,
466  ScriptPos *leading_pos, float *leading_certainty,
467  int *num_rebuilt_trailing, ScriptPos *trailing_pos,
468  float *trailing_certainty, float *avg_certainty,
469  float *unlikely_threshold);
470  WERD_RES *TrySuperscriptSplits(int num_chopped_leading, float leading_certainty,
471  ScriptPos leading_pos, int num_chopped_trailing,
472  float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word,
473  bool *is_good, int *retry_leading, int *retry_trailing);
474  bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold,
475  int *left_ok, int *right_ok) const;
476 
478 
479  void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box);
480  void write_results(PAGE_RES_IT &page_res_it, // full info
481  char newline_type, // type of newline
482  bool force_eol // override tilde crunch?
483  );
484  void set_unlv_suspects(WERD_RES *word);
485  UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated?
486  bool acceptable_number_string(const char *s, const char *lengths);
487  int16_t count_alphanums(const WERD_CHOICE &word);
488  int16_t count_alphas(const WERD_CHOICE &word);
489 
490  void read_config_file(const char *filename, SetParamConstraint constraint);
491  // Initialize for potentially a set of languages defined by the language
492  // string and recursively any additional languages required by any language
493  // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
494  // See init_tesseract_internal for args.
495  int init_tesseract(const std::string &arg0, const std::string &textbase,
496  const std::string &language, OcrEngineMode oem, char **configs,
497  int configs_size, const std::vector<std::string> *vars_vec,
498  const std::vector<std::string> *vars_values, bool set_only_non_debug_params,
499  TessdataManager *mgr);
500  int init_tesseract(const std::string &datapath, const std::string &language, OcrEngineMode oem) {
501  TessdataManager mgr;
502  return init_tesseract(datapath, {}, language, oem, nullptr, 0, nullptr, nullptr, false, &mgr);
503  }
504  // Common initialization for a single language.
505  // arg0 is the datapath for the tessdata directory, which could be the
506  // path of the tessdata directory with no trailing /, or (if tessdata
507  // lives in the same directory as the executable, the path of the executable,
508  // hence the name arg0.
509  // textbase is an optional output file basename (used only for training)
510  // language is the language code to load.
511  // oem controls which engine(s) will operate on the image
512  // configs (argv) is an array of config filenames to load variables from.
513  // May be nullptr.
514  // configs_size (argc) is the number of elements in configs.
515  // vars_vec is an optional vector of variables to set.
516  // vars_values is an optional corresponding vector of values for the variables
517  // in vars_vec.
518  // If set_only_non_debug_params is true, only params that do not contain
519  // "debug" in the name will be set.
520  int init_tesseract_internal(const std::string &arg0, const std::string &textbase,
521  const std::string &language, OcrEngineMode oem, char **configs,
522  int configs_size, const std::vector<std::string> *vars_vec,
523  const std::vector<std::string> *vars_values,
524  bool set_only_non_debug_params, TessdataManager *mgr);
525 
526  // Set the universal_id member of each font to be unique among all
527  // instances of the same font loaded.
528  void SetupUniversalFontIds();
529 
530  void recognize_page(std::string &image_name);
531  void end_tesseract();
532 
533  bool init_tesseract_lang_data(const std::string &arg0,
534  const std::string &language, OcrEngineMode oem, char **configs,
535  int configs_size, const std::vector<std::string> *vars_vec,
536  const std::vector<std::string> *vars_values,
537  bool set_only_non_debug_params, TessdataManager *mgr);
538 
539  void ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
540  std::vector<std::string> *not_to_load);
541 
543  SVMenuNode *build_menu_new();
544 #ifndef GRAPHICS_DISABLED
545  void pgeditor_main(int width, int height, PAGE_RES *page_res);
546 
547  void process_image_event( // action in image win
548  const SVEvent &event);
549  bool process_cmd_win_event( // UI command semantics
550  int32_t cmd_event, // which menu item?
551  char *new_value // any prompt data
552  );
553 #endif // !GRAPHICS_DISABLED
554  void debug_word(PAGE_RES *page_res, const TBOX &selection_box);
555  void do_re_display(bool (tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it));
556  bool word_display(PAGE_RES_IT *pr_it);
557  bool word_bln_display(PAGE_RES_IT *pr_it);
558  bool word_blank_and_set_display(PAGE_RES_IT *pr_its);
559  bool word_set_display(PAGE_RES_IT *pr_it);
560  // #ifndef GRAPHICS_DISABLED
561  bool word_dumper(PAGE_RES_IT *pr_it);
562  // #endif // !GRAPHICS_DISABLED
563  void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box);
565  // make rej map for word
566  void make_reject_map(WERD_RES *word, ROW *row, int16_t pass);
567  bool one_ell_conflict(WERD_RES *word_res, bool update_map);
568  int16_t first_alphanum_index(const char *word, const char *word_lengths);
569  int16_t first_alphanum_offset(const char *word, const char *word_lengths);
570  int16_t alpha_count(const char *word, const char *word_lengths);
571  bool word_contains_non_1_digit(const char *word, const char *word_lengths);
572  void dont_allow_1Il(WERD_RES *word);
573  int16_t count_alphanums( // how many alphanums
574  WERD_RES *word);
575  void flip_0O(WERD_RES *word);
576  bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id);
577  bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id);
578  bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row);
579  void nn_match_word( // Match a word
580  WERD_RES *word, ROW *row);
581  void nn_recover_rejects(WERD_RES *word, ROW *row);
582  void set_done( // set done flag
583  WERD_RES *word, int16_t pass);
584  int16_t safe_dict_word(const WERD_RES *werd_res); // is best_choice in dict?
585  void flip_hyphens(WERD_RES *word);
586  void reject_I_1_L(WERD_RES *word);
587  void reject_edge_blobs(WERD_RES *word);
588  void reject_mostly_rejects(WERD_RES *word);
590  bool word_adaptable( // should we adapt?
591  WERD_RES *word, uint16_t mode);
592 
594  void recog_word_recursive(WERD_RES *word);
595  void recog_word(WERD_RES *word);
596  void split_and_recog_word(WERD_RES *word);
597  void split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece,
598  BlamerBundle **orig_blamer_bundle) const;
599  void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const;
601  bool digit_or_numeric_punct(WERD_RES *word, int char_position);
602  int16_t eval_word_spacing(WERD_RES_LIST &word_res_list);
603  void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block);
604  int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
605  void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block);
606  void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block);
607  void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block);
608  void fix_fuzzy_spaces( // find fuzzy words
609  ETEXT_DESC *monitor, // progress monitor
610  int32_t word_count, // count of words in doc
611  PAGE_RES *page_res);
612  void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved);
613  bool fixspace_thinks_word_done(WERD_RES *word);
614  int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);
615  float blob_noise_score(TBLOB *blob);
616  void break_noisiest_blob_word(WERD_RES_LIST &words);
618 #ifndef DISABLED_LEGACY_ENGINE
619  GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word);
620  bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word);
621 #endif
622  void tilde_crunch(PAGE_RES_IT &page_res_it);
623  void unrej_good_quality_words( // unreject potential
624  PAGE_RES_IT &page_res_it);
625  void doc_and_block_rejection( // reject big chunks
626  PAGE_RES_IT &page_res_it, bool good_quality_doc);
627  void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc);
628  void convert_bad_unlv_chs(WERD_RES *word_res);
629  void tilde_delete(PAGE_RES_IT &page_res_it);
630  int16_t word_blob_quality(WERD_RES *word);
631  void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count);
632  void unrej_good_chs(WERD_RES *word);
633  int16_t count_outline_errs(char c, int16_t outline_count);
634  int16_t word_outline_errs(WERD_RES *word);
635 #ifndef DISABLED_LEGACY_ENGINE
636  bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);
637 #endif
638  CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode);
639  int16_t failure_count(WERD_RES *word);
640  bool noise_outlines(TWERD *word);
642  void process_selected_words(PAGE_RES *page_res, // blocks to check
643  // function to call
644  TBOX &selection_box,
645  bool (tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it));
647  void tess_add_doc_word( // test acceptability
648  WERD_CHOICE *word_choice // after context
649  );
650  void tess_segment_pass_n(int pass_n, WERD_RES *word);
651  bool tess_acceptable_word(WERD_RES *word);
652 
654  // Applies the box file based on the image name filename, and resegments
655  // the words in the block_list (page), with:
656  // blob-mode: one blob per line in the box file, words as input.
657  // word/line-mode: one blob per space-delimited unit after the #, and one word
658  // per line in the box file. (See comment above for box file format.)
659  // If find_segmentation is true, (word/line mode) then the classifier is used
660  // to re-segment words/lines to match the space-delimited truth string for
661  // each box. In this case, the input box may be for a word or even a whole
662  // text line, and the output words will contain multiple blobs corresponding
663  // to the space-delimited input string.
664  // With find_segmentation false, no classifier is needed, but the chopper
665  // can still be used to correctly segment touching characters with the help
666  // of the input boxes.
667  // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
668  // from normal classification, ie. with a word, chopped_word, rebuild_word,
669  // seam_array, denorm, box_word, and best_state, but NO best_choice or
670  // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
671  // Instead, the correct_text member of WERD_RES is set, and this may be later
672  // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
673  // is not required before calling ApplyBoxTraining.
674  PAGE_RES *ApplyBoxes(const char *filename, bool find_segmentation, BLOCK_LIST *block_list);
675 
676  // Any row xheight that is significantly different from the median is set
677  // to the median.
678  void PreenXHeights(BLOCK_LIST *block_list);
679 
680  // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
681  // All fuzzy spaces are removed, and all the words are maximally chopped.
682  PAGE_RES *SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list);
683  // Tests the chopper by exhaustively running chop_one_blob.
684  // The word_res will contain filled chopped_word, seam_array, denorm,
685  // box_word and best_state for the maximally chopped word.
686  void MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,
687  WERD_RES *word_res);
688  // Gather consecutive blobs that match the given box into the best_state
689  // and corresponding correct_text.
690  // Fights over which box owns which blobs are settled by pre-chopping and
691  // applying the blobs to box or next_box with the least non-overlap.
692  // Returns false if the box was in error, which can only be caused by
693  // failing to find an appropriate blob for a box.
694  // This means that occasionally, blobs may be incorrectly segmented if the
695  // chopper fails to find a suitable chop point.
696  bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,
697  const TBOX *next_box, const char *correct_text);
698  // Consume all source blobs that strongly overlap the given box,
699  // putting them into a new word, with the correct_text label.
700  // Fights over which box owns which blobs are settled by
701  // applying the blobs to box or next_box with the least non-overlap.
702  // Returns false if the box was in error, which can only be caused by
703  // failing to find an overlapping blob for a box.
704  bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,
705  const char *correct_text);
706  // Resegments the words by running the classifier in an attempt to find the
707  // correct segmentation that produces the required string.
709  // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
710  // Returns false if an invalid UNICHAR_ID is encountered.
711  bool ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids);
712  // Resegments the word to achieve the target_text from the classifier.
713  // Returns false if the re-segmentation fails.
714  // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
715  // applies a full search on the classifier results to find the best classified
716  // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
717  // substitutions ARE used.
718  bool FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res);
719  // Recursive helper to find a match to the target_text (from text_index
720  // position) in the choices (from choices_pos position).
721  // Choices is an array of vectors of length choices_length, with each
722  // element representing a starting position in the word, and the
723  // vector holding classification results for a sequence of consecutive
724  // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
725  void SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
726  unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,
727  unsigned text_index, float rating, std::vector<int> *segmentation,
728  float *best_rating, std::vector<int> *best_segmentation);
729  // Counts up the labelled words and the blobs within.
730  // Deletes all unused or emptied words, counting the unused ones.
731  // Resets W_BOL and W_EOL flags correctly.
732  // Builds the rebuild_word and rebuilds the box_word.
733  void TidyUp(PAGE_RES *page_res);
734  // Logs a bad box by line in the box file and box coords.
735  void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg);
736  // Creates a fake best_choice entry in each WERD_RES with the correct text.
738  // Call LearnWord to extract features for labelled blobs within each word.
739  // Features are stored in an internal buffer.
740  void ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res);
741 
743  // Returns the number of misfit blob tops in this word.
744  int CountMisfitTops(WERD_RES *word_res);
745  // Returns a new x-height in pixels (original image coords) that is
746  // maximally compatible with the result in word_res.
747  // Returns 0.0f if no x-height is found that is better than the current
748  // estimate.
749  float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift);
751  // TODO(ocr-team): Find and remove obsolete parameters.
752  BOOL_VAR_H(tessedit_resegment_from_boxes);
753  BOOL_VAR_H(tessedit_resegment_from_line_boxes);
754  BOOL_VAR_H(tessedit_train_from_boxes);
755  BOOL_VAR_H(tessedit_make_boxes_from_boxes);
756  BOOL_VAR_H(tessedit_train_line_recognizer);
757  BOOL_VAR_H(tessedit_dump_pageseg_images);
758  BOOL_VAR_H(tessedit_do_invert);
759  INT_VAR_H(tessedit_pageseg_mode);
760  INT_VAR_H(thresholding_method);
761  BOOL_VAR_H(thresholding_debug);
762  double_VAR_H(thresholding_window_size);
763  double_VAR_H(thresholding_kfactor);
764  double_VAR_H(thresholding_tile_size);
765  double_VAR_H(thresholding_smooth_kernel_size);
766  double_VAR_H(thresholding_score_fraction);
767  INT_VAR_H(tessedit_ocr_engine_mode);
768  STRING_VAR_H(tessedit_char_blacklist);
769  STRING_VAR_H(tessedit_char_whitelist);
770  STRING_VAR_H(tessedit_char_unblacklist);
771  BOOL_VAR_H(tessedit_ambigs_training);
772  INT_VAR_H(pageseg_devanagari_split_strategy);
773  INT_VAR_H(ocr_devanagari_split_strategy);
774  STRING_VAR_H(tessedit_write_params_to_file);
775  BOOL_VAR_H(tessedit_adaption_debug);
776  INT_VAR_H(bidi_debug);
777  INT_VAR_H(applybox_debug);
778  INT_VAR_H(applybox_page);
779  STRING_VAR_H(applybox_exposure_pattern);
780  BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode);
781  BOOL_VAR_H(applybox_learn_ngrams_mode);
782  BOOL_VAR_H(tessedit_display_outwords);
783  BOOL_VAR_H(tessedit_dump_choices);
784  BOOL_VAR_H(tessedit_timing_debug);
785  BOOL_VAR_H(tessedit_fix_fuzzy_spaces);
786  BOOL_VAR_H(tessedit_unrej_any_wd);
787  BOOL_VAR_H(tessedit_fix_hyphens);
788  BOOL_VAR_H(tessedit_enable_doc_dict);
789  BOOL_VAR_H(tessedit_debug_fonts);
790  INT_VAR_H(tessedit_font_id);
791  BOOL_VAR_H(tessedit_debug_block_rejection);
792  BOOL_VAR_H(tessedit_enable_bigram_correction);
793  BOOL_VAR_H(tessedit_enable_dict_correction);
794  INT_VAR_H(tessedit_bigram_debug);
795  BOOL_VAR_H(enable_noise_removal);
796  INT_VAR_H(debug_noise_removal);
797  // Worst (min) certainty, for which a diacritic is allowed to make the base
798  // character worse and still be included.
799  double_VAR_H(noise_cert_basechar);
800  // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
801  // make the base character worse and still be included.
802  double_VAR_H(noise_cert_disjoint);
803  // Worst (min) certainty, for which a diacritic is allowed to make a new
804  // stand-alone blob.
805  double_VAR_H(noise_cert_punc);
806  // Factor of certainty margin for adding diacritics to not count as worse.
807  double_VAR_H(noise_cert_factor);
808  INT_VAR_H(noise_maxperblob);
809  INT_VAR_H(noise_maxperword);
810  INT_VAR_H(debug_x_ht_level);
811  STRING_VAR_H(chs_leading_punct);
812  STRING_VAR_H(chs_trailing_punct1);
813  STRING_VAR_H(chs_trailing_punct2);
814  double_VAR_H(quality_rej_pc);
815  double_VAR_H(quality_blob_pc);
816  double_VAR_H(quality_outline_pc);
817  double_VAR_H(quality_char_pc);
818  INT_VAR_H(quality_min_initial_alphas_reqd);
819  INT_VAR_H(tessedit_tess_adaption_mode);
820  BOOL_VAR_H(tessedit_minimal_rej_pass1);
821  BOOL_VAR_H(tessedit_test_adaption);
822  BOOL_VAR_H(test_pt);
823  double_VAR_H(test_pt_x);
824  double_VAR_H(test_pt_y);
825  INT_VAR_H(multilang_debug_level);
826  INT_VAR_H(paragraph_debug_level);
827  BOOL_VAR_H(paragraph_text_based);
828  BOOL_VAR_H(lstm_use_matrix);
829  STRING_VAR_H(outlines_odd);
830  STRING_VAR_H(outlines_2);
831  BOOL_VAR_H(tessedit_good_quality_unrej);
832  BOOL_VAR_H(tessedit_use_reject_spaces);
833  double_VAR_H(tessedit_reject_doc_percent);
834  double_VAR_H(tessedit_reject_block_percent);
835  double_VAR_H(tessedit_reject_row_percent);
836  double_VAR_H(tessedit_whole_wd_rej_row_percent);
837  BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds);
838  BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds);
839  BOOL_VAR_H(tessedit_dont_blkrej_good_wds);
840  BOOL_VAR_H(tessedit_dont_rowrej_good_wds);
841  INT_VAR_H(tessedit_preserve_min_wd_len);
842  BOOL_VAR_H(tessedit_row_rej_good_docs);
843  double_VAR_H(tessedit_good_doc_still_rowrej_wd);
844  BOOL_VAR_H(tessedit_reject_bad_qual_wds);
845  BOOL_VAR_H(tessedit_debug_doc_rejection);
846  BOOL_VAR_H(tessedit_debug_quality_metrics);
847  BOOL_VAR_H(bland_unrej);
848  double_VAR_H(quality_rowrej_pc);
849  BOOL_VAR_H(unlv_tilde_crunching);
850  BOOL_VAR_H(hocr_font_info);
851  BOOL_VAR_H(hocr_char_boxes);
852  BOOL_VAR_H(crunch_early_merge_tess_fails);
853  BOOL_VAR_H(crunch_early_convert_bad_unlv_chs);
854  double_VAR_H(crunch_terrible_rating);
855  BOOL_VAR_H(crunch_terrible_garbage);
856  double_VAR_H(crunch_poor_garbage_cert);
857  double_VAR_H(crunch_poor_garbage_rate);
858  double_VAR_H(crunch_pot_poor_rate);
859  double_VAR_H(crunch_pot_poor_cert);
860  double_VAR_H(crunch_del_rating);
861  double_VAR_H(crunch_del_cert);
862  double_VAR_H(crunch_del_min_ht);
863  double_VAR_H(crunch_del_max_ht);
864  double_VAR_H(crunch_del_min_width);
865  double_VAR_H(crunch_del_high_word);
866  double_VAR_H(crunch_del_low_word);
867  double_VAR_H(crunch_small_outlines_size);
868  INT_VAR_H(crunch_rating_max);
869  INT_VAR_H(crunch_pot_indicators);
870  BOOL_VAR_H(crunch_leave_ok_strings);
871  BOOL_VAR_H(crunch_accept_ok);
872  BOOL_VAR_H(crunch_leave_accept_strings);
873  BOOL_VAR_H(crunch_include_numerals);
874  INT_VAR_H(crunch_leave_lc_strings);
875  INT_VAR_H(crunch_leave_uc_strings);
876  INT_VAR_H(crunch_long_repetitions);
877  INT_VAR_H(crunch_debug);
878  INT_VAR_H(fixsp_non_noise_limit);
879  double_VAR_H(fixsp_small_outlines_size);
880  BOOL_VAR_H(tessedit_prefer_joined_punct);
881  INT_VAR_H(fixsp_done_mode);
882  INT_VAR_H(debug_fix_space_level);
883  STRING_VAR_H(numeric_punctuation);
884  INT_VAR_H(x_ht_acceptance_tolerance);
885  INT_VAR_H(x_ht_min_change);
886  INT_VAR_H(superscript_debug);
887  double_VAR_H(superscript_worse_certainty);
888  double_VAR_H(superscript_bettered_certainty);
889  double_VAR_H(superscript_scaledown_ratio);
890  double_VAR_H(subscript_max_y_top);
891  double_VAR_H(superscript_min_y_bottom);
892  BOOL_VAR_H(tessedit_write_block_separators);
893  BOOL_VAR_H(tessedit_write_rep_codes);
894  BOOL_VAR_H(tessedit_write_unlv);
895  BOOL_VAR_H(tessedit_create_txt);
896  BOOL_VAR_H(tessedit_create_hocr);
897  BOOL_VAR_H(tessedit_create_alto);
898  BOOL_VAR_H(tessedit_create_lstmbox);
899  BOOL_VAR_H(tessedit_create_tsv);
900  BOOL_VAR_H(tessedit_create_wordstrbox);
901  BOOL_VAR_H(tessedit_create_pdf);
902  BOOL_VAR_H(textonly_pdf);
903  INT_VAR_H(jpg_quality);
904  INT_VAR_H(user_defined_dpi);
905  INT_VAR_H(min_characters_to_try);
906  STRING_VAR_H(unrecognised_char);
907  INT_VAR_H(suspect_level);
908  INT_VAR_H(suspect_short_words);
909  BOOL_VAR_H(suspect_constrain_1Il);
910  double_VAR_H(suspect_rating_per_ch);
911  double_VAR_H(suspect_accept_rating);
912  BOOL_VAR_H(tessedit_minimal_rejection);
913  BOOL_VAR_H(tessedit_zero_rejection);
914  BOOL_VAR_H(tessedit_word_for_word);
915  BOOL_VAR_H(tessedit_zero_kelvin_rejection);
916  INT_VAR_H(tessedit_reject_mode);
917  BOOL_VAR_H(tessedit_rejection_debug);
918  BOOL_VAR_H(tessedit_flip_0O);
919  double_VAR_H(tessedit_lower_flip_hyphen);
920  double_VAR_H(tessedit_upper_flip_hyphen);
921  BOOL_VAR_H(rej_trust_doc_dawg);
922  BOOL_VAR_H(rej_1Il_use_dict_word);
923  BOOL_VAR_H(rej_1Il_trust_permuter_type);
924  BOOL_VAR_H(rej_use_tess_accepted);
925  BOOL_VAR_H(rej_use_tess_blanks);
926  BOOL_VAR_H(rej_use_good_perm);
927  BOOL_VAR_H(rej_use_sensible_wd);
928  BOOL_VAR_H(rej_alphas_in_number_perm);
929  double_VAR_H(rej_whole_of_mostly_reject_word_fract);
930  INT_VAR_H(tessedit_image_border);
931  STRING_VAR_H(ok_repeated_ch_non_alphanum_wds);
932  STRING_VAR_H(conflict_set_I_l_1);
933  INT_VAR_H(min_sane_x_ht_pixels);
934  BOOL_VAR_H(tessedit_create_boxfile);
935  INT_VAR_H(tessedit_page_number);
936  BOOL_VAR_H(tessedit_write_images);
937  BOOL_VAR_H(interactive_display_mode);
938  STRING_VAR_H(file_type);
939  BOOL_VAR_H(tessedit_override_permuter);
940  STRING_VAR_H(tessedit_load_sublangs);
941  BOOL_VAR_H(tessedit_use_primary_params_model);
942  // Min acceptable orientation margin (difference in scores between top and 2nd
943  // choice in OSResults::orientations) to believe the page orientation.
944  double_VAR_H(min_orientation_margin);
945  BOOL_VAR_H(textord_tabfind_show_vlines);
946  BOOL_VAR_H(textord_use_cjk_fp_model);
947  BOOL_VAR_H(poly_allow_detailed_fx);
948  BOOL_VAR_H(tessedit_init_config_only);
949 #ifndef DISABLED_LEGACY_ENGINE
950  BOOL_VAR_H(textord_equation_detect);
951 #endif // ndef DISABLED_LEGACY_ENGINE
952  BOOL_VAR_H(textord_tabfind_vertical_text);
953  BOOL_VAR_H(textord_tabfind_force_vertical_text);
954  double_VAR_H(textord_tabfind_vertical_text_ratio);
955  double_VAR_H(textord_tabfind_aligned_gap_fraction);
956  INT_VAR_H(tessedit_parallelize);
957  BOOL_VAR_H(preserve_interword_spaces);
958  STRING_VAR_H(page_separator);
959  INT_VAR_H(lstm_choice_mode);
960  INT_VAR_H(lstm_choice_iterations);
961  double_VAR_H(lstm_rating_coefficient);
962  BOOL_VAR_H(pageseg_apply_music_mask);
963 
965  FILE *init_recog_training(const char *filename);
966  void recog_training_segmented(const char *filename, PAGE_RES *page_res,
967  volatile ETEXT_DESC *monitor, FILE *output_file);
968  void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file);
969 
970 private:
971  // The filename of a backup config file. If not null, then we currently
972  // have a temporary debug config file loaded, and backup_config_file_
973  // will be loaded, and set to null when debug is complete.
974  const char *backup_config_file_;
975  // The filename of a config file to read when processing a debug word.
976  std::string word_config_;
977  // Image used for input to layout analysis and tesseract recognition.
978  // May be modified by the ShiroRekhaSplitter to eliminate the top-line.
979  Image pix_binary_;
980  // Grey-level input image if the input was not binary, otherwise nullptr.
981  Image pix_grey_;
982  // Original input image. Color if the input was color.
983  Image pix_original_;
984  // Thresholds that were used to generate the thresholded image from grey.
985  Image pix_thresholds_;
986  // Debug images. If non-empty, will be written on destruction.
987  DebugPixa pixa_debug_;
988  // Input image resolution after any scaling. The resolution is not well
989  // transmitted by operations on Pix, so we keep an independent record here.
990  int source_resolution_;
991  // The shiro-rekha splitter object which is used to split top-lines in
992  // Devanagari words to provide a better word and grapheme segmentation.
993  ShiroRekhaSplitter splitter_;
994  // Page segmentation/layout
995  Textord textord_;
996  // True if the primary language uses right_to_left reading order.
997  bool right_to_left_;
998  Image scaled_color_;
999  int scaled_factor_;
1000  FCOORD deskew_;
1001  FCOORD reskew_;
1002  TesseractStats stats_;
1003  // Sub-languages to be tried in addition to this.
1004  std::vector<Tesseract *> sub_langs_;
1005  // Most recently used Tesseract out of this and sub_langs_. The default
1006  // language for the next word.
1007  Tesseract *most_recently_used_;
1008  // The size of the font table, ie max possible font id + 1.
1009  int font_table_size_;
1010 #ifndef DISABLED_LEGACY_ENGINE
1011  // Equation detector. Note: this pointer is NOT owned by the class.
1012  EquationDetect *equ_detect_;
1013 #endif // ndef DISABLED_LEGACY_ENGINE
1014  // LSTM recognizer, if available.
1015  LSTMRecognizer *lstm_recognizer_;
1016  // Output "page" number (actually line number) using TrainLineRecognizer.
1017  int train_line_page_num_;
1018 };
1019 
1020 } // namespace tesseract
1021 
1022 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_
struct ETEXT_DESC ETEXT_DESC
Definition: capi.h:137
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
@ TBOX
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:266
SetParamConstraint
Definition: params.h:38
int16_t word_blob_quality(WERD_RES *word)
void dont_allow_1Il(WERD_RES *word)
bool non_0_digit(const char *str, int length)
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
int UNICHAR_ID
Definition: unichar.h:36
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
GARBAGE_LEVEL
Definition: docqual.h:30
void flip_0O(WERD_RES *word)
void flip_hyphens(WERD_RES *word)
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
WordData(const PAGE_RES_IT &page_res_it)
PointerVector< WERD_RES > lang_words
BOOL_VAR_H(tessedit_unrej_any_wd)
Image * mutable_pix_binary()
BOOL_VAR_H(tessedit_minimal_rej_pass1)
INT_VAR_H(suspect_level)
double_VAR_H(thresholding_score_fraction)
BOOL_VAR_H(tessedit_debug_quality_metrics)
BOOL_VAR_H(rej_use_tess_accepted)
INT_VAR_H(thresholding_method)
int init_tesseract(const std::string &datapath, const std::string &language, OcrEngineMode oem)
BOOL_VAR_H(textord_tabfind_force_vertical_text)
BOOL_VAR_H(tessedit_display_outwords)
BOOL_VAR_H(textord_tabfind_vertical_text)
INT_VAR_H(crunch_debug)
INT_VAR_H(tessedit_font_id)
double_VAR_H(noise_cert_punc)
BOOL_VAR_H(suspect_constrain_1Il)
double_VAR_H(superscript_worse_certainty)
BOOL_VAR_H(unlv_tilde_crunching)
INT_VAR_H(tessedit_image_border)
double_VAR_H(noise_cert_factor)
double_VAR_H(test_pt_x)
BOOL_VAR_H(tessedit_train_line_recognizer)
BOOL_VAR_H(paragraph_text_based)
BOOL_VAR_H(rej_use_tess_blanks)
BOOL_VAR_H(tessedit_zero_rejection)
double_VAR_H(tessedit_lower_flip_hyphen)
double_VAR_H(crunch_pot_poor_cert)
STRING_VAR_H(outlines_odd)
BOOL_VAR_H(tessedit_init_config_only)
double_VAR_H(crunch_terrible_rating)
BOOL_VAR_H(tessedit_fix_fuzzy_spaces)
BOOL_VAR_H(tessedit_fix_hyphens)
INT_VAR_H(crunch_long_repetitions)
INT_VAR_H(tessedit_page_number)
const FCOORD & reskew() const
INT_VAR_H(suspect_short_words)
double_VAR_H(suspect_rating_per_ch)
INT_VAR_H(min_sane_x_ht_pixels)
BOOL_VAR_H(lstm_use_matrix)
void set_pix_grey(Image grey_pix)
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
BOOL_VAR_H(applybox_learn_ngrams_mode)
double_VAR_H(tessedit_whole_wd_rej_row_percent)
INT_VAR_H(tessedit_parallelize)
BOOL_VAR_H(tessedit_debug_block_rejection)
BOOL_VAR_H(rej_1Il_trust_permuter_type)
int num_sub_langs() const
STRING_VAR_H(tessedit_char_whitelist)
BOOL_VAR_H(tessedit_word_for_word)
Image scaled_color() const
BOOL_VAR_H(tessedit_create_txt)
void TidyUp(PAGE_RES *page_res)
INT_VAR_H(fixsp_done_mode)
double_VAR_H(quality_blob_pc)
double_VAR_H(tessedit_reject_row_percent)
BOOL_VAR_H(tessedit_rejection_debug)
STRING_VAR_H(chs_leading_punct)
STRING_VAR_H(chs_trailing_punct2)
BOOL_VAR_H(crunch_leave_ok_strings)
void ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res)
BOOL_VAR_H(tessedit_do_invert)
BOOL_VAR_H(rej_1Il_use_dict_word)
BOOL_VAR_H(tessedit_zero_kelvin_rejection)
BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds)
BOOL_VAR_H(tessedit_override_permuter)
BOOL_VAR_H(crunch_accept_ok)
BOOL_VAR_H(tessedit_use_primary_params_model)
double_VAR_H(subscript_max_y_top)
BOOL_VAR_H(crunch_include_numerals)
double_VAR_H(noise_cert_disjoint)
double_VAR_H(thresholding_window_size)
INT_VAR_H(x_ht_min_change)
BOOL_VAR_H(tessedit_create_wordstrbox)
double_VAR_H(tessedit_reject_doc_percent)
INT_VAR_H(crunch_leave_uc_strings)
const Textord & textord() const
double_VAR_H(quality_char_pc)
double_VAR_H(fixsp_small_outlines_size)
INT_VAR_H(pageseg_devanagari_split_strategy)
STRING_VAR_H(conflict_set_I_l_1)
INT_VAR_H(quality_min_initial_alphas_reqd)
INT_VAR_H(fixsp_non_noise_limit)
void ReSegmentByClassification(PAGE_RES *page_res)
double_VAR_H(crunch_del_min_ht)
INT_VAR_H(applybox_page)
BOOL_VAR_H(textord_equation_detect)
BOOL_VAR_H(tessedit_enable_bigram_correction)
void set_pix_thresholds(Image thresholds)
BOOL_VAR_H(tessedit_create_alto)
bool ConvertStringToUnichars(const char *utf8, std::vector< UNICHAR_ID > *class_ids)
STRING_VAR_H(chs_trailing_punct1)
INT_VAR_H(multilang_debug_level)
BOOL_VAR_H(tessedit_dont_rowrej_good_wds)
double_VAR_H(superscript_min_y_bottom)
Image pix_original() const
double_VAR_H(superscript_scaledown_ratio)
INT_VAR_H(noise_maxperblob)
double_VAR_H(min_orientation_margin)
double_VAR_H(crunch_poor_garbage_rate)
double_VAR_H(quality_rej_pc)
double_VAR_H(tessedit_reject_block_percent)
Textord * mutable_textord()
BOOL_VAR_H(tessedit_create_tsv)
BOOL_VAR_H(tessedit_resegment_from_boxes)
INT_VAR_H(tessedit_pageseg_mode)
INT_VAR_H(crunch_pot_indicators)
BOOL_VAR_H(tessedit_prefer_joined_punct)
BOOL_VAR_H(tessedit_test_adaption)
BOOL_VAR_H(crunch_terrible_garbage)
INT_VAR_H(min_characters_to_try)
INT_VAR_H(tessedit_ocr_engine_mode)
double_VAR_H(quality_outline_pc)
Image BestPix() const
INT_VAR_H(jpg_quality)
INT_VAR_H(x_ht_acceptance_tolerance)
void set_pix_original(Image original_pix)
double_VAR_H(textord_tabfind_aligned_gap_fraction)
int scaled_factor() const
double_VAR_H(quality_rowrej_pc)
INT_VAR_H(superscript_debug)
void SearchForText(const std::vector< BLOB_CHOICE_LIST * > *choices, int choices_pos, unsigned choices_length, const std::vector< UNICHAR_ID > &target_text, unsigned text_index, float rating, std::vector< int > *segmentation, float *best_rating, std::vector< int > *best_segmentation)
INT_VAR_H(paragraph_debug_level)
INT_VAR_H(tessedit_preserve_min_wd_len)
BOOL_VAR_H(tessedit_ambigs_training)
double_VAR_H(thresholding_tile_size)
BOOL_VAR_H(tessedit_enable_dict_correction)
BOOL_VAR_H(tessedit_dump_pageseg_images)
BOOL_VAR_H(thresholding_debug)
BOOL_VAR_H(tessedit_create_pdf)
BOOL_VAR_H(tessedit_enable_doc_dict)
BOOL_VAR_H(tessedit_create_boxfile)
STRING_VAR_H(tessedit_char_unblacklist)
BOOL_VAR_H(tessedit_row_rej_good_docs)
INT_VAR_H(tessedit_tess_adaption_mode)
INT_VAR_H(lstm_choice_mode)
BOOL_VAR_H(poly_allow_detailed_fx)
double_VAR_H(crunch_pot_poor_rate)
BOOL_VAR_H(tessedit_reject_bad_qual_wds)
BOOL_VAR_H(tessedit_timing_debug)
BOOL_VAR_H(tessedit_good_quality_unrej)
BOOL_VAR_H(tessedit_write_block_separators)
STRING_VAR_H(tessedit_load_sublangs)
void SetupWordScripts(BLOCK_LIST *blocks)
double_VAR_H(crunch_del_low_word)
BOOL_VAR_H(tessedit_write_unlv)
STRING_VAR_H(applybox_exposure_pattern)
Image pix_grey() const
BOOL_VAR_H(textord_tabfind_show_vlines)
INT_VAR_H(tessedit_reject_mode)
double_VAR_H(crunch_del_rating)
void set_source_resolution(int ppi)
void CorrectClassifyWords(PAGE_RES *page_res)
INT_VAR_H(applybox_debug)
double_VAR_H(superscript_bettered_certainty)
double_VAR_H(crunch_del_cert)
Image pix_binary() const
BOOL_VAR_H(bland_unrej)
BOOL_VAR_H(tessedit_debug_doc_rejection)
BOOL_VAR_H(pageseg_apply_music_mask)
void recognize_page(std::string &image_name)
BOOL_VAR_H(hocr_char_boxes)
double_VAR_H(thresholding_smooth_kernel_size)
STRING_VAR_H(tessedit_char_blacklist)
INT_VAR_H(crunch_rating_max)
INT_VAR_H(debug_noise_removal)
BOOL_VAR_H(tessedit_create_hocr)
BOOL_VAR_H(crunch_leave_accept_strings)
double_VAR_H(rej_whole_of_mostly_reject_word_fract)
bool AnyTessLang() const
BOOL_VAR_H(tessedit_debug_fonts)
BOOL_VAR_H(tessedit_create_lstmbox)
BOOL_VAR_H(tessedit_minimal_rejection)
BOOL_VAR_H(enable_noise_removal)
BOOL_VAR_H(tessedit_train_from_boxes)
double_VAR_H(tessedit_good_doc_still_rowrej_wd)
BOOL_VAR_H(rej_use_good_perm)
BOOL_VAR_H(textonly_pdf)
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
BOOL_VAR_H(hocr_font_info)
double_VAR_H(noise_cert_basechar)
bool RunOldFixXht(WERD_RES *word, BLOCK *block, ROW *row)
BOOL_VAR_H(preserve_interword_spaces)
BOOL_VAR_H(tessedit_write_rep_codes)
BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode)
void SetScaledColor(int factor, Image color)
STRING_VAR_H(tessedit_write_params_to_file)
double_VAR_H(tessedit_upper_flip_hyphen)
double_VAR_H(crunch_poor_garbage_cert)
STRING_VAR_H(ok_repeated_ch_non_alphanum_wds)
BOOL_VAR_H(tessedit_resegment_from_line_boxes)
INT_VAR_H(bidi_debug)
bool FindSegmentation(const std::vector< UNICHAR_ID > &target_text, WERD_RES *word_res)
STRING_VAR_H(numeric_punctuation)
double_VAR_H(textord_tabfind_vertical_text_ratio)
double_VAR_H(crunch_small_outlines_size)
INT_VAR_H(tessedit_bigram_debug)
INT_VAR_H(user_defined_dpi)
BOOL_VAR_H(tessedit_make_boxes_from_boxes)
void nn_match_word(WERD_RES *word, ROW *row)
STRING_VAR_H(page_separator)
bool AnyLSTMLang() const
double_VAR_H(lstm_rating_coefficient)
BOOL_VAR_H(interactive_display_mode)
double_VAR_H(crunch_del_max_ht)
bool right_to_left() const
INT_VAR_H(ocr_devanagari_split_strategy)
double_VAR_H(crunch_del_min_width)
INT_VAR_H(debug_fix_space_level)
double_VAR_H(test_pt_y)
STRING_VAR_H(unrecognised_char)
void nn_recover_rejects(WERD_RES *word, ROW *row)
INT_VAR_H(debug_x_ht_level)
STRING_VAR_H(file_type)
INT_VAR_H(lstm_choice_iterations)
BOOL_VAR_H(rej_trust_doc_dawg)
BOOL_VAR_H(crunch_early_merge_tess_fails)
int source_resolution() const
Tesseract * get_sub_lang(int index) const
BOOL_VAR_H(tessedit_adaption_debug)
INT_VAR_H(crunch_leave_lc_strings)
BOOL_VAR_H(tessedit_write_images)
BOOL_VAR_H(tessedit_use_reject_spaces)
BOOL_VAR_H(tessedit_dont_blkrej_good_wds)
INT_VAR_H(noise_maxperword)
double_VAR_H(thresholding_kfactor)
BOOL_VAR_H(tessedit_dump_choices)
BOOL_VAR_H(rej_alphas_in_number_perm)
BOOL_VAR_H(tessedit_flip_0O)
BOOL_VAR_H(rej_use_sensible_wd)
double_VAR_H(suspect_accept_rating)
BOOL_VAR_H(textord_use_cjk_fp_model)
STRING_VAR_H(outlines_2)
BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds)
double_VAR_H(crunch_del_high_word)
BOOL_VAR_H(crunch_early_convert_bad_unlv_chs)
Image clone() const
Definition: image.cpp:24
void destroy()
Definition: image.cpp:32
#define TESS_API
Definition: export.h:34