tesseract  5.0.0
dict.h
Go to the documentation of this file.
1 // File: dict.h
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_DICT_DICT_H_
20 #define TESSERACT_DICT_DICT_H_
21 
22 #ifdef HAVE_CONFIG_H
23 # include "config_auto.h" // DISABLED_LEGACY_ENGINE
24 #endif
25 
26 #ifndef DISABLED_LEGACY_ENGINE
27 # include "ambigs.h"
28 #endif
29 #include "dawg.h"
30 #include "dawg_cache.h"
31 #include "ratngs.h"
32 #include "stopper.h"
33 #include "trie.h"
34 #include "unicharset.h"
35 #ifndef DISABLED_LEGACY_ENGINE
36 # include "params_training_featdef.h"
37 #endif // ndef DISABLED_LEGACY_ENGINE
38 
39 namespace tesseract {
40 
41 class MATRIX;
42 class WERD_RES;
43 
44 #define CHARS_PER_LINE 500
45 #define MAX_WERD_LENGTH (int64_t)128
46 #define NO_RATING -1
47 
53  float rating;
54  float certainty;
55 };
56 
57 using DawgVector = std::vector<Dawg *>;
58 
59 //
60 // Constants
61 //
62 static const int kRatingPad = 4;
63 static const int kDictMaxWildcards = 2; // max wildcards for a word
64 // TODO(daria): If hyphens are different in different languages and can be
65 // inferred from training data we should load their values dynamically.
66 static const char kHyphenSymbol[] = "-";
67 static const char kSlashSymbol[] = "/";
68 static const char kQuestionSymbol[] = "?";
69 static const char kApostropheSymbol[] = "'";
70 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
71 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
72 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
73 static const int kDocDictMaxRepChars = 4;
74 
75 // Enum for describing whether the x-height for the word is consistent:
76 // 0 - everything is good.
77 // 1 - there are one or two secondary (but consistent) baselines
78 // [think subscript and superscript], or there is an oversized
79 // first character.
80 // 2 - the word is inconsistent.
82 
83 struct DawgArgs {
85  : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
86 
90  // True if the current position is a valid word end.
91  bool valid_end;
92 };
93 
94 class TESS_API Dict {
95 public:
96  Dict(CCUtil *image_ptr);
97  ~Dict();
98  const CCUtil *getCCUtil() const {
99  return ccutil_;
100  }
102  return ccutil_;
103  }
104  const UNICHARSET &getUnicharset() const {
105  return getCCUtil()->unicharset;
106  }
108  return getCCUtil()->unicharset;
109  }
110 #ifndef DISABLED_LEGACY_ENGINE
112  return getCCUtil()->unichar_ambigs;
113  }
114 #endif
115  // Returns true if unichar_id is a word compounding character like - or /.
116  inline bool compound_marker(UNICHAR_ID unichar_id) {
117  const UNICHARSET &unicharset = getUnicharset();
118  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
119  const auto &normed_ids = unicharset.normed_ids(unichar_id);
120  return normed_ids.size() == 1 &&
121  (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_);
122  }
123  // Returns true if unichar_id is an apostrophe-like character that may
124  // separate prefix/suffix words from a main body word.
125  inline bool is_apostrophe(UNICHAR_ID unichar_id) {
126  const UNICHARSET &unicharset = getUnicharset();
127  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
128  const auto &normed_ids = unicharset.normed_ids(unichar_id);
129  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
130  }
131 
132  /* hyphen.cpp ************************************************************/
133 
135  inline bool hyphenated() const {
136  return !last_word_on_line_ && hyphen_word_;
137  }
139  inline int hyphen_base_size() const {
140  return this->hyphenated() ? hyphen_word_->length() : 0;
141  }
145  inline void copy_hyphen_info(WERD_CHOICE *word) const {
146  if (this->hyphenated()) {
147  *word = *hyphen_word_;
148  if (hyphen_debug_level) {
149  word->print("copy_hyphen_info: ");
150  }
151  }
152  }
154  inline bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id,
155  bool first_pos) const {
156  if (!last_word_on_line_ || first_pos) {
157  return false;
158  }
159  ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
160  const auto &normed_ids = unicharset->normed_ids(unichar_id);
161  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
162  }
164  inline bool has_hyphen_end(const WERD_CHOICE &word) const {
165  int word_index = word.length() - 1;
166  return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0);
167  }
171  void reset_hyphen_vars(bool last_word_on_line);
174  void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs);
175 
176  /* permdawg.cpp ************************************************************/
177  // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
178  // When this function is refactored, permdawg.cpp can be removed.
179 
182  inline void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice) {
183  if (word.rating() < best_choice->rating()) {
184  *best_choice = word;
185  }
186  }
190  void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const;
191  // Fill the given vector with the default collection of any-length dawgs
192  void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const;
193 
199  WERD_CHOICE *dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,
200  float rating_limit);
204  void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
205  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
206  bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
207  WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args);
208 
210  void (Dict::*go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
211  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
212  bool word_ending, WERD_CHOICE *word, float certainties[],
213  float *limit, WERD_CHOICE *best_choice, int *attempts_left,
214  void *void_more_args);
215  //
216  // Helper functions for dawg_permute_and_select().
217  //
218  void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
219  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
220  WERD_CHOICE *word, float certainties[], float *limit,
221  WERD_CHOICE *best_choice, int *attempts_left, void *more_args);
222 
223  void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
224  const BLOB_CHOICE &blob_choice, int char_choice_index,
225  const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
226  float certainties[], float *limit, WERD_CHOICE *best_choice,
227  int *attempts_left, void *more_args);
228 
229  bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
230  const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
231  int word_ending, CHAR_FRAGMENT_INFO *char_frag_info);
232 
233  /* stopper.cpp *************************************************************/
234 #if !defined(DISABLED_LEGACY_ENGINE)
235  bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable,
236  MATRIX *ratings);
237 #endif // !defined(DISABLED_LEGACY_ENGINE)
238  // Replaces the corresponding wrong ngram in werd_choice with the correct
239  // one. The whole correct n-gram is inserted into the ratings matrix and
240  // the werd_choice: no more fragments!. Rating and certainty of new entries
241  // in matrix and werd_choice are the sum and mean of the wrong ngram
242  // respectively.
243  // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
244  // mystring", with a new entry in the ratings matrix for ".
245  void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id,
246  WERD_CHOICE *werd_choice, MATRIX *ratings);
247 
249  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
257  int UniformCertainties(const WERD_CHOICE &word);
259  bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency);
263  bool AcceptableResult(WERD_RES *word) const;
264 #if !defined(DISABLED_LEGACY_ENGINE)
265  void EndDangerousAmbigs();
266 #endif // !defined(DISABLED_LEGACY_ENGINE)
268  void DebugWordChoices();
270  void SettupStopperPass1();
272  void SettupStopperPass2();
273  /* context.cpp *************************************************************/
275  int case_ok(const WERD_CHOICE &word) const;
278  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
279 
280  /* dict.cpp ****************************************************************/
281 
284  static DawgCache *GlobalDawgCache();
285  // Sets up ready for a Load or LoadLSTM.
286  void SetupForLoad(DawgCache *dawg_cache);
287  // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
288  void Load(const std::string &lang, TessdataManager *data_file);
289  // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
290  void LoadLSTM(const std::string &lang, TessdataManager *data_file);
291  // Completes the loading process after Load() and/or LoadLSTM().
292  // Returns false if no dictionaries were loaded.
293  bool FinishLoad();
294  void End();
295 
296  // Resets the document dictionary analogous to ResetAdaptiveClassifier.
298  if (pending_words_ != nullptr) {
299  pending_words_->clear();
300  }
301  if (document_words_ != nullptr) {
302  document_words_->clear();
303  }
304  }
305 
341  //
342  int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,
343  bool word_end) const;
344 
345  int (Dict::*letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset,
346  UNICHAR_ID unichar_id, bool word_end) const;
348  int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,
349  bool word_end) const {
350  return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end);
351  }
352 
354  double (Dict::*probability_in_context_)(const char *lang, const char *context, int context_bytes,
355  const char *character, int character_bytes);
357  double ProbabilityInContext(const char *context, int context_bytes, const char *character,
358  int character_bytes) {
359  return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes,
360  character, character_bytes);
361  }
362 
364  double def_probability_in_context(const char *lang, const char *context, int context_bytes,
365  const char *character, int character_bytes) {
366  (void)lang;
367  (void)context;
368  (void)context_bytes;
369  (void)character;
370  (void)character_bytes;
371  return 0.0;
372  }
373 
374  inline void SetWildcardID(UNICHAR_ID id) {
375  wildcard_unichar_id_ = id;
376  }
377  inline UNICHAR_ID WildcardID() const {
378  return wildcard_unichar_id_;
379  }
381  inline int NumDawgs() const {
382  return dawgs_.size();
383  }
385  inline const Dawg *GetDawg(int index) const {
386  return dawgs_[index];
387  }
389  inline const Dawg *GetPuncDawg() const {
390  return punc_dawg_;
391  }
393  inline const Dawg *GetUnambigDawg() const {
394  return unambig_dawg_;
395  }
397  static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
398  if (edge_ref == NO_EDGE) {
399  return 0; // beginning to explore the dawg
400  }
401  NODE_REF node = dawg->next_node(edge_ref);
402  if (node == 0) {
403  node = NO_EDGE; // end of word
404  }
405  return node;
406  }
407 
408  // Given a unichar from a string and a given dawg, return the unichar
409  // we should use to match in that dawg type. (for example, in the number
410  // dawg, all numbers are transformed to kPatternUnicharId).
411  UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const {
412  if (!dawg) {
413  return ch;
414  }
415  switch (dawg->type()) {
416  case DAWG_TYPE_NUMBER:
417  return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
418  default:
419  return ch;
420  }
421  }
422 
428  void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id,
429  bool word_end, DawgArgs *dawg_args,
430  PermuterType *current_permuter) const;
431 
435 
437  inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
438  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM ||
439  perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
440  (numbers_ok && perm == NUMBER_PERM));
441  }
442  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
443  int valid_word(const WERD_CHOICE &word) const {
444  return valid_word(word, false); // return NO_PERM for words with digits
445  }
446  int valid_word_or_number(const WERD_CHOICE &word) const {
447  return valid_word(word, true); // return NUMBER_PERM for valid numbers
448  }
450  int valid_word(const char *string) const {
451  WERD_CHOICE word(string, getUnicharset());
452  return valid_word(word);
453  }
454  // Do the two WERD_CHOICEs form a meaningful bigram?
455  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
460  bool valid_punctuation(const WERD_CHOICE &word);
462  int good_choice(const WERD_CHOICE &choice);
464  void add_document_word(const WERD_CHOICE &best_choice);
466  void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency,
467  float additional_adjust, bool modify_rating, bool debug);
469  inline void SetWordsegRatingAdjustFactor(float f) {
470  wordseg_rating_adjust_factor_ = f;
471  }
473  bool IsSpaceDelimitedLang() const;
474 
475 private:
477  CCUtil *ccutil_;
484 #ifndef DISABLED_LEGACY_ENGINE
485  UnicharAmbigs *dang_ambigs_table_ = nullptr;
487  UnicharAmbigs *replace_ambigs_table_ = nullptr;
488 #endif
490  float reject_offset_;
491  // Cached UNICHAR_IDs:
492  UNICHAR_ID wildcard_unichar_id_; // kDictWildcard.
493  UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
494  UNICHAR_ID question_unichar_id_; // kQuestionSymbol.
495  UNICHAR_ID slash_unichar_id_; // kSlashSymbol.
496  UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol.
497  // Hyphen-related variables.
498  WERD_CHOICE *hyphen_word_;
499  DawgPositionVector hyphen_active_dawgs_;
500  bool last_word_on_line_;
501  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
502  // matching. The first member of each list is taken as canonical. For
503  // example, the first list contains hyphens and dashes with the first symbol
504  // being the ASCII hyphen minus.
505  std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;
506  // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
507  DawgCache *dawg_cache_;
508  bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
509  // Dawgs.
510  DawgVector dawgs_;
511  SuccessorListsVector successors_;
512  Trie *pending_words_;
515  // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
516  // any of them are present on the best choices list for a word pair.
517  // the bigrams are stored as space-separated words where:
518  // (1) leading and trailing punctuation has been removed from each word and
519  // (2) any digits have been replaced with '?' marks.
520  Dawg *bigram_dawg_;
521  // TODO(daria): need to support multiple languages in the future,
522  // so maybe will need to maintain a list of dawgs of each kind.
523  Dawg *freq_dawg_;
524  Dawg *unambig_dawg_;
525  Dawg *punc_dawg_;
526  Trie *document_words_;
529  float wordseg_rating_adjust_factor_;
530  // File for recording ambiguities discovered during dictionary search.
531  FILE *output_ambig_words_file_;
532 
533 public:
537  STRING_VAR_H(user_words_file);
538  STRING_VAR_H(user_words_suffix);
539  STRING_VAR_H(user_patterns_file);
540  STRING_VAR_H(user_patterns_suffix);
541  BOOL_VAR_H(load_system_dawg);
542  BOOL_VAR_H(load_freq_dawg);
543  BOOL_VAR_H(load_unambig_dawg);
544  BOOL_VAR_H(load_punc_dawg);
545  BOOL_VAR_H(load_number_dawg);
546  BOOL_VAR_H(load_bigram_dawg);
547  double_VAR_H(xheight_penalty_subscripts);
548  double_VAR_H(xheight_penalty_inconsistent);
549  double_VAR_H(segment_penalty_dict_frequent_word);
550  double_VAR_H(segment_penalty_dict_case_ok);
551  double_VAR_H(segment_penalty_dict_case_bad);
552  double_VAR_H(segment_penalty_dict_nonword);
553  double_VAR_H(segment_penalty_garbage);
554  STRING_VAR_H(output_ambig_words_file);
555  INT_VAR_H(dawg_debug_level);
556  INT_VAR_H(hyphen_debug_level);
557  BOOL_VAR_H(use_only_first_uft8_step);
558  double_VAR_H(certainty_scale);
559  double_VAR_H(stopper_nondict_certainty_base);
560  double_VAR_H(stopper_phase2_certainty_rejection_offset);
561  INT_VAR_H(stopper_smallword_size);
562  double_VAR_H(stopper_certainty_per_char);
563  double_VAR_H(stopper_allowable_character_badness);
564  INT_VAR_H(stopper_debug_level);
565  BOOL_VAR_H(stopper_no_acceptable_choices);
566  INT_VAR_H(tessedit_truncate_wordchoice_log);
567  STRING_VAR_H(word_to_debug);
568  BOOL_VAR_H(segment_nonalphabetic_script);
569  BOOL_VAR_H(save_doc_words);
570  double_VAR_H(doc_dict_pending_threshold);
571  double_VAR_H(doc_dict_certainty_threshold);
572  INT_VAR_H(max_permuter_attempts);
573 };
574 
575 } // namespace tesseract
576 
577 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
#define ASSERT_HOST(x)
Definition: errcode.h:59
@ DAWG_TYPE_NUMBER
Definition: dawg.h:67
int64_t EDGE_REF
Definition: dawg.h:49
XHeightConsistencyEnum
Definition: dict.h:81
@ XH_GOOD
Definition: dict.h:81
@ XH_SUBNORMAL
Definition: dict.h:81
@ XH_INCONSISTENT
Definition: dict.h:81
std::vector< SuccessorList * > SuccessorListsVector
Definition: dawg.h:62
int64_t NODE_REF
Definition: dawg.h:50
@ character
Definition: mfoutline.h:53
std::vector< Dawg * > DawgVector
Definition: dict.h:57
int UNICHAR_ID
Definition: unichar.h:36
std::vector< DANGERR_INFO > DANGERR
Definition: stopper.h:47
PermuterType
Definition: ratngs.h:231
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:240
@ NUMBER_PERM
Definition: ratngs.h:238
@ COMPOUND_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:242
@ USER_PATTERN_PERM
Definition: ratngs.h:239
@ DOC_DAWG_PERM
Definition: ratngs.h:241
@ FREQ_DAWG_PERM
Definition: ratngs.h:243
std::vector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:623
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
const UNICHARSET * unicharset() const
Definition: ratngs.h:277
unsigned length() const
Definition: ratngs.h:283
void print() const
Definition: ratngs.h:557
float rating() const
Definition: ratngs.h:308
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:869
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
DawgType type() const
Definition: dawg.h:119
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:117
const CHAR_FRAGMENT * fragment
Definition: dict.h:51
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:84
DawgPositionVector * updated_dawgs
Definition: dict.h:88
DawgPositionVector * active_dawgs
Definition: dict.h:87
PermuterType permuter
Definition: dict.h:89
bool valid_end
Definition: dict.h:91
BOOL_VAR_H(use_only_first_uft8_step)
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145
double_VAR_H(segment_penalty_dict_case_ok)
UNICHAR_ID WildcardID() const
Definition: dict.h:377
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:469
STRING_VAR_H(user_words_file)
double_VAR_H(doc_dict_certainty_threshold)
BOOL_VAR_H(load_punc_dawg)
double_VAR_H(stopper_nondict_certainty_base)
const UNICHARSET & getUnicharset() const
Definition: dict.h:104
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:443
BOOL_VAR_H(load_bigram_dawg)
STRING_VAR_H(word_to_debug)
INT_VAR_H(dawg_debug_level)
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:385
STRING_VAR_H(user_patterns_suffix)
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:348
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:437
double_VAR_H(segment_penalty_dict_case_bad)
STRING_VAR_H(user_words_suffix)
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:411
INT_VAR_H(hyphen_debug_level)
double_VAR_H(certainty_scale)
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:381
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:450
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:393
double_VAR_H(xheight_penalty_subscripts)
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:364
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:154
const CCUtil * getCCUtil() const
Definition: dict.h:98
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:389
BOOL_VAR_H(load_unambig_dawg)
INT_VAR_H(tessedit_truncate_wordchoice_log)
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:116
double_VAR_H(stopper_phase2_certainty_rejection_offset)
STRING_VAR_H(output_ambig_words_file)
double_VAR_H(doc_dict_pending_threshold)
double_VAR_H(segment_penalty_dict_nonword)
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:397
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:125
void ResetDocumentDictionary()
Definition: dict.h:297
BOOL_VAR_H(load_system_dawg)
BOOL_VAR_H(load_number_dawg)
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:357
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111
int good_choice(const WERD_CHOICE &choice)
Returns true if a good answer is found for the unknown blob rating.
BOOL_VAR_H(segment_nonalphabetic_script)
double_VAR_H(segment_penalty_garbage)
INT_VAR_H(stopper_debug_level)
UNICHARSET & getUnicharset()
Definition: dict.h:107
double_VAR_H(segment_penalty_dict_frequent_word)
STRING_VAR_H(user_patterns_file)
BOOL_VAR_H(stopper_no_acceptable_choices)
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:164
INT_VAR_H(stopper_smallword_size)
double_VAR_H(stopper_certainty_per_char)
BOOL_VAR_H(save_doc_words)
double_VAR_H(stopper_allowable_character_badness)
BOOL_VAR_H(load_freq_dawg)
INT_VAR_H(max_permuter_attempts)
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:374
CCUtil * getCCUtil()
Definition: dict.h:101
double_VAR_H(xheight_penalty_inconsistent)
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:446
#define TESS_API
Definition: export.h:34