tesseract  5.0.0
dict.cpp
Go to the documentation of this file.
1 // File: dict.cpp
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "dict.h"
20 
21 #include "tprintf.h"
22 
23 #include <cstdio>
24 
25 namespace tesseract {
26 
27 class Image;
28 
30  : letter_is_okay_(&tesseract::Dict::def_letter_is_okay)
31  , probability_in_context_(&tesseract::Dict::def_probability_in_context)
32  , ccutil_(ccutil)
33  , wildcard_unichar_id_(INVALID_UNICHAR_ID)
34  , apostrophe_unichar_id_(INVALID_UNICHAR_ID)
35  , question_unichar_id_(INVALID_UNICHAR_ID)
36  , slash_unichar_id_(INVALID_UNICHAR_ID)
37  , hyphen_unichar_id_(INVALID_UNICHAR_ID)
38  , STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
39  getCCUtil()->params())
40  , STRING_INIT_MEMBER(user_words_suffix, "",
41  "A suffix of user-provided words located in tessdata.",
42  getCCUtil()->params())
43  , STRING_MEMBER(user_patterns_file, "", "A filename of user-provided patterns.",
44  getCCUtil()->params())
45  , STRING_INIT_MEMBER(user_patterns_suffix, "",
46  "A suffix of user-provided patterns located in "
47  "tessdata.",
48  getCCUtil()->params())
49  , BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", getCCUtil()->params())
50  , BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", getCCUtil()->params())
51  , BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
52  getCCUtil()->params())
53  , BOOL_INIT_MEMBER(load_punc_dawg, true,
54  "Load dawg with punctuation"
55  " patterns.",
56  getCCUtil()->params())
57  , BOOL_INIT_MEMBER(load_number_dawg, true,
58  "Load dawg with number"
59  " patterns.",
60  getCCUtil()->params())
61  , BOOL_INIT_MEMBER(load_bigram_dawg, true,
62  "Load dawg with special word "
63  "bigrams.",
64  getCCUtil()->params())
65  , double_MEMBER(xheight_penalty_subscripts, 0.125,
66  "Score penalty (0.1 = 10%) added if there are subscripts "
67  "or superscripts in a word, but it is otherwise OK.",
68  getCCUtil()->params())
69  , double_MEMBER(xheight_penalty_inconsistent, 0.25,
70  "Score penalty (0.1 = 10%) added if an xheight is "
71  "inconsistent.",
72  getCCUtil()->params())
73  , double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
74  "Score multiplier for word matches which have good case and"
75  " are frequent in the given language (lower is better).",
76  getCCUtil()->params())
77  , double_MEMBER(segment_penalty_dict_case_ok, 1.1,
78  "Score multiplier for word matches that have good case "
79  "(lower is better).",
80  getCCUtil()->params())
81  , double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
82  "Default score multiplier for word matches, which may have "
83  "case issues (lower is better).",
84  getCCUtil()->params())
85  , double_MEMBER(segment_penalty_dict_nonword, 1.25,
86  "Score multiplier for glyph fragment segmentations which "
87  "do not match a dictionary word (lower is better).",
88  getCCUtil()->params())
89  , double_MEMBER(segment_penalty_garbage, 1.50,
90  "Score multiplier for poorly cased strings that are not in"
91  " the dictionary and generally look like garbage (lower is"
92  " better).",
93  getCCUtil()->params())
94  , STRING_MEMBER(output_ambig_words_file, "",
95  "Output file for ambiguities found in the dictionary", getCCUtil()->params())
96  , INT_MEMBER(dawg_debug_level, 0,
97  "Set to 1 for general debug info"
98  ", to 2 for more details, to 3 to see all the debug messages",
99  getCCUtil()->params())
100  , INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", getCCUtil()->params())
101  , BOOL_MEMBER(use_only_first_uft8_step, false,
102  "Use only the first UTF8 step of the given string"
103  " when computing log probabilities.",
104  getCCUtil()->params())
105  , double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", getCCUtil()->params())
106  , double_MEMBER(stopper_nondict_certainty_base, -2.50, "Certainty threshold for non-dict words",
107  getCCUtil()->params())
108  , double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, "Reject certainty offset",
109  getCCUtil()->params())
110  , INT_MEMBER(stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word",
111  getCCUtil()->params())
112  , double_MEMBER(stopper_certainty_per_char, -0.50,
113  "Certainty to add"
114  " for each dict char above small word size.",
115  getCCUtil()->params())
116  , double_MEMBER(stopper_allowable_character_badness, 3.0,
117  "Max certaintly variation allowed in a word (in sigma)", getCCUtil()->params())
118  , INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", getCCUtil()->params())
119  , BOOL_MEMBER(stopper_no_acceptable_choices, false,
120  "Make AcceptableChoice() always return false. Useful"
121  " when there is a need to explore all segmentations",
122  getCCUtil()->params())
123  , INT_MEMBER(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list",
124  getCCUtil()->params())
125  , STRING_MEMBER(word_to_debug, "",
126  "Word for which stopper debug"
127  " information should be printed to stdout",
128  getCCUtil()->params())
129  , BOOL_MEMBER(segment_nonalphabetic_script, false,
130  "Don't use any alphabetic-specific tricks."
131  " Set to true in the traineddata config file for"
132  " scripts that are cursive or inherently fixed-pitch",
133  getCCUtil()->params())
134  , BOOL_MEMBER(save_doc_words, 0, "Save Document Words", getCCUtil()->params())
135  , double_MEMBER(doc_dict_pending_threshold, 0.0, "Worst certainty for using pending dictionary",
136  getCCUtil()->params())
137  , double_MEMBER(doc_dict_certainty_threshold, -2.25,
138  "Worst certainty for words that can be inserted into the"
139  " document dictionary",
140  getCCUtil()->params())
141  , INT_MEMBER(max_permuter_attempts, 10000,
142  "Maximum number of different"
143  " character choices to consider during permutation."
144  " This limit is especially useful when user patterns"
145  " are specified, since overly generic patterns can result in"
146  " dawg search exploring an overly large number of options.",
147  getCCUtil()->params()) {
148  reject_offset_ = 0.0;
149  go_deeper_fxn_ = nullptr;
150  hyphen_word_ = nullptr;
151  last_word_on_line_ = false;
152  document_words_ = nullptr;
153  dawg_cache_ = nullptr;
154  dawg_cache_is_ours_ = false;
155  pending_words_ = nullptr;
156  bigram_dawg_ = nullptr;
157  freq_dawg_ = nullptr;
158  punc_dawg_ = nullptr;
159  unambig_dawg_ = nullptr;
160  wordseg_rating_adjust_factor_ = -1.0f;
161  output_ambig_words_file_ = nullptr;
162 }
163 
165  End();
166  delete hyphen_word_;
167  if (output_ambig_words_file_ != nullptr) {
168  fclose(output_ambig_words_file_);
169  }
170 }
171 
173  // This global cache (a singleton) will outlive every Tesseract instance
174  // (even those that someone else might declare as global statics).
175  static DawgCache cache;
176  return &cache;
177 }
178 
179 // Sets up ready for a Load or LoadLSTM.
180 void Dict::SetupForLoad(DawgCache *dawg_cache) {
181  if (dawgs_.size() != 0) {
182  this->End();
183  }
184 
185  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
186  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
187  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
188  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
189 
190  if (dawg_cache != nullptr) {
191  dawg_cache_ = dawg_cache;
192  dawg_cache_is_ours_ = false;
193  } else {
194  dawg_cache_ = new DawgCache();
195  dawg_cache_is_ours_ = true;
196  }
197 }
198 
199 // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
200 void Dict::Load(const std::string &lang, TessdataManager *data_file) {
201  // Load dawgs_.
202  if (load_punc_dawg) {
203  punc_dawg_ =
204  dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
205  if (punc_dawg_) {
206  dawgs_.push_back(punc_dawg_);
207  }
208  }
209  if (load_system_dawg) {
210  Dawg *system_dawg =
211  dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
212  if (system_dawg) {
213  dawgs_.push_back(system_dawg);
214  }
215  }
216  if (load_number_dawg) {
217  Dawg *number_dawg =
218  dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
219  if (number_dawg) {
220  dawgs_.push_back(number_dawg);
221  }
222  }
223  if (load_bigram_dawg) {
224  bigram_dawg_ =
225  dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, dawg_debug_level, data_file);
226  // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
227  // dawgs_!!
228  }
229  if (load_freq_dawg) {
230  freq_dawg_ =
231  dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
232  if (freq_dawg_) {
233  dawgs_.push_back(freq_dawg_);
234  }
235  }
236  if (load_unambig_dawg) {
237  unambig_dawg_ =
238  dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
239  if (unambig_dawg_) {
240  dawgs_.push_back(unambig_dawg_);
241  }
242  }
243 
244  std::string name;
245  if (!user_words_suffix.empty() || !user_words_file.empty()) {
246  Trie *trie_ptr =
247  new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
248  if (!user_words_file.empty()) {
249  name = user_words_file;
250  } else {
252  name += user_words_suffix;
253  }
254  if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
256  tprintf("Error: failed to load %s\n", name.c_str());
257  delete trie_ptr;
258  } else {
259  dawgs_.push_back(trie_ptr);
260  }
261  }
262 
263  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
264  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
265  dawg_debug_level);
266  trie_ptr->initialize_patterns(&(getUnicharset()));
267  if (!user_patterns_file.empty()) {
268  name = user_patterns_file;
269  } else {
271  name += user_patterns_suffix;
272  }
273  if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
274  tprintf("Error: failed to load %s\n", name.c_str());
275  delete trie_ptr;
276  } else {
277  dawgs_.push_back(trie_ptr);
278  }
279  }
280 
281  document_words_ =
282  new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
283  dawgs_.push_back(document_words_);
284 
285  // This dawg is temporary and should not be searched by letter_is_ok.
286  pending_words_ =
287  new Trie(DAWG_TYPE_WORD, lang, NO_PERM, getUnicharset().size(), dawg_debug_level);
288 }
289 
290 // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
291 void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
292  // Load dawgs_.
293  if (load_punc_dawg) {
294  punc_dawg_ =
295  dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
296  if (punc_dawg_) {
297  dawgs_.push_back(punc_dawg_);
298  }
299  }
300  if (load_system_dawg) {
301  Dawg *system_dawg =
302  dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
303  if (system_dawg) {
304  dawgs_.push_back(system_dawg);
305  }
306  }
307  if (load_number_dawg) {
308  Dawg *number_dawg =
309  dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
310  if (number_dawg) {
311  dawgs_.push_back(number_dawg);
312  }
313  }
314 
315  // stolen from Dict::Load (but needs params_ from Tesseract
316  // langdata/config/api):
317  std::string name;
318  if (!user_words_suffix.empty() || !user_words_file.empty()) {
319  Trie *trie_ptr =
320  new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
321  if (!user_words_file.empty()) {
322  name = user_words_file;
323  } else {
325  name += user_words_suffix;
326  }
327  if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
329  tprintf("Error: failed to load %s\n", name.c_str());
330  delete trie_ptr;
331  } else {
332  dawgs_.push_back(trie_ptr);
333  }
334  }
335 
336  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
337  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
338  dawg_debug_level);
339  trie_ptr->initialize_patterns(&(getUnicharset()));
340  if (!user_patterns_file.empty()) {
341  name = user_patterns_file;
342  } else {
344  name += user_patterns_suffix;
345  }
346  if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
347  tprintf("Error: failed to load %s\n", name.c_str());
348  delete trie_ptr;
349  } else {
350  dawgs_.push_back(trie_ptr);
351  }
352  }
353 }
354 
355 // Completes the loading process after Load() and/or LoadLSTM().
356 // Returns false if no dictionaries were loaded.
358  if (dawgs_.empty()) {
359  return false;
360  }
361  // Construct a list of corresponding successors for each dawg. Each entry, i,
362  // in the successors_ vector is a vector of integers that represent the
363  // indices into the dawgs_ vector of the successors for dawg i.
364  successors_.reserve(dawgs_.size());
365  for (auto dawg : dawgs_) {
366  auto *lst = new SuccessorList();
367  for (unsigned j = 0; j < dawgs_.size(); ++j) {
368  const Dawg *other = dawgs_[j];
369  if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
370  kDawgSuccessors[dawg->type()][other->type()]) {
371  lst->push_back(j);
372  }
373  }
374  successors_.push_back(lst);
375  }
376  return true;
377 }
378 
379 void Dict::End() {
380  if (dawgs_.empty()) {
381  return; // Not safe to call twice.
382  }
383  for (auto &dawg : dawgs_) {
384  if (!dawg_cache_->FreeDawg(dawg)) {
385  delete dawg;
386  }
387  }
388  dawg_cache_->FreeDawg(bigram_dawg_);
389  if (dawg_cache_is_ours_) {
390  delete dawg_cache_;
391  dawg_cache_ = nullptr;
392  }
393  for (auto successor : successors_) {
394  delete successor;
395  }
396  dawgs_.clear();
397  successors_.clear();
398  document_words_ = nullptr;
399  delete pending_words_;
400  pending_words_ = nullptr;
401 }
402 
403 // Returns true if in light of the current state unichar_id is allowed
404 // according to at least one of the dawgs in the dawgs_ vector.
405 // See more extensive comments in dict.h where this function is declared.
406 int Dict::def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset,
407  UNICHAR_ID unichar_id, bool word_end) const {
408  auto *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
409 
410  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
411 
412  if (dawg_debug_level >= 3) {
413  tprintf(
414  "def_letter_is_okay: current unichar=%s word_end=%d"
415  " num active dawgs=%zu\n",
416  getUnicharset().debug_str(unichar_id).c_str(), word_end, dawg_args->active_dawgs->size());
417  }
418 
419  // Do not accept words that contain kPatternUnicharID.
420  // (otherwise pattern dawgs would not function correctly).
421  // Do not accept words containing INVALID_UNICHAR_IDs.
422  if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) {
423  dawg_args->permuter = NO_PERM;
424  return NO_PERM;
425  }
426 
427  // Initialization.
428  PermuterType curr_perm = NO_PERM;
429  dawg_args->updated_dawgs->clear();
430  dawg_args->valid_end = false;
431 
432  // Go over the active_dawgs vector and insert DawgPosition records
433  // with the updated ref (an edge with the corresponding unichar id) into
434  // dawg_args->updated_pos.
435  for (unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) {
436  const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
437  const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
438  const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
439 
440  if (!dawg && !punc_dawg) {
441  // shouldn't happen.
442  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
443  continue;
444  }
445  if (!dawg) {
446  // We're in the punctuation dawg. A core dawg has not been chosen.
447  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
448  EDGE_REF punc_transition_edge =
449  punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
450  if (punc_transition_edge != NO_EDGE) {
451  // Find all successors, and see which can transition.
452  const SuccessorList &slist = *(successors_[pos.punc_index]);
453  for (int sdawg_index : slist) {
454  const Dawg *sdawg = dawgs_[sdawg_index];
455  UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
456  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
457  if (dawg_edge != NO_EDGE) {
458  if (dawg_debug_level >= 3) {
459  tprintf("Letter found in dawg %d\n", sdawg_index);
460  }
461  dawg_args->updated_dawgs->add_unique(
462  DawgPosition(sdawg_index, dawg_edge, pos.punc_index, punc_transition_edge, false),
463  dawg_debug_level > 0, "Append transition from punc dawg to current dawgs: ");
464  if (sdawg->permuter() > curr_perm) {
465  curr_perm = sdawg->permuter();
466  }
467  if (sdawg->end_of_word(dawg_edge) && punc_dawg->end_of_word(punc_transition_edge)) {
468  dawg_args->valid_end = true;
469  }
470  }
471  }
472  }
473  EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
474  if (punc_edge != NO_EDGE) {
475  if (dawg_debug_level >= 3) {
476  tprintf("Letter found in punctuation dawg\n");
477  }
478  dawg_args->updated_dawgs->add_unique(
479  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), dawg_debug_level > 0,
480  "Extend punctuation dawg: ");
481  if (PUNC_PERM > curr_perm) {
482  curr_perm = PUNC_PERM;
483  }
484  if (punc_dawg->end_of_word(punc_edge)) {
485  dawg_args->valid_end = true;
486  }
487  }
488  continue;
489  }
490 
491  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
492  // We can end the main word here.
493  // If we can continue on the punc ref, add that possibility.
494  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
495  EDGE_REF punc_edge =
496  punc_node == NO_EDGE ? NO_EDGE : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
497  if (punc_edge != NO_EDGE) {
498  dawg_args->updated_dawgs->add_unique(
499  DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, punc_edge, true),
500  dawg_debug_level > 0, "Return to punctuation dawg: ");
501  if (dawg->permuter() > curr_perm) {
502  curr_perm = dawg->permuter();
503  }
504  if (punc_dawg->end_of_word(punc_edge)) {
505  dawg_args->valid_end = true;
506  }
507  }
508  }
509 
510  if (pos.back_to_punc) {
511  continue;
512  }
513 
514  // If we are dealing with the pattern dawg, look up all the
515  // possible edges, not only for the exact unichar_id, but also
516  // for all its character classes (alpha, digit, etc).
517  if (dawg->type() == DAWG_TYPE_PATTERN) {
518  ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, &curr_perm);
519  // There can't be any successors to dawg that is of type
520  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
521  continue;
522  }
523 
524  // Find the edge out of the node for the unichar_id.
525  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
526  EDGE_REF edge =
527  (node == NO_EDGE)
528  ? NO_EDGE
529  : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
530 
531  if (dawg_debug_level >= 3) {
532  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node, edge);
533  }
534 
535  if (edge != NO_EDGE) { // the unichar was found in the current dawg
536  if (dawg_debug_level >= 3) {
537  tprintf("Letter found in dawg %d\n", pos.dawg_index);
538  }
539  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
540  if (dawg_debug_level >= 3) {
541  tprintf("Punctuation constraint not satisfied at end of word.\n");
542  }
543  continue;
544  }
545  if (dawg->permuter() > curr_perm) {
546  curr_perm = dawg->permuter();
547  }
548  if (dawg->end_of_word(edge) &&
549  (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) {
550  dawg_args->valid_end = true;
551  }
552  dawg_args->updated_dawgs->add_unique(
553  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, false),
554  dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
555  }
556  } // end for
557  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
558  // or if we found the current letter in a non-punctuation dawg. This
559  // allows preserving information on which dawg the "core" word came from.
560  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
561  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
562  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
563  dawg_args->permuter = curr_perm;
564  }
565  if (dawg_debug_level >= 2) {
566  tprintf("Returning %d for permuter code for this character.\n", dawg_args->permuter);
567  }
568  return dawg_args->permuter;
569 }
570 
571 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHAR_ID unichar_id,
572  bool word_end, DawgArgs *dawg_args, PermuterType *curr_perm) const {
573  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
574  // Try to find the edge corresponding to the exact unichar_id and to all the
575  // edges corresponding to the character class of unichar_id.
576  std::vector<UNICHAR_ID> unichar_id_patterns;
577  unichar_id_patterns.push_back(unichar_id);
578  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
579  for (int unichar_id_pattern : unichar_id_patterns) {
580  // On the first iteration check all the outgoing edges.
581  // On the second iteration check all self-loops.
582  for (int k = 0; k < 2; ++k) {
583  EDGE_REF edge = (k == 0)
584  ? dawg->edge_char_of(node, unichar_id_pattern, word_end)
585  : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_pattern, word_end);
586  if (edge == NO_EDGE) {
587  continue;
588  }
589  if (dawg_debug_level >= 3) {
590  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node,
591  edge);
592  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
593  }
594  if (dawg->permuter() > *curr_perm) {
595  *curr_perm = dawg->permuter();
596  }
597  if (dawg->end_of_word(edge)) {
598  dawg_args->valid_end = true;
599  }
600  dawg_args->updated_dawgs->add_unique(
601  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, pos.back_to_punc),
602  dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
603  }
604  }
605 }
606 
607 // Fill the given active_dawgs vector with dawgs that could contain the
608 // beginning of the word. If hyphenated() returns true, copy the entries
609 // from hyphen_active_dawgs_ instead.
610 void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const {
611  if (hyphenated()) {
612  *active_dawgs = hyphen_active_dawgs_;
613  if (dawg_debug_level >= 3) {
614  for (unsigned i = 0; i < hyphen_active_dawgs_.size(); ++i) {
615  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
616  hyphen_active_dawgs_[i].dawg_index, hyphen_active_dawgs_[i].dawg_ref);
617  }
618  }
619  } else {
620  default_dawgs(active_dawgs, ambigs_mode);
621  }
622 }
623 
624 void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_patterns) const {
625  bool punc_dawg_available = (punc_dawg_ != nullptr) &&
626  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
627 
628  for (unsigned i = 0; i < dawgs_.size(); i++) {
629  if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
630  int dawg_ty = dawgs_[i]->type();
631  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
632  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
633  dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
634  if (dawg_debug_level >= 3) {
635  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
636  }
637  } else if (!punc_dawg_available || !subsumed_by_punc) {
638  dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
639  if (dawg_debug_level >= 3) {
640  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
641  }
642  }
643  }
644  }
645 }
646 
647 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
648  // Do not add hyphenated word parts to the document dawg.
649  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
650  // called when the first part of the hyphenated word is
651  // discovered and while the second part of the word is recognized.
652  // hyphen_word_ is cleared in cc_recg() before the next word on
653  // the line is recognized.
654  if (hyphen_word_) {
655  return;
656  }
657 
658  int stringlen = best_choice.length();
659 
660  if (valid_word(best_choice) || stringlen < 2) {
661  return;
662  }
663 
664  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
665  if (best_choice.length() >= kDocDictMaxRepChars) {
666  int num_rep_chars = 1;
667  UNICHAR_ID uch_id = best_choice.unichar_id(0);
668  for (unsigned i = 1; i < best_choice.length(); ++i) {
669  if (best_choice.unichar_id(i) != uch_id) {
670  num_rep_chars = 1;
671  uch_id = best_choice.unichar_id(i);
672  } else {
673  ++num_rep_chars;
674  if (num_rep_chars == kDocDictMaxRepChars) {
675  return;
676  }
677  }
678  }
679  }
680 
681  if (best_choice.certainty() < doc_dict_certainty_threshold || stringlen == 2) {
682  if (best_choice.certainty() < doc_dict_pending_threshold) {
683  return;
684  }
685 
686  if (!pending_words_->word_in_dawg(best_choice)) {
687  if (stringlen > 2 ||
688  (stringlen == 2 && getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
689  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
690  pending_words_->add_word_to_dawg(best_choice);
691  }
692  return;
693  }
694  }
695 
696  if (save_doc_words) {
697  std::string filename(getCCUtil()->imagefile);
698  filename += ".doc";
699  FILE *doc_word_file = fopen(filename.c_str(), "a");
700  if (doc_word_file == nullptr) {
701  tprintf("Error: Could not open file %s\n", filename.c_str());
702  ASSERT_HOST(doc_word_file);
703  }
704  fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
705  fclose(doc_word_file);
706  }
707  document_words_->add_word_to_dawg(best_choice);
708 }
709 
710 void Dict::adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency,
711  float additional_adjust, bool modify_rating, bool debug) {
712  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
713  word->GetTopScriptID() == getUnicharset().han_sid());
714  bool case_is_ok = (is_han || case_ok(*word));
715  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
716 
717  float adjust_factor = additional_adjust;
718  float new_rating = word->rating();
719  new_rating += kRatingPad;
720  const char *xheight_triggered = "";
721  if (word->length() > 1) {
722  // Calculate x-height and y-offset consistency penalties.
723  switch (xheight_consistency) {
724  case XH_INCONSISTENT:
725  adjust_factor += xheight_penalty_inconsistent;
726  xheight_triggered = ", xhtBAD";
727  break;
728  case XH_SUBNORMAL:
729  adjust_factor += xheight_penalty_subscripts;
730  xheight_triggered = ", xhtSUB";
731  break;
732  case XH_GOOD:
733  // leave the factor alone - all good!
734  break;
735  }
736  // TODO(eger): if nonword is true, but there is a "core" that is a dict
737  // word, negate nonword status.
738  } else {
739  if (debug) {
740  tprintf("Consistency could not be calculated.\n");
741  }
742  }
743  if (debug) {
744  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", word->unichar_string().c_str(),
745  word->rating(), xheight_triggered);
746  }
747 
748  if (nonword) { // non-dictionary word
749  if (case_is_ok && punc_is_ok) {
750  adjust_factor += segment_penalty_dict_nonword;
751  new_rating *= adjust_factor;
752  if (debug) {
753  tprintf(", W");
754  }
755  } else {
756  adjust_factor += segment_penalty_garbage;
757  new_rating *= adjust_factor;
758  if (debug) {
759  if (!case_is_ok) {
760  tprintf(", C");
761  }
762  if (!punc_is_ok) {
763  tprintf(", P");
764  }
765  }
766  }
767  } else { // dictionary word
768  if (case_is_ok) {
769  if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
771  adjust_factor += segment_penalty_dict_frequent_word;
772  new_rating *= adjust_factor;
773  if (debug) {
774  tprintf(", F");
775  }
776  } else {
777  adjust_factor += segment_penalty_dict_case_ok;
778  new_rating *= adjust_factor;
779  if (debug) {
780  tprintf(", ");
781  }
782  }
783  } else {
784  adjust_factor += segment_penalty_dict_case_bad;
785  new_rating *= adjust_factor;
786  if (debug) {
787  tprintf(", C");
788  }
789  }
790  }
791  new_rating -= kRatingPad;
792  if (modify_rating) {
793  word->set_rating(new_rating);
794  }
795  if (debug) {
796  tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
797  }
798  word->set_adjust_factor(adjust_factor);
799 }
800 
801 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
802  const WERD_CHOICE *word_ptr = &word;
803  WERD_CHOICE temp_word(word.unicharset());
804  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
805  copy_hyphen_info(&temp_word);
806  temp_word += word;
807  word_ptr = &temp_word;
808  }
809  if (word_ptr->empty()) {
810  return NO_PERM;
811  }
812  // Allocate vectors for holding current and updated
813  // active_dawgs and initialize them.
814  DawgPositionVector active_dawgs[2];
815  init_active_dawgs(&(active_dawgs[0]), false);
816  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
817  int last_index = word_ptr->length() - 1;
818  // Call letter_is_okay for each letter in the word.
819  for (int i = hyphen_base_size(); i <= last_index; ++i) {
820  if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), word_ptr->unichar_id(i),
821  i == last_index))) {
822  break;
823  }
824  // Swap active_dawgs, constraints with the corresponding updated vector.
825  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
826  dawg_args.updated_dawgs = &(active_dawgs[0]);
827  ++(dawg_args.active_dawgs);
828  } else {
829  ++(dawg_args.updated_dawgs);
830  dawg_args.active_dawgs = &(active_dawgs[0]);
831  }
832  }
833  return valid_word_permuter(dawg_args.permuter, numbers_ok) ? dawg_args.permuter : NO_PERM;
834 }
835 
836 bool Dict::valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const {
837  if (bigram_dawg_ == nullptr) {
838  return false;
839  }
840 
841  // Extract the core word from the middle of each word with any digits
842  // replaced with question marks.
843  unsigned w1start, w1end, w2start, w2end;
844  word1.punct_stripped(&w1start, &w1end);
845  word2.punct_stripped(&w2start, &w2end);
846 
847  // We don't want to penalize a single guillemet, hyphen, etc.
848  // But our bigram list doesn't have any information about punctuation.
849  if (w1start >= w1end) {
850  return word1.length() < 3;
851  }
852  if (w2start >= w2end) {
853  return word2.length() < 3;
854  }
855 
856  const UNICHARSET &uchset = getUnicharset();
857  std::vector<UNICHAR_ID> bigram_string;
858  bigram_string.reserve(w1end + w2end + 1);
859  for (auto i = w1start; i < w1end; i++) {
860  const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i));
861  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
862  bigram_string.push_back(question_unichar_id_);
863  } else {
864  bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
865  }
866  }
867  bigram_string.push_back(UNICHAR_SPACE);
868  for (auto i = w2start; i < w2end; i++) {
869  const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i));
870  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
871  bigram_string.push_back(question_unichar_id_);
872  } else {
873  bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
874  }
875  }
876  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
877  for (int i : bigram_string) {
878  normalized_word.append_unichar_id_space_allocated(i, 1, 0.0f, 0.0f);
879  }
880  return bigram_dawg_->word_in_dawg(normalized_word);
881 }
882 
884  if (word.empty()) {
885  return NO_PERM;
886  }
887  WERD_CHOICE new_word(word.unicharset());
888  auto last_index = word.length() - 1;
889  int new_len = 0;
890  for (unsigned i = 0; i <= last_index; ++i) {
891  UNICHAR_ID unichar_id = (word.unichar_id(i));
892  if (getUnicharset().get_ispunctuation(unichar_id)) {
893  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
894  } else if (!getUnicharset().get_isalpha(unichar_id) &&
895  !getUnicharset().get_isdigit(unichar_id)) {
896  return false; // neither punc, nor alpha, nor digit
897  } else if ((new_len = new_word.length()) == 0 ||
898  new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
899  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
900  }
901  }
902  for (unsigned i = 0; i < dawgs_.size(); ++i) {
903  if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
904  dawgs_[i]->word_in_dawg(new_word)) {
905  return true;
906  }
907  }
908  return false;
909 }
910 
913  const UNICHARSET &u_set = getUnicharset();
914  if (u_set.han_sid() > 0) {
915  return false;
916  }
917  if (u_set.katakana_sid() > 0) {
918  return false;
919  }
920  if (u_set.thai_sid() > 0) {
921  return false;
922  }
923  return true;
924 }
925 
926 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:368
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:380
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:378
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:374
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:372
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:370
#define REFFORMAT
Definition: dawg.h:85
@ DAWG_TYPE_PATTERN
Definition: dawg.h:68
@ DAWG_TYPE_WORD
Definition: dawg.h:66
@ DAWG_TYPE_PUNCTUATION
Definition: dawg.h:65
int64_t EDGE_REF
Definition: dawg.h:49
std::vector< int > SuccessorList
Definition: dawg.h:61
XHeightConsistencyEnum
Definition: dict.h:81
@ XH_GOOD
Definition: dict.h:81
@ XH_SUBNORMAL
Definition: dict.h:81
@ XH_INCONSISTENT
Definition: dict.h:81
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int64_t NODE_REF
Definition: dawg.h:50
@ TESSDATA_UNAMBIG_DAWG
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_NUMBER_DAWG
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_BIGRAM_DAWG
@ TESSDATA_LSTM_NUMBER_DAWG
@ TESSDATA_SYSTEM_DAWG
int UNICHAR_ID
Definition: unichar.h:36
@ UNICHAR_SPACE
Definition: unicharset.h:36
PermuterType
Definition: ratngs.h:231
@ COMPOUND_PERM
Definition: ratngs.h:244
@ NO_PERM
Definition: ratngs.h:232
@ PUNC_PERM
Definition: ratngs.h:233
@ USER_DAWG_PERM
Definition: ratngs.h:242
@ USER_PATTERN_PERM
Definition: ratngs.h:239
@ DOC_DAWG_PERM
Definition: ratngs.h:241
@ FREQ_DAWG_PERM
Definition: ratngs.h:243
void punct_stripped(unsigned *start_core, unsigned *end_core) const
Definition: ratngs.cpp:367
std::string debug_string() const
Definition: ratngs.h:475
float certainty() const
Definition: ratngs.h:311
int GetTopScriptID() const
Definition: ratngs.cpp:631
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
bool empty() const
Definition: ratngs.h:280
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:424
void set_permuter(uint8_t perm)
Definition: ratngs.h:356
const UNICHARSET * unicharset() const
Definition: ratngs.h:277
unsigned length() const
Definition: ratngs.h:283
void set_adjust_factor(float factor)
Definition: ratngs.h:289
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:447
float rating() const
Definition: ratngs.h:308
std::string & unichar_string()
Definition: ratngs.h:515
void set_rating(float new_val)
Definition: ratngs.h:350
std::string language_data_path_prefix
Definition: ccutil.h:60
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:869
int han_sid() const
Definition: unicharset.h:932
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
int null_sid() const
Definition: unicharset.h:917
int katakana_sid() const
Definition: unicharset.h:938
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533
size_t size() const
Definition: unicharset.h:355
int thai_sid() const
Definition: unicharset.h:941
const std::string & lang() const
Definition: dawg.h:122
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:64
virtual bool end_of_word(EDGE_REF edge_ref) const =0
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, std::vector< UNICHAR_ID > *vec) const
Definition: dawg.h:181
DawgType type() const
Definition: dawg.h:119
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: dawg.h:192
PermuterType permuter() const
Definition: dawg.h:125
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:117
EDGE_REF punc_ref
Definition: dawg.h:371
EDGE_REF dawg_ref
Definition: dawg.h:370
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Definition: dawg.h:383
Dawg * GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:43
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:37
DawgPositionVector * updated_dawgs
Definition: dict.h:88
DawgPositionVector * active_dawgs
Definition: dict.h:87
PermuterType permuter
Definition: dict.h:89
bool valid_end
Definition: dict.h:91
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:571
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:172
bool IsSpaceDelimitedLang() const
Returns true if the language is space-delimited (not CJ, or T).
Definition: dict.cpp:912
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:345
const UNICHARSET & getUnicharset() const
Definition: dict.h:104
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:624
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139
void LoadLSTM(const std::string &lang, TessdataManager *data_file)
Definition: dict.cpp:291
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:437
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:210
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:836
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:411
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:801
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:180
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
void End()
Definition: dict.cpp:379
Dict(CCUtil *image_ptr)
Definition: dict.cpp:29
const CCUtil * getCCUtil() const
Definition: dict.h:98
bool FinishLoad()
Definition: dict.cpp:357
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:397
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:883
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:45
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:647
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:710
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:610
void Load(const std::string &lang, TessdataManager *data_file)
Definition: dict.cpp:200
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:406
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:273
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:332
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:390
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:57
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector< bool > *repetitions)
Definition: trie.cpp:159