tesseract  5.0.0
language_model.h
Go to the documentation of this file.
1 // File: language_model.h
3 // Description: Functions that utilize the knowledge about the properties,
4 // structure and statistics of the language to help segmentation
5 // search.
6 // Author: Daria Antonova
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
22 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
23 
24 #include "associate.h" // for AssociateStats (ptr only), AssociateUtils
25 #include "dawg.h" // for DawgPositionVector
26 #include "dict.h" // for DawgArgs, Dict
27 #include "lm_consistency.h" // for LMConsistencyInfo
28 #include "lm_state.h" // for ViterbiStateEntry, LanguageModelFlagsType
29 #include "params.h" // for DoubleParam, double_VAR_H, IntParam, Boo...
30 #include "params_model.h" // for ParamsModel
31 #include "ratngs.h" // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST...
32 #include "stopper.h" // for DANGERR
33 
34 #include <cmath> // for exp
35 
36 namespace tesseract {
37 
38 class UNICHARSET;
39 class WERD_RES;
40 
41 struct BlamerBundle;
42 
43 template <typename T>
44 class UnicityTable;
45 
46 class LMPainPoints;
47 struct FontInfo;
48 
49 // This class that contains the data structures and functions necessary
50 // to represent and use the knowledge about the language.
52 public:
53  // Masks for keeping track of top choices that should not be pruned out.
57  static const LanguageModelFlagsType kDigitFlag = 0x8;
59 
60  // Denominator for normalizing per-letter ngram cost when deriving
61  // penalty adjustments.
62  static const float kMaxAvgNgramCost;
63 
64  LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict);
66 
67  // Fills the given floats array with features extracted from path represented
68  // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h
69  // for feature information.
70  // Note: the function assumes that features points to an array of size
71  // PTRAIN_NUM_FEATURE_TYPES.
72  static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[]);
73 
74  // Updates data structures that are used for the duration of the segmentation
75  // search on the current word;
76  void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio,
77  float rating_cert_scale);
78 
79  // Updates language model state of the given BLOB_CHOICE_LIST (from
80  // the ratings matrix) a its parent. Updates pain_points if new
81  // problematic points are found in the segmentation graph.
82  //
83  // At most language_model_viterbi_list_size are kept in each
84  // LanguageModelState.viterbi_state_entries list.
85  // At most language_model_viterbi_list_max_num_prunable of those are prunable
86  // (non-dictionary) paths.
87  // The entries that represent dictionary word paths are kept at the front
88  // of the list.
89  // The list ordered by cost that is computed collectively by several
90  // language model components (currently dawg and ngram components).
91  bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list,
92  LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res,
93  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
94 
95  // Returns true if an acceptable best choice was discovered.
96  inline bool AcceptableChoiceFound() {
98  }
99  inline void SetAcceptableChoiceFound(bool val) {
101  }
102  // Returns the reference to ParamsModel.
104  return params_model_;
105  }
106 
107 protected:
108  inline float CertaintyScore(float cert) {
109  if (language_model_use_sigmoidal_certainty) {
110  // cert is assumed to be between 0 and -dict_->certainty_scale.
111  // If you enable language_model_use_sigmoidal_certainty, you
112  // need to adjust language_model_ngram_nonmatch_score as well.
113  cert = -cert / dict_->certainty_scale;
114  return 1.0f / (1.0f + exp(10.0f * cert));
115  } else {
116  return (-1.0f / cert);
117  }
118  }
119 
120  inline float ComputeAdjustment(int num_problems, float penalty) {
121  if (num_problems == 0) {
122  return 0.0f;
123  }
124  if (num_problems == 1) {
125  return penalty;
126  }
127  return (penalty + (language_model_penalty_increment * static_cast<float>(num_problems - 1)));
128  }
129 
130  // Computes the adjustment to the ratings sum based on the given
131  // consistency_info. The paths with invalid punctuation, inconsistent
132  // case and character type are penalized proportionally to the number
133  // of inconsistencies on the path.
135  const LMConsistencyInfo &consistency_info) {
136  if (dawg_info != nullptr) {
137  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
138  language_model_penalty_case) +
139  (consistency_info.inconsistent_script ? language_model_penalty_script : 0.0f);
140  }
141  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(), language_model_penalty_punc) +
142  ComputeAdjustment(consistency_info.NumInconsistentCase(), language_model_penalty_case) +
143  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
144  language_model_penalty_chartype) +
145  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
146  language_model_penalty_spacing) +
147  (consistency_info.inconsistent_script ? language_model_penalty_script : 0.0f) +
148  (consistency_info.inconsistent_font ? language_model_penalty_font : 0.0f));
149  }
150 
151  // Returns an adjusted ratings sum that includes inconsistency penalties,
152  // penalties for non-dictionary paths and paths with dips in ngram
153  // probability.
155 
156  // Finds the first lower and upper case letter and first digit in curr_list.
157  // Uses the first character in the list in place of empty results.
158  // Returns true if both alpha and digits are found.
159  bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower,
160  BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const;
161  // Forces there to be at least one entry in the overall set of the
162  // viterbi_state_entries of each element of parent_node that has the
163  // top_choice_flag set for lower, upper and digit using the same rules as
164  // GetTopLowerUpperDigit, setting the flag on the first found suitable
165  // candidate, whether or not the flag is set on some other parent.
166  // Returns 1 if both alpha and digits are found among the parents, -1 if no
167  // parents are found at all (a legitimate case), and 0 otherwise.
168  int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const;
169 
170  // Finds the next ViterbiStateEntry with which the given unichar_id can
171  // combine sensibly, taking into account any mixed alnum/mixed case
172  // situation, and whether this combination has been inspected before.
173  ViterbiStateEntry *GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc,
174  LanguageModelFlagsType blob_choice_flags,
175  const UNICHARSET &unicharset, WERD_RES *word_res,
176  ViterbiStateEntry_IT *vse_it,
177  LanguageModelFlagsType *top_choice_flags) const;
178  // Helper function that computes the cost of the path composed of the
179  // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.
180  // If the new path looks good enough, adds a new ViterbiStateEntry to the
181  // list of viterbi entries in the given BLOB_CHOICE and returns true.
182  bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end,
183  int curr_col, int curr_row, BLOB_CHOICE *b,
184  LanguageModelState *curr_state, ViterbiStateEntry *parent_vse,
185  LMPainPoints *pain_points, WERD_RES *word_res,
186  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
187 
188  // Determines whether a potential entry is a true top choice and
189  // updates changed accordingly.
190  //
191  // Note: The function assumes that b, top_choice_flags and changed
192  // are not nullptr.
193  void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse,
194  LanguageModelState *lms);
195 
196  // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and
197  // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo
198  // with updated active dawgs, constraints and permuter.
199  //
200  // Note: the caller is responsible for deleting the returned pointer.
201  LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int curr_col, int curr_row,
202  const BLOB_CHOICE &b,
203  const ViterbiStateEntry *parent_vse);
204 
205  // Computes p(unichar | parent context) and records it in ngram_cost.
206  // If b.unichar_id() is an unlikely continuation of the parent context
207  // sets found_small_prob to true and returns nullptr.
208  // Otherwise creates a new LanguageModelNgramInfo entry containing the
209  // updated context (that includes b.unichar_id() at the end) and returns it.
210  //
211  // Note: the caller is responsible for deleting the returned pointer.
212  LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar, float certainty, float denom,
213  int curr_col, int curr_row, float outline_length,
214  const ViterbiStateEntry *parent_vse);
215 
216  // Computes -(log(prob(classifier)) + log(prob(ngram model)))
217  // for the given unichar in the given context. If there are multiple
218  // unichars at one position - takes the average of their probabilities.
219  // UNICHAR::utf8_step() is used to separate out individual UTF8 characters,
220  // since probability_in_context() can only handle one at a time (while
221  // unicharset might contain ngrams and glyphs composed from multiple UTF8
222  // characters).
223  float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context,
224  int *unichar_step_len, bool *found_small_prob, float *ngram_prob);
225 
226  // Computes the normalization factors for the classifier confidences
227  // (used by ComputeNgramCost()).
228  float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
229 
230  // Fills the given consistenty_info based on parent_vse.consistency_info
231  // and on the consistency of the given unichar_id with parent_vse.
232  void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b,
233  ViterbiStateEntry *parent_vse, WERD_RES *word_res,
234  LMConsistencyInfo *consistency_info);
235 
236  // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs
237  // on the path represented by the given BLOB_CHOICE and language model
238  // state entries (lmse, dse). The path is re-constructed by following
239  // the parent pointers in the the lang model state entries). If the
240  // constructed WERD_CHOICE is better than the best/raw choice recorded
241  // in the best_choice_bundle, this function updates the corresponding
242  // fields and sets best_choice_bunldle->updated to true.
243  void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res,
244  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
245 
246  // Constructs a WERD_CHOICE by tracing parent pointers starting with
247  // the given LanguageModelStateEntry. Returns the constructed word.
248  // Updates best_char_choices, certainties and state if they are not
249  // nullptr (best_char_choices and certainties are assumed to have the
250  // length equal to lmse->length).
251  // The caller is responsible for freeing memory associated with the
252  // returned WERD_CHOICE.
254  BlamerBundle *blamer_bundle, bool *truth_path);
255 
256  // Wrapper around AssociateUtils::ComputeStats().
257  inline void ComputeAssociateStats(int col, int row, float max_char_wh_ratio,
258  ViterbiStateEntry *parent_vse, WERD_RES *word_res,
259  AssociateStats *associate_stats) {
261  col, row, (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr,
262  (parent_vse != nullptr) ? parent_vse->length : 0, fixed_pitch_, max_char_wh_ratio, word_res,
263  language_model_debug_level > 2, associate_stats);
264  }
265 
266  // Returns true if the path with such top_choice_flags and dawg_info
267  // could be pruned out (i.e. is neither a system/user/frequent dictionary
268  // nor a top choice path).
269  // In non-space delimited languages all paths can be "somewhat" dictionary
270  // words. In such languages we can not do dictionary-driven path pruning,
271  // so paths with non-empty dawg_info are considered prunable.
272  inline bool PrunablePath(const ViterbiStateEntry &vse) {
273  if (vse.top_choice_flags) {
274  return false;
275  }
276  if (vse.dawg_info != nullptr &&
278  vse.dawg_info->permuter == FREQ_DAWG_PERM)) {
279  return false;
280  }
281  return true;
282  }
283 
284  // Returns true if the given ViterbiStateEntry represents an acceptable path.
285  inline bool AcceptablePath(const ViterbiStateEntry &vse) {
286  return (vse.dawg_info != nullptr || vse.Consistent() ||
287  (vse.ngram_info != nullptr && !vse.ngram_info->pruned));
288  }
289 
290 public:
291  // Parameters.
292  INT_VAR_H(language_model_debug_level);
293  BOOL_VAR_H(language_model_ngram_on);
294  INT_VAR_H(language_model_ngram_order);
295  INT_VAR_H(language_model_viterbi_list_max_num_prunable);
296  INT_VAR_H(language_model_viterbi_list_max_size);
297  double_VAR_H(language_model_ngram_small_prob);
298  double_VAR_H(language_model_ngram_nonmatch_score);
299  BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step);
300  double_VAR_H(language_model_ngram_scale_factor);
301  double_VAR_H(language_model_ngram_rating_factor);
302  BOOL_VAR_H(language_model_ngram_space_delimited_language);
303  INT_VAR_H(language_model_min_compound_length);
304  // Penalties used for adjusting path costs and final word rating.
305  double_VAR_H(language_model_penalty_non_freq_dict_word);
306  double_VAR_H(language_model_penalty_non_dict_word);
307  double_VAR_H(language_model_penalty_punc);
308  double_VAR_H(language_model_penalty_case);
309  double_VAR_H(language_model_penalty_script);
310  double_VAR_H(language_model_penalty_chartype);
311  double_VAR_H(language_model_penalty_font);
312  double_VAR_H(language_model_penalty_spacing);
313  double_VAR_H(language_model_penalty_increment);
314  INT_VAR_H(wordrec_display_segmentations);
315  BOOL_VAR_H(language_model_use_sigmoidal_certainty);
316 
317 protected:
318  // Member Variables.
319 
320  // Temporary DawgArgs struct that is re-used across different words to
321  // avoid dynamic memory re-allocation (should be cleared before each use).
323  // Scaling for recovering blob outline length from rating and certainty.
324  float rating_cert_scale_ = 0.0f;
325 
326  // The following variables are set at construction time.
327 
328  // Pointer to fontinfo table (not owned by LanguageModel).
330 
331  // Pointer to Dict class, that is used for querying the dictionaries
332  // (the pointer is not owned by LanguageModel).
333  Dict *dict_ = nullptr;
334 
335  // TODO(daria): the following variables should become LanguageModel params
336  // when the old code in bestfirst.cpp and heuristic.cpp is deprecated.
337  //
338  // Set to true if we are dealing with fixed pitch text
339  // (set to assume_fixed_pitch_char_segment).
340  bool fixed_pitch_ = false;
341  // Max char width-to-height ratio allowed
342  // (set to segsearch_max_char_wh_ratio).
343  float max_char_wh_ratio_ = 0.0f;
344 
345  // The following variables are initialized with InitForWord().
346 
347  // String representation of the classification of the previous word
348  // (since this is only used by the character ngram model component,
349  // only the last language_model_ngram_order of the word are stored).
350  std::string prev_word_str_;
352  // Active dawg vector.
355  // Set to true if acceptable choice was discovered.
356  // Note: it would be nice to use this to terminate the search once an
357  // acceptable choices is found. However we do not do that and once an
358  // acceptable choice is found we finish looking for alternative choices
359  // in the current segmentation graph and then exit the search (no more
360  // classifications are done after an acceptable choice is found).
361  // This is needed in order to let the search find the words very close to
362  // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these
363  // choices. This way the stopper will know that the best choice is not
364  // ambiguous (i.e. there are best choices in the best choice list that have
365  // ratings close to the very best one) and will be less likely to mis-adapt.
367  // Set to true if a choice representing correct segmentation was explored.
369 
370  // Params models containing weights for for computing ViterbiStateEntry costs.
372 };
373 
374 } // namespace tesseract
375 
376 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:35
std::vector< DANGERR_INFO > DANGERR
Definition: stopper.h:47
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:240
@ USER_DAWG_PERM
Definition: ratngs.h:242
@ FREQ_DAWG_PERM
Definition: ratngs.h:243
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:33
BOOL_VAR_H(language_model_ngram_space_delimited_language)
void SetAcceptableChoiceFound(bool val)
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
ParamsModel & getParamsModel()
BOOL_VAR_H(language_model_use_sigmoidal_certainty)
INT_VAR_H(language_model_viterbi_list_max_num_prunable)
DawgPositionVector beginning_active_dawgs_
bool PrunablePath(const ViterbiStateEntry &vse)
static const LanguageModelFlagsType kXhtConsistentFlag
INT_VAR_H(language_model_ngram_order)
INT_VAR_H(language_model_viterbi_list_max_size)
double_VAR_H(language_model_penalty_font)
float ComputeAdjustment(int num_problems, float penalty)
static const LanguageModelFlagsType kSmallestRatingFlag
double_VAR_H(language_model_penalty_case)
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
static const LanguageModelFlagsType kDigitFlag
double_VAR_H(language_model_penalty_non_freq_dict_word)
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
bool AcceptablePath(const ViterbiStateEntry &vse)
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
INT_VAR_H(wordrec_display_segmentations)
double_VAR_H(language_model_ngram_scale_factor)
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
static const LanguageModelFlagsType kLowerCaseFlag
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
double_VAR_H(language_model_penalty_increment)
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
static const float kMaxAvgNgramCost
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
double_VAR_H(language_model_ngram_small_prob)
double_VAR_H(language_model_penalty_punc)
BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step)
double_VAR_H(language_model_ngram_rating_factor)
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
double_VAR_H(language_model_penalty_chartype)
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
BOOL_VAR_H(language_model_ngram_on)
DawgPositionVector very_beginning_active_dawgs_
static const LanguageModelFlagsType kUpperCaseFlag
double_VAR_H(language_model_penalty_non_dict_word)
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
double_VAR_H(language_model_penalty_script)
float CertaintyScore(float cert)
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
const UnicityTable< FontInfo > * fontinfo_table_
double_VAR_H(language_model_penalty_spacing)
double_VAR_H(language_model_ngram_nonmatch_score)
INT_VAR_H(language_model_min_compound_length)
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
INT_VAR_H(language_model_debug_level)
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:170
AssociateStats associate_stats
character widths/gaps/seams
Definition: lm_state.h:192
int length
number of characters on the path
Definition: lm_state.h:189
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:174
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:196
Struct to store information maintained by various language model components.
Definition: lm_state.h:204
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:226