tesseract  5.0.0
lm_pain_points.h
Go to the documentation of this file.
1 // File: lm_pain_points.h
3 // Description: Functions that utilize the knowledge about the properties
4 // of the paths explored by the segmentation search in order
5 // to generate "pain points" - the locations in the ratings
6 // matrix which should be classified next.
7 // Author: Rika Antonova
8 //
9 // (C) Copyright 2012, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #ifndef TESSERACT_WORDREC_PAIN_POINTS_H_
23 #define TESSERACT_WORDREC_PAIN_POINTS_H_
24 
25 #include "genericheap.h" // for GenericHeap
26 #include "matrix.h" // for MATRIX_COORD (ptr only), MatrixCoordPair
27 #include "stopper.h" // for DANGERR
28 
29 namespace tesseract {
30 
31 class Dict;
32 struct ViterbiStateEntry;
33 class WERD_RES;
34 
35 // Heap of pain points used for determining where to chop/join.
37 
38 // Types of pain points (ordered in the decreasing level of importance).
44 
46 };
47 
48 static const char *const LMPainPointsTypeName[] = {
49  "LM_PPTYPE_BLAMER",
50  "LM_PPTYPE_AMBIGS",
51  "LM_PPTYPE_PATH",
52  "LM_PPTYPE_SHAPE",
53 };
54 
55 class LMPainPoints {
56 public:
58  // If there is a significant drop in character ngram probability or a
59  // dangerous ambiguity make the thresholds on what blob combinations
60  // can be classified looser.
61  static const float kLooseMaxCharWhRatio;
62  // Returns a description of the type of a pain point.
63  static const char *PainPointDescription(LMPainPointsType type) {
64  return LMPainPointsTypeName[type];
65  }
66 
67  LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
68  : max_heap_size_(max)
69  , max_char_wh_ratio_(rat)
70  , fixed_pitch_(fp)
71  , dict_(d)
72  , debug_level_(deb) {}
73  ~LMPainPoints() = default;
74 
75  // Returns true if the heap of pain points of pp_type is not empty().
76  inline bool HasPainPoints(LMPainPointsType pp_type) const {
77  return !pain_points_heaps_[pp_type].empty();
78  }
79 
80  // Dequeues the next pain point from the pain points queue and copies
81  // its contents and priority to *pp and *priority.
82  // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.
83  LMPainPointsType Deque(MATRIX_COORD *pp, float *priority);
84 
85  // Clears pain points heap.
86  void Clear() {
87  for (auto &pain_points_heap : pain_points_heaps_) {
88  pain_points_heap.clear();
89  }
90  }
91 
92  // For each cell, generate a "pain point" if the cell is not classified
93  // and has a left or right neighbor that was classified.
94  void GenerateInitial(WERD_RES *word_res);
95 
96  // Generate pain points from the given path.
97  void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res);
98 
99  // Generate pain points from dangerous ambiguities in best choice.
100  void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res);
101 
102  // Adds a pain point to classify chunks_record->ratings(col, row).
103  // Returns true if a new pain point was added to an appropriate heap.
104  // Pain point priority is set to special_priority for pain points of
105  // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points
106  // AssociateStats::gap_sum is used.
107  bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority,
108  bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res);
109 
110  // Adjusts the pain point coordinates to cope with expansion of the ratings
111  // matrix due to a split of the blob with the given index.
112  void RemapForSplit(int index);
113 
114 private:
115  // Priority queues containing pain points generated by the language model
116  // The priority is set by the language model components, adjustments like
117  // seam cost and width priority are factored into the priority.
118  PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM];
119  // Maximum number of points to keep in the heap.
120  int max_heap_size_;
121  // Maximum character width/height ratio.
122  float max_char_wh_ratio_;
123  // Set to true if fixed pitch should be assumed.
124  bool fixed_pitch_;
125  // Cached pointer to dictionary.
126  const Dict *dict_;
127  // Debug level for print statements.
128  int debug_level_;
129 };
130 
131 } // namespace tesseract
132 
133 #endif // TESSERACT_WORDREC_PAIN_POINTS_H_
std::vector< DANGERR_INFO > DANGERR
Definition: stopper.h:47
bool empty() const
Definition: genericheap.h:68
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
void RemapForSplit(int index)
bool HasPainPoints(LMPainPointsType pp_type) const
LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
void GenerateInitial(WERD_RES *word_res)
void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res)
static const float kLooseMaxCharWhRatio
static const char * PainPointDescription(LMPainPointsType type)
static const float kDefaultPainPointPriorityAdjustment
void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res)
LMPainPointsType Deque(MATRIX_COORD *pp, float *priority)