tesseract  5.0.0
tesseract::LMPainPoints Class Reference

#include <lm_pain_points.h>

Public Member Functions

 LMPainPoints (int max, float rat, bool fp, const Dict *d, int deb)
 
 ~LMPainPoints ()=default
 
bool HasPainPoints (LMPainPointsType pp_type) const
 
LMPainPointsType Deque (MATRIX_COORD *pp, float *priority)
 
void Clear ()
 
void GenerateInitial (WERD_RES *word_res)
 
void GenerateFromPath (float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res)
 
void GenerateFromAmbigs (const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res)
 
bool GeneratePainPoint (int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
 
void RemapForSplit (int index)
 

Static Public Member Functions

static const char * PainPointDescription (LMPainPointsType type)
 

Static Public Attributes

static const float kDefaultPainPointPriorityAdjustment = 2.0f
 
static const float kLooseMaxCharWhRatio = 2.5f
 

Detailed Description

Definition at line 55 of file lm_pain_points.h.

Constructor & Destructor Documentation

◆ LMPainPoints()

tesseract::LMPainPoints::LMPainPoints ( int  max,
float  rat,
bool  fp,
const Dict d,
int  deb 
)
inline

Definition at line 67 of file lm_pain_points.h.

68  : max_heap_size_(max)
69  , max_char_wh_ratio_(rat)
70  , fixed_pitch_(fp)
71  , dict_(d)
72  , debug_level_(deb) {}

◆ ~LMPainPoints()

tesseract::LMPainPoints::~LMPainPoints ( )
default

Member Function Documentation

◆ Clear()

void tesseract::LMPainPoints::Clear ( )
inline

Definition at line 86 of file lm_pain_points.h.

86  {
87  for (auto &pain_points_heap : pain_points_heaps_) {
88  pain_points_heap.clear();
89  }
90  }

◆ Deque()

LMPainPointsType tesseract::LMPainPoints::Deque ( MATRIX_COORD pp,
float *  priority 
)

Definition at line 39 of file lm_pain_points.cpp.

39  {
40  for (int h = 0; h < LM_PPTYPE_NUM; ++h) {
41  if (pain_points_heaps_[h].empty()) {
42  continue;
43  }
44  *priority = pain_points_heaps_[h].PeekTop().key();
45  *pp = pain_points_heaps_[h].PeekTop().data();
46  pain_points_heaps_[h].Pop(nullptr);
47  return static_cast<LMPainPointsType>(h);
48  }
49  return LM_PPTYPE_NUM;
50 }
bool Pop(Pair *entry)
Definition: genericheap.h:120
const Pair & PeekTop() const
Definition: genericheap.h:108
Key & key()
Definition: kdpair.h:47
Data & data()
Definition: kdpair.h:41

◆ GenerateFromAmbigs()

void tesseract::LMPainPoints::GenerateFromAmbigs ( const DANGERR fixpt,
ViterbiStateEntry vse,
WERD_RES word_res 
)

Definition at line 129 of file lm_pain_points.cpp.

130  {
131  // Begins and ends in DANGERR vector now record the blob indices as used
132  // by the ratings matrix.
133  for (auto danger : fixpt) {
134  // Only use dangerous ambiguities.
135  if (danger.dangerous) {
136  GeneratePainPoint(danger.begin, danger.end - 1, LM_PPTYPE_AMBIG, vse->cost, true,
137  kLooseMaxCharWhRatio, word_res);
138  }
139  }
140 }
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
static const float kLooseMaxCharWhRatio

◆ GenerateFromPath()

void tesseract::LMPainPoints::GenerateFromPath ( float  rating_cert_scale,
ViterbiStateEntry vse,
WERD_RES word_res 
)

Definition at line 72 of file lm_pain_points.cpp.

73  {
74  ViterbiStateEntry *curr_vse = vse;
75  BLOB_CHOICE *curr_b = vse->curr_b;
76  // The following pain point generation and priority calculation approaches
77  // prioritize exploring paths with low average rating of the known part of
78  // the path, while not relying on the ratings of the pieces to be combined.
79  //
80  // A pain point to combine the neighbors is generated for each pair of
81  // neighboring blobs on the path (the path is represented by vse argument
82  // given to GenerateFromPath()). The priority of each pain point is set to
83  // the average rating (per outline length) of the path, not including the
84  // ratings of the blobs to be combined.
85  // The ratings of the blobs to be combined are not used to calculate the
86  // priority, since it is not possible to determine from their magnitude
87  // whether it will be beneficial to combine the blobs. The reason is that
88  // chopped junk blobs (/ | - ') can have very good (low) ratings, however
89  // combining them will be beneficial. Blobs with high ratings might be
90  // over-joined pieces of characters, but also could be blobs from an unseen
91  // font or chopped pieces of complex characters.
92  while (curr_vse->parent_vse != nullptr) {
93  ViterbiStateEntry *parent_vse = curr_vse->parent_vse;
94  const MATRIX_COORD &curr_cell = curr_b->matrix_cell();
95  const MATRIX_COORD &parent_cell = parent_vse->curr_b->matrix_cell();
96  MATRIX_COORD pain_coord(parent_cell.col, curr_cell.row);
97  if (!pain_coord.Valid(*word_res->ratings) ||
98  !word_res->ratings->Classified(parent_cell.col, curr_cell.row, dict_->WildcardID())) {
99  // rat_subtr contains ratings sum of the two adjacent blobs to be merged.
100  // rat_subtr will be subtracted from the ratings sum of the path, since
101  // the blobs will be joined into a new blob, whose rating is yet unknown.
102  float rat_subtr = curr_b->rating() + parent_vse->curr_b->rating();
103  // ol_subtr contains the outline length of the blobs that will be joined.
104  float ol_subtr =
105  AssociateUtils::ComputeOutlineLength(rating_cert_scale, *curr_b) +
106  AssociateUtils::ComputeOutlineLength(rating_cert_scale, *(parent_vse->curr_b));
107  // ol_dif is the outline of the path without the two blobs to be joined.
108  float ol_dif = vse->outline_length - ol_subtr;
109  // priority is set to the average rating of the path per unit of outline,
110  // not counting the ratings of the pieces to be joined.
111  float priority = ol_dif > 0 ? (vse->ratings_sum - rat_subtr) / ol_dif : 0.0;
112  GeneratePainPoint(pain_coord.col, pain_coord.row, LM_PPTYPE_PATH, priority, true,
113  max_char_wh_ratio_, word_res);
114  } else if (debug_level_ > 3) {
115  tprintf("NO pain point (Classified) for col=%d row=%d type=%s\n", pain_coord.col,
116  pain_coord.row, LMPainPointsTypeName[LM_PPTYPE_PATH]);
117  BLOB_CHOICE_IT b_it(word_res->ratings->get(pain_coord.col, pain_coord.row));
118  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
119  BLOB_CHOICE *choice = b_it.data();
120  choice->print_full();
121  }
122  }
123 
124  curr_vse = parent_vse;
125  curr_b = curr_vse->curr_b;
126  }
127 }
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
UNICHAR_ID WildcardID() const
Definition: dict.h:377
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:84

◆ GenerateInitial()

void tesseract::LMPainPoints::GenerateInitial ( WERD_RES word_res)

Definition at line 52 of file lm_pain_points.cpp.

52  {
53  MATRIX *ratings = word_res->ratings;
54  AssociateStats associate_stats;
55  for (int col = 0; col < ratings->dimension(); ++col) {
56  int row_end = std::min(ratings->dimension(), col + ratings->bandwidth() + 1);
57  for (int row = col + 1; row < row_end; ++row) {
58  MATRIX_COORD coord(col, row);
59  if (coord.Valid(*ratings) && ratings->get(col, row) != NOT_CLASSIFIED) {
60  continue;
61  }
62  // Add an initial pain point if needed.
63  if (ratings->Classified(col, row - 1, dict_->WildcardID()) ||
64  (col + 1 < ratings->dimension() &&
65  ratings->Classified(col + 1, row, dict_->WildcardID()))) {
66  GeneratePainPoint(col, row, LM_PPTYPE_SHAPE, 0.0, true, max_char_wh_ratio_, word_res);
67  }
68  }
69  }
70 }
#define NOT_CLASSIFIED
Definition: matrix.h:45

◆ GeneratePainPoint()

bool tesseract::LMPainPoints::GeneratePainPoint ( int  col,
int  row,
LMPainPointsType  pp_type,
float  special_priority,
bool  ok_to_extend,
float  max_char_wh_ratio,
WERD_RES word_res 
)

Definition at line 142 of file lm_pain_points.cpp.

144  {
145  MATRIX_COORD coord(col, row);
146  if (coord.Valid(*word_res->ratings) &&
147  word_res->ratings->Classified(col, row, dict_->WildcardID())) {
148  return false;
149  }
150  if (debug_level_ > 3) {
151  tprintf("Generating pain point for col=%d row=%d type=%s\n", col, row,
152  LMPainPointsTypeName[pp_type]);
153  }
154  // Compute associate stats.
155  AssociateStats associate_stats;
156  AssociateUtils::ComputeStats(col, row, nullptr, 0, fixed_pitch_, max_char_wh_ratio, word_res,
157  debug_level_, &associate_stats);
158  // For fixed-pitch fonts/languages: if the current combined blob overlaps
159  // the next blob on the right and it is ok to extend the blob, try extending
160  // the blob until there is no overlap with the next blob on the right or
161  // until the width-to-height ratio becomes too large.
162  if (ok_to_extend) {
163  while (associate_stats.bad_fixed_pitch_right_gap && row + 1 < word_res->ratings->dimension() &&
164  !associate_stats.bad_fixed_pitch_wh_ratio) {
165  AssociateUtils::ComputeStats(col, ++row, nullptr, 0, fixed_pitch_, max_char_wh_ratio,
166  word_res, debug_level_, &associate_stats);
167  }
168  }
169  if (associate_stats.bad_shape) {
170  if (debug_level_ > 3) {
171  tprintf("Discarded pain point with a bad shape\n");
172  }
173  return false;
174  }
175 
176  // Insert the new pain point into pain_points_heap_.
177  if (pain_points_heaps_[pp_type].size() < max_heap_size_) {
178  // Compute pain point priority.
179  float priority;
180  if (pp_type == LM_PPTYPE_PATH) {
181  priority = special_priority;
182  } else {
183  priority = associate_stats.gap_sum;
184  }
185  MatrixCoordPair pain_point(priority, MATRIX_COORD(col, row));
186  pain_points_heaps_[pp_type].Push(&pain_point);
187  if (debug_level_) {
188  tprintf("Added pain point with priority %g\n", priority);
189  }
190  return true;
191  } else {
192  if (debug_level_) {
193  tprintf("Pain points heap is full\n");
194  }
195  return false;
196  }
197 }
KDPairInc< float, MATRIX_COORD > MatrixCoordPair
Definition: matrix.h:724
void Push(Pair *entry)
Definition: genericheap.h:95
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:33

◆ HasPainPoints()

bool tesseract::LMPainPoints::HasPainPoints ( LMPainPointsType  pp_type) const
inline

Definition at line 76 of file lm_pain_points.h.

76  {
77  return !pain_points_heaps_[pp_type].empty();
78  }
bool empty() const
Definition: genericheap.h:68

◆ PainPointDescription()

static const char* tesseract::LMPainPoints::PainPointDescription ( LMPainPointsType  type)
inlinestatic

Definition at line 63 of file lm_pain_points.h.

63  {
64  return LMPainPointsTypeName[type];
65  }

◆ RemapForSplit()

void tesseract::LMPainPoints::RemapForSplit ( int  index)

Adjusts the pain point coordinates to cope with expansion of the ratings matrix due to a split of the blob with the given index.

Definition at line 203 of file lm_pain_points.cpp.

203  {
204  for (auto &pain_points_heap : pain_points_heaps_) {
205  std::vector<MatrixCoordPair> &heap = pain_points_heap.heap();
206  for (auto entry : heap) {
207  entry.data().MapForSplit(index);
208  }
209  }
210 }

Member Data Documentation

◆ kDefaultPainPointPriorityAdjustment

const float tesseract::LMPainPoints::kDefaultPainPointPriorityAdjustment = 2.0f
static

Definition at line 57 of file lm_pain_points.h.

◆ kLooseMaxCharWhRatio

const float tesseract::LMPainPoints::kLooseMaxCharWhRatio = 2.5f
static

Definition at line 61 of file lm_pain_points.h.


The documentation for this class was generated from the following files: