tesseract  5.0.0
rejctmap.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: rejctmap.h (Formerly rejmap.h)
3  * Description: REJ and REJMAP class functions.
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1994, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17 
18 This module may look unnecessarily verbose, but here's the philosophy...
19 
20 ALL processing of the reject map is done in this module. There are lots of
21 separate calls to set reject/accept flags. These have DELIBERATELY been kept
22 distinct so that this module can decide what to do.
23 
24 Basically, there is a flag for each sort of rejection or acceptance. This
25 provides a history of what has happened to EACH character.
26 
27 Determining whether a character is CURRENTLY rejected depends on implicit
28 understanding of the SEQUENCE of possible calls. The flags are defined and
29 grouped in the REJ_FLAGS enum. These groupings are used in determining a
30 characters CURRENT rejection status. Basically, a character is ACCEPTED if
31 
32  none of the permanent rej flags are set
33  AND ( the character has never been rejected
34  OR an accept flag is set which is LATER than the latest reject flag )
35 
36 IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE
37 OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!!
38 **********************************************************************/
39 
40 #ifndef REJCTMAP_H
41 #define REJCTMAP_H
42 
43 #include "errcode.h"
44 #include "params.h"
45 
46 #include <bitset>
47 #include <memory>
48 
49 namespace tesseract {
50 
51 enum REJ_FLAGS {
52  /* Reject modes which are NEVER overridden */
53  R_TESS_FAILURE, // PERM Tess didn't classify
54  R_SMALL_XHT, // PERM Xht too small
55  R_EDGE_CHAR, // PERM Too close to edge of image
56  R_1IL_CONFLICT, // PERM 1Il confusion
57  R_POSTNN_1IL, // PERM 1Il unrejected by NN
58  R_REJ_CBLOB, // PERM Odd blob
59  R_MM_REJECT, // PERM Matrix match rejection (m's)
60  R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend
61 
62  /* Initial reject modes (pre NN_ACCEPT) */
63  R_POOR_MATCH, // TEMP Ray's original heuristic (Not used)
64  R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD
65  R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD
66  R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD
67 
68  /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */
69  R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop
70  R_DUBIOUS, // TEMP Post NN dodgy chars
71  R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN
72  R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest
73  R_XHT_FIXUP, // TEMP Xht tests unsure
74 
75  /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */
76  R_BAD_QUALITY, // TEMP Quality metrics bad for WERD
77 
78  /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/
79  R_DOC_REJ, // TEMP Document rejection
80  R_BLOCK_REJ, // TEMP Block rejection
81  R_ROW_REJ, // TEMP Row rejection
82  R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space
83 
84  /* Accept modes which occur between the above rejection groups */
85  R_NN_ACCEPT, // NN acceptance
86  R_HYPHEN_ACCEPT, // Hyphen acceptance
87  R_MM_ACCEPT, // Matrix match acceptance
88  R_QUALITY_ACCEPT, // Accept word in good quality doc
89  R_MINIMAL_REJ_ACCEPT // Accept EVERYTHING except tess failures
90 };
91 
92 /* REJECT MAP VALUES */
93 
94 #define MAP_ACCEPT '1'
95 #define MAP_REJECT_PERM '0'
96 #define MAP_REJECT_TEMP '2'
97 #define MAP_REJECT_POTENTIAL '3'
98 
99 class REJ {
100  std::bitset<32> flags;
101 
102  void set_flag(REJ_FLAGS rej_flag) {
103  flags.set(rej_flag);
104  }
105 
106 public:
107  REJ() = default;
108 
109  REJ( // classwise copy
110  const REJ &source) {
111  flags = source.flags;
112  }
113 
114  REJ &operator=( // assign REJ
115  const REJ &source) = default;
116 
117  bool flag(REJ_FLAGS rej_flag) const {
118  return flags[rej_flag];
119  }
120 
121  char display_char() const {
122  if (perm_rejected()) {
123  return MAP_REJECT_PERM;
124  } else if (accept_if_good_quality()) {
125  return MAP_REJECT_POTENTIAL;
126  } else if (rejected()) {
127  return MAP_REJECT_TEMP;
128  } else {
129  return MAP_ACCEPT;
130  }
131  }
132 
133  bool perm_rejected() const { // Is char perm reject?
134  return (flag(R_TESS_FAILURE) || flag(R_SMALL_XHT) || flag(R_EDGE_CHAR) ||
137  }
138 
139 private:
140  bool rej_before_nn_accept() const {
143  }
144 
145  bool rej_between_nn_and_mm() const {
146  return flag(R_HYPHEN) || flag(R_DUBIOUS) || flag(R_NO_ALPHANUMS) ||
148  }
149 
150  bool rej_between_mm_and_quality_accept() const {
151  return flag(R_BAD_QUALITY);
152  }
153 
154  bool rej_between_quality_and_minimal_rej_accept() const {
155  return flag(R_DOC_REJ) || flag(R_BLOCK_REJ) || flag(R_ROW_REJ) ||
156  flag(R_UNLV_REJ);
157  }
158 
159  bool rej_before_mm_accept() const {
160  return rej_between_nn_and_mm() ||
161  (rej_before_nn_accept() && !flag(R_NN_ACCEPT) &&
163  }
164 
165  bool rej_before_quality_accept() const {
166  return rej_between_mm_and_quality_accept() ||
167  (!flag(R_MM_ACCEPT) && rej_before_mm_accept());
168  }
169 
170 public:
171  bool rejected() const { // Is char rejected?
172  if (flag(R_MINIMAL_REJ_ACCEPT)) {
173  return false;
174  } else {
175  return (perm_rejected() || rej_between_quality_and_minimal_rej_accept() ||
176  (!flag(R_QUALITY_ACCEPT) && rej_before_quality_accept()));
177  }
178  }
179 
180  bool accept_if_good_quality() const { // potential rej?
181  return (rejected() && !perm_rejected() && flag(R_BAD_PERMUTER) &&
184  (!rej_between_nn_and_mm() && !rej_between_mm_and_quality_accept() &&
185  !rej_between_quality_and_minimal_rej_accept()));
186  }
187 
188  void setrej_tess_failure() { // Tess generated blank
189  set_flag(R_TESS_FAILURE);
190  }
191 
192  void setrej_small_xht() { // Small xht char/wd
193  set_flag(R_SMALL_XHT);
194  }
195 
196  void setrej_edge_char() { // Close to image edge
197  set_flag(R_EDGE_CHAR);
198  }
199 
200  void setrej_1Il_conflict() { // Initial reject map
201  set_flag(R_1IL_CONFLICT);
202  }
203 
204  void setrej_postNN_1Il() { // 1Il after NN
205  set_flag(R_POSTNN_1IL);
206  }
207 
208  void setrej_rej_cblob() { // Insert duff blob
209  set_flag(R_REJ_CBLOB);
210  }
211 
212  void setrej_mm_reject() { // Matrix matcher
213  set_flag(R_MM_REJECT);
214  }
215 
216  void setrej_bad_repetition() { // Odd repeated char
217  set_flag(R_BAD_REPETITION);
218  }
219 
220  void setrej_poor_match() { // Failed Rays heuristic
221  set_flag(R_POOR_MATCH);
222  }
223 
225  // TEMP reject_word
226  set_flag(R_NOT_TESS_ACCEPTED);
227  }
228 
230  // TEMP reject_word
231  set_flag(R_CONTAINS_BLANKS);
232  }
233 
234  void setrej_bad_permuter() { // POTENTIAL reject_word
235  set_flag(R_BAD_PERMUTER);
236  }
237 
238  void setrej_hyphen() { // PostNN dubious hyphen or .
239  set_flag(R_HYPHEN);
240  }
241 
242  void setrej_dubious() { // PostNN dubious limit
243  set_flag(R_DUBIOUS);
244  }
245 
246  void setrej_no_alphanums() { // TEMP reject_word
247  set_flag(R_NO_ALPHANUMS);
248  }
249 
250  void setrej_mostly_rej() { // TEMP reject_word
251  set_flag(R_MOSTLY_REJ);
252  }
253 
254  void setrej_xht_fixup() { // xht fixup
255  set_flag(R_XHT_FIXUP);
256  }
257 
258  void setrej_bad_quality() { // TEMP reject_word
259  set_flag(R_BAD_QUALITY);
260  }
261 
262  void setrej_doc_rej() { // TEMP reject_word
263  set_flag(R_DOC_REJ);
264  }
265 
266  void setrej_block_rej() { // TEMP reject_word
267  set_flag(R_BLOCK_REJ);
268  }
269 
270  void setrej_row_rej() { // TEMP reject_word
271  set_flag(R_ROW_REJ);
272  }
273 
274  void setrej_unlv_rej() { // TEMP reject_word
275  set_flag(R_UNLV_REJ);
276  }
277 
278  void setrej_hyphen_accept() { // NN Flipped a char
279  set_flag(R_HYPHEN_ACCEPT);
280  }
281 
282  void setrej_nn_accept() { // NN Flipped a char
283  set_flag(R_NN_ACCEPT);
284  }
285 
286  void setrej_mm_accept() { // Matrix matcher
287  set_flag(R_MM_ACCEPT);
288  }
289 
290  void setrej_quality_accept() { // Quality flip a char
291  set_flag(R_QUALITY_ACCEPT);
292  }
293 
295  // Accept all except blank
296  set_flag(R_MINIMAL_REJ_ACCEPT);
297  }
298 
299  bool accepted() const { // Is char accepted?
300  return !rejected();
301  }
302 
303  bool recoverable() const {
304  return (rejected() && !perm_rejected());
305  }
306 
307  void full_print(FILE *fp) const;
308 };
309 
310 class REJMAP {
311  std::unique_ptr<REJ[]> ptr; // ptr to the chars
312  uint16_t len = 0; // Number of chars
313 
314 public:
315  REJMAP() = default;
316 
317  REJMAP(const REJMAP &rejmap) {
318  *this = rejmap;
319  }
320 
321  REJMAP &operator=(const REJMAP &source);
322 
323  // Sets up the ptr array to length, whatever it was before.
324  void initialise(uint16_t length);
325 
326  REJ &operator[]( // access function
327  uint16_t index) const // map index
328  {
329  ASSERT_HOST(index < len);
330  return ptr[index]; // no bounds checks
331  }
332 
333  uint16_t length() const { // map length
334  return len;
335  }
336 
337  int16_t accept_count() const; // How many accepted?
338 
339  int16_t reject_count() const { // How many rejects?
340  return len - accept_count();
341  }
342 
343  // Cut out an element.
344  void remove_pos(uint16_t pos);
345 
346  void print(FILE *fp) const;
347 
348  void full_print(FILE *fp) const;
349 
350  bool recoverable_rejects() const; // Any non perm rejs?
351 
352  bool quality_recoverable_rejects() const;
353  // Any potential rejs?
354 
355  void rej_word_small_xht(); // Reject whole word
356  // Reject whole word
357  void rej_word_tess_failure();
359  // Reject whole word
360  // Reject whole word
362  // Reject whole word
363  void rej_word_bad_permuter();
364  void rej_word_xht_fixup(); // Reject whole word
365  // Reject whole word
366  void rej_word_no_alphanums();
367  void rej_word_mostly_rej(); // Reject whole word
368  void rej_word_bad_quality(); // Reject whole word
369  void rej_word_doc_rej(); // Reject whole word
370  void rej_word_block_rej(); // Reject whole word
371  void rej_word_row_rej(); // Reject whole word
372 };
373 
374 } // namespace tesseract
375 
376 #endif
#define MAP_ACCEPT
Definition: rejctmap.h:94
#define MAP_REJECT_POTENTIAL
Definition: rejctmap.h:97
#define MAP_REJECT_PERM
Definition: rejctmap.h:95
#define MAP_REJECT_TEMP
Definition: rejctmap.h:96
#define ASSERT_HOST(x)
Definition: errcode.h:59
@ R_MINIMAL_REJ_ACCEPT
Definition: rejctmap.h:89
@ R_ROW_REJ
Definition: rejctmap.h:81
@ R_NO_ALPHANUMS
Definition: rejctmap.h:71
@ R_TESS_FAILURE
Definition: rejctmap.h:53
@ R_QUALITY_ACCEPT
Definition: rejctmap.h:88
@ R_DOC_REJ
Definition: rejctmap.h:79
@ R_MM_ACCEPT
Definition: rejctmap.h:87
@ R_MOSTLY_REJ
Definition: rejctmap.h:72
@ R_XHT_FIXUP
Definition: rejctmap.h:73
@ R_POOR_MATCH
Definition: rejctmap.h:63
@ R_SMALL_XHT
Definition: rejctmap.h:54
@ R_BAD_PERMUTER
Definition: rejctmap.h:66
@ R_BAD_REPETITION
Definition: rejctmap.h:60
@ R_BLOCK_REJ
Definition: rejctmap.h:80
@ R_HYPHEN_ACCEPT
Definition: rejctmap.h:86
@ R_HYPHEN
Definition: rejctmap.h:69
@ R_CONTAINS_BLANKS
Definition: rejctmap.h:65
@ R_POSTNN_1IL
Definition: rejctmap.h:57
@ R_REJ_CBLOB
Definition: rejctmap.h:58
@ R_NOT_TESS_ACCEPTED
Definition: rejctmap.h:64
@ R_BAD_QUALITY
Definition: rejctmap.h:76
@ R_UNLV_REJ
Definition: rejctmap.h:82
@ R_NN_ACCEPT
Definition: rejctmap.h:85
@ R_DUBIOUS
Definition: rejctmap.h:70
@ R_MM_REJECT
Definition: rejctmap.h:59
@ R_1IL_CONFLICT
Definition: rejctmap.h:56
@ R_EDGE_CHAR
Definition: rejctmap.h:55
void setrej_1Il_conflict()
Definition: rejctmap.h:200
void setrej_mm_reject()
Definition: rejctmap.h:212
void setrej_rej_cblob()
Definition: rejctmap.h:208
void setrej_doc_rej()
Definition: rejctmap.h:262
void setrej_tess_failure()
Definition: rejctmap.h:188
void setrej_hyphen()
Definition: rejctmap.h:238
bool rejected() const
Definition: rejctmap.h:171
void setrej_small_xht()
Definition: rejctmap.h:192
bool recoverable() const
Definition: rejctmap.h:303
bool perm_rejected() const
Definition: rejctmap.h:133
bool flag(REJ_FLAGS rej_flag) const
Definition: rejctmap.h:117
void setrej_mm_accept()
Definition: rejctmap.h:286
REJ & operator=(const REJ &source)=default
void full_print(FILE *fp) const
Definition: rejctmap.cpp:27
void setrej_xht_fixup()
Definition: rejctmap.h:254
bool accepted() const
Definition: rejctmap.h:299
void setrej_minimal_rej_accept()
Definition: rejctmap.h:294
void setrej_unlv_rej()
Definition: rejctmap.h:274
void setrej_nn_accept()
Definition: rejctmap.h:282
void setrej_postNN_1Il()
Definition: rejctmap.h:204
void setrej_poor_match()
Definition: rejctmap.h:220
void setrej_block_rej()
Definition: rejctmap.h:266
void setrej_contains_blanks()
Definition: rejctmap.h:229
void setrej_no_alphanums()
Definition: rejctmap.h:246
void setrej_row_rej()
Definition: rejctmap.h:270
void setrej_not_tess_accepted()
Definition: rejctmap.h:224
void setrej_mostly_rej()
Definition: rejctmap.h:250
void setrej_bad_permuter()
Definition: rejctmap.h:234
void setrej_bad_repetition()
Definition: rejctmap.h:216
REJ()=default
void setrej_bad_quality()
Definition: rejctmap.h:258
void setrej_quality_accept()
Definition: rejctmap.h:290
void setrej_edge_char()
Definition: rejctmap.h:196
void setrej_dubious()
Definition: rejctmap.h:242
char display_char() const
Definition: rejctmap.h:121
REJ(const REJ &source)
Definition: rejctmap.h:109
bool accept_if_good_quality() const
Definition: rejctmap.h:180
void setrej_hyphen_accept()
Definition: rejctmap.h:278
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:139
void print(FILE *fp) const
Definition: rejctmap.cpp:112
int16_t reject_count() const
Definition: rejctmap.h:339
void rej_word_tess_failure()
Definition: rejctmap.cpp:133
void rej_word_contains_blanks()
Definition: rejctmap.cpp:147
void rej_word_small_xht()
Definition: rejctmap.cpp:127
void rej_word_bad_quality()
Definition: rejctmap.cpp:187
void rej_word_xht_fixup()
Definition: rejctmap.cpp:163
void rej_word_row_rej()
Definition: rejctmap.cpp:211
void remove_pos(uint16_t pos)
Definition: rejctmap.cpp:100
int16_t accept_count() const
Definition: rejctmap.cpp:72
REJ & operator[](uint16_t index) const
Definition: rejctmap.h:326
REJMAP(const REJMAP &rejmap)
Definition: rejctmap.h:317
uint16_t length() const
Definition: rejctmap.h:333
void rej_word_block_rej()
Definition: rejctmap.cpp:203
bool quality_recoverable_rejects() const
Definition: rejctmap.cpp:91
void initialise(uint16_t length)
Definition: rejctmap.cpp:67
void rej_word_bad_permuter()
Definition: rejctmap.cpp:155
REJMAP & operator=(const REJMAP &source)
Definition: rejctmap.cpp:59
void rej_word_no_alphanums()
Definition: rejctmap.cpp:171
void rej_word_doc_rej()
Definition: rejctmap.cpp:195
bool recoverable_rejects() const
Definition: rejctmap.cpp:82
void full_print(FILE *fp) const
Definition: rejctmap.cpp:120
void rej_word_mostly_rej()
Definition: rejctmap.cpp:179