tesseract  5.0.0
output.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: output.cpp (Formerly output.c)
3  * Description: Output pass
4  * Author: Phil Cheatle
5  *
6  * (C) Copyright 1994, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "output.h"
20 
21 #include "control.h"
22 #include "tesseractclass.h"
23 #include "tessvars.h"
24 #ifndef DISABLED_LEGACY_ENGINE
25 # include "docqual.h"
26 # include "reject.h"
27 #endif
28 
29 #include "helpers.h"
30 
31 #include <cctype>
32 #include <cerrno>
33 #include <cstring>
34 
35 #define CTRL_NEWLINE '\012' // newline
36 #define CTRL_HARDLINE '\015' // cr
37 
38 namespace tesseract {
39 void Tesseract::output_pass( // Tess output pass //send to api
40  PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {
41  BLOCK_RES *block_of_last_word;
42  bool force_eol; // During output
43  BLOCK *nextblock; // block of next word
44  WERD *nextword; // next word
45 
46  page_res_it.restart_page();
47  block_of_last_word = nullptr;
48  while (page_res_it.word() != nullptr) {
49  check_debug_pt(page_res_it.word(), 120);
50 
51  if (target_word_box) {
52  TBOX current_word_box = page_res_it.word()->word->bounding_box();
53  FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
54  (current_word_box.bottom() + current_word_box.top()) / 2);
55  if (!target_word_box->contains(center_pt)) {
56  page_res_it.forward();
57  continue;
58  }
59  }
60  if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
61  block_of_last_word = page_res_it.block();
62  }
63 
64  force_eol =
65  (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
66  (page_res_it.next_word() == nullptr);
67 
68  if (page_res_it.next_word() != nullptr) {
69  nextword = page_res_it.next_word()->word;
70  } else {
71  nextword = nullptr;
72  }
73  if (page_res_it.next_block() != nullptr) {
74  nextblock = page_res_it.next_block()->block;
75  } else {
76  nextblock = nullptr;
77  }
78  // regardless of tilde crunching
79  write_results(page_res_it,
80  determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
81  nextword, nextblock),
82  force_eol);
83  page_res_it.forward();
84  }
85 }
86 
87 /*************************************************************************
88  * write_results()
89  *
90  * All recognition and rejection has now been done. Generate the following:
91  * .txt file - giving the final best choices with NO highlighting
92  * .raw file - giving the tesseract top choice output for each word
93  * .map file - showing how the .txt file has been rejected in the .ep file
94  * epchoice list - a list of one element per word, containing the text for the
95  * epaper. Reject strings are inserted.
96  * inset list - a list of bounding boxes of reject insets - indexed by the
97  * reject strings in the epchoice text.
98  *************************************************************************/
100  char newline_type, // type of newline
101  bool force_eol) { // override tilde crunch?
102  WERD_RES *word = page_res_it.word();
103  const UNICHARSET &uchset = *word->uch_set;
104  bool need_reject = false;
105  UNICHAR_ID space = uchset.unichar_to_id(" ");
106 
107  if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
108  !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
109  if ((word->unlv_crunch_mode != CR_DELETE) &&
110  (!stats_.tilde_crunch_written ||
111  ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
112  !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
113  if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
114  !word->word->flag(W_FUZZY_SP)) {
115  stats_.last_char_was_tilde = false;
116  }
117  need_reject = true;
118  }
119  if ((need_reject && !stats_.last_char_was_tilde) ||
120  (force_eol && stats_.write_results_empty_block)) {
121  /* Write a reject char - mark as rejected unless zero_rejection mode */
122  stats_.last_char_was_tilde = true;
123  stats_.tilde_crunch_written = true;
124  stats_.last_char_was_newline = false;
125  stats_.write_results_empty_block = false;
126  }
127 
128  if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
129  stats_.tilde_crunch_written = false;
130  stats_.last_char_was_newline = true;
131  stats_.last_char_was_tilde = false;
132  }
133 
134  if (force_eol) {
135  stats_.write_results_empty_block = true;
136  }
137  return;
138  }
139 
140  /* NORMAL PROCESSING of non tilde crunched words */
141 
142  stats_.tilde_crunch_written = false;
143  if (newline_type) {
144  stats_.last_char_was_newline = true;
145  } else {
146  stats_.last_char_was_newline = false;
147  }
148  stats_.write_results_empty_block = force_eol; // about to write a real word
149 
150  if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
151  !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
152  (word->best_choice->unichar_id(0) == space)) {
153  /* Prevent adjacent tilde across words - we know that adjacent tildes within
154  words have been removed */
155  word->MergeAdjacentBlobs(0);
156  }
157  if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
158  stats_.last_char_was_tilde = false;
159  } else {
160  if (word->reject_map.length() > 0) {
161  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
162  stats_.last_char_was_tilde = true;
163  } else {
164  stats_.last_char_was_tilde = false;
165  }
166  } else if (word->word->space() > 0) {
167  stats_.last_char_was_tilde = false;
168  }
169  /* else it is unchanged as there are no output chars */
170  }
171 
172  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
173 
174  set_unlv_suspects(word);
175  check_debug_pt(word, 120);
176  if (tessedit_rejection_debug) {
177  tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
178  dict_word(*(word->best_choice)));
179  }
180  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
181  if (tessedit_zero_rejection) {
182  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
183  for (unsigned i = 0; i < word->best_choice->length(); ++i) {
184  if (word->reject_map[i].rejected()) {
185  word->reject_map[i].setrej_minimal_rej_accept();
186  }
187  }
188  }
189  if (tessedit_minimal_rejection) {
190  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
191  for (unsigned i = 0; i < word->best_choice->length(); ++i) {
192  if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
193  word->reject_map[i].setrej_minimal_rej_accept();
194  }
195  }
196  }
197  }
198 }
199 
200 /**********************************************************************
201  * determine_newline_type
202  *
203  * Find whether we have a wrapping or hard newline.
204  * Return false if not at end of line.
205  **********************************************************************/
206 
207 char determine_newline_type( // test line ends
208  WERD *word, // word to do
209  BLOCK *block, // current block
210  WERD *next_word, // next word
211  BLOCK *next_block // block of next word
212 ) {
213  int16_t end_gap; // to right edge
214  int16_t width; // of next word
215  TBOX word_box; // bounding
216  TBOX next_box; // next word
217  TBOX block_box; // block bounding
218 
219  if (!word->flag(W_EOL)) {
220  return false; // not end of line
221  }
222  if (next_word == nullptr || next_block == nullptr || block != next_block) {
223  return CTRL_NEWLINE;
224  }
225  if (next_word->space() > 0) {
226  return CTRL_HARDLINE; // it is tabbed
227  }
228  word_box = word->bounding_box();
229  next_box = next_word->bounding_box();
230  block_box = block->pdblk.bounding_box();
231  // gap to eol
232  end_gap = block_box.right() - word_box.right();
233  end_gap -= static_cast<int32_t>(block->space());
234  width = next_box.right() - next_box.left();
235  // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
236  // block_box.right(),word_box.right(),end_gap,
237  // next_box.right(),next_box.left(),width,
238  // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
239  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
240 }
241 
242 /*************************************************************************
243  * get_rep_char()
244  * Return the first accepted character from the repetition string. This is the
245  * character which is repeated - as determined earlier by fix_rep_char()
246  *************************************************************************/
247 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
248  int i;
249  for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
250  ;
251  }
252 
253  if (i < word->reject_map.length()) {
254  return word->best_choice->unichar_id(i);
255  } else {
256  return word->uch_set->unichar_to_id(unrecognised_char.c_str());
257  }
258 }
259 
260 /*************************************************************************
261  * SUSPECT LEVELS
262  *
263  * 0 - don't reject ANYTHING
264  * 1,2 - partial rejection
265  * 3 - BEST
266  *
267  * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
268  * tessedit_minimal_rejection.
269  *************************************************************************/
271  int len = word_res->reject_map.length();
272  const WERD_CHOICE &word = *(word_res->best_choice);
273  const UNICHARSET &uchset = *word.unicharset();
274  int i;
275  float rating_per_ch;
276 
277  if (suspect_level == 0) {
278  for (i = 0; i < len; i++) {
279  if (word_res->reject_map[i].rejected()) {
280  word_res->reject_map[i].setrej_minimal_rej_accept();
281  }
282  }
283  return;
284  }
285 
286  if (suspect_level >= 3) {
287  return; // Use defaults
288  }
289 
290  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
291 
292  if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
293  /* Unreject alphas in dictionary words */
294  for (i = 0; i < len; ++i) {
295  if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
296  word_res->reject_map[i].setrej_minimal_rej_accept();
297  }
298  }
299  }
300 
301  rating_per_ch = word.rating() / word_res->reject_map.length();
302 
303  if (rating_per_ch >= suspect_rating_per_ch) {
304  return; // Don't touch bad ratings
305  }
306 
307  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
308  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
309  for (i = 0; i < len; ++i) {
310  if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
311  word_res->reject_map[i].setrej_minimal_rej_accept();
312  }
313  }
314  }
315 
316  for (i = 0; i < len; i++) {
317  if (word_res->reject_map[i].rejected()) {
318  if (word_res->reject_map[i].flag(R_DOC_REJ)) {
319  word_res->reject_map[i].setrej_minimal_rej_accept();
320  }
321  if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
322  word_res->reject_map[i].setrej_minimal_rej_accept();
323  }
324  if (word_res->reject_map[i].flag(R_ROW_REJ)) {
325  word_res->reject_map[i].setrej_minimal_rej_accept();
326  }
327  }
328  }
329 
330  if (suspect_level == 2) {
331  return;
332  }
333 
334  if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
335  for (i = 0; i < len; i++) {
336  if (word_res->reject_map[i].rejected()) {
337  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
338  word_res->reject_map[i].flag(R_POSTNN_1IL))) {
339  word_res->reject_map[i].setrej_minimal_rej_accept();
340  }
341 
342  if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
343  word_res->reject_map[i].setrej_minimal_rej_accept();
344  }
345  }
346  }
347  }
348 
349  if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
350  word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
351  acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
352  if (word_res->reject_map.length() > suspect_short_words) {
353  for (i = 0; i < len; i++) {
354  if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
355  word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
356  word_res->reject_map[i].flag(R_POSTNN_1IL) ||
357  word_res->reject_map[i].flag(R_MM_REJECT))) {
358  word_res->reject_map[i].setrej_minimal_rej_accept();
359  }
360  }
361  }
362  }
363 }
364 
365 int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
366  int count = 0;
367  for (unsigned i = 0; i < word.length(); ++i) {
368  if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
369  count++;
370  }
371  }
372  return count;
373 }
374 
376  int count = 0;
377  for (unsigned i = 0; i < word.length(); ++i) {
378  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
379  word.unicharset()->get_isdigit(word.unichar_id(i))) {
380  count++;
381  }
382  }
383  return count;
384 }
385 
386 bool Tesseract::acceptable_number_string(const char *s, const char *lengths) {
387  bool prev_digit = false;
388 
389  if (*lengths == 1 && *s == '(') {
390  s++;
391  }
392 
393  if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {
394  s++;
395  }
396 
397  for (; *s != '\0'; s += *(lengths++)) {
398  if (unicharset.get_isdigit(s, *lengths)) {
399  prev_digit = true;
400  } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {
401  prev_digit = false;
402  } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
403  ((*s == '%') || (*s == ')'))) {
404  return true;
405  } else if (prev_digit && *lengths == 1 && (*s == '%') &&
406  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
407  (*(s + *lengths + *(lengths + 1)) == '\0')) {
408  return true;
409  } else {
410  return false;
411  }
412  }
413  return true;
414 }
415 } // namespace tesseract
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
#define CTRL_NEWLINE
Definition: output.cpp:35
#define CTRL_HARDLINE
Definition: output.cpp:36
#define ASSERT_HOST(x)
Definition: errcode.h:59
@ W_BOL
start of line
Definition: werd.h:34
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_EOL
end of line
Definition: werd.h:35
@ W_REP_CHAR
repeated character
Definition: werd.h:40
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
@ R_ROW_REJ
Definition: rejctmap.h:81
@ R_DOC_REJ
Definition: rejctmap.h:79
@ R_BLOCK_REJ
Definition: rejctmap.h:80
@ R_POSTNN_1IL
Definition: rejctmap.h:57
@ R_MM_REJECT
Definition: rejctmap.h:59
@ R_1IL_CONFLICT
Definition: rejctmap.h:56
@ CR_NONE
Definition: pageres.h:160
@ CR_KEEP_SPACE
Definition: pageres.h:160
@ CR_DELETE
Definition: pageres.h:160
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:207
int UNICHAR_ID
Definition: unichar.h:36
bool acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:386
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:375
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:270
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1704
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:39
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:593
int16_t count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:365
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
Definition: output.cpp:99
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1811
UNICHAR_ID get_rep_char(WERD_RES *word)
Definition: output.cpp:247
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
int16_t space() const
return spacing
Definition: ocrblock.h:93
WERD_CHOICE * best_choice
Definition: pageres.h:239
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:313
const UNICHARSET * uch_set
Definition: pageres.h:201
void MergeAdjacentBlobs(unsigned index)
Definition: pageres.cpp:1005
WERD_RES * next_word() const
Definition: pageres.h:772
BLOCK_RES * next_block() const
Definition: pageres.h:778
WERD_RES * restart_page()
Definition: pageres.h:710
WERD_RES * forward()
Definition: pageres.h:743
WERD_RES * word() const
Definition: pageres.h:763
BLOCK_RES * block() const
Definition: pageres.h:769
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
std::string debug_string() const
Definition: ratngs.h:475
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
bool empty() const
Definition: ratngs.h:280
const std::string & unichar_lengths() const
Definition: ratngs.h:529
const UNICHARSET * unicharset() const
Definition: ratngs.h:277
unsigned length() const
Definition: ratngs.h:283
float rating() const
Definition: ratngs.h:308
std::string & unichar_string()
Definition: ratngs.h:515
TDimension left() const
Definition: rect.h:82
TDimension top() const
Definition: rect.h:68
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
bool contains(const FCOORD pt) const
Definition: rect.h:344
uint16_t length() const
Definition: rejctmap.h:333
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128
TBOX bounding_box() const
Definition: werd.cpp:155
uint8_t space() const
Definition: werd.h:100
UNICHARSET unicharset
Definition: ccutil.h:61
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:713
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:86