tesseract  5.0.0
tordmain.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tordmain.cpp (Formerly textordp.c)
3  * Description: C++ top level textord code.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #define _USE_MATH_DEFINES // for M_PI
20 
21 #ifdef HAVE_CONFIG_H
22 # include "config_auto.h"
23 #endif
24 
25 #include "tordmain.h"
26 
27 #include "arrayaccess.h" // for GET_DATA_BYTE
28 #include "blobbox.h" // for BLOBNBOX_IT, BLOBNBOX, TO_BLOCK, TO_B...
29 #include "ccstruct.h" // for CCStruct, CCStruct::kXHeightFraction
30 #include "clst.h" // for CLISTIZE
31 #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST, C_OUTLINE
32 #include "drawtord.h" // for plot_box_list, to_win, create_to_win
33 #include "edgblob.h" // for extract_edges
34 #include "errcode.h" // for ASSERT_HOST, ...
35 #include "makerow.h" // for textord_test_x, textord_test_y, texto...
36 #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
37 #include "ocrrow.h" // for ROW, ROW_IT, ROW_LIST, tweak_row_base...
38 #include "params.h" // for DoubleParam, BoolParam, IntParam
39 #include "pdblock.h" // for PDBLK
40 #include "points.h" // for FCOORD, ICOORD
41 #include "polyblk.h" // for POLY_BLOCK
42 #include "quadratc.h" // for QUAD_COEFFS
43 #include "quspline.h" // for QSPLINE, tweak_row_baseline
44 #include "rect.h" // for TBOX
45 #include "scrollview.h" // for ScrollView, ScrollView::WHITE
46 #include "statistc.h" // for STATS
47 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
48 #include "textord.h" // for Textord, WordWithBox, WordGrid, WordS...
49 #include "tprintf.h" // for tprintf
50 #include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP
51 
52 #include <allheaders.h> // for pixDestroy, pixGetHeight, boxCreate
53 
54 #include <cfloat> // for FLT_MAX
55 #include <cmath> // for ceil, floor, M_PI
56 #include <cstdint> // for INT16_MAX, uint32_t, int32_t, int16_t
57 #include <memory>
58 
59 namespace tesseract {
60 
61 #define MAX_NEAREST_DIST 600 // for block skew stats
62 
63 /**********************************************************************
64  * SetBlobStrokeWidth
65  *
66  * Set the horizontal and vertical stroke widths in the blob.
67  **********************************************************************/
68 void SetBlobStrokeWidth(Image pix, BLOBNBOX *blob) {
69  // Cut the blob rectangle into a Pix.
70  int pix_height = pixGetHeight(pix);
71  const TBOX &box = blob->bounding_box();
72  int width = box.width();
73  int height = box.height();
74  Box *blob_pix_box = boxCreate(box.left(), pix_height - box.top(), width, height);
75  Image pix_blob = pixClipRectangle(pix, blob_pix_box, nullptr);
76  boxDestroy(&blob_pix_box);
77  Image dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
78  pix_blob.destroy();
79  // Compute the stroke widths.
80  uint32_t *data = pixGetData(dist_pix);
81  int wpl = pixGetWpl(dist_pix);
82  // Horizontal width of stroke.
83  STATS h_stats(0, width + 1);
84  for (int y = 0; y < height; ++y) {
85  uint32_t *pixels = data + y * wpl;
86  int prev_pixel = 0;
87  int pixel = GET_DATA_BYTE(pixels, 0);
88  for (int x = 1; x < width; ++x) {
89  int next_pixel = GET_DATA_BYTE(pixels, x);
90  // We are looking for a pixel that is equal to its vertical neighbours,
91  // yet greater than its left neighbour.
92  if (prev_pixel < pixel && (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
93  (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
94  if (pixel > next_pixel) {
95  // Single local max, so an odd width.
96  h_stats.add(pixel * 2 - 1, 1);
97  } else if (pixel == next_pixel && x + 1 < width && pixel > GET_DATA_BYTE(pixels, x + 1)) {
98  // Double local max, so an even width.
99  h_stats.add(pixel * 2, 1);
100  }
101  }
102  prev_pixel = pixel;
103  pixel = next_pixel;
104  }
105  }
106  // Vertical width of stroke.
107  STATS v_stats(0, height + 1);
108  for (int x = 0; x < width; ++x) {
109  int prev_pixel = 0;
110  int pixel = GET_DATA_BYTE(data, x);
111  for (int y = 1; y < height; ++y) {
112  uint32_t *pixels = data + y * wpl;
113  int next_pixel = GET_DATA_BYTE(pixels, x);
114  // We are looking for a pixel that is equal to its horizontal neighbours,
115  // yet greater than its upper neighbour.
116  if (prev_pixel < pixel && (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
117  (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
118  if (pixel > next_pixel) {
119  // Single local max, so an odd width.
120  v_stats.add(pixel * 2 - 1, 1);
121  } else if (pixel == next_pixel && y + 1 < height &&
122  pixel > GET_DATA_BYTE(pixels + wpl, x)) {
123  // Double local max, so an even width.
124  v_stats.add(pixel * 2, 1);
125  }
126  }
127  prev_pixel = pixel;
128  pixel = next_pixel;
129  }
130  }
131  dist_pix.destroy();
132  // Store the horizontal and vertical width in the blob, keeping both
133  // widths if there is enough information, otherwise only the one with
134  // the most samples.
135  // If there are insufficient samples, store zero, rather than using
136  // 2*area/perimeter, as the numbers that gives do not match the numbers
137  // from the distance method.
138  if (h_stats.get_total() >= (width + height) / 4) {
139  blob->set_horz_stroke_width(h_stats.ile(0.5f));
140  if (v_stats.get_total() >= (width + height) / 4) {
141  blob->set_vert_stroke_width(v_stats.ile(0.5f));
142  } else {
143  blob->set_vert_stroke_width(0.0f);
144  }
145  } else {
146  if (v_stats.get_total() >= (width + height) / 4 || v_stats.get_total() > h_stats.get_total()) {
147  blob->set_horz_stroke_width(0.0f);
148  blob->set_vert_stroke_width(v_stats.ile(0.5f));
149  } else {
150  blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f) : 0.0f);
151  blob->set_vert_stroke_width(0.0f);
152  }
153  }
154 }
155 
156 /**********************************************************************
157  * assign_blobs_to_blocks2
158  *
159  * Make a list of TO_BLOCKs for portrait and landscape orientation.
160  **********************************************************************/
161 
163  BLOCK_LIST *blocks, // blocks to process
164  TO_BLOCK_LIST *port_blocks) { // output list
165  BLOCK_IT block_it = blocks;
166  C_BLOB_IT blob_it; // iterator
167  BLOBNBOX_IT port_box_it; // iterator
168  // destination iterator
169  TO_BLOCK_IT port_block_it = port_blocks;
170 
171  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
172  auto block = block_it.data();
173  auto port_block = new TO_BLOCK(block);
174 
175  // Convert the good outlines to block->blob_list
176  port_box_it.set_to_list(&port_block->blobs);
177  blob_it.set_to_list(block->blob_list());
178  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
179  auto blob = blob_it.extract();
180  auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
181  newblob->set_owns_cblob(true);
182  SetBlobStrokeWidth(pix, newblob);
183  port_box_it.add_after_then_move(newblob);
184  }
185 
186  // Put the rejected outlines in block->noise_blobs, which allows them to
187  // be reconsidered and sorted back into rows and recover outlines mistakenly
188  // rejected.
189  port_box_it.set_to_list(&port_block->noise_blobs);
190  blob_it.set_to_list(block->reject_blobs());
191  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
192  auto blob = blob_it.extract();
193  auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
194  newblob->set_owns_cblob(true);
195  SetBlobStrokeWidth(pix, newblob);
196  port_box_it.add_after_then_move(newblob);
197  }
198 
199  port_block_it.add_after_then_move(port_block);
200  }
201 }
202 
203 /**********************************************************************
204  * find_components
205  *
206  * Find the C_OUTLINEs of the connected components in each block, put them
207  * in C_BLOBs, and filter them by size, putting the different size
208  * grades on different lists in the matching TO_BLOCK in to_blocks.
209  **********************************************************************/
210 
211 void Textord::find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) {
212  int width = pixGetWidth(pix);
213  int height = pixGetHeight(pix);
214  if (width > INT16_MAX || height > INT16_MAX) {
215  tprintf("Input image too large! (%d, %d)\n", width, height);
216  return; // Can't handle it.
217  }
218 
219  BLOCK_IT block_it(blocks); // iterator
220  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
221  BLOCK *block = block_it.data();
222  if (block->pdblk.poly_block() == nullptr || block->pdblk.poly_block()->IsText()) {
223  extract_edges(pix, block);
224  }
225  }
226 
227  assign_blobs_to_blocks2(pix, blocks, to_blocks);
228  ICOORD page_tr(width, height);
229  filter_blobs(page_tr, to_blocks, !textord_test_landscape);
230 }
231 
232 /**********************************************************************
233  * filter_blobs
234  *
235  * Sort the blobs into sizes in all the blocks for later work.
236  **********************************************************************/
237 
238 void Textord::filter_blobs(ICOORD page_tr, // top right
239  TO_BLOCK_LIST *blocks, // output list
240  bool testing_on) { // for plotting
241  TO_BLOCK_IT block_it = blocks; // destination iterator
242  TO_BLOCK *block; // created block
243 
244 #ifndef GRAPHICS_DISABLED
245  if (to_win != nullptr) {
246  to_win->Clear();
247  }
248 #endif // !GRAPHICS_DISABLED
249 
250  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
251  block = block_it.data();
252  block->line_size = filter_noise_blobs(&block->blobs, &block->noise_blobs, &block->small_blobs,
253  &block->large_blobs);
254  if (block->line_size == 0) {
255  block->line_size = 1;
256  }
257  block->line_spacing =
258  block->line_size *
264 
265 #ifndef GRAPHICS_DISABLED
266  if (textord_show_blobs && testing_on) {
267  if (to_win == nullptr) {
268  create_to_win(page_tr);
269  }
270  block->plot_graded_blobs(to_win);
271  }
272  if (textord_show_boxes && testing_on) {
273  if (to_win == nullptr) {
274  create_to_win(page_tr);
275  }
280  }
281 #endif // !GRAPHICS_DISABLED
282  }
283 }
284 
285 /**********************************************************************
286  * filter_noise_blobs
287  *
288  * Move small blobs to a separate list.
289  **********************************************************************/
290 
291 float Textord::filter_noise_blobs(BLOBNBOX_LIST *src_list, // original list
292  BLOBNBOX_LIST *noise_list, // noise list
293  BLOBNBOX_LIST *small_list, // small blobs
294  BLOBNBOX_LIST *large_list) { // large blobs
295  int16_t height; // height of blob
296  int16_t width; // of blob
297  BLOBNBOX *blob; // current blob
298  float initial_x; // first guess
299  BLOBNBOX_IT src_it = src_list; // iterators
300  BLOBNBOX_IT noise_it = noise_list;
301  BLOBNBOX_IT small_it = small_list;
302  BLOBNBOX_IT large_it = large_list;
303  STATS size_stats(0, MAX_NEAREST_DIST);
304  // blob heights
305  float min_y; // size limits
306  float max_y;
307  float max_x;
308  float max_height; // of good blobs
309 
310  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
311  blob = src_it.data();
312  if (blob->bounding_box().height() < textord_max_noise_size) {
313  noise_it.add_after_then_move(src_it.extract());
314  } else if (blob->enclosed_area() >= blob->bounding_box().height() *
315  blob->bounding_box().width() *
316  textord_noise_area_ratio) {
317  small_it.add_after_then_move(src_it.extract());
318  }
319  }
320  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
321  size_stats.add(src_it.data()->bounding_box().height(), 1);
322  }
323  initial_x = size_stats.ile(textord_initialx_ile);
324  max_y = ceil(initial_x *
328  min_y = std::floor(initial_x / 2);
329  max_x = ceil(initial_x * textord_width_limit);
330  small_it.move_to_first();
331  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
332  height = small_it.data()->bounding_box().height();
333  if (height > max_y) {
334  large_it.add_after_then_move(small_it.extract());
335  } else if (height >= min_y) {
336  src_it.add_after_then_move(small_it.extract());
337  }
338  }
339  size_stats.clear();
340  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
341  height = src_it.data()->bounding_box().height();
342  width = src_it.data()->bounding_box().width();
343  if (height < min_y) {
344  small_it.add_after_then_move(src_it.extract());
345  } else if (height > max_y || width > max_x) {
346  large_it.add_after_then_move(src_it.extract());
347  } else {
348  size_stats.add(height, 1);
349  }
350  }
351  max_height = size_stats.ile(textord_initialasc_ile);
352  // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
353  // max_y,min_y,initial_x,max_height);
355  if (max_height > initial_x) {
356  initial_x = max_height;
357  }
358  // tprintf(" ret=%g\n",initial_x);
359  return initial_x;
360 }
361 
362 // Fixes the block so it obeys all the rules:
363 // Must have at least one ROW.
364 // Must have at least one WERD.
365 // WERDs contain a fake blob.
366 void Textord::cleanup_nontext_block(BLOCK *block) {
367  // Non-text blocks must contain at least one row.
368  ROW_IT row_it(block->row_list());
369  if (row_it.empty()) {
370  const TBOX &box = block->pdblk.bounding_box();
371  float height = box.height();
372  int32_t xstarts[2] = {box.left(), box.right()};
373  double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
374  ROW *row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1);
375  row_it.add_after_then_move(row);
376  }
377  // Each row must contain at least one word.
378  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
379  ROW *row = row_it.data();
380  WERD_IT w_it(row->word_list());
381  if (w_it.empty()) {
382  // Make a fake blob to put in the word.
383  TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box();
384  C_BLOB *blob = C_BLOB::FakeBlob(box);
385  C_BLOB_LIST blobs;
386  C_BLOB_IT blob_it(&blobs);
387  blob_it.add_after_then_move(blob);
388  WERD *word = new WERD(&blobs, 0, nullptr);
389  w_it.add_after_then_move(word);
390  }
391  // Each word must contain a fake blob.
392  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
393  WERD *word = w_it.data();
394  // Just assert that this is true, as it would be useful to find
395  // out why it isn't.
396  ASSERT_HOST(!word->cblob_list()->empty());
397  }
398  row->recalc_bounding_box();
399  }
400 }
401 
402 /**********************************************************************
403  * cleanup_blocks
404  *
405  * Delete empty blocks, rows from the page.
406  **********************************************************************/
407 
408 void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) {
409  BLOCK_IT block_it = blocks; // iterator
410  ROW_IT row_it; // row iterator
411 
412  int num_rows = 0;
413  int num_rows_all = 0;
414  int num_blocks = 0;
415  int num_blocks_all = 0;
416  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
417  BLOCK *block = block_it.data();
418  if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
419  cleanup_nontext_block(block);
420  continue;
421  }
422  num_rows = 0;
423  num_rows_all = 0;
424  if (clean_noise) {
425  row_it.set_to_list(block->row_list());
426  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
427  ROW *row = row_it.data();
428  ++num_rows_all;
429  clean_small_noise_from_words(row);
430  if ((textord_noise_rejrows && !row->word_list()->empty() && clean_noise_from_row(row)) ||
431  row->word_list()->empty()) {
432  delete row_it.extract(); // lose empty row.
433  } else {
434  if (textord_noise_rejwords) {
435  clean_noise_from_words(row_it.data());
436  }
437  if (textord_blshift_maxshift >= 0) {
438  tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction);
439  }
440  ++num_rows;
441  }
442  }
443  }
444  if (block->row_list()->empty()) {
445  delete block_it.extract(); // Lose empty text blocks.
446  } else {
447  ++num_blocks;
448  }
449  ++num_blocks_all;
450  if (textord_noise_debug) {
451  tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
452  }
453  }
454  if (textord_noise_debug) {
455  tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
456  }
457 }
458 
459 /**********************************************************************
460  * clean_noise_from_row
461  *
462  * Move blobs of words from rows of garbage into the reject blobs list.
463  **********************************************************************/
464 
465 bool Textord::clean_noise_from_row( // remove empties
466  ROW *row // row to clean
467 ) {
468  bool testing_on;
469  TBOX blob_box; // bounding box
470  C_BLOB *blob; // current blob
471  C_OUTLINE *outline; // current outline
472  WERD *word; // current word
473  int32_t blob_size; // biggest size
474  int32_t trans_count = 0; // no of transitions
475  int32_t trans_threshold; // noise tolerance
476  int32_t dot_count; // small objects
477  int32_t norm_count; // normal objects
478  int32_t super_norm_count; // real char-like
479  // words of row
480  WERD_IT word_it = row->word_list();
481  C_BLOB_IT blob_it; // blob iterator
482  C_OUTLINE_IT out_it; // outline iterator
483 
484  testing_on = textord_test_y > row->base_line(textord_test_x) && textord_show_blobs &&
485  textord_test_y < row->base_line(textord_test_x) + row->x_height();
486  dot_count = 0;
487  norm_count = 0;
488  super_norm_count = 0;
489  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
490  word = word_it.data(); // current word
491  // blobs in word
492  blob_it.set_to_list(word->cblob_list());
493  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
494  blob = blob_it.data();
495  if (!word->flag(W_DONT_CHOP)) {
496  // get outlines
497  out_it.set_to_list(blob->out_list());
498  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
499  outline = out_it.data();
500  blob_box = outline->bounding_box();
501  blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();
502  if (blob_size < textord_noise_sizelimit * row->x_height()) {
503  dot_count++; // count small outlines
504  }
505  if (!outline->child()->empty() &&
506  blob_box.height() < (1 + textord_noise_syfract) * row->x_height() &&
507  blob_box.height() > (1 - textord_noise_syfract) * row->x_height() &&
508  blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() &&
509  blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) {
510  super_norm_count++; // count small outlines
511  }
512  }
513  } else {
514  super_norm_count++;
515  }
516  blob_box = blob->bounding_box();
517  blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();
518  if (blob_size >= textord_noise_sizelimit * row->x_height() &&
519  blob_size < row->x_height() * 2) {
520  trans_threshold = blob_size / textord_noise_sizefraction;
521  trans_count = blob->count_transitions(trans_threshold);
522  if (trans_count < textord_noise_translimit) {
523  norm_count++;
524  }
525  } else if (blob_box.height() > row->x_height() * 2 &&
526  (!word_it.at_first() || !blob_it.at_first())) {
527  dot_count += 2;
528  }
529  if (testing_on) {
530  tprintf("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", blob_box.left(),
531  blob_box.bottom(), blob_box.right(), blob_box.top(), blob->out_list()->length(),
532  trans_count, blob_box.bottom() - row->base_line(blob_box.left()));
533  }
534  }
535  }
536  if (textord_noise_debug) {
537  tprintf("Row ending at (%d,%g):", blob_box.right(), row->base_line(blob_box.right()));
538  tprintf(" R=%g, dc=%d, nc=%d, %s\n",
539  norm_count > 0 ? static_cast<float>(dot_count) / norm_count : 9999, dot_count,
540  norm_count,
541  dot_count > norm_count * textord_noise_normratio && dot_count > 2 ? "REJECTED"
542  : "ACCEPTED");
543  }
544  return super_norm_count < textord_noise_sncount &&
545  dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
546 }
547 
548 /**********************************************************************
549  * clean_noise_from_words
550  *
551  * Move blobs of words from rows of garbage into the reject blobs list.
552  **********************************************************************/
553 
554 void Textord::clean_noise_from_words( // remove empties
555  ROW *row // row to clean
556 ) {
557  TBOX blob_box; // bounding box
558  C_BLOB *blob; // current blob
559  C_OUTLINE *outline; // current outline
560  WERD *word; // current word
561  int32_t blob_size; // biggest size
562  int32_t trans_count; // no of transitions
563  int32_t trans_threshold; // noise tolerance
564  int32_t dot_count; // small objects
565  int32_t norm_count; // normal objects
566  int32_t dud_words; // number discarded
567  int32_t ok_words; // number remaining
568  int32_t word_index; // current word
569  // words of row
570  WERD_IT word_it = row->word_list();
571  C_BLOB_IT blob_it; // blob iterator
572  C_OUTLINE_IT out_it; // outline iterator
573 
574  ok_words = word_it.length();
575  if (ok_words == 0 || textord_no_rejects) {
576  return;
577  }
578  // was it chucked
579  std::vector<int8_t> word_dud(ok_words);
580  dud_words = 0;
581  ok_words = 0;
582  word_index = 0;
583  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
584  word = word_it.data(); // current word
585  dot_count = 0;
586  norm_count = 0;
587  // blobs in word
588  blob_it.set_to_list(word->cblob_list());
589  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
590  blob = blob_it.data();
591  if (!word->flag(W_DONT_CHOP)) {
592  // get outlines
593  out_it.set_to_list(blob->out_list());
594  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
595  outline = out_it.data();
596  blob_box = outline->bounding_box();
597  blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();
598  if (blob_size < textord_noise_sizelimit * row->x_height()) {
599  dot_count++; // count small outlines
600  }
601  if (!outline->child()->empty() &&
602  blob_box.height() < (1 + textord_noise_syfract) * row->x_height() &&
603  blob_box.height() > (1 - textord_noise_syfract) * row->x_height() &&
604  blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() &&
605  blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) {
606  norm_count++; // count small outlines
607  }
608  }
609  } else {
610  norm_count++;
611  }
612  blob_box = blob->bounding_box();
613  blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();
614  if (blob_size >= textord_noise_sizelimit * row->x_height() &&
615  blob_size < row->x_height() * 2) {
616  trans_threshold = blob_size / textord_noise_sizefraction;
617  trans_count = blob->count_transitions(trans_threshold);
618  if (trans_count < textord_noise_translimit) {
619  norm_count++;
620  }
621  } else if (blob_box.height() > row->x_height() * 2 &&
622  (!word_it.at_first() || !blob_it.at_first())) {
623  dot_count += 2;
624  }
625  }
626  if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
627  if (dot_count > norm_count * textord_noise_normratio * 2) {
628  word_dud[word_index] = 2;
629  } else if (dot_count > norm_count * textord_noise_normratio) {
630  word_dud[word_index] = 1;
631  } else {
632  word_dud[word_index] = 0;
633  }
634  } else {
635  word_dud[word_index] = 0;
636  }
637  if (word_dud[word_index] == 2) {
638  dud_words++;
639  } else {
640  ok_words++;
641  }
642  word_index++;
643  }
644 
645  word_index = 0;
646  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
647  if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) {
648  word = word_it.data(); // Current word.
649  // Previously we threw away the entire word.
650  // Now just aggressively throw all small blobs into the reject list, where
651  // the classifier can decide whether they are actually needed.
652  word->CleanNoise(textord_noise_sizelimit * row->x_height());
653  }
654  word_index++;
655  }
656 }
657 
658 // Remove outlines that are a tiny fraction in either width or height
659 // of the word height.
660 void Textord::clean_small_noise_from_words(ROW *row) {
661  WERD_IT word_it(row->word_list());
662  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
663  WERD *word = word_it.data();
664  int min_size = static_cast<int>(textord_noise_hfract * word->bounding_box().height() + 0.5);
665  C_BLOB_IT blob_it(word->cblob_list());
666  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
667  C_BLOB *blob = blob_it.data();
668  C_OUTLINE_IT out_it(blob->out_list());
669  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
670  C_OUTLINE *outline = out_it.data();
671  outline->RemoveSmallRecursive(min_size, &out_it);
672  }
673  if (blob->out_list()->empty()) {
674  delete blob_it.extract();
675  }
676  }
677  if (word->cblob_list()->empty()) {
678  if (!word_it.at_last()) {
679  // The next word is no longer a fuzzy non space if it was before,
680  // since the word before is about to be deleted.
681  WERD *next_word = word_it.data_relative(1);
682  if (next_word->flag(W_FUZZY_NON)) {
683  next_word->set_flag(W_FUZZY_NON, false);
684  }
685  }
686  delete word_it.extract();
687  }
688  }
689 }
690 
691 // Local struct to hold a group of blocks.
692 struct BlockGroup {
693  BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}
694  explicit BlockGroup(BLOCK *block)
695  : bounding_box(block->pdblk.bounding_box())
696  , rotation(block->re_rotation())
697  , angle(block->re_rotation().angle())
698  , min_xheight(block->x_height()) {
699  blocks.push_back(block);
700  }
701  // Union of block bounding boxes.
703  // Common rotation of the blocks.
705  // Angle of rotation.
706  float angle;
707  // Min xheight of the blocks.
708  float min_xheight;
709  // Collection of borrowed pointers to the blocks in the group.
710  std::vector<BLOCK *> blocks;
711 };
712 
713 // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
714 // TransferDiacriticsToWords to copy the diacritic blobs to the most
715 // appropriate words in the group of blocks. Source blobs are not touched.
716 void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks) {
717  // Angle difference larger than this is too much to consider equal.
718  // They should only be in multiples of M_PI/2 anyway.
719  const double kMaxAngleDiff = 0.01; // About 0.6 degrees.
720  std::vector<std::unique_ptr<BlockGroup>> groups;
721  BLOCK_IT bk_it(blocks);
722  for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
723  BLOCK *block = bk_it.data();
724  if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
725  continue;
726  }
727  // Linear search of the groups to find a matching rotation.
728  float block_angle = block->re_rotation().angle();
729  int best_g = 0;
730  float best_angle_diff = FLT_MAX;
731  for (const auto &group : groups) {
732  double angle_diff = std::fabs(block_angle - group->angle);
733  if (angle_diff > M_PI) {
734  angle_diff = fabs(angle_diff - 2.0 * M_PI);
735  }
736  if (angle_diff < best_angle_diff) {
737  best_angle_diff = angle_diff;
738  best_g = &group - &groups[0];
739  }
740  }
741  if (best_angle_diff > kMaxAngleDiff) {
742  groups.push_back(std::make_unique<BlockGroup>(block));
743  } else {
744  groups[best_g]->blocks.push_back(block);
745  groups[best_g]->bounding_box += block->pdblk.bounding_box();
746  float x_height = block->x_height();
747  if (x_height < groups[best_g]->min_xheight) {
748  groups[best_g]->min_xheight = x_height;
749  }
750  }
751  }
752  // Now process each group of blocks.
753  std::vector<std::unique_ptr<WordWithBox>> word_ptrs;
754  for (const auto &group : groups) {
755  if (group->bounding_box.null_box()) {
756  continue;
757  }
758  WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
759  group->bounding_box.topright());
760  for (auto b : group->blocks) {
761  ROW_IT row_it(b->row_list());
762  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
763  ROW *row = row_it.data();
764  // Put the words of the row into the grid.
765  WERD_IT w_it(row->word_list());
766  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
767  WERD *word = w_it.data();
768  auto box_word = std::make_unique<WordWithBox>(word);
769  word_grid.InsertBBox(true, true, box_word.get());
770  // Save the pointer where it will be auto-deleted.
771  word_ptrs.emplace_back(std::move(box_word));
772  }
773  }
774  }
775  FCOORD rotation = group->rotation;
776  // Make it a forward rotation that will transform blob coords to block.
777  rotation.set_y(-rotation.y());
778  TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
779  }
780 }
781 
782 // Places a copy of blobs that are near a word (after applying rotation to the
783 // blob) in the most appropriate word, unless there is doubt, in which case a
784 // blob can end up in two words. Source blobs are not touched.
785 void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, const FCOORD &rotation,
786  WordGrid *word_grid) {
787  WordSearch ws(word_grid);
788  BLOBNBOX_IT b_it(diacritic_blobs);
789  // Apply rotation to each blob before finding the nearest words. The rotation
790  // allows us to only consider above/below placement and not left/right on
791  // vertical text, because all text is horizontal here.
792  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
793  BLOBNBOX *blobnbox = b_it.data();
794  TBOX blob_box = blobnbox->bounding_box();
795  blob_box.rotate(rotation);
796  ws.StartRectSearch(blob_box);
797  // Above/below refer to word position relative to diacritic. Since some
798  // scripts eg Kannada/Telugu habitually put diacritics below words, and
799  // others eg Thai/Vietnamese/Latin put most diacritics above words, try
800  // for both if there isn't much in it.
801  WordWithBox *best_above_word = nullptr;
802  WordWithBox *best_below_word = nullptr;
803  int best_above_distance = 0;
804  int best_below_distance = 0;
805  for (WordWithBox *word = ws.NextRectSearch(); word != nullptr; word = ws.NextRectSearch()) {
806  if (word->word()->flag(W_REP_CHAR)) {
807  continue;
808  }
809  TBOX word_box = word->true_bounding_box();
810  int x_distance = blob_box.x_gap(word_box);
811  int y_distance = blob_box.y_gap(word_box);
812  if (x_distance > 0) {
813  // Arbitrarily divide x-distance by 2 if there is a major y overlap,
814  // and the word is to the left of the diacritic. If the
815  // diacritic is a dropped broken character between two words, this will
816  // help send all the pieces to a single word, instead of splitting them
817  // over the 2 words.
818  if (word_box.major_y_overlap(blob_box) && blob_box.left() > word_box.right()) {
819  x_distance /= 2;
820  }
821  y_distance += x_distance;
822  }
823  if (word_box.y_middle() > blob_box.y_middle() &&
824  (best_above_word == nullptr || y_distance < best_above_distance)) {
825  best_above_word = word;
826  best_above_distance = y_distance;
827  }
828  if (word_box.y_middle() <= blob_box.y_middle() &&
829  (best_below_word == nullptr || y_distance < best_below_distance)) {
830  best_below_word = word;
831  best_below_distance = y_distance;
832  }
833  }
834  bool above_good = best_above_word != nullptr &&
835  (best_below_word == nullptr ||
836  best_above_distance < best_below_distance + blob_box.height());
837  bool below_good = best_below_word != nullptr && best_below_word != best_above_word &&
838  (best_above_word == nullptr ||
839  best_below_distance < best_above_distance + blob_box.height());
840  if (below_good) {
841  C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
842  copied_blob->rotate(rotation);
843  // Put the blob into the word's reject blobs list.
844  C_BLOB_IT blob_it(best_below_word->RejBlobs());
845  blob_it.add_to_end(copied_blob);
846  }
847  if (above_good) {
848  C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
849  copied_blob->rotate(rotation);
850  // Put the blob into the word's reject blobs list.
851  C_BLOB_IT blob_it(best_above_word->RejBlobs());
852  blob_it.add_to_end(copied_blob);
853  }
854  }
855 }
856 
857 /**********************************************************************
858  * tweak_row_baseline
859  *
860  * Shift baseline to fit the blobs more accurately where they are
861  * close enough.
862  **********************************************************************/
863 
864 void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction) {
865  TBOX blob_box; // bounding box
866  C_BLOB *blob; // current blob
867  WERD *word; // current word
868  int32_t blob_count; // no of blobs
869  int32_t src_index; // source segment
870  int32_t dest_index; // destination segment
871  float ydiff; // baseline error
872  float x_centre; // centre of blob
873  // words of row
874  WERD_IT word_it = row->word_list();
875  C_BLOB_IT blob_it; // blob iterator
876 
877  blob_count = 0;
878  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
879  word = word_it.data(); // current word
880  // get total blobs
881  blob_count += word->cblob_list()->length();
882  }
883  if (blob_count == 0) {
884  return;
885  }
886  // spline segments
887  std::vector<int32_t> xstarts(blob_count + row->baseline.segments + 1);
888  // spline coeffs
889  std::vector<double> coeffs((blob_count + row->baseline.segments) * 3);
890 
891  src_index = 0;
892  dest_index = 0;
893  xstarts[0] = row->baseline.xcoords[0];
894  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
895  word = word_it.data(); // current word
896  // blobs in word
897  blob_it.set_to_list(word->cblob_list());
898  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
899  blob = blob_it.data();
900  blob_box = blob->bounding_box();
901  x_centre = (blob_box.left() + blob_box.right()) / 2.0;
902  ydiff = blob_box.bottom() - row->base_line(x_centre);
903  if (ydiff < 0) {
904  ydiff = -ydiff / row->x_height();
905  } else {
906  ydiff = ydiff / row->x_height();
907  }
908  if (ydiff < blshift_maxshift && blob_box.height() / row->x_height() > blshift_xfraction) {
909  if (xstarts[dest_index] >= x_centre) {
910  xstarts[dest_index] = blob_box.left();
911  }
912  coeffs[dest_index * 3] = 0;
913  coeffs[dest_index * 3 + 1] = 0;
914  coeffs[dest_index * 3 + 2] = blob_box.bottom();
915  // shift it
916  dest_index++;
917  xstarts[dest_index] = blob_box.right() + 1;
918  } else {
919  if (xstarts[dest_index] <= x_centre) {
920  while (row->baseline.xcoords[src_index + 1] <= x_centre &&
921  src_index < row->baseline.segments - 1) {
922  if (row->baseline.xcoords[src_index + 1] > xstarts[dest_index]) {
923  coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
924  coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
925  coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
926  dest_index++;
927  xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
928  }
929  src_index++;
930  }
931  coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
932  coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
933  coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
934  dest_index++;
935  xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
936  }
937  }
938  }
939  }
940  while (src_index < row->baseline.segments &&
941  row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) {
942  src_index++;
943  }
944  while (src_index < row->baseline.segments) {
945  coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
946  coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
947  coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
948  dest_index++;
949  src_index++;
950  xstarts[dest_index] = row->baseline.xcoords[src_index];
951  }
952  // turn to spline
953  row->baseline = QSPLINE(dest_index, &xstarts[0], &coeffs[0]);
954 }
955 
956 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define MAX_NEAREST_DIST
Definition: tordmain.cpp:61
@ TBOX
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:39
@ W_REP_CHAR
repeated character
Definition: werd.h:40
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
BBGrid< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordGrid
Definition: textord.h:73
int textord_test_y
Definition: makerow.cpp:65
int textord_test_x
Definition: makerow.cpp:64
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
double textord_excess_blobsize
Definition: makerow.cpp:81
@ baseline
Definition: mfoutline.h:53
ScrollView * to_win
Definition: drawtord.cpp:37
void plot_box_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour)
Definition: drawtord.cpp:69
void SetBlobStrokeWidth(Image pix, BLOBNBOX *blob)
Definition: tordmain.cpp:68
bool textord_test_landscape
Definition: makerow.cpp:52
double textord_min_linesize
Definition: makerow.cpp:80
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:47
double textord_width_limit
Definition: makerow.cpp:75
void assign_blobs_to_blocks2(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
Definition: tordmain.cpp:162
void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction)
Definition: tordmain.cpp:864
void extract_edges(Image pix, BLOCK *block)
Definition: edgblob.cpp:347
GridSearch< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordSearch
Definition: textord.h:74
const TBOX & bounding_box() const
Definition: blobbox.h:239
int32_t enclosed_area() const
Definition: blobbox.h:262
void set_horz_stroke_width(float width)
Definition: blobbox.h:355
void set_vert_stroke_width(float width)
Definition: blobbox.h:361
BLOBNBOX_LIST blobs
Definition: blobbox.h:776
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:779
void plot_graded_blobs(ScrollView *to_win)
Definition: blobbox.cpp:1058
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:780
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:778
static const double kXHeightCapRatio
Definition: ccstruct.h:37
static const double kXHeightFraction
Definition: ccstruct.h:34
static const double kDescenderFraction
Definition: ccstruct.h:33
static const double kAscenderFraction
Definition: ccstruct.h:35
void destroy()
Definition: image.cpp:32
FCOORD re_rotation() const
Definition: ocrblock.h:129
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
int32_t x_height() const
return xheight
Definition: ocrblock.h:101
WERD_LIST * word_list()
Definition: ocrrow.h:57
float x_height() const
Definition: ocrrow.h:66
float base_line(float xpos) const
Definition: ocrrow.h:61
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
integer coordinate
Definition: points.h:36
float angle() const
find angle
Definition: points.h:246
bool IsText() const
Definition: polyblk.h:52
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
int32_t get_total() const
Definition: statistc.h:85
double ile(double frac) const
Definition: statistc.cpp:173
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:238
TBOX bounding_box() const
Definition: stepblob.cpp:250
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:118
C_BLOB_LIST * cblob_list()
Definition: werd.h:96
void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, bool testing_on)
Definition: tordmain.cpp:238
void find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:211
BlockGroup(BLOCK *block)
Definition: tordmain.cpp:694
std::vector< BLOCK * > blocks
Definition: tordmain.cpp:710