tesseract  5.0.0
wordseg.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: wordseg.cpp (Formerly wspace.c)
3  * Description: Code to segment the blobs into words.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23 
24 #include "wordseg.h"
25 
26 #include <cmath>
27 
28 #include "blobbox.h"
29 #include "cjkpitch.h"
30 #include "drawtord.h"
31 #include "fpchop.h"
32 #include "makerow.h"
33 #include "pitsync1.h"
34 #include "statistc.h"
35 #include "textord.h"
36 #include "topitch.h"
37 #include "tovars.h"
38 
39 namespace tesseract {
40 
41 BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows");
42 BOOL_VAR(textord_chopper_test, false, "Chopper is being tested.");
43 
44 #define BLOCK_STATS_CLUSTERS 10
45 
53 void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) {
54  TO_ROW_IT to_row_it(rows);
55  ROW_IT row_it(real_rows);
56  for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) {
57  TO_ROW *row = to_row_it.data();
58  // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
59  // to create the word.
60  C_BLOB_LIST cblobs;
61  C_BLOB_IT cblob_it(&cblobs);
62  BLOBNBOX_IT box_it(row->blob_list());
63  for (; !box_it.empty(); box_it.forward()) {
64  BLOBNBOX *bblob = box_it.extract();
65  if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
66  auto cblob = bblob->remove_cblob();
67  if (cblob != nullptr) {
68  C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
69  cout_it.move_to_last();
70  cout_it.add_list_after(cblob->out_list());
71  delete cblob;
72  }
73  } else {
74  auto cblob = bblob->remove_cblob();
75  if (cblob != nullptr) {
76  cblob_it.add_after_then_move(cblob);
77  }
78  }
79  delete bblob;
80  }
81  // Convert the TO_ROW to a ROW.
82  ROW *real_row =
83  new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
84  WERD_IT word_it(real_row->word_list());
85  WERD *word = new WERD(&cblobs, 0, nullptr);
86  word->set_flag(W_BOL, true);
87  word->set_flag(W_EOL, true);
88  word->set_flag(W_DONT_CHOP, one_blob);
89  word_it.add_after_then_move(word);
90  row_it.add_after_then_move(real_row);
91  }
92 }
93 
100  ICOORD page_tr, // top right
101  float gradient, // page skew
102  BLOCK_LIST *blocks, // block list
103  TO_BLOCK_LIST *port_blocks) { // output list
104  TO_BLOCK_IT block_it; // iterator
105  TO_BLOCK *block; // current block
106 
107  if (textord->use_cjk_fp_model()) {
108  compute_fixed_pitch_cjk(page_tr, port_blocks);
109  } else {
110  compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
111  !bool(textord_test_landscape));
112  }
113  textord->to_spacing(page_tr, port_blocks);
114  block_it.set_to_list(port_blocks);
115  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
116  block = block_it.data();
117  make_real_words(textord, block, FCOORD(1.0f, 0.0f));
118  }
119 }
120 
128 void set_row_spaces( // find space sizes
129  TO_BLOCK *block, // block to do
130  FCOORD rotation, // for drawing
131  bool testing_on // correct orientation
132 ) {
133  TO_ROW *row; // current row
134  TO_ROW_IT row_it = block->get_rows();
135 
136  if (row_it.empty()) {
137  return; // empty block
138  }
139  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
140  row = row_it.data();
141  if (row->fixed_pitch == 0) {
142  row->min_space = static_cast<int32_t>(
143  ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
144  row->max_nonspace = static_cast<int32_t>(
145  floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
146  if (testing_on && textord_show_initial_words) {
147  tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace,
148  row->min_space, row->intercept());
149  }
150  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
151  row->space_size = row->pr_space;
152  row->kern_size = row->pr_nonsp;
153  }
154 #ifndef GRAPHICS_DISABLED
155  if (textord_show_initial_words && testing_on) {
156  plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
157  }
158 #endif
159  }
160 }
161 
168 int32_t row_words( // compute space size
169  TO_BLOCK *block, // block it came from
170  TO_ROW *row, // row to operate on
171  int32_t maxwidth, // max expected space size
172  FCOORD rotation, // for drawing
173  bool testing_on // for debug
174 ) {
175  bool testing_row; // contains testpt
176  bool prev_valid; // if decent size
177  int32_t prev_x; // end of prev blob
178  int32_t cluster_count; // no of clusters
179  int32_t gap_index; // which cluster
180  int32_t smooth_factor; // for smoothing stats
181  BLOBNBOX *blob; // current blob
182  float lower, upper; // clustering parameters
183  float gaps[3]; // gap clusers
184  ICOORD testpt;
185  TBOX blob_box; // bounding box
186  // iterator
187  BLOBNBOX_IT blob_it = row->blob_list();
188  STATS gap_stats(0, maxwidth);
189  STATS cluster_stats[4]; // clusters
190 
192  smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
193  // if (testing_on)
194  // tprintf("Row smooth factor=%d\n",smooth_factor);
195  prev_valid = false;
196  prev_x = -INT32_MAX;
197  testing_row = false;
198  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
199  blob = blob_it.data();
200  blob_box = blob->bounding_box();
201  if (blob_box.contains(testpt)) {
202  testing_row = true;
203  }
204  gap_stats.add(blob_box.width(), 1);
205  }
206  gap_stats.clear();
207  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
208  blob = blob_it.data();
209  if (!blob->joined_to_prev()) {
210  blob_box = blob->bounding_box();
211  if (prev_valid && blob_box.left() - prev_x < maxwidth) {
212  gap_stats.add(blob_box.left() - prev_x, 1);
213  }
214  prev_valid = true;
215  prev_x = blob_box.right();
216  }
217  }
218  if (gap_stats.get_total() == 0) {
219  row->min_space = 0; // no evidence
220  row->max_nonspace = 0;
221  return 0;
222  }
223  gap_stats.smooth(smooth_factor);
224  lower = row->xheight * textord_words_initial_lower;
225  upper = row->xheight * textord_words_initial_upper;
226  cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
227  while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) {
228  // shrink gap
229  upper = (upper * 3 + lower) / 4;
230  lower = (lower * 3 + upper) / 4;
231  cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
232  }
233  if (cluster_count < 2) {
234  row->min_space = 0; // no evidence
235  row->max_nonspace = 0;
236  return 0;
237  }
238  for (gap_index = 0; gap_index < cluster_count; gap_index++) {
239  gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
240  }
241  // get medians
242  if (cluster_count > 2) {
243  if (testing_on && textord_show_initial_words) {
244  tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(),
245  cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5));
246  }
247  lower = gaps[0];
248  if (gaps[1] > lower) {
249  upper = gaps[1]; // prefer most frequent
250  if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) {
251  upper = gaps[2];
252  }
253  } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) {
254  upper = gaps[2];
255  } else if (lower >= block->xheight * textord_words_min_minspace) {
256  upper = lower; // not nice
257  lower = gaps[1];
258  if (testing_on && textord_show_initial_words) {
259  tprintf("Had to switch most common from lower to upper!!\n");
260  gap_stats.print();
261  }
262  } else {
263  row->min_space = 0; // no evidence
264  row->max_nonspace = 0;
265  return 0;
266  }
267  } else {
268  if (gaps[1] < gaps[0]) {
269  if (testing_on && textord_show_initial_words) {
270  tprintf("Had to switch most common from lower to upper!!\n");
271  gap_stats.print();
272  }
273  lower = gaps[1];
274  upper = gaps[0];
275  } else {
276  upper = gaps[1];
277  lower = gaps[0];
278  }
279  }
280  if (upper < block->xheight * textord_words_min_minspace) {
281  row->min_space = 0; // no evidence
282  row->max_nonspace = 0;
283  return 0;
284  }
285  if (upper * 3 < block->min_space * 2 + block->max_nonspace ||
286  lower * 3 > block->min_space * 2 + block->max_nonspace) {
287  if (testing_on && textord_show_initial_words) {
288  tprintf("Disagreement between block and row at %g!!\n", row->intercept());
289  tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper);
290  gap_stats.print();
291  }
292  }
293  row->min_space =
294  static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
295  row->max_nonspace =
296  static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
297  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
298  row->space_size = upper;
299  row->kern_size = lower;
300  if (testing_on && textord_show_initial_words) {
301  if (testing_row) {
302  tprintf("GAP STATS\n");
303  gap_stats.print();
304  tprintf("SPACE stats\n");
305  cluster_stats[2].print_summary();
306  tprintf("NONSPACE stats\n");
307  cluster_stats[1].print_summary();
308  }
309  tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
310  upper, row->max_nonspace, lower);
311  }
312  return cluster_stats[2].get_total();
313 }
314 
321 int32_t row_words2( // compute space size
322  TO_BLOCK *block, // block it came from
323  TO_ROW *row, // row to operate on
324  int32_t maxwidth, // max expected space size
325  FCOORD rotation, // for drawing
326  bool testing_on // for debug
327 ) {
328  bool prev_valid; // if decent size
329  bool this_valid; // current blob big enough
330  int32_t prev_x; // end of prev blob
331  int32_t min_width; // min interesting width
332  int32_t valid_count; // good gaps
333  int32_t total_count; // total gaps
334  int32_t cluster_count; // no of clusters
335  int32_t prev_count; // previous cluster_count
336  int32_t gap_index; // which cluster
337  int32_t smooth_factor; // for smoothing stats
338  BLOBNBOX *blob; // current blob
339  float lower, upper; // clustering parameters
340  ICOORD testpt;
341  TBOX blob_box; // bounding box
342  // iterator
343  BLOBNBOX_IT blob_it = row->blob_list();
344  STATS gap_stats(0, maxwidth);
345  // gap sizes
346  float gaps[BLOCK_STATS_CLUSTERS];
347  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
348  // clusters
349 
351  smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
352  // if (testing_on)
353  // tprintf("Row smooth factor=%d\n",smooth_factor);
354  prev_valid = false;
355  prev_x = -INT16_MAX;
356  const bool testing_row = false;
357  // min blob size
358  min_width = static_cast<int32_t>(block->pr_space);
359  total_count = 0;
360  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
361  blob = blob_it.data();
362  if (!blob->joined_to_prev()) {
363  blob_box = blob->bounding_box();
364  this_valid = blob_box.width() >= min_width;
365  if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) {
366  gap_stats.add(blob_box.left() - prev_x, 1);
367  }
368  total_count++; // count possibles
369  prev_x = blob_box.right();
370  prev_valid = this_valid;
371  }
372  }
373  valid_count = gap_stats.get_total();
374  if (valid_count < total_count * textord_words_minlarge) {
375  gap_stats.clear();
376  prev_x = -INT16_MAX;
377  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
378  blob = blob_it.data();
379  if (!blob->joined_to_prev()) {
380  blob_box = blob->bounding_box();
381  if (blob_box.left() - prev_x < maxwidth) {
382  gap_stats.add(blob_box.left() - prev_x, 1);
383  }
384  prev_x = blob_box.right();
385  }
386  }
387  }
388  if (gap_stats.get_total() == 0) {
389  row->min_space = 0; // no evidence
390  row->max_nonspace = 0;
391  return 0;
392  }
393 
394  cluster_count = 0;
395  lower = block->xheight * words_initial_lower;
396  upper = block->xheight * words_initial_upper;
397  gap_stats.smooth(smooth_factor);
398  do {
399  prev_count = cluster_count;
400  cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
401  BLOCK_STATS_CLUSTERS, cluster_stats);
402  } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
403  if (cluster_count < 1) {
404  row->min_space = 0;
405  row->max_nonspace = 0;
406  return 0;
407  }
408  for (gap_index = 0; gap_index < cluster_count; gap_index++) {
409  gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
410  }
411  // get medians
412  if (testing_on) {
413  tprintf("cluster_count=%d:", cluster_count);
414  for (gap_index = 0; gap_index < cluster_count; gap_index++) {
415  tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
416  }
417  tprintf("\n");
418  }
419 
420  // Try to find proportional non-space and space for row.
421  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace;
422  gap_index++) {
423  ;
424  }
425  if (gap_index < cluster_count) {
426  lower = gaps[gap_index]; // most frequent below
427  } else {
428  if (testing_on) {
429  tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp);
430  }
431  lower = block->pr_nonsp;
432  }
433  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace;
434  gap_index++) {
435  ;
436  }
437  if (gap_index < cluster_count) {
438  upper = gaps[gap_index]; // most frequent above
439  } else {
440  if (testing_on) {
441  tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space);
442  }
443  upper = block->pr_space;
444  }
445  row->min_space =
446  static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
447  row->max_nonspace =
448  static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
449  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
450  row->space_size = upper;
451  row->kern_size = lower;
452  if (testing_on) {
453  if (testing_row) {
454  tprintf("GAP STATS\n");
455  gap_stats.print();
456  tprintf("SPACE stats\n");
457  cluster_stats[2].print_summary();
458  tprintf("NONSPACE stats\n");
459  cluster_stats[1].print_summary();
460  }
461  tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
462  upper, row->max_nonspace, lower);
463  }
464  return 1;
465 }
466 
474  TO_BLOCK *block, // block to do
475  FCOORD rotation // for drawing
476 ) {
477  TO_ROW *row; // current row
478  TO_ROW_IT row_it = block->get_rows();
479  ROW *real_row = nullptr; // output row
480  ROW_IT real_row_it = block->block->row_list();
481 
482  if (row_it.empty()) {
483  return; // empty block
484  }
485  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
486  row = row_it.data();
487  if (row->blob_list()->empty() && !row->rep_words.empty()) {
488  real_row = make_rep_words(row, block);
489  } else if (!row->blob_list()->empty()) {
490  // In a fixed pitch document, some lines may be detected as fixed pitch
491  // while others don't, and will go through different path.
492  // For non-space delimited language like CJK, fixed pitch chop always
493  // leave the entire line as one word. We can force consistent chopping
494  // with force_make_prop_words flag.
495  POLY_BLOCK *pb = block->block->pdblk.poly_block();
496  if (textord_chopper_test) {
497  real_row = textord->make_blob_words(row, rotation);
498  } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) ||
500  real_row = textord->make_prop_words(row, rotation);
501  } else if (row->pitch_decision == PITCH_DEF_FIXED ||
503  real_row = fixed_pitch_words(row, rotation);
504  } else {
505  ASSERT_HOST(false);
506  }
507  }
508  if (real_row != nullptr) {
509  // put row in block
510  real_row_it.add_after_then_move(real_row);
511  }
512  }
513  block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size),
514  static_cast<int16_t>(block->space_size),
515  static_cast<int16_t>(block->fixed_pitch));
516  block->block->check_pitch();
517 }
518 
526 ROW *make_rep_words( // make a row
527  TO_ROW *row, // row to convert
528  TO_BLOCK *block // block it lives in
529 ) {
530  ROW *real_row; // output row
531  TBOX word_box; // bounding box
532  // iterator
533  WERD_IT word_it = &row->rep_words;
534 
535  if (word_it.empty()) {
536  return nullptr;
537  }
538  word_box = word_it.data()->bounding_box();
539  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
540  word_box += word_it.data()->bounding_box();
541  }
542  row->xheight = block->xheight;
543  real_row =
544  new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size));
545  word_it.set_to_list(real_row->word_list());
546  // put words in row
547  word_it.add_list_after(&row->rep_words);
548  real_row->recalc_bounding_box();
549  return real_row;
550 }
551 
559 WERD *make_real_word(BLOBNBOX_IT *box_it, // iterator
560  int32_t blobcount, // no of blobs to use
561  bool bol, // start of line
562  uint8_t blanks // no of blanks
563 ) {
564  C_OUTLINE_IT cout_it;
565  C_BLOB_LIST cblobs;
566  C_BLOB_IT cblob_it = &cblobs;
567 
568  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
569  auto bblob = box_it->extract();
570  if (bblob->joined_to_prev()) {
571  auto cblob = bblob->remove_cblob();
572  if (cblob != nullptr) {
573  cout_it.set_to_list(cblob_it.data()->out_list());
574  cout_it.move_to_last();
575  cout_it.add_list_after(cblob->out_list());
576  delete cblob;
577  }
578  } else {
579  auto cblob = bblob->remove_cblob();
580  if (cblob != nullptr) {
581  cblob_it.add_after_then_move(cblob);
582  }
583  }
584  delete bblob;
585  box_it->forward(); // next one
586  }
587 
588  if (blanks < 1) {
589  blanks = 1;
590  }
591 
592  auto word = new WERD(&cblobs, blanks, nullptr);
593 
594  if (bol) {
595  word->set_flag(W_BOL, true);
596  }
597  if (box_it->at_first()) {
598  word->set_flag(W_EOL, true); // at end of line
599  }
600 
601  return word;
602 }
603 
604 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define BOOL_VAR(name, val, comment)
Definition: params.h:359
#define BLOCK_STATS_CLUSTERS
Definition: wordseg.cpp:44
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:39
void compute_fixed_pitch(ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient, FCOORD rotation, bool testing_on)
Definition: topitch.cpp:75
double words_initial_upper
Definition: tovars.cpp:47
double textord_wordstats_smooth_factor
Definition: tovars.cpp:31
double words_initial_lower
Definition: tovars.cpp:46
int textord_test_y
Definition: makerow.cpp:65
int textord_test_x
Definition: makerow.cpp:64
bool textord_force_make_prop_words
Definition: wordseg.cpp:41
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
WERD * make_real_word(BLOBNBOX_IT *box_it, int32_t blobcount, bool bol, uint8_t blanks)
Definition: wordseg.cpp:559
ScrollView * to_win
Definition: drawtord.cpp:37
ROW * fixed_pitch_words(TO_ROW *row, FCOORD rotation)
Definition: fpchop.cpp:65
@ PITCH_DEF_FIXED
Definition: blobbox.h:49
@ PITCH_DEF_PROP
Definition: blobbox.h:51
@ PITCH_CORR_FIXED
Definition: blobbox.h:53
@ PITCH_CORR_PROP
Definition: blobbox.h:54
void set_row_spaces(TO_BLOCK *block, FCOORD rotation, bool testing_on)
Definition: wordseg.cpp:128
double textord_words_min_minspace
Definition: tovars.cpp:35
double textord_words_initial_upper
Definition: tovars.cpp:38
void make_real_words(tesseract::Textord *textord, TO_BLOCK *block, FCOORD rotation)
Definition: wordseg.cpp:473
void make_words(tesseract::Textord *textord, ICOORD page_tr, float gradient, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
Definition: wordseg.cpp:99
bool textord_chopper_test
Definition: wordseg.cpp:42
bool textord_test_landscape
Definition: makerow.cpp:52
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
Definition: drawtord.cpp:238
double textord_words_initial_lower
Definition: tovars.cpp:37
double textord_words_definite_spread
Definition: tovars.cpp:51
void compute_fixed_pitch_cjk(ICOORD page_tr, TO_BLOCK_LIST *port_blocks)
Definition: cjkpitch.cpp:1103
void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows)
Definition: wordseg.cpp:53
bool textord_show_initial_words
Definition: tovars.cpp:25
int32_t row_words2(TO_BLOCK *block, TO_ROW *row, int32_t maxwidth, FCOORD rotation, bool testing_on)
Definition: wordseg.cpp:321
ROW * make_rep_words(TO_ROW *row, TO_BLOCK *block)
Definition: wordseg.cpp:526
double textord_spacesize_ratioprop
Definition: tovars.cpp:52
int32_t row_words(TO_BLOCK *block, TO_ROW *row, int32_t maxwidth, FCOORD rotation, bool testing_on)
Definition: wordseg.cpp:168
double textord_words_minlarge
Definition: tovars.cpp:39
const TBOX & bounding_box() const
Definition: blobbox.h:239
C_BLOB * remove_cblob()
Definition: blobbox.h:280
bool joined_to_prev() const
Definition: blobbox.h:265
int32_t min_space
Definition: blobbox.h:669
WERD_LIST rep_words
Definition: blobbox.h:674
int32_t max_nonspace
Definition: blobbox.h:670
float space_size
Definition: blobbox.h:673
float fixed_pitch
Definition: blobbox.h:657
int32_t space_threshold
Definition: blobbox.h:671
float intercept() const
Definition: blobbox.h:598
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:608
PITCH_TYPE pitch_decision
Definition: blobbox.h:656
TO_ROW_LIST * get_rows()
Definition: blobbox.h:709
int32_t min_space
Definition: blobbox.h:796
int32_t max_nonspace
Definition: blobbox.h:797
void check_pitch()
check proportional
Definition: ocrblock.cpp:164
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
void set_stats(bool prop, int16_t kern, int16_t space, int16_t ch_pitch)
Definition: ocrblock.h:56
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:111
WERD_LIST * word_list()
Definition: ocrrow.h:57
void recalc_bounding_box()
Definition: ocrrow.cpp:100
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
integer coordinate
Definition: points.h:36
bool IsText() const
Definition: polyblk.h:52
TDimension left() const
Definition: rect.h:82
TDimension width() const
Definition: rect.h:126
TDimension right() const
Definition: rect.h:89
bool contains(const FCOORD pt) const
Definition: rect.h:344
void print_summary() const
Definition: statistc.cpp:573
void print() const
Definition: statistc.cpp:548
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
int32_t get_total() const
Definition: statistc.h:85
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
Definition: statistc.cpp:335
void smooth(int32_t factor)
Definition: statistc.cpp:302
double ile(double frac) const
Definition: statistc.cpp:173
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131
bool use_cjk_fp_model() const
Definition: textord.h:98
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:844
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:45
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1118