tesseract  5.0.0
tospace.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 /**********************************************************************
11  * tospace.cpp
12  *
13  * Compute fuzzy word spacing thresholds for each row.
14  * I.e. set : max_nonspace
15  * space_threshold
16  * min_space
17  * kern_size
18  * space_size
19  * for each row.
20  * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
21  *
22  * Note: functions in this file were originally not members of any
23  * class or enclosed by any namespace. Now they are all static members
24  * of the Textord class.
25  *
26  **********************************************************************/
27 
28 #include "drawtord.h"
29 #include "statistc.h"
30 #include "textord.h"
31 #include "tovars.h"
32 
33 // Include automatically generated configuration file if running autoconf.
34 #ifdef HAVE_CONFIG_H
35 # include "config_auto.h"
36 #endif
37 
38 #include <algorithm>
39 #include <cmath>
40 #include <memory>
41 
42 #define MAXSPACING 128 /*max expected spacing in pix */
43 
44 namespace tesseract {
45 void Textord::to_spacing(ICOORD page_tr, // topright of page
46  TO_BLOCK_LIST *blocks // blocks on page
47 ) {
48  TO_BLOCK_IT block_it; // iterator
49  TO_BLOCK *block; // current block;
50  TO_ROW *row; // current row
51  int block_index; // block number
52  int row_index; // row number
53  // estimated width of real spaces for whole block
54  int16_t block_space_gap_width;
55  // estimated width of non space gaps for whole block
56  int16_t block_non_space_gap_width;
57  bool old_text_ord_proportional; // old fixed/prop result
58 
59  block_it.set_to_list(blocks);
60  block_index = 1;
61  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
62  block = block_it.data();
63  std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk
64  block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,
65  block_non_space_gap_width);
66  // Make sure relative values of block-level space and non-space gap
67  // widths are reasonable. The ratio of 1:3 is also used in
68  // block_spacing_stats, to correct the block_space_gap_width.
69  // Useful for arabic and hindi, when the non-space gap width is
70  // often over-estimated and should not be trusted. A similar ratio
71  // is found in block_spacing_stats.
72  if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
73  block_non_space_gap_width > block_space_gap_width / 3) {
74  block_non_space_gap_width = block_space_gap_width / 3;
75  }
76  // row iterator
77  TO_ROW_IT row_it(block->get_rows());
78  row_index = 1;
79  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
80  row = row_it.data();
81  if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) {
82  if ((tosp_debug_level > 0) && !old_text_ord_proportional) {
83  tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index);
84  }
85  row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,
86  block_non_space_gap_width);
87  } else {
88  if ((tosp_debug_level > 0) && old_text_ord_proportional) {
89  tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index,
90  row_index, row->pitch_decision, row->fixed_pitch);
91  }
92  }
93 #ifndef GRAPHICS_DISABLED
95  plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
96  }
97 #endif
98  row_index++;
99  }
100  block_index++;
101  }
102 }
103 
104 /*************************************************************************
105  * block_spacing_stats()
106  *************************************************************************/
107 
108 void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,
109  int16_t &block_space_gap_width, // resulting estimate
110  int16_t &block_non_space_gap_width // resulting estimate
111 ) {
112  TO_ROW *row; // current row
113  BLOBNBOX_IT blob_it; // iterator
114 
115  STATS centre_to_centre_stats(0, MAXSPACING);
116  // DEBUG USE ONLY
117  STATS all_gap_stats(0, MAXSPACING);
118  STATS space_gap_stats(0, MAXSPACING);
119  int16_t minwidth = MAXSPACING; // narrowest blob
120  TBOX blob_box;
121  TBOX prev_blob_box;
122  int16_t centre_to_centre;
123  int16_t gap_width;
124  float real_space_threshold;
125  float iqr_centre_to_centre; // DEBUG USE ONLY
126  float iqr_all_gap_stats; // DEBUG USE ONLY
127  int32_t end_of_row;
128  int32_t row_length;
129 
130  // row iterator
131  TO_ROW_IT row_it(block->get_rows());
132  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
133  row = row_it.data();
134  if (!row->blob_list()->empty() &&
135  (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
136  (row->pitch_decision == PITCH_CORR_PROP))) {
137  blob_it.set_to_list(row->blob_list());
138  blob_it.mark_cycle_pt();
139  end_of_row = blob_it.data_relative(-1)->bounding_box().right();
140  if (tosp_use_pre_chopping) {
141  blob_box = box_next_pre_chopped(&blob_it);
142  } else if (tosp_stats_use_xht_gaps) {
143  blob_box = reduced_box_next(row, &blob_it);
144  } else {
145  blob_box = box_next(&blob_it);
146  }
147  row_length = end_of_row - blob_box.left();
148  if (blob_box.width() < minwidth) {
149  minwidth = blob_box.width();
150  }
151  prev_blob_box = blob_box;
152  while (!blob_it.cycled_list()) {
153  if (tosp_use_pre_chopping) {
154  blob_box = box_next_pre_chopped(&blob_it);
155  } else if (tosp_stats_use_xht_gaps) {
156  blob_box = reduced_box_next(row, &blob_it);
157  } else {
158  blob_box = box_next(&blob_it);
159  }
160  if (blob_box.width() < minwidth) {
161  minwidth = blob_box.width();
162  }
163  int16_t left = prev_blob_box.right();
164  int16_t right = blob_box.left();
165  gap_width = right - left;
166  if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
167  all_gap_stats.add(gap_width, 1);
168 
169  centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2;
170  // DEBUG
171  centre_to_centre_stats.add(centre_to_centre, 1);
172  // DEBUG
173  }
174  prev_blob_box = blob_box;
175  }
176  }
177  }
178 
179  // Inadequate samples
180  if (all_gap_stats.get_total() <= 1) {
181  block_non_space_gap_width = minwidth;
182  block_space_gap_width = -1; // No est. space width
183  // DEBUG
184  old_text_ord_proportional = true;
185  } else {
186  /* For debug only ..... */
187  iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25);
188  iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25);
189  old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats;
190  /* .......For debug only */
191 
192  /*
193 The median of the gaps is used as an estimate of the NON-SPACE gap width.
194 This RELIES on the assumption that there are more gaps WITHIN words than
195 BETWEEN words in a block
196 
197 Now try to estimate the width of a real space for all real spaces in the
198 block. Do this by using a crude threshold to ignore "narrow" gaps, then
199 find the median of the "wide" gaps and use this.
200 */
201  block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median()));
202  // median gap
203 
204  row_it.set_to_list(block->get_rows());
205  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
206  row = row_it.data();
207  if (!row->blob_list()->empty() &&
208  (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
209  (row->pitch_decision == PITCH_CORR_PROP))) {
210  real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width,
211  tosp_init_guess_xht_mult * row->xheight);
212  blob_it.set_to_list(row->blob_list());
213  blob_it.mark_cycle_pt();
214  end_of_row = blob_it.data_relative(-1)->bounding_box().right();
215  if (tosp_use_pre_chopping) {
216  blob_box = box_next_pre_chopped(&blob_it);
217  } else if (tosp_stats_use_xht_gaps) {
218  blob_box = reduced_box_next(row, &blob_it);
219  } else {
220  blob_box = box_next(&blob_it);
221  }
222  row_length = blob_box.left() - end_of_row;
223  prev_blob_box = blob_box;
224  while (!blob_it.cycled_list()) {
225  if (tosp_use_pre_chopping) {
226  blob_box = box_next_pre_chopped(&blob_it);
227  } else if (tosp_stats_use_xht_gaps) {
228  blob_box = reduced_box_next(row, &blob_it);
229  } else {
230  blob_box = box_next(&blob_it);
231  }
232  int16_t left = prev_blob_box.right();
233  int16_t right = blob_box.left();
234  gap_width = right - left;
235  if ((gap_width > real_space_threshold) &&
236  !ignore_big_gap(row, row_length, gapmap, left, right)) {
237  /*
238 If tosp_use_cert_spaces is enabled, the estimate of the space gap is
239 restricted to obvious spaces - those wider than half the xht or
240 those with wide blobs on both sides - i.e not things that are
241 suspect 1's or punctuation that is sometimes widely spaced.
242 */
243  if (!tosp_block_use_cert_spaces ||
244  (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
245  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
246  (!tosp_narrow_blobs_not_cert ||
247  (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
248  (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
249  space_gap_stats.add(gap_width, 1);
250  }
251  }
252  prev_blob_box = blob_box;
253  }
254  }
255  }
256  // Inadequate samples
257  if (space_gap_stats.get_total() <= 2) {
258  block_space_gap_width = -1; // No est. space width
259  } else {
260  block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())),
261  static_cast<int16_t>(3 * block_non_space_gap_width));
262  }
263  }
264 }
265 
266 /*************************************************************************
267  * row_spacing_stats()
268  * Set values for min_space, max_non_space based on row stats only
269  * If failure - return 0 values.
270  *************************************************************************/
271 void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,
272  int16_t block_space_gap_width, // estimate for block
273  int16_t block_non_space_gap_width // estimate for block
274 ) {
275  // iterator
276  BLOBNBOX_IT blob_it = row->blob_list();
277  STATS all_gap_stats(0, MAXSPACING);
278  STATS cert_space_gap_stats(0, MAXSPACING);
279  STATS all_space_gap_stats(0, MAXSPACING);
280  STATS small_gap_stats(0, MAXSPACING);
281  TBOX blob_box;
282  TBOX prev_blob_box;
283  int16_t gap_width;
284  int16_t real_space_threshold = 0;
285  int16_t max = 0;
286  int16_t index;
287  int16_t large_gap_count = 0;
288  bool suspected_table;
289  int32_t max_max_nonspace; // upper bound
290  bool good_block_space_estimate = block_space_gap_width > 0;
291  int32_t end_of_row;
292  int32_t row_length = 0;
293  float sane_space;
294  int32_t sane_threshold;
295 
296  /* Collect first pass stats for row */
297 
298  if (!good_block_space_estimate) {
299  block_space_gap_width = int16_t(std::floor(row->xheight / 2));
300  }
301  if (!row->blob_list()->empty()) {
302  if (tosp_threshold_bias1 > 0) {
303  real_space_threshold =
304  block_non_space_gap_width +
305  int16_t(floor(0.5 + tosp_threshold_bias1 *
306  (block_space_gap_width - block_non_space_gap_width)));
307  } else {
308  real_space_threshold = // Old TO method
309  (block_space_gap_width + block_non_space_gap_width) / 2;
310  }
311  blob_it.set_to_list(row->blob_list());
312  blob_it.mark_cycle_pt();
313  end_of_row = blob_it.data_relative(-1)->bounding_box().right();
314  if (tosp_use_pre_chopping) {
315  blob_box = box_next_pre_chopped(&blob_it);
316  } else if (tosp_stats_use_xht_gaps) {
317  blob_box = reduced_box_next(row, &blob_it);
318  } else {
319  blob_box = box_next(&blob_it);
320  }
321  row_length = end_of_row - blob_box.left();
322  prev_blob_box = blob_box;
323  while (!blob_it.cycled_list()) {
324  if (tosp_use_pre_chopping) {
325  blob_box = box_next_pre_chopped(&blob_it);
326  } else if (tosp_stats_use_xht_gaps) {
327  blob_box = reduced_box_next(row, &blob_it);
328  } else {
329  blob_box = box_next(&blob_it);
330  }
331  int16_t left = prev_blob_box.right();
332  int16_t right = blob_box.left();
333  gap_width = right - left;
334  if (ignore_big_gap(row, row_length, gapmap, left, right)) {
335  large_gap_count++;
336  } else {
337  if (gap_width >= real_space_threshold) {
338  if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
339  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
340  (!tosp_narrow_blobs_not_cert ||
341  (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
342  (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
343  cert_space_gap_stats.add(gap_width, 1);
344  }
345  all_space_gap_stats.add(gap_width, 1);
346  } else {
347  small_gap_stats.add(gap_width, 1);
348  }
349  all_gap_stats.add(gap_width, 1);
350  }
351  prev_blob_box = blob_box;
352  }
353  }
354  suspected_table = (large_gap_count > 1) ||
355  ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples));
356 
357  /* Now determine row kern size, space size and threshold */
358 
359  if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) ||
360  ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) &&
361  cert_space_gap_stats.get_total() > 0)) {
362  old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats,
363  block_space_gap_width, block_non_space_gap_width);
364  } else {
365  if (!tosp_recovery_isolated_row_stats ||
366  !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) {
367  if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) {
368  tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx);
369  }
370  if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
371  // Use block default
372  row->space_size = block_space_gap_width;
373  if (all_gap_stats.get_total() > tosp_redo_kern_limit) {
374  row->kern_size = all_gap_stats.median();
375  } else {
376  row->kern_size = block_non_space_gap_width;
377  }
378  row->space_threshold =
379  int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
380  } else {
381  old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats,
382  block_space_gap_width, block_non_space_gap_width);
383  }
384  }
385  }
386 
387  if (tosp_improve_thresh && !suspected_table) {
388  improve_row_threshold(row, &all_gap_stats);
389  }
390 
391  /* Now lets try to be careful not to do anything silly with tables when we
392 are ignoring big gaps*/
393  if (tosp_sanity_method == 0) {
394  if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
395  if (tosp_debug_level > 5) {
396  tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx,
397  row->kern_size, row->space_threshold, row->space_size);
398  }
399  row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);
400  row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
401  }
402  } else if (tosp_sanity_method == 1) {
403  sane_space = row->space_size;
404  /* NEVER let space size get too close to kern size */
405  if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
406  ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) {
407  if (good_block_space_estimate &&
408  (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) {
409  sane_space = block_space_gap_width;
410  } else {
411  sane_space =
412  std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
413  row->xheight / 2.0f);
414  }
415  if (tosp_debug_level > 5) {
416  tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx,
417  row->kern_size, row->space_threshold, row->space_size, sane_space);
418  }
419  row->space_size = sane_space;
420  row->space_threshold =
421  int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
422  }
423  /* NEVER let threshold get VERY far away from kern */
424  sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f)));
425  if (row->space_threshold > sane_threshold) {
426  if (tosp_debug_level > 5) {
427  tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx,
428  row->kern_size, row->space_threshold, row->space_size, sane_threshold);
429  }
430  row->space_threshold = sane_threshold;
431  if (row->space_size <= sane_threshold) {
432  row->space_size = row->space_threshold + 1.0f;
433  }
434  }
435  /* Beware of tables - there may be NO spaces */
436  if (suspected_table) {
437  sane_space =
438  std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight);
439  sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2));
440 
441  if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) {
442  if (tosp_debug_level > 5) {
443  tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx,
444  row->kern_size, row->space_threshold, row->space_size);
445  }
446  // the minimum sane value
447  row->space_threshold = static_cast<int32_t>(sane_space);
448  row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
449  }
450  }
451  }
452 
453  /* Now lets try to put some error limits on the threshold */
454 
455  if (tosp_old_to_method) {
456  /* Old textord made a space if gap >= threshold */
457  // NO FUZZY SPACES YET
458  row->max_nonspace = row->space_threshold;
459  // NO FUZZY SPACES YET
460  row->min_space = row->space_threshold + 1;
461  } else {
462  /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
463  row->min_space =
464  std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size));
465  if (row->min_space <= row->space_threshold) {
466  // Don't be silly
467  row->min_space = row->space_threshold + 1;
468  }
469  /*
470 Lets try to guess the max certain kern gap by looking at the cluster of
471 kerns for the row. The row is proportional so the kerns should cluster
472 tightly at the bottom of the distribution. We also expect most gaps to be
473 kerns. Find the maximum of the kern piles between 0 and twice the kern
474 estimate. Piles before the first one with less than 1/10 the maximum
475 number of samples can be taken as certain kerns.
476 
477  Of course, there are some cases where the kern peak and space peaks merge,
478  so we will put an UPPER limit on the max certain kern gap of some fraction
479  below the threshold.
480 */
481 
482  max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2);
483 
484  // default
485  row->max_nonspace = max_max_nonspace;
486  for (index = 0; index <= max_max_nonspace; index++) {
487  if (all_gap_stats.pile_count(index) > max) {
488  max = all_gap_stats.pile_count(index);
489  }
490  if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) {
491  row->max_nonspace = index;
492  break;
493  }
494  }
495  }
496 
497  /* Yet another algorithm - simpler this time - just choose a fraction of the
498 threshold to space range */
499 
500  if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) {
501  row->min_space = std::max(
502  row->min_space, static_cast<int32_t>(ceil(row->space_threshold +
503  tosp_fuzzy_sp_fraction *
504  (row->space_size - row->space_threshold))));
505  }
506 
507  /* Ensure that ANY space less than some multiplier times the kern size is
508 fuzzy. In tables there is a risk of erroneously setting a small space size
509 when there are no real spaces. Sometimes tables have text squashed into
510 columns so that the kn->sp ratio is small anyway - this means that we can't
511 use this to force a wider separation - hence we rely on context to join any
512 dubious breaks. */
513 
514  if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) {
515  row->min_space = std::max(
516  row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size)));
517  }
518 
519  if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
520  row->max_nonspace = static_cast<int32_t>(floor(
521  0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)));
522  }
523  if (row->max_nonspace > row->space_threshold) {
524  // Don't be silly
525  row->max_nonspace = row->space_threshold;
526  }
527 
528  if (tosp_debug_level > 5) {
529  tprintf(
530  "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) "
531  "Sp:%3.2f\n",
532  block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width,
533  real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold,
534  row->min_space, row->space_size);
535  }
536  if (tosp_debug_level > 10) {
537  tprintf(
538  "row->kern_size = %3.2f, row->space_size = %3.2f, "
539  "row->space_threshold = %d\n",
540  row->kern_size, row->space_size, row->space_threshold);
541  }
542 }
543 
544 void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,
545  STATS *small_gap_stats,
546  int16_t block_space_gap_width, // estimate for block
547  int16_t block_non_space_gap_width // estimate for block
548 ) {
549  /* First, estimate row space size */
550  /* Old to condition was > 2 */
551  if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) {
552  // Adequate samples
553  /* Set space size to median of spaces BUT limits it if it seems wildly out
554  */
555  row->space_size = space_gap_stats->median();
556  if (row->space_size > block_space_gap_width * 1.5) {
557  if (tosp_old_to_bug_fix) {
558  row->space_size = block_space_gap_width * 1.5;
559  } else {
560  // BUG??? should be *1.5
561  row->space_size = block_space_gap_width;
562  }
563  }
564  if (row->space_size < (block_non_space_gap_width * 2) + 1) {
565  row->space_size = (block_non_space_gap_width * 2) + 1;
566  }
567  }
568  // Only 1 or 2 samples
569  else if (space_gap_stats->get_total() >= 1) {
570  // hence mean not median
571  row->space_size = space_gap_stats->mean();
572  if (row->space_size > block_space_gap_width * 1.5) {
573  if (tosp_old_to_bug_fix) {
574  row->space_size = block_space_gap_width * 1.5;
575  } else {
576  // BUG??? should be *1.5
577  row->space_size = block_space_gap_width;
578  }
579  }
580  if (row->space_size < (block_non_space_gap_width * 3) + 1) {
581  row->space_size = (block_non_space_gap_width * 3) + 1;
582  }
583  } else {
584  // Use block default
585  row->space_size = block_space_gap_width;
586  }
587 
588  /* Next, estimate row kern size */
589  if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) {
590  row->kern_size = small_gap_stats->median();
591  } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) {
592  row->kern_size = all_gap_stats->median();
593  } else { // old TO -SAME FOR ALL ROWS
594  row->kern_size = block_non_space_gap_width;
595  }
596 
597  /* Finally, estimate row space threshold */
598  if (tosp_threshold_bias2 > 0) {
599  row->space_threshold = int32_t(
600  floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size)));
601  } else {
602  /*
603  NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold
604 and holds this in a float. The use is with a >= test
605 NEW textord uses an integer threshold and a > test
606 It comes to the same thing.
607  (Though there is a difference in that old textor has integer space_size
608  and kern_size.)
609 */
610  row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
611  }
612 
613  // Apply the same logic and ratios as in row_spacing_stats to
614  // restrict relative values of the row's space_size, kern_size, and
615  // space_threshold
616  if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&
617  ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
618  ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) {
619  if (row->kern_size > 2.5) {
620  row->kern_size = row->space_size / tosp_min_sane_kn_sp;
621  }
622  row->space_threshold =
623  int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
624  }
625 }
626 
627 /*************************************************************************
628  * isolated_row_stats()
629  * Set values for min_space, max_non_space based on row stats only
630  *************************************************************************/
631 bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats,
632  bool suspected_table, int16_t block_idx, int16_t row_idx) {
633  float kern_estimate;
634  float crude_threshold_estimate;
635  int16_t small_gaps_count;
636  int16_t total;
637  // iterator
638  BLOBNBOX_IT blob_it = row->blob_list();
639  STATS cert_space_gap_stats(0, MAXSPACING);
640  STATS all_space_gap_stats(0, MAXSPACING);
641  STATS small_gap_stats(0, MAXSPACING);
642  TBOX blob_box;
643  TBOX prev_blob_box;
644  int16_t gap_width;
645  int32_t end_of_row;
646  int32_t row_length;
647 
648  kern_estimate = all_gap_stats->median();
649  crude_threshold_estimate =
650  std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight);
651  small_gaps_count =
652  stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate)));
653  total = all_gap_stats->get_total();
654 
655  if ((total <= tosp_redo_kern_limit) ||
656  ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||
657  (total - small_gaps_count < 1)) {
658  if (tosp_debug_level > 5) {
659  tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx);
660  }
661  return false;
662  }
663  blob_it.set_to_list(row->blob_list());
664  blob_it.mark_cycle_pt();
665  end_of_row = blob_it.data_relative(-1)->bounding_box().right();
666  if (tosp_use_pre_chopping) {
667  blob_box = box_next_pre_chopped(&blob_it);
668  } else if (tosp_stats_use_xht_gaps) {
669  blob_box = reduced_box_next(row, &blob_it);
670  } else {
671  blob_box = box_next(&blob_it);
672  }
673  row_length = end_of_row - blob_box.left();
674  prev_blob_box = blob_box;
675  while (!blob_it.cycled_list()) {
676  if (tosp_use_pre_chopping) {
677  blob_box = box_next_pre_chopped(&blob_it);
678  } else if (tosp_stats_use_xht_gaps) {
679  blob_box = reduced_box_next(row, &blob_it);
680  } else {
681  blob_box = box_next(&blob_it);
682  }
683  int16_t left = prev_blob_box.right();
684  int16_t right = blob_box.left();
685  gap_width = right - left;
686  if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
687  (gap_width > crude_threshold_estimate)) {
688  if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
689  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
690  (!tosp_narrow_blobs_not_cert ||
691  (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
692  (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
693  cert_space_gap_stats.add(gap_width, 1);
694  }
695  all_space_gap_stats.add(gap_width, 1);
696  }
697  if (gap_width < crude_threshold_estimate) {
698  small_gap_stats.add(gap_width, 1);
699  }
700 
701  prev_blob_box = blob_box;
702  }
703  if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
704  // median
705  row->space_size = cert_space_gap_stats.median();
706  } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) {
707  // to avoid spaced
708  row->space_size = cert_space_gap_stats.mean();
709  // 1's in tables
710  } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
711  // median
712  row->space_size = all_space_gap_stats.median();
713  } else {
714  row->space_size = all_space_gap_stats.mean();
715  }
716 
717  if (tosp_only_small_gaps_for_kern) {
718  row->kern_size = small_gap_stats.median();
719  } else {
720  row->kern_size = all_gap_stats->median();
721  }
722  row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
723  /* Sanity check */
724  if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) ||
725  (row->space_threshold <= 0)) {
726  if (tosp_debug_level > 5) {
727  tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx,
728  row->kern_size, row->space_threshold, row->space_size);
729  }
730  row->kern_size = 0.0f;
731  row->space_threshold = 0;
732  row->space_size = 0.0f;
733  return false;
734  }
735 
736  if (tosp_debug_level > 5) {
737  tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size,
738  row->space_threshold, row->space_size);
739  }
740  return true;
741 }
742 
743 int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
744  int16_t index;
745  int16_t total = 0;
746 
747  for (index = 0; index < threshold; index++) {
748  total += stats->pile_count(index);
749  }
750  return total;
751 }
752 
753 /*************************************************************************
754  * improve_row_threshold()
755  * Try to recognise a "normal line" -
756  * > 25 gaps
757  * && space > 3 * kn && space > 10
758  * (I.e. reasonably large space and kn:sp ratio)
759  * && > 3/4 # gaps < kn + (sp - kn)/3
760  * (I.e. most gaps are well away from space estimate)
761  * && a gap of max(3, (sp - kn) / 3) empty histogram positions is found
762  * somewhere in the histogram between kn and sp
763  * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
764  * NO!!!!! the bristol line has "11" with a gap of 12 between the
765  *1's!!! try moving the default threshold to within this band but leave the
766  * fuzzy limit calculation as at present.
767  *************************************************************************/
768 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
769  float sp = row->space_size;
770  float kn = row->kern_size;
771  int16_t reqd_zero_width = 0;
772  int16_t zero_width = 0;
773  int16_t zero_start = 0;
774  int16_t index = 0;
775 
776  if (tosp_debug_level > 10) {
777  tprintf("Improve row threshold 0");
778  }
779  if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) ||
780  (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) <
781  (0.75 * all_gap_stats->get_total()))) {
782  return;
783  }
784  if (tosp_debug_level > 10) {
785  tprintf(" 1");
786  }
787  /*
788 Look for the first region of all 0's in the histogram which is wider than
789 max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current
790 threshold is not within it, move the threshold so that is is just inside it.
791 */
792  reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5));
793  if (reqd_zero_width < 3) {
794  reqd_zero_width = 3;
795  }
796 
797  for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) {
798  if (all_gap_stats->pile_count(index) == 0) {
799  if (zero_width == 0) {
800  zero_start = index;
801  }
802  zero_width++;
803  } else {
804  if (zero_width >= reqd_zero_width) {
805  break;
806  } else {
807  zero_width = 0;
808  }
809  }
810  }
811  index--;
812  if (tosp_debug_level > 10) {
813  tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width,
814  zero_width, zero_start, row->space_threshold);
815  }
816  if ((zero_width < reqd_zero_width) ||
817  ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) {
818  return;
819  }
820  if (tosp_debug_level > 10) {
821  tprintf(" 2");
822  }
823  if (row->space_threshold < zero_start) {
824  if (tosp_debug_level > 5) {
825  tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start,
826  index, row->space_threshold, zero_start);
827  }
828  row->space_threshold = zero_start;
829  }
830  if (row->space_threshold > index) {
831  if (tosp_debug_level > 5) {
832  tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start,
833  index, row->space_threshold, index);
834  }
835  row->space_threshold = index;
836  }
837 }
838 
839 /**********************************************************************
840  * make_prop_words
841  *
842  * Convert a TO_ROW to a ROW.
843  **********************************************************************/
844 ROW *Textord::make_prop_words(TO_ROW *row, // row to make
845  FCOORD rotation // for drawing
846 ) {
847  bool bol; // start of line
848  /* prev_ values are for start of word being built. non prev_ values are for
849 the gap between the word being built and the next one. */
850  bool prev_fuzzy_sp; // probably space
851  bool prev_fuzzy_non; // probably not
852  uint8_t prev_blanks; // in front of word
853  bool fuzzy_sp = false; // probably space
854  bool fuzzy_non = false; // probably not
855  uint8_t blanks = 0; // in front of word
856  bool prev_gap_was_a_space = false;
857  bool break_at_next_gap = false;
858  ROW *real_row; // output row
859  C_OUTLINE_IT cout_it;
860  C_BLOB_LIST cblobs;
861  C_BLOB_IT cblob_it = &cblobs;
862  WERD_LIST words;
863  WERD *word; // new word
864  int32_t next_rep_char_word_right = INT32_MAX;
865  float repetition_spacing; // gap between repetitions
866  int32_t xstarts[2]; // row ends
867  int32_t prev_x; // end of prev blob
868  BLOBNBOX_IT box_it; // iterator
869  TBOX prev_blob_box;
870  TBOX next_blob_box;
871  int16_t prev_gap = INT16_MAX;
872  int16_t current_gap = INT16_MAX;
873  int16_t next_gap = INT16_MAX;
874  int16_t prev_within_xht_gap = INT16_MAX;
875  int16_t current_within_xht_gap = INT16_MAX;
876  int16_t next_within_xht_gap = INT16_MAX;
877  int16_t word_count = 0;
878 
879  // repeated char words
880  WERD_IT rep_char_it(&(row->rep_words));
881  if (!rep_char_it.empty()) {
882  next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
883  }
884 
885  prev_x = -INT16_MAX;
886  cblob_it.set_to_list(&cblobs);
887  box_it.set_to_list(row->blob_list());
888  // new words
889  WERD_IT word_it(&words);
890  bol = true;
891  prev_blanks = 0;
892  prev_fuzzy_sp = false;
893  prev_fuzzy_non = false;
894  if (!box_it.empty()) {
895  xstarts[0] = box_it.data()->bounding_box().left();
896  if (xstarts[0] > next_rep_char_word_right) {
897  /* We need to insert a repeated char word at the start of the row */
898  word = rep_char_it.extract();
899  word_it.add_after_then_move(word);
900  /* Set spaces before repeated char word */
901  word->set_flag(W_BOL, true);
902  bol = false;
903  word->set_blanks(0);
904  // NO uncertainty
905  word->set_flag(W_FUZZY_SP, false);
906  word->set_flag(W_FUZZY_NON, false);
907  xstarts[0] = word->bounding_box().left();
908  /* Set spaces after repeated char word (and leave current word set) */
909  repetition_spacing = find_mean_blob_spacing(word);
910  current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;
911  current_within_xht_gap = current_gap;
912  if (current_gap > tosp_rep_space * repetition_spacing) {
913  prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
914  if (prev_blanks < 1) {
915  prev_blanks = 1;
916  }
917  } else {
918  prev_blanks = 0;
919  }
920  if (tosp_debug_level > 5) {
921  tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
922  box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),
923  repetition_spacing, current_gap);
924  }
925  prev_fuzzy_sp = false;
926  prev_fuzzy_non = false;
927  if (rep_char_it.empty()) {
928  next_rep_char_word_right = INT32_MAX;
929  } else {
930  rep_char_it.forward();
931  next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
932  }
933  }
934 
935  peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
936  do {
937  auto bblob = box_it.data();
938  auto blob_box = bblob->bounding_box();
939  if (bblob->joined_to_prev()) {
940  auto cblob = bblob->remove_cblob();
941  if (cblob != nullptr) {
942  cout_it.set_to_list(cblob_it.data()->out_list());
943  cout_it.move_to_last();
944  cout_it.add_list_after(cblob->out_list());
945  delete cblob;
946  }
947  } else {
948  auto cblob = bblob->cblob();
949  if (cblob != nullptr) {
950  bblob->set_owns_cblob(false);
951  cblob_it.add_after_then_move(cblob);
952  }
953  prev_x = blob_box.right();
954  }
955  box_it.forward(); // next one
956  bblob = box_it.data();
957  blob_box = bblob->bounding_box();
958 
959  if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
960  /* Real Blob - not multiple outlines or pre-chopped */
961  prev_gap = current_gap;
962  prev_within_xht_gap = current_within_xht_gap;
963  prev_blob_box = next_blob_box;
964  current_gap = next_gap;
965  current_within_xht_gap = next_within_xht_gap;
966  peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
967 
968  int16_t prev_gap_arg = prev_gap;
969  int16_t next_gap_arg = next_gap;
970  if (tosp_only_use_xht_gaps) {
971  prev_gap_arg = prev_within_xht_gap;
972  next_gap_arg = next_within_xht_gap;
973  }
974  // Decide if a word-break should be inserted
975  if (blob_box.left() > next_rep_char_word_right ||
976  make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,
977  current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,
978  fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||
979  box_it.at_first()) {
980  /* Form a new word out of the blobs collected */
981  word = new WERD(&cblobs, prev_blanks, nullptr);
982  word_count++;
983  word_it.add_after_then_move(word);
984  if (bol) {
985  word->set_flag(W_BOL, true);
986  bol = false;
987  }
988  if (prev_fuzzy_sp) {
989  // probably space
990  word->set_flag(W_FUZZY_SP, true);
991  } else if (prev_fuzzy_non) {
992  word->set_flag(W_FUZZY_NON, true);
993  }
994  // probably not
995 
996  if (blob_box.left() > next_rep_char_word_right) {
997  /* We need to insert a repeated char word */
998  word = rep_char_it.extract();
999  word_it.add_after_then_move(word);
1000 
1001  /* Set spaces before repeated char word */
1002  repetition_spacing = find_mean_blob_spacing(word);
1003  current_gap = word->bounding_box().left() - prev_x;
1004  current_within_xht_gap = current_gap;
1005  if (current_gap > tosp_rep_space * repetition_spacing) {
1006  blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1007  if (blanks < 1) {
1008  blanks = 1;
1009  }
1010  } else {
1011  blanks = 0;
1012  }
1013  if (tosp_debug_level > 5) {
1014  tprintf("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1015  word->bounding_box().left(), word->bounding_box().bottom(),
1016  repetition_spacing, current_gap, blanks);
1017  }
1018  word->set_blanks(blanks);
1019  // NO uncertainty
1020  word->set_flag(W_FUZZY_SP, false);
1021  word->set_flag(W_FUZZY_NON, false);
1022 
1023  /* Set spaces after repeated char word (and leave current word set)
1024  */
1025  current_gap = blob_box.left() - next_rep_char_word_right;
1026  if (current_gap > tosp_rep_space * repetition_spacing) {
1027  blanks = static_cast<uint8_t>(current_gap / row->space_size);
1028  if (blanks < 1) {
1029  blanks = 1;
1030  }
1031  } else {
1032  blanks = 0;
1033  }
1034  if (tosp_debug_level > 5) {
1035  tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks);
1036  }
1037  fuzzy_sp = false;
1038  fuzzy_non = false;
1039 
1040  if (rep_char_it.empty()) {
1041  next_rep_char_word_right = INT32_MAX;
1042  } else {
1043  rep_char_it.forward();
1044  next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
1045  }
1046  }
1047 
1048  if (box_it.at_first() && rep_char_it.empty()) {
1049  // at end of line
1050  word->set_flag(W_EOL, true);
1051  xstarts[1] = prev_x;
1052  } else {
1053  prev_blanks = blanks;
1054  prev_fuzzy_sp = fuzzy_sp;
1055  prev_fuzzy_non = fuzzy_non;
1056  }
1057  }
1058  }
1059  } while (!box_it.at_first()); // until back at start
1060 
1061  /* Insert any further repeated char words */
1062  while (!rep_char_it.empty()) {
1063  word = rep_char_it.extract();
1064  word_it.add_after_then_move(word);
1065 
1066  /* Set spaces before repeated char word */
1067  repetition_spacing = find_mean_blob_spacing(word);
1068  current_gap = word->bounding_box().left() - prev_x;
1069  if (current_gap > tosp_rep_space * repetition_spacing) {
1070  blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1071  if (blanks < 1) {
1072  blanks = 1;
1073  }
1074  } else {
1075  blanks = 0;
1076  }
1077  if (tosp_debug_level > 5) {
1078  tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1079  word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing,
1080  current_gap, blanks);
1081  }
1082  word->set_blanks(blanks);
1083  // NO uncertainty
1084  word->set_flag(W_FUZZY_SP, false);
1085  word->set_flag(W_FUZZY_NON, false);
1086  prev_x = word->bounding_box().right();
1087  if (rep_char_it.empty()) {
1088  // at end of line
1089  word->set_flag(W_EOL, true);
1090  xstarts[1] = prev_x;
1091  } else {
1092  rep_char_it.forward();
1093  }
1094  }
1095  real_row =
1096  new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1097  word_it.set_to_list(real_row->word_list());
1098  // put words in row
1099  word_it.add_list_after(&words);
1100  real_row->recalc_bounding_box();
1101 
1102  if (tosp_debug_level > 4) {
1103  tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1104  real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1105  real_row->bounding_box().right(), real_row->bounding_box().top());
1106  }
1107  return real_row;
1108  }
1109  return nullptr;
1110 }
1111 
1112 /**********************************************************************
1113  * make_blob_words
1114  *
1115  * Converts words into blobs so that each blob is a single character.
1116  * Used for chopper test.
1117  **********************************************************************/
1118 ROW *Textord::make_blob_words(TO_ROW *row, // row to make
1119  FCOORD rotation // for drawing
1120 ) {
1121  bool bol; // start of line
1122  ROW *real_row; // output row
1123  C_OUTLINE_IT cout_it;
1124  C_BLOB_LIST cblobs;
1125  C_BLOB_IT cblob_it = &cblobs;
1126  WERD_LIST words;
1127  WERD *word; // new word
1128  BLOBNBOX_IT box_it; // iterator
1129  int16_t word_count = 0;
1130 
1131  cblob_it.set_to_list(&cblobs);
1132  box_it.set_to_list(row->blob_list());
1133  // new words
1134  WERD_IT word_it(&words);
1135  bol = true;
1136  if (!box_it.empty()) {
1137  do {
1138  auto bblob = box_it.data();
1139  auto blob_box = bblob->bounding_box();
1140  if (bblob->joined_to_prev()) {
1141  auto cblob = bblob->remove_cblob();
1142  if (cblob != nullptr) {
1143  cout_it.set_to_list(cblob_it.data()->out_list());
1144  cout_it.move_to_last();
1145  cout_it.add_list_after(cblob->out_list());
1146  delete cblob;
1147  }
1148  } else {
1149  auto cblob = bblob->cblob();
1150  if (cblob != nullptr) {
1151  bblob->set_owns_cblob(false);
1152  cblob_it.add_after_then_move(cblob);
1153  }
1154  }
1155  box_it.forward(); // next one
1156  bblob = box_it.data();
1157  blob_box = bblob->bounding_box();
1158 
1159  if (!bblob->joined_to_prev() && !cblobs.empty()) {
1160  word = new WERD(&cblobs, 1, nullptr);
1161  word_count++;
1162  word_it.add_after_then_move(word);
1163  if (bol) {
1164  word->set_flag(W_BOL, true);
1165  bol = false;
1166  }
1167  if (box_it.at_first()) { // at end of line
1168  word->set_flag(W_EOL, true);
1169  }
1170  }
1171  } while (!box_it.at_first()); // until back at start
1172  /* Setup the row with created words. */
1173  real_row =
1174  new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1175  word_it.set_to_list(real_row->word_list());
1176  // put words in row
1177  word_it.add_list_after(&words);
1178  real_row->recalc_bounding_box();
1179  if (tosp_debug_level > 4) {
1180  tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1181  real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1182  real_row->bounding_box().right(), real_row->bounding_box().top());
1183  }
1184  return real_row;
1185  }
1186  return nullptr;
1187 }
1188 
1189 bool Textord::make_a_word_break(TO_ROW *row, // row being made
1190  TBOX blob_box, // for next_blob // how many blanks?
1191  int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,
1192  int16_t within_xht_current_gap, TBOX next_blob_box,
1193  int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,
1194  bool &prev_gap_was_a_space, bool &break_at_next_gap) {
1195  bool space;
1196  int16_t current_gap;
1197  float fuzzy_sp_to_kn_limit;
1198 
1199  if (break_at_next_gap) {
1200  break_at_next_gap = false;
1201  return true;
1202  }
1203  /* Inhibit using the reduced gap if
1204  The kerning is large - chars are not kerned and reducing "f"s can cause
1205  erroneous blanks
1206 OR The real gap is less than 0
1207 OR The real gap is less than the kerning estimate
1208 */
1209  if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1210  ((tosp_dont_fool_with_small_kerns >= 0) &&
1211  (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) {
1212  // Ignore the difference
1213  within_xht_current_gap = real_current_gap;
1214  }
1215 
1216  if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) {
1217  current_gap = within_xht_current_gap;
1218  } else {
1219  current_gap = real_current_gap;
1220  }
1221 
1222  if (tosp_old_to_method) {
1223  // Boring old method
1224  space = current_gap > row->max_nonspace;
1225  if (space && (current_gap < INT16_MAX)) {
1226  if (current_gap < row->min_space) {
1227  if (current_gap > row->space_threshold) {
1228  blanks = 1;
1229  fuzzy_sp = true;
1230  fuzzy_non = false;
1231  } else {
1232  blanks = 0;
1233  fuzzy_sp = false;
1234  fuzzy_non = true;
1235  }
1236  } else {
1237  if (row->space_size == 0.0f) {
1238  // Avoid FP division by 0.
1239  blanks = 1;
1240  } else {
1241  blanks = static_cast<uint8_t>(current_gap / row->space_size);
1242  if (blanks < 1) {
1243  blanks = 1;
1244  }
1245  }
1246  fuzzy_sp = false;
1247  fuzzy_non = false;
1248  }
1249  }
1250  return space;
1251  } else {
1252  /* New exciting heuristic method */
1253  if (prev_blob_box.null_box()) { // Beginning of row
1254  prev_gap_was_a_space = true;
1255  }
1256 
1257  // Default as old TO
1258  space = current_gap > row->space_threshold;
1259 
1260  /* Set defaults for the word break in case we find one. Currently there are
1261 no fuzzy spaces. Depending on the reliability of the different heuristics
1262 we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1263 be used if the function returns true - ie the word is to be broken.
1264 */
1265  int num_blanks = current_gap;
1266  if (row->space_size > 1.0f) {
1267  num_blanks = IntCastRounded(current_gap / row->space_size);
1268  }
1269  blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
1270  fuzzy_sp = false;
1271  fuzzy_non = false;
1272  /*
1273 If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1274 despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1275 context.
1276 */
1277  if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) &&
1278  (within_xht_current_gap > row->max_nonspace)) {
1279  space = true;
1280  fuzzy_non = true;
1281 #ifndef GRAPHICS_DISABLED
1282  mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1283  next_gap);
1284 #endif
1285  } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) &&
1286  (within_xht_current_gap > row->space_threshold)) {
1287  space = true;
1288  if (tosp_flip_fuzz_kn_to_sp) {
1289  fuzzy_sp = true;
1290  } else {
1291  fuzzy_non = true;
1292  }
1293 #ifndef GRAPHICS_DISABLED
1294  mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1295  next_gap);
1296 #endif
1297  } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) &&
1298  (within_xht_current_gap >= row->min_space)) {
1299  space = true;
1300 #ifndef GRAPHICS_DISABLED
1301  mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1302  next_gap);
1303 #endif
1304  } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) &&
1305  suspected_punct_blob(row, blob_box)) {
1306  break_at_next_gap = true;
1307  }
1308  /* Now continue with normal heuristics */
1309  else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) {
1310  /* Heuristics to turn dubious spaces to kerns */
1311  if (tosp_pass_wide_fuzz_sp_to_context > 0) {
1312  fuzzy_sp_to_kn_limit =
1313  row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size);
1314  } else {
1315  fuzzy_sp_to_kn_limit = 99999.0f;
1316  }
1317 
1318  /* If current gap is significantly smaller than the previous space the
1319 other side of a narrow blob then this gap is a kern. */
1320  if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space &&
1321  (current_gap <= tosp_gap_factor * prev_gap)) {
1322  if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1323  if (tosp_flip_fuzz_sp_to_kn) {
1324  fuzzy_non = true;
1325  } else {
1326  fuzzy_sp = true;
1327  }
1328  } else {
1329  space = false;
1330  }
1331 #ifndef GRAPHICS_DISABLED
1332  mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1333  next_gap);
1334 #endif
1335  }
1336  /* If current gap not much bigger than the previous kern the other side of
1337 a narrow blob then this gap is a kern as well */
1338  else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) &&
1339  !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) {
1340  if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1341  if (tosp_flip_fuzz_sp_to_kn) {
1342  fuzzy_non = true;
1343  } else {
1344  fuzzy_sp = true;
1345  }
1346  } else {
1347  space = false;
1348  }
1349 #ifndef GRAPHICS_DISABLED
1350  mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1351  next_gap);
1352 #endif
1353  } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
1354  (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) {
1355  if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1356  if (tosp_flip_fuzz_sp_to_kn) {
1357  fuzzy_non = true;
1358  } else {
1359  fuzzy_sp = true;
1360  }
1361  } else {
1362  space = false;
1363  }
1364 #ifndef GRAPHICS_DISABLED
1365  mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1366  next_gap);
1367 #endif
1368  } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
1369  (next_gap <= row->space_threshold) &&
1370  (current_gap * tosp_gap_factor <= next_gap)) {
1371  if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1372  if (tosp_flip_fuzz_sp_to_kn) {
1373  fuzzy_non = true;
1374  } else {
1375  fuzzy_sp = true;
1376  }
1377  } else {
1378  space = false;
1379  }
1380 #ifndef GRAPHICS_DISABLED
1381  mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1382  next_gap);
1383 #endif
1384  } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) ||
1385  ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) {
1386  fuzzy_sp = true;
1387 #ifndef GRAPHICS_DISABLED
1388  mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1389  next_gap);
1390 #endif
1391  }
1392  } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) {
1393  /* Heuristics to turn dubious kerns to spaces */
1394  /* TRIED THIS BUT IT MADE THINGS WORSE
1395  if (prev_gap == INT16_MAX)
1396  prev_gap = 0; // start of row
1397  if (next_gap == INT16_MAX)
1398  next_gap = 0; // end of row
1399 */
1400  if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) &&
1401  (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
1402  wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) {
1403  space = true;
1404  /*
1405 tosp_flip_caution is an attempt to stop the default changing in cases
1406 where there is a large difference between the kern and space estimates.
1407  See problem in 'chiefs' where "have" gets split in the quotation.
1408 */
1409  if ((tosp_flip_fuzz_kn_to_sp) &&
1410  ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) {
1411  fuzzy_sp = true;
1412  } else {
1413  fuzzy_non = true;
1414  }
1415 #ifndef GRAPHICS_DISABLED
1416  mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1417  next_gap);
1418 #endif
1419  } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 &&
1420  current_gap > 5 && // Rule 9 handles small gap, big ratio.
1421  current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
1422  !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) &&
1423  !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) {
1424  space = true;
1425  fuzzy_non = true;
1426 #ifndef GRAPHICS_DISABLED
1427  mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1428  next_gap);
1429 #endif
1430  } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) &&
1431  (next_blob_box.width() > 0) &&
1432  (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
1433  (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) &&
1434  !suspected_punct_blob(row, next_blob_box)))) {
1435  space = true;
1436  fuzzy_non = true;
1437 #ifndef GRAPHICS_DISABLED
1438  mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1439  next_gap);
1440 #endif
1441  }
1442  }
1443  if (tosp_debug_level > 10) {
1444  tprintf(
1445  "word break = %d current_gap = %d, prev_gap = %d, "
1446  "next_gap = %d\n",
1447  space ? 1 : 0, current_gap, prev_gap, next_gap);
1448  }
1449  prev_gap_was_a_space = space && !(fuzzy_non);
1450  return space;
1451  }
1452 }
1453 
1454 bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
1455  bool result;
1456  result =
1457  ((blob_box.width() <= tosp_narrow_fraction * row->xheight) ||
1458  ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio));
1459  return result;
1460 }
1461 
1462 bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
1463  bool result;
1464  if (tosp_wide_fraction > 0) {
1465  if (tosp_wide_aspect_ratio > 0) {
1466  result =
1467  ((blob_box.width() >= tosp_wide_fraction * row->xheight) &&
1468  ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio));
1469  } else {
1470  result = (blob_box.width() >= tosp_wide_fraction * row->xheight);
1471  }
1472  } else {
1473  result = !narrow_blob(row, blob_box);
1474  }
1475  return result;
1476 }
1477 
1478 bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
1479  bool result;
1480  float baseline;
1481  float blob_x_centre;
1482  /* Find baseline of centre of blob */
1483  blob_x_centre = (box.right() + box.left()) / 2.0;
1484  baseline = row->baseline.y(blob_x_centre);
1485 
1486  result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) ||
1487  (box.bottom() > baseline + row->xheight / 2.0);
1488  return result;
1489 }
1490 
1491 void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box,
1492  int16_t &next_gap, int16_t &next_within_xht_gap) {
1493  TBOX next_reduced_blob_box;
1494  TBOX bit_beyond;
1495  BLOBNBOX_IT reduced_box_it = box_it;
1496 
1497  next_blob_box = box_next(&box_it);
1498  next_reduced_blob_box = reduced_box_next(row, &reduced_box_it);
1499  if (box_it.at_first()) {
1500  next_gap = INT16_MAX;
1501  next_within_xht_gap = INT16_MAX;
1502  } else {
1503  bit_beyond = box_it.data()->bounding_box();
1504  next_gap = bit_beyond.left() - next_blob_box.right();
1505  bit_beyond = reduced_box_next(row, &reduced_box_it);
1506  next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right();
1507  }
1508 }
1509 
1510 #ifndef GRAPHICS_DISABLED
1511 void Textord::mark_gap(TBOX blob, // blob following gap
1512  int16_t rule, // heuristic id
1513  int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,
1514  int16_t next_blob_width, int16_t next_gap) {
1515  ScrollView::Color col; // of ellipse marking flipped gap
1516 
1517  switch (rule) {
1518  case 1:
1519  col = ScrollView::RED;
1520  break;
1521  case 2:
1522  col = ScrollView::CYAN;
1523  break;
1524  case 3:
1525  col = ScrollView::GREEN;
1526  break;
1527  case 4:
1528  col = ScrollView::BLACK;
1529  break;
1530  case 5:
1531  col = ScrollView::MAGENTA;
1532  break;
1533  case 6:
1534  col = ScrollView::BLUE;
1535  break;
1536 
1537  case 7:
1538  col = ScrollView::WHITE;
1539  break;
1540  case 8:
1541  col = ScrollView::YELLOW;
1542  break;
1543  case 9:
1544  col = ScrollView::BLACK;
1545  break;
1546 
1547  case 20:
1548  col = ScrollView::CYAN;
1549  break;
1550  case 21:
1551  col = ScrollView::GREEN;
1552  break;
1553  case 22:
1554  col = ScrollView::MAGENTA;
1555  break;
1556  default:
1557  col = ScrollView::BLACK;
1558  }
1560  to_win->Pen(col);
1561  /* if (rule < 20)
1562  //interior_style(to_win, INT_SOLID, false);
1563  else
1564  //interior_style(to_win, INT_HOLLOW, true);*/
1565  // x radius
1566  to_win->Ellipse(current_gap / 2.0f,
1567  blob.height() / 2.0f, // y radius
1568  // x centre
1569  blob.left() - current_gap / 2.0f,
1570  // y centre
1571  blob.bottom() + blob.height() / 2.0f);
1572  }
1573  if (tosp_debug_level > 5) {
1574  tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2,
1575  blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap);
1576  }
1577 }
1578 #endif
1579 
1580 float Textord::find_mean_blob_spacing(WERD *word) {
1581  C_BLOB_IT cblob_it;
1582  TBOX blob_box;
1583  int32_t gap_sum = 0;
1584  int16_t gap_count = 0;
1585  int16_t prev_right;
1586 
1587  cblob_it.set_to_list(word->cblob_list());
1588  if (!cblob_it.empty()) {
1589  cblob_it.mark_cycle_pt();
1590  prev_right = cblob_it.data()->bounding_box().right();
1591  // first blob
1592  cblob_it.forward();
1593  for (; !cblob_it.cycled_list(); cblob_it.forward()) {
1594  blob_box = cblob_it.data()->bounding_box();
1595  gap_sum += blob_box.left() - prev_right;
1596  gap_count++;
1597  prev_right = blob_box.right();
1598  }
1599  }
1600  if (gap_count > 0) {
1601  return (gap_sum / static_cast<float>(gap_count));
1602  } else {
1603  return 0.0f;
1604  }
1605 }
1606 
1607 bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left,
1608  int16_t right) {
1609  int16_t gap = right - left + 1;
1610 
1611  if (tosp_ignore_big_gaps > 999) {
1612  return false; // Don't ignore
1613  }
1614  if (tosp_ignore_big_gaps > 0) {
1615  return (gap > tosp_ignore_big_gaps * row->xheight);
1616  }
1617  if (gap > tosp_ignore_very_big_gaps * row->xheight) {
1618  return true;
1619  }
1620  if (tosp_ignore_big_gaps == 0) {
1621  if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) {
1622  return true;
1623  }
1624  if ((gap > 1.75 * row->xheight) &&
1625  ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) {
1626  return true;
1627  }
1628  } else {
1629  /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table
1630  */
1631  if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) {
1632  return true;
1633  }
1634  }
1635  return false;
1636 }
1637 
1638 /**********************************************************************
1639  * reduced_box_next
1640  *
1641  * Compute the bounding box of this blob with merging of x overlaps
1642  * but no pre-chopping.
1643  * Then move the iterator on to the start of the next blob.
1644  * DON'T reduce the box for small things - eg punctuation.
1645  **********************************************************************/
1646 TBOX Textord::reduced_box_next(TO_ROW *row, // current row
1647  BLOBNBOX_IT *it // iterator to blobds
1648 ) {
1649  BLOBNBOX *blob; // current blob
1650  BLOBNBOX *head_blob; // place to store box
1651  TBOX full_box; // full blob boundg box
1652  TBOX reduced_box; // box of significant part
1653  int16_t left_above_xht; // ABOVE xht left limit
1654  int16_t new_left_above_xht; // ABOVE xht left limit
1655 
1656  blob = it->data();
1657  if (blob->red_box_set()) {
1658  reduced_box = blob->reduced_box();
1659  do {
1660  it->forward();
1661  blob = it->data();
1662  } while (blob->cblob() == nullptr || blob->joined_to_prev());
1663  return reduced_box;
1664  }
1665  head_blob = blob;
1666  full_box = blob->bounding_box();
1667  reduced_box = reduced_box_for_blob(blob, row, &left_above_xht);
1668  do {
1669  it->forward();
1670  blob = it->data();
1671  if (blob->cblob() == nullptr) {
1672  // was pre-chopped
1673  full_box += blob->bounding_box();
1674  } else if (blob->joined_to_prev()) {
1675  reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht);
1676  left_above_xht = std::min(left_above_xht, new_left_above_xht);
1677  }
1678  }
1679  // until next real blob
1680  while (blob->cblob() == nullptr || blob->joined_to_prev());
1681 
1682  if ((reduced_box.width() > 0) &&
1683  ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) &&
1684  (reduced_box.height() > 0.7 * row->xheight)) {
1685 #ifndef GRAPHICS_DISABLED
1687  reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW);
1688  }
1689 #endif
1690  } else {
1691  reduced_box = full_box;
1692  }
1693  head_blob->set_reduced_box(reduced_box);
1694  return reduced_box;
1695 }
1696 
1697 /*************************************************************************
1698  * reduced_box_for_blob()
1699  * Find box for blob which is the same height and y position as the whole blob,
1700  * but whose left limit is the left most position of the blob ABOVE the
1701  * baseline and whose right limit is the right most position of the blob BELOW
1702  * the xheight.
1703  *
1704  *
1705  * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1706  * "home". Perhaps we need something which say if the width ABOVE the
1707  * xht alone includes the whole of the reduced width, then use the full
1708  * blob box - Might still fail on italic F
1709  *
1710  * Alternatively we could be a little less severe and only reduce the
1711  * left and right edges by half the difference between the full box and
1712  * the reduced box.
1713  *
1714  * NOTE that we need to rotate all the coordinates as
1715  * find_blob_limits finds the y min and max within a specified x band
1716  *************************************************************************/
1717 TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) {
1718  float baseline;
1719  float blob_x_centre;
1720  float left_limit;
1721  float right_limit;
1722  float junk;
1723  TBOX blob_box;
1724 
1725  /* Find baseline of centre of blob */
1726 
1727  blob_box = blob->bounding_box();
1728  blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0;
1729  baseline = row->baseline.y(blob_x_centre);
1730 
1731  /*
1732 Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1733 caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1734 */
1735  left_limit = static_cast<float>(INT32_MAX);
1736  junk = static_cast<float>(-INT32_MAX);
1737  find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX),
1738  left_limit, junk);
1739  if (left_limit > junk) {
1740  *left_above_xht = INT16_MAX; // No area above xht
1741  } else {
1742  *left_above_xht = static_cast<int16_t>(std::floor(left_limit));
1743  }
1744  /*
1745 Find reduced LH limit of blob - the left extent of the region ABOVE the
1746 baseline.
1747 */
1748  left_limit = static_cast<float>(INT32_MAX);
1749  junk = static_cast<float>(-INT32_MAX);
1750  find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk);
1751 
1752  if (left_limit > junk) {
1753  return TBOX(); // no area within xht so return empty box
1754  }
1755  /*
1756 Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1757 */
1758  junk = static_cast<float>(INT32_MAX);
1759  right_limit = static_cast<float>(-INT32_MAX);
1760  find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk,
1761  right_limit);
1762  if (junk > right_limit) {
1763  return TBOX(); // no area within xht so return empty box
1764  }
1765 
1766  return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()),
1767  ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top()));
1768 }
1769 } // namespace tesseract
#define MAXSPACING
Definition: tospace.cpp:42
@ TBOX
@ W_BOL
start of line
Definition: werd.h:34
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_EOL
end of line
Definition: werd.h:35
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
double gapmap_big_gaps
Definition: gap_map.cpp:20
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:175
@ baseline
Definition: mfoutline.h:53
ScrollView * to_win
Definition: drawtord.cpp:37
@ PITCH_DEF_PROP
Definition: blobbox.h:51
@ PITCH_CORR_PROP
Definition: blobbox.h:54
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xmax)
Definition: blobbox.cpp:579
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
Definition: drawtord.cpp:238
bool textord_show_initial_words
Definition: tovars.cpp:25
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:667
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:638
int32_t min_space
Definition: blobbox.h:669
WERD_LIST rep_words
Definition: blobbox.h:674
int32_t max_nonspace
Definition: blobbox.h:670
float space_size
Definition: blobbox.h:673
float fixed_pitch
Definition: blobbox.h:657
int32_t space_threshold
Definition: blobbox.h:671
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:608
PITCH_TYPE pitch_decision
Definition: blobbox.h:656
TO_ROW_LIST * get_rows()
Definition: blobbox.h:709
WERD_LIST * word_list()
Definition: ocrrow.h:57
void recalc_bounding_box()
Definition: ocrrow.cpp:100
TBOX bounding_box() const
Definition: ocrrow.h:90
integer coordinate
Definition: points.h:36
TDimension left() const
Definition: rect.h:82
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
bool null_box() const
Definition: rect.h:60
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131
TBOX bounding_box() const
Definition: werd.cpp:155
void set_blanks(uint8_t new_blanks)
Definition: werd.h:103
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:844
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:45
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1118
void Pen(Color color)
Definition: scrollview.cpp:723
void Ellipse(int x, int y, int width, int height)
Definition: scrollview.cpp:598