tesseract  5.0.0
topitch.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: topitch.cpp (Formerly to_pitch.c)
3  * Description: Code to determine fixed pitchness and the pitch if fixed.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1993, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23 
24 #include "topitch.h"
25 
26 #include "blobbox.h"
27 #include "drawtord.h"
28 #include "makerow.h"
29 #include "pithsync.h"
30 #include "pitsync1.h"
31 #include "statistc.h"
32 #include "tovars.h"
33 #include "wordseg.h"
34 
35 #include "helpers.h"
36 
37 #include <memory>
38 
39 namespace tesseract {
40 
41 static BOOL_VAR(textord_all_prop, false, "All doc is proportial text");
42 BOOL_VAR(textord_debug_pitch_test, false, "Debug on fixed pitch test");
43 static BOOL_VAR(textord_disable_pitch_test, false, "Turn off dp fixed pitch algorithm");
44 BOOL_VAR(textord_fast_pitch_test, false, "Do even faster pitch algorithm");
45 BOOL_VAR(textord_debug_pitch_metric, false, "Write full metric stuff");
46 BOOL_VAR(textord_show_row_cuts, false, "Draw row-level cuts");
47 BOOL_VAR(textord_show_page_cuts, false, "Draw page-level cuts");
48 BOOL_VAR(textord_blockndoc_fixed, false, "Attempt whole doc/block fixed pitch");
49 double_VAR(textord_projection_scale, 0.200, "Ding rate for mid-cuts");
50 double_VAR(textord_balance_factor, 1.0, "Ding rate for unbalanced char cells");
51 
52 #define BLOCK_STATS_CLUSTERS 10
53 #define MAX_ALLOWED_PITCH 100 // max pixel pitch.
54 
55 // qsort function to sort 2 floats.
56 static int sort_floats(const void *arg1, const void *arg2) {
57  float diff = *reinterpret_cast<const float *>(arg1) - *reinterpret_cast<const float *>(arg2);
58  if (diff > 0) {
59  return 1;
60  } else if (diff < 0) {
61  return -1;
62  } else {
63  return 0;
64  }
65 }
66 
67 /**********************************************************************
68  * compute_fixed_pitch
69  *
70  * Decide whether each row is fixed pitch individually.
71  * Correlate definite and uncertain results to obtain an individual
72  * result for each row in the TO_ROW class.
73  **********************************************************************/
74 
75 void compute_fixed_pitch(ICOORD page_tr, // top right
76  TO_BLOCK_LIST *port_blocks, // input list
77  float gradient, // page skew
78  FCOORD rotation, // for drawing
79  bool testing_on) { // correct orientation
80  TO_BLOCK_IT block_it; // iterator
81  TO_BLOCK *block; // current block;
82  TO_ROW *row; // current row
83  int block_index; // block number
84  int row_index; // row number
85 
86 #ifndef GRAPHICS_DISABLED
87  if (textord_show_initial_words && testing_on) {
88  if (to_win == nullptr) {
89  create_to_win(page_tr);
90  }
91  }
92 #endif
93 
94  block_it.set_to_list(port_blocks);
95  block_index = 1;
96  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
97  block = block_it.data();
98  compute_block_pitch(block, rotation, block_index, testing_on);
99  block_index++;
100  }
101 
102  if (!try_doc_fixed(page_tr, port_blocks, gradient)) {
103  block_index = 1;
104  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
105  block = block_it.data();
106  if (!try_block_fixed(block, block_index)) {
107  try_rows_fixed(block, block_index, testing_on);
108  }
109  block_index++;
110  }
111  }
112 
113  block_index = 1;
114  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
115  block = block_it.data();
116  POLY_BLOCK *pb = block->block->pdblk.poly_block();
117  if (pb != nullptr && !pb->IsText()) {
118  continue; // Non-text doesn't exist!
119  }
120  // row iterator
121  TO_ROW_IT row_it(block->get_rows());
122  row_index = 1;
123  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
124  row = row_it.data();
125  fix_row_pitch(row, block, port_blocks, row_index, block_index);
126  row_index++;
127  }
128  block_index++;
129  }
130 #ifndef GRAPHICS_DISABLED
131  if (textord_show_initial_words && testing_on) {
133  }
134 #endif
135 }
136 
137 /**********************************************************************
138  * fix_row_pitch
139  *
140  * Get a pitch_decision for this row by voting among similar rows in the
141  * block, then similar rows over all the page, or any other rows at all.
142  **********************************************************************/
143 
144 void fix_row_pitch(TO_ROW *bad_row, // row to fix
145  TO_BLOCK *bad_block, // block of bad_row
146  TO_BLOCK_LIST *blocks, // blocks to scan
147  int32_t row_target, // number of row
148  int32_t block_target) { // number of block
149  int16_t mid_cuts;
150  int block_votes; // votes in block
151  int like_votes; // votes over page
152  int other_votes; // votes of unlike blocks
153  int block_index; // number of block
154  int row_index; // number of row
155  int maxwidth; // max pitch
156  TO_BLOCK_IT block_it = blocks; // block iterator
157  TO_BLOCK *block; // current block
158  TO_ROW *row; // current row
159  float sp_sd; // space deviation
160  STATS block_stats; // pitches in block
161  STATS like_stats; // pitches in page
162 
163  block_votes = like_votes = other_votes = 0;
164  maxwidth = static_cast<int32_t>(ceil(bad_row->xheight * textord_words_maxspace));
165  if (bad_row->pitch_decision != PITCH_DEF_FIXED && bad_row->pitch_decision != PITCH_DEF_PROP) {
166  block_stats.set_range(0, maxwidth);
167  like_stats.set_range(0, maxwidth);
168  block_index = 1;
169  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
170  block = block_it.data();
171  POLY_BLOCK *pb = block->block->pdblk.poly_block();
172  if (pb != nullptr && !pb->IsText()) {
173  continue; // Non text doesn't exist!
174  }
175  row_index = 1;
176  TO_ROW_IT row_it(block->get_rows());
177  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
178  row = row_it.data();
179  if ((bad_row->all_caps &&
180  row->xheight + row->ascrise <
181  (bad_row->xheight + bad_row->ascrise) * (1 + textord_pitch_rowsimilarity) &&
182  row->xheight + row->ascrise >
183  (bad_row->xheight + bad_row->ascrise) * (1 - textord_pitch_rowsimilarity)) ||
184  (!bad_row->all_caps &&
185  row->xheight < bad_row->xheight * (1 + textord_pitch_rowsimilarity) &&
186  row->xheight > bad_row->xheight * (1 - textord_pitch_rowsimilarity))) {
187  if (block_index == block_target) {
188  if (row->pitch_decision == PITCH_DEF_FIXED) {
189  block_votes += textord_words_veto_power;
190  block_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power);
191  } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
193  block_votes++;
194  block_stats.add(static_cast<int32_t>(row->fixed_pitch), 1);
195  } else if (row->pitch_decision == PITCH_DEF_PROP) {
196  block_votes -= textord_words_veto_power;
197  } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
199  block_votes--;
200  }
201  } else {
202  if (row->pitch_decision == PITCH_DEF_FIXED) {
203  like_votes += textord_words_veto_power;
204  like_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power);
205  } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
207  like_votes++;
208  like_stats.add(static_cast<int32_t>(row->fixed_pitch), 1);
209  } else if (row->pitch_decision == PITCH_DEF_PROP) {
210  like_votes -= textord_words_veto_power;
211  } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
213  like_votes--;
214  }
215  }
216  } else {
217  if (row->pitch_decision == PITCH_DEF_FIXED) {
218  other_votes += textord_words_veto_power;
219  } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
221  other_votes++;
222  } else if (row->pitch_decision == PITCH_DEF_PROP) {
223  other_votes -= textord_words_veto_power;
224  } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
226  other_votes--;
227  }
228  }
229  row_index++;
230  }
231  block_index++;
232  }
233  if (block_votes > textord_words_veto_power) {
234  bad_row->fixed_pitch = block_stats.ile(0.5);
235  bad_row->pitch_decision = PITCH_CORR_FIXED;
236  } else if (block_votes <= textord_words_veto_power && like_votes > 0) {
237  bad_row->fixed_pitch = like_stats.ile(0.5);
238  bad_row->pitch_decision = PITCH_CORR_FIXED;
239  } else {
240  bad_row->pitch_decision = PITCH_CORR_PROP;
241  if (block_votes == 0 && like_votes == 0 && other_votes > 0 &&
243  tprintf(
244  "Warning:row %d of block %d set prop with no like rows against "
245  "trend\n",
246  row_target, block_target);
247  }
248  }
249  }
251  tprintf(":b_votes=%d:l_votes=%d:o_votes=%d", block_votes, like_votes, other_votes);
252  tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise);
253  }
254  if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
255  if (bad_row->fixed_pitch < textord_min_xheight) {
256  if (block_votes > 0) {
257  bad_row->fixed_pitch = block_stats.ile(0.5);
258  } else if (block_votes == 0 && like_votes > 0) {
259  bad_row->fixed_pitch = like_stats.ile(0.5);
260  } else {
261  tprintf("Warning:guessing pitch as xheight on row %d, block %d\n", row_target,
262  block_target);
263  bad_row->fixed_pitch = bad_row->xheight;
264  }
265  }
266  if (bad_row->fixed_pitch < textord_min_xheight) {
267  bad_row->fixed_pitch = (float)textord_min_xheight;
268  }
269  bad_row->kern_size = bad_row->fixed_pitch / 4;
270  bad_row->min_space = static_cast<int32_t>(bad_row->fixed_pitch * 0.6);
271  bad_row->max_nonspace = static_cast<int32_t>(bad_row->fixed_pitch * 0.4);
272  bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2;
273  bad_row->space_size = bad_row->fixed_pitch;
274  if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) {
275  tune_row_pitch(bad_row, &bad_row->projection, bad_row->projection_left,
276  bad_row->projection_right,
277  (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
278  sp_sd, mid_cuts, &bad_row->char_cells, false);
279  }
280  } else if (bad_row->pitch_decision == PITCH_CORR_PROP ||
281  bad_row->pitch_decision == PITCH_DEF_PROP) {
282  bad_row->fixed_pitch = 0.0f;
283  bad_row->char_cells.clear();
284  }
285 }
286 
287 /**********************************************************************
288  * compute_block_pitch
289  *
290  * Decide whether each block is fixed pitch individually.
291  **********************************************************************/
292 
293 void compute_block_pitch(TO_BLOCK *block, // input list
294  FCOORD rotation, // for drawing
295  int32_t block_index, // block number
296  bool testing_on) { // correct orientation
297  TBOX block_box; // bounding box
298 
299  block_box = block->block->pdblk.bounding_box();
300  if (testing_on && textord_debug_pitch_test) {
301  tprintf("Block %d at (%d,%d)->(%d,%d)\n", block_index, block_box.left(), block_box.bottom(),
302  block_box.right(), block_box.top());
303  }
304  block->min_space = static_cast<int32_t>(floor(block->xheight * textord_words_default_minspace));
305  block->max_nonspace = static_cast<int32_t>(ceil(block->xheight * textord_words_default_nonspace));
306  block->fixed_pitch = 0.0f;
307  block->space_size = static_cast<float>(block->min_space);
308  block->kern_size = static_cast<float>(block->max_nonspace);
309  block->pr_nonsp = block->xheight * words_default_prop_nonspace;
311  if (!block->get_rows()->empty()) {
312  ASSERT_HOST(block->xheight > 0);
313  find_repeated_chars(block, textord_show_initial_words && testing_on);
314 #ifndef GRAPHICS_DISABLED
315  if (textord_show_initial_words && testing_on) {
316  // overlap_picture_ops(true);
318  }
319 #endif
320  compute_rows_pitch(block, block_index, textord_debug_pitch_test && testing_on);
321  }
322 }
323 
324 /**********************************************************************
325  * compute_rows_pitch
326  *
327  * Decide whether each row is fixed pitch individually.
328  **********************************************************************/
329 
330 bool compute_rows_pitch( // find line stats
331  TO_BLOCK *block, // block to do
332  int32_t block_index, // block number
333  bool testing_on // correct orientation
334 ) {
335  int32_t maxwidth; // of spaces
336  TO_ROW *row; // current row
337  int32_t row_index; // row number.
338  float lower, upper; // cluster thresholds
339  TO_ROW_IT row_it = block->get_rows();
340 
341  row_index = 1;
342  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
343  row = row_it.data();
344  ASSERT_HOST(row->xheight > 0);
346  maxwidth = static_cast<int32_t>(ceil(row->xheight * textord_words_maxspace));
347  if (row_pitch_stats(row, maxwidth, testing_on) &&
348  find_row_pitch(row, maxwidth, textord_dotmatrix_gap + 1, block, block_index, row_index,
349  testing_on)) {
350  if (row->fixed_pitch == 0) {
351  lower = row->pr_nonsp;
352  upper = row->pr_space;
353  row->space_size = upper;
354  row->kern_size = lower;
355  }
356  } else {
357  row->fixed_pitch = 0.0f; // insufficient data
359  }
360  row_index++;
361  }
362  return false;
363 }
364 
365 /**********************************************************************
366  * try_doc_fixed
367  *
368  * Attempt to call the entire document fixed pitch.
369  **********************************************************************/
370 
371 bool try_doc_fixed( // determine pitch
372  ICOORD page_tr, // top right
373  TO_BLOCK_LIST *port_blocks, // input list
374  float gradient // page skew
375 ) {
376  int16_t master_x; // uniform shifts
377  int16_t pitch; // median pitch.
378  int x; // profile coord
379  int prop_blocks; // correct counts
380  int fixed_blocks;
381  int total_row_count; // total in page
382  // iterator
383  TO_BLOCK_IT block_it = port_blocks;
384  TO_BLOCK *block; // current block;
385  TO_ROW *row; // current row
386  int16_t projection_left; // edges
387  int16_t projection_right;
388  int16_t row_left; // edges of row
389  int16_t row_right;
390  float master_y; // uniform shifts
391  float shift_factor; // page skew correction
392  float final_pitch; // output pitch
393  float row_y; // baseline
394  STATS projection; // entire page
395  STATS pitches(0, MAX_ALLOWED_PITCH);
396  // for median
397  float sp_sd; // space sd
398  int16_t mid_cuts; // no of cheap cuts
399  float pitch_sd; // sync rating
400 
401  if (block_it.empty()
402  // || block_it.data()==block_it.data_relative(1)
404  return false;
405  }
406  shift_factor = gradient / (gradient * gradient + 1);
407  // row iterator
408  TO_ROW_IT row_it(block_it.data()->get_rows());
409  master_x = row_it.data()->projection_left;
410  master_y = row_it.data()->baseline.y(master_x);
411  projection_left = INT16_MAX;
412  projection_right = -INT16_MAX;
413  prop_blocks = 0;
414  fixed_blocks = 0;
415  total_row_count = 0;
416 
417  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
418  block = block_it.data();
419  row_it.set_to_list(block->get_rows());
420  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
421  row = row_it.data();
422  total_row_count++;
423  if (row->fixed_pitch > 0) {
424  pitches.add(static_cast<int32_t>(row->fixed_pitch), 1);
425  }
426  // find median
427  row_y = row->baseline.y(master_x);
428  row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y));
429  row_right = static_cast<int16_t>(row->projection_right - shift_factor * (master_y - row_y));
430  if (row_left < projection_left) {
431  projection_left = row_left;
432  }
433  if (row_right > projection_right) {
434  projection_right = row_right;
435  }
436  }
437  }
438  if (pitches.get_total() == 0) {
439  return false;
440  }
441  projection.set_range(projection_left, projection_right);
442 
443  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
444  block = block_it.data();
445  row_it.set_to_list(block->get_rows());
446  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
447  row = row_it.data();
448  row_y = row->baseline.y(master_x);
449  row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y));
450  for (x = row->projection_left; x < row->projection_right; x++, row_left++) {
451  projection.add(row_left, row->projection.pile_count(x));
452  }
453  }
454  }
455 
456  row_it.set_to_list(block_it.data()->get_rows());
457  row = row_it.data();
458 #ifndef GRAPHICS_DISABLED
459  if (textord_show_page_cuts && to_win != nullptr) {
460  projection.plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
461  }
462 #endif
463  final_pitch = pitches.ile(0.5);
464  pitch = static_cast<int16_t>(final_pitch);
465  pitch_sd = tune_row_pitch(row, &projection, projection_left, projection_right, pitch * 0.75,
466  final_pitch, sp_sd, mid_cuts, &row->char_cells, false);
467 
469  tprintf(
470  "try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%"
471  "g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
472  prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, pitch_sd / total_row_count,
473  pitch_sd / pitch, pitch_sd / total_row_count / pitch);
474  }
475 
476 #ifndef GRAPHICS_DISABLED
477  if (textord_show_page_cuts && to_win != nullptr) {
478  float row_shift; // shift for row
479  ICOORDELT_LIST *master_cells; // cells for page
480  master_cells = &row->char_cells;
481  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
482  block = block_it.data();
483  row_it.set_to_list(block->get_rows());
484  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
485  row = row_it.data();
486  row_y = row->baseline.y(master_x);
487  row_shift = shift_factor * (master_y - row_y);
488  plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells);
489  }
490  }
491  }
492 #endif
493  row->char_cells.clear();
494  return false;
495 }
496 
497 /**********************************************************************
498  * try_block_fixed
499  *
500  * Try to call the entire block fixed.
501  **********************************************************************/
502 
503 bool try_block_fixed( // find line stats
504  TO_BLOCK *block, // block to do
505  int32_t block_index // block number
506 ) {
507  return false;
508 }
509 
510 /**********************************************************************
511  * try_rows_fixed
512  *
513  * Decide whether each row is fixed pitch individually.
514  **********************************************************************/
515 
516 bool try_rows_fixed( // find line stats
517  TO_BLOCK *block, // block to do
518  int32_t block_index, // block number
519  bool testing_on // correct orientation
520 ) {
521  TO_ROW *row; // current row
522  int32_t row_index; // row number.
523  int32_t def_fixed = 0; // counters
524  int32_t def_prop = 0;
525  int32_t maybe_fixed = 0;
526  int32_t maybe_prop = 0;
527  int32_t dunno = 0;
528  int32_t corr_fixed = 0;
529  int32_t corr_prop = 0;
530  float lower, upper; // cluster thresholds
531  TO_ROW_IT row_it = block->get_rows();
532 
533  row_index = 1;
534  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
535  row = row_it.data();
536  ASSERT_HOST(row->xheight > 0);
537  if (row->fixed_pitch > 0 && fixed_pitch_row(row, block->block, block_index)) {
538  if (row->fixed_pitch == 0) {
539  lower = row->pr_nonsp;
540  upper = row->pr_space;
541  row->space_size = upper;
542  row->kern_size = lower;
543  }
544  }
545  row_index++;
546  }
547  count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,
548  dunno);
549  if (testing_on &&
551  tprintf("Initially:");
552  print_block_counts(block, block_index);
553  }
554  if (def_fixed > def_prop * textord_words_veto_power) {
556  } else if (def_prop > def_fixed * textord_words_veto_power) {
558  } else if (def_fixed > 0 || def_prop > 0) {
559  block->pitch_decision = PITCH_DUNNO;
560  } else if (maybe_fixed > maybe_prop * textord_words_veto_power) {
562  } else if (maybe_prop > maybe_fixed * textord_words_veto_power) {
564  } else {
565  block->pitch_decision = PITCH_DUNNO;
566  }
567  return false;
568 }
569 
570 /**********************************************************************
571  * print_block_counts
572  *
573  * Count up how many rows have what decision and print the results.
574  **********************************************************************/
575 
576 void print_block_counts( // find line stats
577  TO_BLOCK *block, // block to do
578  int32_t block_index // block number
579 ) {
580  int32_t def_fixed = 0; // counters
581  int32_t def_prop = 0;
582  int32_t maybe_fixed = 0;
583  int32_t maybe_prop = 0;
584  int32_t dunno = 0;
585  int32_t corr_fixed = 0;
586  int32_t corr_prop = 0;
587 
588  count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,
589  dunno);
590  tprintf("Block %d has (%d,%d,%d)", block_index, def_fixed, maybe_fixed, corr_fixed);
591  if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed)) {
592  tprintf(" (Wrongly)");
593  }
594  tprintf(" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
595  if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop)) {
596  tprintf(" (Wrongly)");
597  }
598  tprintf(" prop, %d dunno\n", dunno);
599 }
600 
601 /**********************************************************************
602  * count_block_votes
603  *
604  * Count the number of rows in the block with each kind of pitch_decision.
605  **********************************************************************/
606 
607 void count_block_votes( // find line stats
608  TO_BLOCK *block, // block to do
609  int32_t &def_fixed, // add to counts
610  int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed,
611  int32_t &corr_prop, int32_t &dunno) {
612  TO_ROW *row; // current row
613  TO_ROW_IT row_it = block->get_rows();
614 
615  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
616  row = row_it.data();
617  switch (row->pitch_decision) {
618  case PITCH_DUNNO:
619  dunno++;
620  break;
621  case PITCH_DEF_PROP:
622  def_prop++;
623  break;
624  case PITCH_MAYBE_PROP:
625  maybe_prop++;
626  break;
627  case PITCH_DEF_FIXED:
628  def_fixed++;
629  break;
630  case PITCH_MAYBE_FIXED:
631  maybe_fixed++;
632  break;
633  case PITCH_CORR_PROP:
634  corr_prop++;
635  break;
636  case PITCH_CORR_FIXED:
637  corr_fixed++;
638  break;
639  }
640  }
641 }
642 
643 /**********************************************************************
644  * row_pitch_stats
645  *
646  * Decide whether each row is fixed pitch individually.
647  **********************************************************************/
648 
649 bool row_pitch_stats( // find line stats
650  TO_ROW *row, // current row
651  int32_t maxwidth, // of spaces
652  bool testing_on // correct orientation
653 ) {
654  BLOBNBOX *blob; // current blob
655  int gap_index; // current gap
656  int32_t prev_x; // end of prev blob
657  int32_t cluster_count; // no of clusters
658  int32_t prev_count; // of clusters
659  int32_t smooth_factor; // for smoothing stats
660  TBOX blob_box; // bounding box
661  float lower, upper; // cluster thresholds
662  // gap sizes
663  float gaps[BLOCK_STATS_CLUSTERS];
664  // blobs
665  BLOBNBOX_IT blob_it = row->blob_list();
666  STATS gap_stats(0, maxwidth);
667  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
668  // clusters
669 
670  smooth_factor = static_cast<int32_t>(row->xheight * textord_wordstats_smooth_factor + 1.5);
671  if (!blob_it.empty()) {
672  prev_x = blob_it.data()->bounding_box().right();
673  blob_it.forward();
674  while (!blob_it.at_first()) {
675  blob = blob_it.data();
676  if (!blob->joined_to_prev()) {
677  blob_box = blob->bounding_box();
678  if (blob_box.left() - prev_x < maxwidth) {
679  gap_stats.add(blob_box.left() - prev_x, 1);
680  }
681  prev_x = blob_box.right();
682  }
683  blob_it.forward();
684  }
685  }
686  if (gap_stats.get_total() == 0) {
687  return false;
688  }
689  cluster_count = 0;
690  lower = row->xheight * words_initial_lower;
691  upper = row->xheight * words_initial_upper;
692  gap_stats.smooth(smooth_factor);
693  do {
694  prev_count = cluster_count;
695  cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
696  BLOCK_STATS_CLUSTERS, cluster_stats);
697  } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
698  if (cluster_count < 1) {
699  return false;
700  }
701  for (gap_index = 0; gap_index < cluster_count; gap_index++) {
702  gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
703  }
704  // get medians
705  if (testing_on) {
706  tprintf("cluster_count=%d:", cluster_count);
707  for (gap_index = 0; gap_index < cluster_count; gap_index++) {
708  tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
709  }
710  tprintf("\n");
711  }
712  qsort(gaps, cluster_count, sizeof(float), sort_floats);
713 
714  // Try to find proportional non-space and space for row.
715  lower = row->xheight * words_default_prop_nonspace;
716  upper = row->xheight * textord_words_min_minspace;
717  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < lower; gap_index++) {
718  ;
719  }
720  if (gap_index == 0) {
721  if (testing_on) {
722  tprintf("No clusters below nonspace threshold!!\n");
723  }
724  if (cluster_count > 1) {
725  row->pr_nonsp = gaps[0];
726  row->pr_space = gaps[1];
727  } else {
728  row->pr_nonsp = lower;
729  row->pr_space = gaps[0];
730  }
731  } else {
732  row->pr_nonsp = gaps[gap_index - 1];
733  while (gap_index < cluster_count && gaps[gap_index] < upper) {
734  gap_index++;
735  }
736  if (gap_index == cluster_count) {
737  if (testing_on) {
738  tprintf("No clusters above nonspace threshold!!\n");
739  }
740  row->pr_space = lower * textord_spacesize_ratioprop;
741  } else {
742  row->pr_space = gaps[gap_index];
743  }
744  }
745 
746  // Now try to find the fixed pitch space and non-space.
747  upper = row->xheight * words_default_fixed_space;
748  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < upper; gap_index++) {
749  ;
750  }
751  if (gap_index == 0) {
752  if (testing_on) {
753  tprintf("No clusters below space threshold!!\n");
754  }
755  row->fp_nonsp = upper;
756  row->fp_space = gaps[0];
757  } else {
758  row->fp_nonsp = gaps[gap_index - 1];
759  if (gap_index == cluster_count) {
760  if (testing_on) {
761  tprintf("No clusters above space threshold!!\n");
762  }
763  row->fp_space = row->xheight;
764  } else {
765  row->fp_space = gaps[gap_index];
766  }
767  }
768  if (testing_on) {
769  tprintf(
770  "Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, "
771  "fp_space=%g\n",
772  row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
773  }
774  return true; // computed some stats
775 }
776 
777 /**********************************************************************
778  * find_row_pitch
779  *
780  * Check to see if this row could be fixed pitch using the given spacings.
781  * Blobs with gaps smaller than the lower threshold are assumed to be one.
782  * The larger threshold is the word gap threshold.
783  **********************************************************************/
784 
785 bool find_row_pitch( // find lines
786  TO_ROW *row, // row to do
787  int32_t maxwidth, // max permitted space
788  int32_t dm_gap, // ignorable gaps
789  TO_BLOCK *block, // block of row
790  int32_t block_index, // block_number
791  int32_t row_index, // number of row
792  bool testing_on // correct orientation
793 ) {
794  bool used_dm_model; // looks like dot matrix
795  float min_space; // estimate threshold
796  float non_space; // gap size
797  float gap_iqr; // interquartile range
798  float pitch_iqr;
799  float dm_gap_iqr; // interquartile range
800  float dm_pitch_iqr;
801  float dm_pitch; // pitch with dm on
802  float pitch; // revised estimate
803  float initial_pitch; // guess at pitch
804  STATS gap_stats(0, maxwidth);
805  // centre-centre
806  STATS pitch_stats(0, maxwidth);
807 
808  row->fixed_pitch = 0.0f;
809  initial_pitch = row->fp_space;
810  if (initial_pitch > row->xheight * (1 + words_default_fixed_limit)) {
811  initial_pitch = row->xheight; // keep pitch decent
812  }
813  non_space = row->fp_nonsp;
814  if (non_space > initial_pitch) {
815  non_space = initial_pitch;
816  }
817  min_space = (initial_pitch + non_space) / 2;
818 
819  if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false,
820  dm_gap)) {
821  dm_gap_iqr = 0.0001f;
822  dm_pitch_iqr = maxwidth * 2.0f;
823  dm_pitch = initial_pitch;
824  } else {
825  dm_gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
826  dm_pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
827  dm_pitch = pitch_stats.ile(0.5);
828  }
829  gap_stats.clear();
830  pitch_stats.clear();
831  if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false, 0)) {
832  gap_iqr = 0.0001f;
833  pitch_iqr = maxwidth * 3.0f;
834  } else {
835  gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
836  pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
837  if (testing_on) {
838  tprintf(
839  "First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, "
840  "pitch=%g\n",
841  initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5));
842  }
843  initial_pitch = pitch_stats.ile(0.5);
844  if (min_space > initial_pitch && count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch,
845  initial_pitch, true, false, 0)) {
846  min_space = initial_pitch;
847  gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
848  pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
849  if (testing_on) {
850  tprintf(
851  "Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, "
852  "pitch=%g\n",
853  initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5));
854  }
855  initial_pitch = pitch_stats.ile(0.5);
856  }
857  }
859  tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:", block_index,
860  row_index, 'X', pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
861  pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth
862  ? 'D'
863  : (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
864  }
865  if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
868  tprintf("\n");
869  }
870  return false; // insufficient data
871  }
872  if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
873  if (testing_on) {
874  tprintf(
875  "Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, "
876  "dm_gap_iqr=%g\n",
877  pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
878  }
879  gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
880  pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
881  pitch = pitch_stats.ile(0.5);
882  used_dm_model = false;
883  } else {
884  if (testing_on) {
885  tprintf(
886  "Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, "
887  "dm_gap_iqr=%g\n",
888  pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
889  }
890  gap_iqr = dm_gap_iqr;
891  pitch_iqr = dm_pitch_iqr;
892  pitch = dm_pitch;
893  used_dm_model = true;
894  }
896  tprintf("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:", pitch_iqr, gap_iqr, pitch);
897  tprintf("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:", pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
898  pitch_iqr < gap_iqr * textord_fpiqr_ratio &&
899  pitch_iqr < block->xheight * textord_max_pitch_iqr &&
900  pitch < block->xheight * textord_words_default_maxspace
901  ? 'F'
902  : 'P');
903  }
904  if (pitch_iqr < gap_iqr * textord_fpiqr_ratio &&
905  pitch_iqr < block->xheight * textord_max_pitch_iqr &&
906  pitch < block->xheight * textord_words_default_maxspace) {
908  } else {
910  }
911  row->fixed_pitch = pitch;
912  row->kern_size = gap_stats.ile(0.5);
913  row->min_space = static_cast<int32_t>(row->fixed_pitch + non_space) / 2;
914  if (row->min_space > row->fixed_pitch) {
915  row->min_space = static_cast<int32_t>(row->fixed_pitch);
916  }
917  row->max_nonspace = row->min_space;
918  row->space_size = row->fixed_pitch;
919  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
920  row->used_dm_model = used_dm_model;
921  return true;
922 }
923 
924 /**********************************************************************
925  * fixed_pitch_row
926  *
927  * Check to see if this row could be fixed pitch using the given spacings.
928  * Blobs with gaps smaller than the lower threshold are assumed to be one.
929  * The larger threshold is the word gap threshold.
930  **********************************************************************/
931 
932 bool fixed_pitch_row(TO_ROW *row, // row to do
933  BLOCK *block,
934  int32_t block_index // block_number
935 ) {
936  const char *res_string; // pitch result
937  int16_t mid_cuts; // no of cheap cuts
938  float non_space; // gap size
939  float pitch_sd; // error on pitch
940  float sp_sd = 0.0f; // space sd
941 
942  non_space = row->fp_nonsp;
943  if (non_space > row->fixed_pitch) {
944  non_space = row->fixed_pitch;
945  }
946  POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
947  if (textord_all_prop || (pb != nullptr && !pb->IsText())) {
948  // Set the decision to definitely proportional.
949  pitch_sd = textord_words_def_prop * row->fixed_pitch;
951  } else {
952  pitch_sd = tune_row_pitch(row, &row->projection, row->projection_left, row->projection_right,
953  (row->fixed_pitch + non_space * 3) / 4, row->fixed_pitch, sp_sd,
954  mid_cuts, &row->char_cells, block_index == textord_debug_block);
955  if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch &&
956  ((pitsync_linear_version & 3) < 3 ||
957  ((pitsync_linear_version & 3) >= 3 &&
958  (row->used_dm_model || sp_sd > 20 || (pitch_sd == 0 && sp_sd > 10))))) {
959  if (pitch_sd < textord_words_def_fixed * row->fixed_pitch && !row->all_caps &&
960  ((pitsync_linear_version & 3) < 3 || sp_sd > 20)) {
962  } else {
964  }
965  } else if ((pitsync_linear_version & 3) < 3 || sp_sd > 20 || mid_cuts > 0 ||
966  pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
967  if (pitch_sd < textord_words_def_prop * row->fixed_pitch) {
969  } else {
971  }
972  } else {
974  }
975  }
976 
978  res_string = "??";
979  switch (row->pitch_decision) {
980  case PITCH_DEF_PROP:
981  res_string = "DP";
982  break;
983  case PITCH_MAYBE_PROP:
984  res_string = "MP";
985  break;
986  case PITCH_DEF_FIXED:
987  res_string = "DF";
988  break;
989  case PITCH_MAYBE_FIXED:
990  res_string = "MF";
991  break;
992  default:
993  res_string = "??";
994  }
995  tprintf(":sd/p=%g:occ=%g:init_res=%s\n", pitch_sd / row->fixed_pitch, sp_sd, res_string);
996  }
997  return true;
998 }
999 
1000 /**********************************************************************
1001  * count_pitch_stats
1002  *
1003  * Count up the gap and pitch stats on the block to see if it is fixed pitch.
1004  * Blobs with gaps smaller than the lower threshold are assumed to be one.
1005  * The larger threshold is the word gap threshold.
1006  * The return value indicates whether there were any decent values to use.
1007  **********************************************************************/
1008 
1009 bool count_pitch_stats( // find lines
1010  TO_ROW *row, // row to do
1011  STATS *gap_stats, // blob gaps
1012  STATS *pitch_stats, // centre-centre stats
1013  float initial_pitch, // guess at pitch
1014  float min_space, // estimate space size
1015  bool ignore_outsize, // discard big objects
1016  bool split_outsize, // split big objects
1017  int32_t dm_gap // ignorable gaps
1018 ) {
1019  bool prev_valid; // not word broken
1020  BLOBNBOX *blob; // current blob
1021  // blobs
1022  BLOBNBOX_IT blob_it = row->blob_list();
1023  int32_t prev_right; // end of prev blob
1024  int32_t prev_centre; // centre of previous blob
1025  int32_t x_centre; // centre of this blob
1026  int32_t blob_width; // width of blob
1027  int32_t width_units; // no of widths in blob
1028  float width; // blob width
1029  TBOX blob_box; // bounding box
1030  TBOX joined_box; // of super blob
1031 
1032  gap_stats->clear();
1033  pitch_stats->clear();
1034  if (blob_it.empty()) {
1035  return false;
1036  }
1037  prev_valid = false;
1038  prev_centre = 0;
1039  prev_right = 0; // stop compiler warning
1040  joined_box = blob_it.data()->bounding_box();
1041  do {
1042  blob_it.forward();
1043  blob = blob_it.data();
1044  if (!blob->joined_to_prev()) {
1045  blob_box = blob->bounding_box();
1046  if ((blob_box.left() - joined_box.right() < dm_gap && !blob_it.at_first()) ||
1047  blob->cblob() == nullptr) {
1048  joined_box += blob_box; // merge blobs
1049  } else {
1050  blob_width = joined_box.width();
1051  if (split_outsize) {
1052  width_units =
1053  static_cast<int32_t>(floor(static_cast<float>(blob_width) / initial_pitch + 0.5));
1054  if (width_units < 1) {
1055  width_units = 1;
1056  }
1057  width_units--;
1058  } else if (ignore_outsize) {
1059  width = static_cast<float>(blob_width) / initial_pitch;
1060  width_units =
1061  width < 1 + words_default_fixed_limit && width > 1 - words_default_fixed_limit ? 0
1062  : -1;
1063  } else {
1064  width_units = 0; // everything in
1065  }
1066  x_centre = static_cast<int32_t>(joined_box.left() +
1067  (blob_width - width_units * initial_pitch) / 2);
1068  if (prev_valid && width_units >= 0) {
1069  // if (width_units>0)
1070  // {
1071  // tprintf("wu=%d,
1072  // width=%d,
1073  // xc=%d, adding
1074  // %d\n",
1075  // width_units,blob_width,x_centre,x_centre-prev_centre);
1076  // }
1077  gap_stats->add(joined_box.left() - prev_right, 1);
1078  pitch_stats->add(x_centre - prev_centre, 1);
1079  }
1080  prev_centre = static_cast<int32_t>(x_centre + width_units * initial_pitch);
1081  prev_right = joined_box.right();
1082  prev_valid = blob_box.left() - joined_box.right() < min_space;
1083  prev_valid = prev_valid && width_units >= 0;
1084  joined_box = blob_box;
1085  }
1086  }
1087  } while (!blob_it.at_first());
1088  return gap_stats->get_total() >= 3;
1089 }
1090 
1091 /**********************************************************************
1092  * tune_row_pitch
1093  *
1094  * Use a dp algorithm to fit the character cells and return the sd of
1095  * the cell size over the row.
1096  **********************************************************************/
1097 
1098 float tune_row_pitch( // find fp cells
1099  TO_ROW *row, // row to do
1100  STATS *projection, // vertical projection
1101  int16_t projection_left, // edge of projection
1102  int16_t projection_right, // edge of projection
1103  float space_size, // size of blank
1104  float &initial_pitch, // guess at pitch
1105  float &best_sp_sd, // space sd
1106  int16_t &best_mid_cuts, // no of cheap cuts
1107  ICOORDELT_LIST *best_cells, // row cells
1108  bool testing_on // inidividual words
1109 ) {
1110  int pitch_delta; // offset pitch
1111  int16_t mid_cuts; // cheap cuts
1112  float pitch_sd; // current sd
1113  float best_sd; // best result
1114  float best_pitch; // pitch for best result
1115  float initial_sd; // starting error
1116  float sp_sd; // space sd
1117  ICOORDELT_LIST test_cells; // row cells
1118  ICOORDELT_IT best_it; // start of best list
1119 
1121  return tune_row_pitch2(row, projection, projection_left, projection_right, space_size,
1122  initial_pitch, best_sp_sd,
1123  // space sd
1124  best_mid_cuts, best_cells, testing_on);
1125  }
1126  if (textord_disable_pitch_test) {
1127  best_sp_sd = initial_pitch;
1128  return initial_pitch;
1129  }
1130  initial_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1131  initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on);
1132  best_sd = initial_sd;
1133  best_pitch = initial_pitch;
1134  if (testing_on) {
1135  tprintf("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
1136  }
1137  for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1138  pitch_sd =
1139  compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1140  initial_pitch + pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);
1141  if (testing_on) {
1142  tprintf("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta, pitch_sd);
1143  }
1144  if (pitch_sd < best_sd) {
1145  best_sd = pitch_sd;
1146  best_mid_cuts = mid_cuts;
1147  best_sp_sd = sp_sd;
1148  best_pitch = initial_pitch + pitch_delta;
1149  best_cells->clear();
1150  best_it.set_to_list(best_cells);
1151  best_it.add_list_after(&test_cells);
1152  } else {
1153  test_cells.clear();
1154  }
1155  if (pitch_sd > initial_sd) {
1156  break; // getting worse
1157  }
1158  }
1159  for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1160  pitch_sd =
1161  compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1162  initial_pitch - pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);
1163  if (testing_on) {
1164  tprintf("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta, pitch_sd);
1165  }
1166  if (pitch_sd < best_sd) {
1167  best_sd = pitch_sd;
1168  best_mid_cuts = mid_cuts;
1169  best_sp_sd = sp_sd;
1170  best_pitch = initial_pitch - pitch_delta;
1171  best_cells->clear();
1172  best_it.set_to_list(best_cells);
1173  best_it.add_list_after(&test_cells);
1174  } else {
1175  test_cells.clear();
1176  }
1177  if (pitch_sd > initial_sd) {
1178  break;
1179  }
1180  }
1181  initial_pitch = best_pitch;
1182 
1184  print_pitch_sd(row, projection, projection_left, projection_right, space_size, best_pitch);
1185  }
1186 
1187  return best_sd;
1188 }
1189 
1190 /**********************************************************************
1191  * tune_row_pitch
1192  *
1193  * Use a dp algorithm to fit the character cells and return the sd of
1194  * the cell size over the row.
1195  **********************************************************************/
1196 
1197 float tune_row_pitch2( // find fp cells
1198  TO_ROW *row, // row to do
1199  STATS *projection, // vertical projection
1200  int16_t projection_left, // edge of projection
1201  int16_t projection_right, // edge of projection
1202  float space_size, // size of blank
1203  float &initial_pitch, // guess at pitch
1204  float &best_sp_sd, // space sd
1205  int16_t &best_mid_cuts, // no of cheap cuts
1206  ICOORDELT_LIST *best_cells, // row cells
1207  bool testing_on // inidividual words
1208 ) {
1209  int pitch_delta; // offset pitch
1210  int16_t pixel; // pixel coord
1211  int16_t best_pixel; // pixel coord
1212  int16_t best_delta; // best pitch
1213  int16_t best_pitch; // best pitch
1214  int16_t start; // of good range
1215  int16_t end; // of good range
1216  int32_t best_count; // lowest sum
1217  float best_sd; // best result
1218 
1219  best_sp_sd = initial_pitch;
1220 
1221  best_pitch = static_cast<int>(initial_pitch);
1222  if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) {
1223  return initial_pitch;
1224  }
1225  std::unique_ptr<STATS[]> sum_proj(new STATS[textord_pitch_range * 2 + 1]); // summed projection
1226 
1227  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1228  sum_proj[textord_pitch_range + pitch_delta].set_range(0, best_pitch + pitch_delta + 1);
1229  }
1230  for (pixel = projection_left; pixel <= projection_right; pixel++) {
1231  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1232  sum_proj[textord_pitch_range + pitch_delta].add(
1233  (pixel - projection_left) % (best_pitch + pitch_delta), projection->pile_count(pixel));
1234  }
1235  }
1236  best_count = sum_proj[textord_pitch_range].pile_count(0);
1237  best_delta = 0;
1238  best_pixel = 0;
1239  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1240  for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
1241  if (sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel) < best_count) {
1242  best_count = sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel);
1243  best_delta = pitch_delta;
1244  best_pixel = pixel;
1245  }
1246  }
1247  }
1248  if (testing_on) {
1249  tprintf("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n", initial_pitch, best_delta,
1250  best_count);
1251  }
1252  best_pitch += best_delta;
1253  initial_pitch = best_pitch;
1254  best_count++;
1255  best_count += best_count;
1256  for (start = best_pixel - 2;
1257  start > best_pixel - best_pitch &&
1258  sum_proj[textord_pitch_range + best_delta].pile_count(start % best_pitch) <= best_count;
1259  start--) {
1260  ;
1261  }
1262  for (end = best_pixel + 2;
1263  end < best_pixel + best_pitch &&
1264  sum_proj[textord_pitch_range + best_delta].pile_count(end % best_pitch) <= best_count;
1265  end++) {
1266  ;
1267  }
1268 
1269  best_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1270  initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on,
1271  start, end);
1272  if (testing_on) {
1273  tprintf("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch, best_sd);
1274  }
1275 
1277  print_pitch_sd(row, projection, projection_left, projection_right, space_size, initial_pitch);
1278  }
1279 
1280  return best_sd;
1281 }
1282 
1283 /**********************************************************************
1284  * compute_pitch_sd
1285  *
1286  * Use a dp algorithm to fit the character cells and return the sd of
1287  * the cell size over the row.
1288  **********************************************************************/
1289 
1290 float compute_pitch_sd( // find fp cells
1291  TO_ROW *row, // row to do
1292  STATS *projection, // vertical projection
1293  int16_t projection_left, // edge
1294  int16_t projection_right, // edge
1295  float space_size, // size of blank
1296  float initial_pitch, // guess at pitch
1297  float &sp_sd, // space sd
1298  int16_t &mid_cuts, // no of free cuts
1299  ICOORDELT_LIST *row_cells, // list of chop pts
1300  bool testing_on, // inidividual words
1301  int16_t start, // start of good range
1302  int16_t end // end of good range
1303 ) {
1304  int16_t occupation; // no of cells in word.
1305  // blobs
1306  BLOBNBOX_IT blob_it = row->blob_list();
1307  BLOBNBOX_IT start_it; // start of word
1308  BLOBNBOX_IT plot_it; // for plotting
1309  int16_t blob_count; // no of blobs
1310  TBOX blob_box; // bounding box
1311  TBOX prev_box; // of super blob
1312  int32_t prev_right; // of word sync
1313  int scale_factor; // on scores for big words
1314  int32_t sp_count; // spaces
1315  FPSEGPT_LIST seg_list; // char cells
1316  FPSEGPT_IT seg_it; // iterator
1317  int16_t segpos; // position of segment
1318  int16_t cellpos; // previous cell boundary
1319  // iterator
1320  ICOORDELT_IT cell_it = row_cells;
1321  ICOORDELT *cell; // new cell
1322  double sqsum; // sum of squares
1323  double spsum; // of spaces
1324  double sp_var; // space error
1325  double word_sync; // result for word
1326  int32_t total_count; // total blobs
1327 
1328  if ((pitsync_linear_version & 3) > 1) {
1329  word_sync = compute_pitch_sd2(row, projection, projection_left, projection_right, initial_pitch,
1330  occupation, mid_cuts, row_cells, testing_on, start, end);
1331  sp_sd = occupation;
1332  return word_sync;
1333  }
1334  mid_cuts = 0;
1335  cellpos = 0;
1336  total_count = 0;
1337  sqsum = 0;
1338  sp_count = 0;
1339  spsum = 0;
1340  prev_right = -1;
1341  if (blob_it.empty()) {
1342  return space_size * 10;
1343  }
1344 #ifndef GRAPHICS_DISABLED
1345  if (testing_on && to_win != nullptr) {
1346  blob_box = blob_it.data()->bounding_box();
1347  projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
1348  }
1349 #endif
1350  start_it = blob_it;
1351  blob_count = 0;
1352  blob_box = box_next(&blob_it); // first blob
1353  blob_it.mark_cycle_pt();
1354  do {
1355  for (; blob_count > 0; blob_count--) {
1356  box_next(&start_it);
1357  }
1358  do {
1359  prev_box = blob_box;
1360  blob_count++;
1361  blob_box = box_next(&blob_it);
1362  } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size);
1363  plot_it = start_it;
1364  if (pitsync_linear_version & 3) {
1365  word_sync = check_pitch_sync2(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2,
1366  projection, projection_left, projection_right,
1367  row->xheight * textord_projection_scale, occupation, &seg_list,
1368  start, end);
1369  } else {
1370  word_sync = check_pitch_sync(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2,
1371  projection, &seg_list);
1372  }
1373  if (testing_on) {
1374  tprintf("Word ending at (%d,%d), len=%d, sync rating=%g, ", prev_box.right(), prev_box.top(),
1375  seg_list.length() - 1, word_sync);
1376  seg_it.set_to_list(&seg_list);
1377  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1378  if (seg_it.data()->faked) {
1379  tprintf("(F)");
1380  }
1381  tprintf("%d, ", seg_it.data()->position());
1382  // tprintf("C=%g, s=%g, sq=%g\n",
1383  // seg_it.data()->cost_function(),
1384  // seg_it.data()->sum(),
1385  // seg_it.data()->squares());
1386  }
1387  tprintf("\n");
1388  }
1389 #ifndef GRAPHICS_DISABLED
1390  if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) {
1391  plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1392  }
1393 #endif
1394  seg_it.set_to_list(&seg_list);
1395  if (prev_right >= 0) {
1396  sp_var = seg_it.data()->position() - prev_right;
1397  sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;
1398  sp_var *= sp_var;
1399  spsum += sp_var;
1400  sp_count++;
1401  }
1402  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1403  segpos = seg_it.data()->position();
1404  if (cell_it.empty() || segpos > cellpos + initial_pitch / 2) {
1405  // big gap
1406  while (!cell_it.empty() && segpos > cellpos + initial_pitch * 3 / 2) {
1407  cell = new ICOORDELT(cellpos + static_cast<int16_t>(initial_pitch), 0);
1408  cell_it.add_after_then_move(cell);
1409  cellpos += static_cast<int16_t>(initial_pitch);
1410  }
1411  // make new one
1412  cell = new ICOORDELT(segpos, 0);
1413  cell_it.add_after_then_move(cell);
1414  cellpos = segpos;
1415  } else if (segpos > cellpos - initial_pitch / 2) {
1416  cell = cell_it.data();
1417  // average positions
1418  cell->set_x((cellpos + segpos) / 2);
1419  cellpos = cell->x();
1420  }
1421  }
1422  seg_it.move_to_last();
1423  prev_right = seg_it.data()->position();
1425  scale_factor = (seg_list.length() - 2) / 2;
1426  if (scale_factor < 1) {
1427  scale_factor = 1;
1428  }
1429  } else {
1430  scale_factor = 1;
1431  }
1432  sqsum += word_sync * scale_factor;
1433  total_count += (seg_list.length() - 1) * scale_factor;
1434  seg_list.clear();
1435  } while (!blob_it.cycled_list());
1436  sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;
1437  return total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;
1438 }
1439 
1440 /**********************************************************************
1441  * compute_pitch_sd2
1442  *
1443  * Use a dp algorithm to fit the character cells and return the sd of
1444  * the cell size over the row.
1445  **********************************************************************/
1446 
1447 float compute_pitch_sd2( // find fp cells
1448  TO_ROW *row, // row to do
1449  STATS *projection, // vertical projection
1450  int16_t projection_left, // edge
1451  int16_t projection_right, // edge
1452  float initial_pitch, // guess at pitch
1453  int16_t &occupation, // no of occupied cells
1454  int16_t &mid_cuts, // no of free cuts
1455  ICOORDELT_LIST *row_cells, // list of chop pts
1456  bool testing_on, // inidividual words
1457  int16_t start, // start of good range
1458  int16_t end // end of good range
1459 ) {
1460  // blobs
1461  BLOBNBOX_IT blob_it = row->blob_list();
1462  BLOBNBOX_IT plot_it;
1463  int16_t blob_count; // no of blobs
1464  TBOX blob_box; // bounding box
1465  FPSEGPT_LIST seg_list; // char cells
1466  FPSEGPT_IT seg_it; // iterator
1467  int16_t segpos; // position of segment
1468  // iterator
1469  ICOORDELT_IT cell_it = row_cells;
1470  ICOORDELT *cell; // new cell
1471  double word_sync; // result for word
1472 
1473  mid_cuts = 0;
1474  if (blob_it.empty()) {
1475  occupation = 0;
1476  return initial_pitch * 10;
1477  }
1478 #ifndef GRAPHICS_DISABLED
1479  if (testing_on && to_win != nullptr) {
1480  projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
1481  }
1482 #endif
1483  blob_count = 0;
1484  blob_it.mark_cycle_pt();
1485  do {
1486  // first blob
1487  blob_box = box_next(&blob_it);
1488  blob_count++;
1489  } while (!blob_it.cycled_list());
1490  plot_it = blob_it;
1491  word_sync = check_pitch_sync2(
1492  &blob_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left,
1493  projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, start, end);
1494  if (testing_on) {
1495  tprintf("Row ending at (%d,%d), len=%d, sync rating=%g, ", blob_box.right(), blob_box.top(),
1496  seg_list.length() - 1, word_sync);
1497  seg_it.set_to_list(&seg_list);
1498  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1499  if (seg_it.data()->faked) {
1500  tprintf("(F)");
1501  }
1502  tprintf("%d, ", seg_it.data()->position());
1503  // tprintf("C=%g, s=%g, sq=%g\n",
1504  // seg_it.data()->cost_function(),
1505  // seg_it.data()->sum(),
1506  // seg_it.data()->squares());
1507  }
1508  tprintf("\n");
1509  }
1510 #ifndef GRAPHICS_DISABLED
1511  if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) {
1512  plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1513  }
1514 #endif
1515  seg_it.set_to_list(&seg_list);
1516  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1517  segpos = seg_it.data()->position();
1518  // make new one
1519  cell = new ICOORDELT(segpos, 0);
1520  cell_it.add_after_then_move(cell);
1521  if (seg_it.at_last()) {
1522  mid_cuts = seg_it.data()->cheap_cuts();
1523  }
1524  }
1525  seg_list.clear();
1526  return occupation > 0 ? sqrt(word_sync / occupation) : initial_pitch * 10;
1527 }
1528 
1529 /**********************************************************************
1530  * print_pitch_sd
1531  *
1532  * Use a dp algorithm to fit the character cells and return the sd of
1533  * the cell size over the row.
1534  **********************************************************************/
1535 
1536 void print_pitch_sd( // find fp cells
1537  TO_ROW *row, // row to do
1538  STATS *projection, // vertical projection
1539  int16_t projection_left, // edges //size of blank
1540  int16_t projection_right, float space_size,
1541  float initial_pitch // guess at pitch
1542 ) {
1543  const char *res2; // pitch result
1544  int16_t occupation; // used cells
1545  float sp_sd; // space sd
1546  // blobs
1547  BLOBNBOX_IT blob_it = row->blob_list();
1548  BLOBNBOX_IT start_it; // start of word
1549  BLOBNBOX_IT row_start; // start of row
1550  int16_t blob_count; // no of blobs
1551  int16_t total_blob_count; // total blobs in line
1552  TBOX blob_box; // bounding box
1553  TBOX prev_box; // of super blob
1554  int32_t prev_right; // of word sync
1555  int scale_factor; // on scores for big words
1556  int32_t sp_count; // spaces
1557  FPSEGPT_LIST seg_list; // char cells
1558  FPSEGPT_IT seg_it; // iterator
1559  double sqsum; // sum of squares
1560  double spsum; // of spaces
1561  double sp_var; // space error
1562  double word_sync; // result for word
1563  double total_count; // total cuts
1564 
1565  if (blob_it.empty()) {
1566  return;
1567  }
1568  row_start = blob_it;
1569  total_blob_count = 0;
1570 
1571  total_count = 0;
1572  sqsum = 0;
1573  sp_count = 0;
1574  spsum = 0;
1575  prev_right = -1;
1576  blob_it = row_start;
1577  start_it = blob_it;
1578  blob_count = 0;
1579  blob_box = box_next(&blob_it); // first blob
1580  blob_it.mark_cycle_pt();
1581  do {
1582  for (; blob_count > 0; blob_count--) {
1583  box_next(&start_it);
1584  }
1585  do {
1586  prev_box = blob_box;
1587  blob_count++;
1588  blob_box = box_next(&blob_it);
1589  } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size);
1590  word_sync = check_pitch_sync2(
1591  &start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left,
1592  projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0);
1593  total_blob_count += blob_count;
1594  seg_it.set_to_list(&seg_list);
1595  if (prev_right >= 0) {
1596  sp_var = seg_it.data()->position() - prev_right;
1597  sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;
1598  sp_var *= sp_var;
1599  spsum += sp_var;
1600  sp_count++;
1601  }
1602  seg_it.move_to_last();
1603  prev_right = seg_it.data()->position();
1605  scale_factor = (seg_list.length() - 2) / 2;
1606  if (scale_factor < 1) {
1607  scale_factor = 1;
1608  }
1609  } else {
1610  scale_factor = 1;
1611  }
1612  sqsum += word_sync * scale_factor;
1613  total_count += (seg_list.length() - 1) * scale_factor;
1614  seg_list.clear();
1615  } while (!blob_it.cycled_list());
1616  sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;
1617  word_sync = total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;
1618  tprintf("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:", word_sync, word_sync / initial_pitch, sp_sd,
1619  word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P');
1620 
1621  start_it = row_start;
1622  blob_it = row_start;
1623  word_sync =
1624  check_pitch_sync2(&blob_it, total_blob_count, static_cast<int16_t>(initial_pitch), 2,
1625  projection, projection_left, projection_right,
1626  row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0);
1627  if (occupation > 1) {
1628  word_sync /= occupation;
1629  }
1630  word_sync = sqrt(word_sync);
1631 
1632 #ifndef GRAPHICS_DISABLED
1633  if (textord_show_row_cuts && to_win != nullptr) {
1634  plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list);
1635  }
1636 #endif
1637  seg_list.clear();
1638  if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
1639  if (word_sync < textord_words_def_fixed * initial_pitch && !row->all_caps) {
1640  res2 = "DF";
1641  } else {
1642  res2 = "MF";
1643  }
1644  } else {
1645  res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
1646  }
1647  tprintf(
1648  "row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, "
1649  "all_caps=%d\n",
1650  word_sync, word_sync / initial_pitch,
1651  word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P', occupation, res2,
1652  initial_pitch, row->fixed_pitch, row->all_caps);
1653 }
1654 
1655 /**********************************************************************
1656  * find_repeated_chars
1657  *
1658  * Extract marked leader blobs and put them
1659  * into words in advance of fixed pitch checking and word generation.
1660  **********************************************************************/
1661 void find_repeated_chars(TO_BLOCK *block, // Block to search.
1662  bool testing_on) { // Debug mode.
1663  POLY_BLOCK *pb = block->block->pdblk.poly_block();
1664  if (pb != nullptr && !pb->IsText()) {
1665  return; // Don't find repeated chars in non-text blocks.
1666  }
1667 
1668  TO_ROW *row;
1669  BLOBNBOX_IT box_it;
1670  BLOBNBOX_IT search_it; // forward search
1671  WERD *word; // new word
1672  TBOX word_box; // for plotting
1673  int blobcount, repeated_set;
1674 
1675  TO_ROW_IT row_it = block->get_rows();
1676  if (row_it.empty()) {
1677  return; // empty block
1678  }
1679  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1680  row = row_it.data();
1681  box_it.set_to_list(row->blob_list());
1682  if (box_it.empty()) {
1683  continue; // no blobs in this row
1684  }
1685  if (!row->rep_chars_marked()) {
1686  mark_repeated_chars(row);
1687  }
1688  if (row->num_repeated_sets() == 0) {
1689  continue; // nothing to do for this row
1690  }
1691  // new words
1692  WERD_IT word_it(&row->rep_words);
1693  do {
1694  if (box_it.data()->repeated_set() != 0 && !box_it.data()->joined_to_prev()) {
1695  blobcount = 1;
1696  repeated_set = box_it.data()->repeated_set();
1697  search_it = box_it;
1698  search_it.forward();
1699  while (!search_it.at_first() && search_it.data()->repeated_set() == repeated_set) {
1700  blobcount++;
1701  search_it.forward();
1702  }
1703  // After the call to make_real_word() all the blobs from this
1704  // repeated set will be removed from the blob list. box_it will be
1705  // set to point to the blob after the end of the extracted sequence.
1706  word = make_real_word(&box_it, blobcount, box_it.at_first(), 1);
1707  if (!box_it.empty() && box_it.data()->joined_to_prev()) {
1708  tprintf("Bad box joined to prev at");
1709  box_it.data()->bounding_box().print();
1710  tprintf("After repeated word:");
1711  word->bounding_box().print();
1712  }
1713  ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev());
1714  word->set_flag(W_REP_CHAR, true);
1715  word->set_flag(W_DONT_CHOP, true);
1716  word_it.add_after_then_move(word);
1717  } else {
1718  box_it.forward();
1719  }
1720  } while (!box_it.at_first());
1721  }
1722 }
1723 
1724 /**********************************************************************
1725  * plot_fp_word
1726  *
1727  * Plot a block of words as if fixed pitch.
1728  **********************************************************************/
1729 
1730 #ifndef GRAPHICS_DISABLED
1731 void plot_fp_word( // draw block of words
1732  TO_BLOCK *block, // block to draw
1733  float pitch, // pitch to draw with
1734  float nonspace // for space threshold
1735 ) {
1736  TO_ROW *row; // current row
1737  TO_ROW_IT row_it = block->get_rows();
1738 
1739  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1740  row = row_it.data();
1741  row->min_space = static_cast<int32_t>((pitch + nonspace) / 2);
1742  row->max_nonspace = row->min_space;
1743  row->space_threshold = row->min_space;
1744  plot_word_decisions(to_win, static_cast<int16_t>(pitch), row);
1745  }
1746 }
1747 #endif
1748 
1749 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define BOOL_VAR(name, val, comment)
Definition: params.h:359
#define double_VAR(name, val, comment)
Definition: params.h:365
#define MAX_ALLOWED_PITCH
Definition: topitch.cpp:53
#define BLOCK_STATS_CLUSTERS
Definition: topitch.cpp:52
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:39
@ W_REP_CHAR
repeated character
Definition: werd.h:40
int textord_dotmatrix_gap
Definition: tovars.cpp:28
void compute_fixed_pitch(ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient, FCOORD rotation, bool testing_on)
Definition: topitch.cpp:75
bool try_block_fixed(TO_BLOCK *block, int32_t block_index)
Definition: topitch.cpp:503
double words_initial_upper
Definition: tovars.cpp:47
void compute_block_pitch(TO_BLOCK *block, FCOORD rotation, int32_t block_index, bool testing_on)
Definition: topitch.cpp:293
bool textord_blocksall_prop
Definition: tovars.cpp:27
void plot_fp_cells2(ScrollView *win, ScrollView::Color colour, TO_ROW *row, FPSEGPT_LIST *seg_list)
Definition: drawtord.cpp:353
double textord_wordstats_smooth_factor
Definition: tovars.cpp:31
double words_initial_lower
Definition: tovars.cpp:46
int textord_words_veto_power
Definition: tovars.cpp:43
bool fixed_pitch_row(TO_ROW *row, BLOCK *block, int32_t block_index)
Definition: topitch.cpp:932
void plot_fp_word(TO_BLOCK *block, float pitch, float nonspace)
Definition: topitch.cpp:1731
int textord_min_xheight
Definition: makerow.cpp:70
double textord_words_default_nonspace
Definition: tovars.cpp:36
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool textord_show_fixed_cuts
Definition: drawtord.cpp:35
float compute_pitch_sd2(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float initial_pitch, int16_t &occupation, int16_t &mid_cuts, ICOORDELT_LIST *row_cells, bool testing_on, int16_t start, int16_t end)
Definition: topitch.cpp:1447
double words_default_fixed_space
Definition: tovars.cpp:49
int pitsync_linear_version
Definition: pitsync1.cpp:26
void mark_repeated_chars(TO_ROW *row)
Definition: makerow.cpp:2563
WERD * make_real_word(BLOBNBOX_IT *box_it, int32_t blobcount, bool bol, uint8_t blanks)
Definition: wordseg.cpp:559
float compute_pitch_sd(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float initial_pitch, float &sp_sd, int16_t &mid_cuts, ICOORDELT_LIST *row_cells, bool testing_on, int16_t start, int16_t end)
Definition: topitch.cpp:1290
ScrollView * to_win
Definition: drawtord.cpp:37
int textord_pitch_range
Definition: tovars.cpp:30
int textord_debug_block
Definition: tovars.cpp:29
bool try_rows_fixed(TO_BLOCK *block, int32_t block_index, bool testing_on)
Definition: topitch.cpp:516
double textord_words_default_maxspace
Definition: tovars.cpp:33
void find_repeated_chars(TO_BLOCK *block, bool testing_on)
Definition: topitch.cpp:1661
double textord_projection_scale
Definition: topitch.cpp:49
@ PITCH_DUNNO
Definition: blobbox.h:48
@ PITCH_MAYBE_FIXED
Definition: blobbox.h:50
@ PITCH_DEF_FIXED
Definition: blobbox.h:49
@ PITCH_MAYBE_PROP
Definition: blobbox.h:52
@ PITCH_DEF_PROP
Definition: blobbox.h:51
@ PITCH_CORR_FIXED
Definition: blobbox.h:53
@ PITCH_CORR_PROP
Definition: blobbox.h:54
void print_pitch_sd(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float initial_pitch)
Definition: topitch.cpp:1536
bool textord_blockndoc_fixed
Definition: topitch.cpp:48
double check_pitch_sync2(BLOBNBOX_IT *blob_it, int16_t blob_count, int16_t pitch, int16_t pitch_error, STATS *projection, int16_t projection_left, int16_t projection_right, float projection_scale, int16_t &occupation_count, FPSEGPT_LIST *seg_list, int16_t start, int16_t end)
Definition: pithsync.cpp:292
bool count_pitch_stats(TO_ROW *row, STATS *gap_stats, STATS *pitch_stats, float initial_pitch, float min_space, bool ignore_outsize, bool split_outsize, int32_t dm_gap)
Definition: topitch.cpp:1009
bool textord_pitch_scalebigwords
Definition: tovars.cpp:45
double textord_words_min_minspace
Definition: tovars.cpp:35
bool find_row_pitch(TO_ROW *row, int32_t maxwidth, int32_t dm_gap, TO_BLOCK *block, int32_t block_index, int32_t row_index, bool testing_on)
Definition: topitch.cpp:785
void fix_row_pitch(TO_ROW *bad_row, TO_BLOCK *bad_block, TO_BLOCK_LIST *blocks, int32_t row_target, int32_t block_target)
Definition: topitch.cpp:144
bool textord_blocksall_fixed
Definition: tovars.cpp:26
bool textord_debug_pitch_metric
Definition: topitch.cpp:45
double textord_words_maxspace
Definition: tovars.cpp:32
float tune_row_pitch2(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float &initial_pitch, float &best_sp_sd, int16_t &best_mid_cuts, ICOORDELT_LIST *best_cells, bool testing_on)
Definition: topitch.cpp:1197
void print_block_counts(TO_BLOCK *block, int32_t block_index)
Definition: topitch.cpp:576
bool textord_debug_pitch_test
Definition: topitch.cpp:42
bool row_pitch_stats(TO_ROW *row, int32_t maxwidth, bool testing_on)
Definition: topitch.cpp:649
double textord_balance_factor
Definition: topitch.cpp:50
bool textord_show_row_cuts
Definition: topitch.cpp:46
bool try_doc_fixed(ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient)
Definition: topitch.cpp:371
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
Definition: drawtord.cpp:238
double words_default_prop_nonspace
Definition: tovars.cpp:48
bool textord_fast_pitch_test
Definition: topitch.cpp:44
double textord_fpiqr_ratio
Definition: tovars.cpp:53
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:47
double textord_words_pitchsd_threshold
Definition: tovars.cpp:40
float tune_row_pitch(TO_ROW *row, STATS *projection, int16_t projection_left, int16_t projection_right, float space_size, float &initial_pitch, float &best_sp_sd, int16_t &best_mid_cuts, ICOORDELT_LIST *best_cells, bool testing_on)
Definition: topitch.cpp:1098
void plot_row_cells(ScrollView *win, ScrollView::Color colour, TO_ROW *row, float xshift, ICOORDELT_LIST *cells)
Definition: drawtord.cpp:387
bool textord_show_initial_words
Definition: tovars.cpp:25
void count_block_votes(TO_BLOCK *block, int32_t &def_fixed, int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed, int32_t &corr_prop, int32_t &dunno)
Definition: topitch.cpp:607
double words_default_fixed_limit
Definition: tovars.cpp:50
double check_pitch_sync(BLOBNBOX_IT *blob_it, int16_t blob_count, int16_t pitch, int16_t pitch_error, STATS *projection, FPSEGPT_LIST *seg_list)
Definition: pitsync1.cpp:138
double textord_max_pitch_iqr
Definition: tovars.cpp:54
double textord_words_def_prop
Definition: tovars.cpp:42
double textord_words_default_minspace
Definition: tovars.cpp:34
bool textord_show_page_cuts
Definition: topitch.cpp:47
double textord_spacesize_ratioprop
Definition: tovars.cpp:52
bool compute_rows_pitch(TO_BLOCK *block, int32_t block_index, bool testing_on)
Definition: topitch.cpp:330
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:638
double textord_pitch_rowsimilarity
Definition: tovars.cpp:44
const TBOX & bounding_box() const
Definition: blobbox.h:239
C_BLOB * cblob() const
Definition: blobbox.h:277
bool joined_to_prev() const
Definition: blobbox.h:265
bool rep_chars_marked() const
Definition: blobbox.h:637
QSPLINE baseline
Definition: blobbox.h:676
int32_t min_space
Definition: blobbox.h:669
ICOORDELT_LIST char_cells
Definition: blobbox.h:675
WERD_LIST rep_words
Definition: blobbox.h:674
int num_repeated_sets() const
Definition: blobbox.h:643
int32_t max_nonspace
Definition: blobbox.h:670
bool used_dm_model
Definition: blobbox.h:653
STATS projection
Definition: blobbox.h:677
float space_size
Definition: blobbox.h:673
float fixed_pitch
Definition: blobbox.h:657
int32_t space_threshold
Definition: blobbox.h:671
float intercept() const
Definition: blobbox.h:598
void compute_vertical_projection()
Definition: blobbox.cpp:799
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:608
PITCH_TYPE pitch_decision
Definition: blobbox.h:656
int16_t projection_left
Definition: blobbox.h:654
int16_t projection_right
Definition: blobbox.h:655
TO_ROW_LIST * get_rows()
Definition: blobbox.h:709
int32_t min_space
Definition: blobbox.h:796
int32_t max_nonspace
Definition: blobbox.h:797
PITCH_TYPE pitch_decision
Definition: blobbox.h:782
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
integer coordinate
Definition: points.h:36
void set_x(TDimension xin)
rewrite function
Definition: points.h:67
TDimension x() const
access function
Definition: points.h:58
bool IsText() const
Definition: polyblk.h:52
double y(double x) const
Definition: quspline.cpp:203
TDimension left() const
Definition: rect.h:82
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
void print() const
Definition: rect.h:289
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:597
int32_t pile_count(int32_t value) const
Definition: statistc.h:75
int32_t get_total() const
Definition: statistc.h:85
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
Definition: statistc.cpp:59
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
Definition: statistc.cpp:335
void smooth(int32_t factor)
Definition: statistc.cpp:302
double ile(double frac) const
Definition: statistc.cpp:173
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131
TBOX bounding_box() const
Definition: werd.cpp:155
static void Update()
Definition: scrollview.cpp:713