tesseract  5.0.0
colpartition.cpp
Go to the documentation of this file.
1 // File: colpartition.cpp
3 // Description: Class to hold partitions of the page that correspond
4 // roughly to text lines.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23 
24 #include "colpartition.h"
25 #include "colpartitiongrid.h"
26 #include "colpartitionset.h"
27 #include "detlinefit.h"
28 #include "dppoint.h"
29 #include "helpers.h" // for UpdateRange
30 #include "host.h" // for NearlyEqual
31 #include "imagefind.h"
32 #include "workingpartset.h"
33 
34 #include <algorithm>
35 
36 namespace tesseract {
37 
39 
40 // enum to refer to the entries in a neighbourhood of lines.
41 // Used by SmoothSpacings to test for blips with OKSpacingBlip.
49  PN_COUNT
50 };
51 
52 // Maximum change in spacing (in inches) to ignore.
53 const double kMaxSpacingDrift = 1.0 / 72; // 1/72 is one point.
54 // Maximum fraction of line height used as an additional allowance
55 // for top spacing.
56 const double kMaxTopSpacingFraction = 0.25;
57 // What multiple of the largest line height should be used as an upper bound
58 // for whether lines are in the same text block?
59 const double kMaxSameBlockLineSpacing = 3;
60 // Maximum ratio of sizes for lines to be considered the same size.
61 const double kMaxSizeRatio = 1.5;
62 // Fraction of max of leader width and gap for max IQR of gaps.
63 const double kMaxLeaderGapFractionOfMax = 0.25;
64 // Fraction of min of leader width and gap for max IQR of gaps.
65 const double kMaxLeaderGapFractionOfMin = 0.5;
66 // Minimum number of blobs to be considered a leader.
67 const int kMinLeaderCount = 5;
68 // Minimum score for a STRONG_CHAIN textline.
69 const int kMinStrongTextValue = 6;
70 // Minimum score for a CHAIN textline.
71 const int kMinChainTextValue = 3;
72 // Minimum number of blobs for strong horizontal text lines.
74 // Minimum height (in image pixels) for strong horizontal text lines.
76 // Minimum aspect ratio for strong horizontal text lines.
78 // Maximum upper quartile error allowed on a baseline fit as a fraction
79 // of height.
80 const double kMaxBaselineError = 0.4375;
81 // Min coverage for a good baseline between vectors
82 const double kMinBaselineCoverage = 0.5;
83 // Max RMS color noise to compare colors.
84 const int kMaxRMSColorNoise = 128;
85 // Maximum distance to allow a partition color to be to use that partition
86 // in smoothing neighbouring types. This is a squared distance.
87 const int kMaxColorDistance = 900;
88 
89 // blob_type is the blob_region_type_ of the blobs in this partition.
90 // Vertical is the direction of logical vertical on the possibly skewed image.
92  : left_margin_(-INT32_MAX),
93  right_margin_(INT32_MAX),
94  median_bottom_(INT32_MAX),
95  median_top_(-INT32_MAX),
96  median_left_(INT32_MAX),
97  median_right_(-INT32_MAX),
98  blob_type_(blob_type),
99  vertical_(vertical) {
100  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
101 }
102 
103 // Constructs a fake ColPartition with a single fake BLOBNBOX, all made
104 // from a single TBOX.
105 // WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
106 // the ColPartition owns the BLOBNBOX!!!
107 // Call DeleteBoxes before deleting the ColPartition.
109  PolyBlockType block_type,
110  BlobRegionType blob_type,
111  BlobTextFlowType flow) {
112  auto *part = new ColPartition(blob_type, ICOORD(0, 1));
113  part->set_type(block_type);
114  part->set_flow(flow);
115  part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));
116  part->set_left_margin(box.left());
117  part->set_right_margin(box.right());
118  part->SetBlobTypes();
119  part->ComputeLimits();
120  part->ClaimBoxes();
121  return part;
122 }
123 
124 // Constructs and returns a ColPartition with the given real BLOBNBOX,
125 // and sets it up to be a "big" partition (single-blob partition bigger
126 // than the surrounding text that may be a dropcap, two or more vertically
127 // touching characters, or some graphic element.
128 // If the given list is not nullptr, the partition is also added to the list.
130  ColPartition_LIST *big_part_list) {
131  box->set_owner(nullptr);
132  auto *single = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
133  single->set_flow(BTFT_NONE);
134  single->AddBox(box);
135  single->ComputeLimits();
136  single->ClaimBoxes();
137  single->SetBlobTypes();
138  single->set_block_owned(true);
139  if (big_part_list != nullptr) {
140  ColPartition_IT part_it(big_part_list);
141  part_it.add_to_end(single);
142  }
143  return single;
144 }
145 
147  // Remove this as a partner of all partners, as we don't want them
148  // referring to a deleted object.
149  ColPartition_C_IT it(&upper_partners_);
150  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
151  it.data()->RemovePartner(false, this);
152  }
153  it.set_to_list(&lower_partners_);
154  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
155  it.data()->RemovePartner(true, this);
156  }
157 }
158 
159 // Constructs a fake ColPartition with no BLOBNBOXes to represent a
160 // horizontal or vertical line, given a type and a bounding box.
162  const ICOORD &vertical, int left,
163  int bottom, int right, int top) {
164  auto *part = new ColPartition(blob_type, vertical);
165  part->bounding_box_ = TBOX(left, bottom, right, top);
166  part->median_bottom_ = bottom;
167  part->median_top_ = top;
168  part->median_height_ = top - bottom;
169  part->median_left_ = left;
170  part->median_right_ = right;
171  part->median_width_ = right - left;
172  part->left_key_ = part->BoxLeftKey();
173  part->right_key_ = part->BoxRightKey();
174  return part;
175 }
176 
177 // Adds the given box to the partition, updating the partition bounds.
178 // The list of boxes in the partition is updated, ensuring that no box is
179 // recorded twice, and the boxes are kept in increasing left position.
181  TBOX box = bbox->bounding_box();
182  // Update the partition limits.
183  if (boxes_.empty()) {
184  bounding_box_ = box;
185  } else {
186  bounding_box_ += box;
187  }
188 
189  if (IsVerticalType()) {
190  if (!last_add_was_vertical_) {
191  boxes_.sort(SortByBoxBottom<BLOBNBOX>);
192  last_add_was_vertical_ = true;
193  }
194  boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>, true, bbox);
195  } else {
196  if (last_add_was_vertical_) {
197  boxes_.sort(SortByBoxLeft<BLOBNBOX>);
198  last_add_was_vertical_ = false;
199  }
200  boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);
201  }
202  if (!left_key_tab_) {
203  left_key_ = BoxLeftKey();
204  }
205  if (!right_key_tab_) {
206  right_key_ = BoxRightKey();
207  }
208  if (TabFind::WithinTestRegion(2, box.left(), box.bottom())) {
209  tprintf("Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
210  box.left(), box.bottom(), box.right(), box.top(),
211  bounding_box_.left(), bounding_box_.right());
212  }
213 }
214 
215 // Removes the given box from the partition, updating the bounds.
217  BLOBNBOX_C_IT bb_it(&boxes_);
218  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
219  if (box == bb_it.data()) {
220  bb_it.extract();
221  ComputeLimits();
222  return;
223  }
224  }
225 }
226 
227 // Returns the tallest box in the partition, as measured perpendicular to the
228 // presumed flow of text.
230  BLOBNBOX *biggest = nullptr;
231  BLOBNBOX_C_IT bb_it(&boxes_);
232  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
233  BLOBNBOX *bbox = bb_it.data();
234  if (IsVerticalType()) {
235  if (biggest == nullptr ||
236  bbox->bounding_box().width() > biggest->bounding_box().width()) {
237  biggest = bbox;
238  }
239  } else {
240  if (biggest == nullptr ||
241  bbox->bounding_box().height() > biggest->bounding_box().height()) {
242  biggest = bbox;
243  }
244  }
245  }
246  return biggest;
247 }
248 
249 // Returns the bounding box excluding the given box.
251  TBOX result;
252  BLOBNBOX_C_IT bb_it(&boxes_);
253  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
254  if (box != bb_it.data()) {
255  result += bb_it.data()->bounding_box();
256  }
257  }
258  return result;
259 }
260 
261 // Claims the boxes in the boxes_list by marking them with a this owner
262 // pointer. If a box is already owned, then it must be owned by this.
264  BLOBNBOX_C_IT bb_it(&boxes_);
265  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
266  BLOBNBOX *bblob = bb_it.data();
267  ColPartition *other = bblob->owner();
268  if (other == nullptr) {
269  // Normal case: ownership is available.
270  bblob->set_owner(this);
271  } else {
272  ASSERT_HOST(other == this);
273  }
274  }
275 }
276 
277 // nullptr the owner of the blobs in this partition, so they can be deleted
278 // independently of the ColPartition.
280  BLOBNBOX_C_IT bb_it(&boxes_);
281  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
282  BLOBNBOX *bblob = bb_it.data();
283  ASSERT_HOST(bblob->owner() == this || bblob->owner() == nullptr);
284  bblob->set_owner(nullptr);
285  }
286 }
287 
288 // nullptr the owner of the blobs in this partition that are owned by this
289 // partition, so they can be deleted independently of the ColPartition.
290 // Any blobs that are not owned by this partition get to keep their owner
291 // without an assert failure.
293  BLOBNBOX_C_IT bb_it(&boxes_);
294  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
295  BLOBNBOX *bblob = bb_it.data();
296  if (bblob->owner() == this) {
297  bblob->set_owner(nullptr);
298  }
299  }
300 }
301 
302 // Nulls the owner of the blobs in this partition that are owned by this
303 // partition and not leader blobs, removing them from the boxes_ list, thus
304 // turning this partition back to a leader partition if it contains a leader,
305 // or otherwise leaving it empty. Returns true if any boxes remain.
307  BLOBNBOX_C_IT bb_it(&boxes_);
308  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
309  BLOBNBOX *bblob = bb_it.data();
310  if (bblob->flow() != BTFT_LEADER) {
311  if (bblob->owner() == this) {
312  bblob->set_owner(nullptr);
313  }
314  bb_it.extract();
315  }
316  }
317  if (bb_it.empty()) {
318  return false;
319  }
320  flow_ = BTFT_LEADER;
321  ComputeLimits();
322  return true;
323 }
324 
325 // Delete the boxes that this partition owns.
327  // Although the boxes_ list is a C_LIST, in some cases it owns the
328  // BLOBNBOXes, as the ColPartition takes ownership from the grid,
329  // and the BLOBNBOXes own the underlying C_BLOBs.
330  for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
331  BLOBNBOX *bblob = bb_it.extract();
332  // TODO: remove next line, currently still needed for resultiterator_test.
333  delete bblob->remove_cblob();
334  delete bblob;
335  }
336 }
337 
338 // Reflects the partition in the y-axis, assuming that its blobs have
339 // already been done. Corrects only a limited part of the members, since
340 // this function is assumed to be used shortly after initial creation, which
341 // is before a lot of the members are used.
343  BLOBNBOX_CLIST reversed_boxes;
344  BLOBNBOX_C_IT reversed_it(&reversed_boxes);
345  // Reverse the order of the boxes_.
346  BLOBNBOX_C_IT bb_it(&boxes_);
347  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
348  reversed_it.add_before_then_move(bb_it.extract());
349  }
350  bb_it.add_list_after(&reversed_boxes);
351  ASSERT_HOST(!left_key_tab_ && !right_key_tab_);
352  int tmp = left_margin_;
353  left_margin_ = -right_margin_;
354  right_margin_ = -tmp;
355  ComputeLimits();
356 }
357 
358 // Returns true if this is a legal partition - meaning that the conditions
359 // left_margin <= bounding_box left
360 // left_key <= bounding box left key
361 // bounding box left <= bounding box right
362 // and likewise for right margin and key
363 // are all met.
365  if (bounding_box_.left() > bounding_box_.right()) {
366  if (textord_debug_bugs) {
367  tprintf("Bounding box invalid\n");
368  Print();
369  }
370  return false; // Bounding box invalid.
371  }
372  if (left_margin_ > bounding_box_.left() ||
373  right_margin_ < bounding_box_.right()) {
374  if (textord_debug_bugs) {
375  tprintf("Margins invalid\n");
376  Print();
377  }
378  return false; // Margins invalid.
379  }
380  if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {
381  if (textord_debug_bugs) {
382  tprintf("Key inside box: %d v %d or %d v %d\n", left_key_, BoxLeftKey(),
383  right_key_, BoxRightKey());
384  Print();
385  }
386  return false; // Keys inside the box.
387  }
388  return true;
389 }
390 
391 // Returns true if the left and right edges are approximately equal.
392 bool ColPartition::MatchingColumns(const ColPartition &other) const {
393  int y = (MidY() + other.MidY()) / 2;
394  if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,
395  LeftAtY(y) / kColumnWidthFactor, 1)) {
396  return false;
397  }
398  if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,
399  RightAtY(y) / kColumnWidthFactor, 1)) {
400  return false;
401  }
402  return true;
403 }
404 
405 // Returns true if the colors match for two text partitions.
407  if (color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise &&
408  other.color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise) {
409  return false; // Too noisy.
410  }
411 
412  // Colors must match for other to count.
413  double d_this1_o =
414  ImageFind::ColorDistanceFromLine(other.color1_, other.color2_, color1_);
415  double d_this2_o =
416  ImageFind::ColorDistanceFromLine(other.color1_, other.color2_, color2_);
417  double d_o1_this =
418  ImageFind::ColorDistanceFromLine(color1_, color2_, other.color1_);
419  double d_o2_this =
420  ImageFind::ColorDistanceFromLine(color1_, color2_, other.color2_);
421  // All 4 distances must be small enough.
422  return d_this1_o < kMaxColorDistance && d_this2_o < kMaxColorDistance &&
423  d_o1_this < kMaxColorDistance && d_o2_this < kMaxColorDistance;
424 }
425 
426 // Returns true if the sizes match for two text partitions,
427 // taking orientation into account. See also SizesSimilar.
428 bool ColPartition::MatchingSizes(const ColPartition &other) const {
429  if (blob_type_ == BRT_VERT_TEXT || other.blob_type_ == BRT_VERT_TEXT) {
430  return !TabFind::DifferentSizes(median_width_, other.median_width_);
431  } else {
432  return !TabFind::DifferentSizes(median_height_, other.median_height_);
433  }
434 }
435 
436 // Returns true if there is no tabstop violation in merging this and other.
438  if (bounding_box_.right() < other.bounding_box_.left() &&
439  bounding_box_.right() < other.LeftBlobRule()) {
440  return false;
441  }
442  if (other.bounding_box_.right() < bounding_box_.left() &&
443  other.bounding_box_.right() < LeftBlobRule()) {
444  return false;
445  }
446  if (bounding_box_.left() > other.bounding_box_.right() &&
447  bounding_box_.left() > other.RightBlobRule()) {
448  return false;
449  }
450  if (other.bounding_box_.left() > bounding_box_.right() &&
451  other.bounding_box_.left() > RightBlobRule()) {
452  return false;
453  }
454  return true;
455 }
456 
457 // Returns true if other has a similar stroke width to this.
459  double fractional_tolerance,
460  double constant_tolerance) const {
461  int match_count = 0;
462  int nonmatch_count = 0;
463  BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST *>(&boxes_));
464  BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST *>(&other.boxes_));
465  box_it.mark_cycle_pt();
466  other_it.mark_cycle_pt();
467  while (!box_it.cycled_list() && !other_it.cycled_list()) {
468  if (box_it.data()->MatchingStrokeWidth(
469  *other_it.data(), fractional_tolerance, constant_tolerance)) {
470  ++match_count;
471  } else {
472  ++nonmatch_count;
473  }
474  box_it.forward();
475  other_it.forward();
476  }
477  return match_count > nonmatch_count;
478 }
479 
480 // Returns true if base is an acceptable diacritic base char merge
481 // with this as the diacritic.
482 // Returns true if:
483 // (1) this is a ColPartition containing only diacritics, and
484 // (2) the base characters indicated on the diacritics all believably lie
485 // within the text line of the candidate ColPartition.
487  bool debug) const {
488  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST *>(&boxes_));
489  int min_top = INT32_MAX;
490  int max_bottom = -INT32_MAX;
491  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
492  BLOBNBOX *blob = it.data();
493  if (!blob->IsDiacritic()) {
494  if (debug) {
495  tprintf("Blob is not a diacritic:");
496  blob->bounding_box().print();
497  }
498  return false; // All blobs must have diacritic bases.
499  }
500  if (blob->base_char_top() < min_top) {
501  min_top = blob->base_char_top();
502  }
503  if (blob->base_char_bottom() > max_bottom) {
504  max_bottom = blob->base_char_bottom();
505  }
506  }
507  // If the intersection of all vertical ranges of all base characters
508  // overlaps the median range of this, then it is OK.
509  bool result =
510  min_top > candidate.median_bottom_ && max_bottom < candidate.median_top_;
511  if (debug) {
512  if (result) {
513  tprintf("OKDiacritic!\n");
514  } else {
515  tprintf("y ranges don\'t overlap: %d-%d / %d-%d\n", max_bottom, min_top,
516  median_bottom_, median_top_);
517  }
518  }
519  return result;
520 }
521 
522 // Sets the sort key using either the tab vector, or the bounding box if
523 // the tab vector is nullptr. If the tab_vector lies inside the bounding_box,
524 // use the edge of the box as a key any way.
525 void ColPartition::SetLeftTab(const TabVector *tab_vector) {
526  if (tab_vector != nullptr) {
527  left_key_ = tab_vector->sort_key();
528  left_key_tab_ = left_key_ <= BoxLeftKey();
529  } else {
530  left_key_tab_ = false;
531  }
532  if (!left_key_tab_) {
533  left_key_ = BoxLeftKey();
534  }
535 }
536 
537 // As SetLeftTab, but with the right.
538 void ColPartition::SetRightTab(const TabVector *tab_vector) {
539  if (tab_vector != nullptr) {
540  right_key_ = tab_vector->sort_key();
541  right_key_tab_ = right_key_ >= BoxRightKey();
542  } else {
543  right_key_tab_ = false;
544  }
545  if (!right_key_tab_) {
546  right_key_ = BoxRightKey();
547  }
548 }
549 
550 // Copies the left/right tab from the src partition, but if take_box is
551 // true, copies the box instead and uses that as a key.
552 void ColPartition::CopyLeftTab(const ColPartition &src, bool take_box) {
553  left_key_tab_ = take_box ? false : src.left_key_tab_;
554  if (left_key_tab_) {
555  left_key_ = src.left_key_;
556  } else {
557  bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));
558  left_key_ = BoxLeftKey();
559  }
560  if (left_margin_ > bounding_box_.left()) {
561  left_margin_ = src.left_margin_;
562  }
563 }
564 
565 // As CopyLeftTab, but with the right.
566 void ColPartition::CopyRightTab(const ColPartition &src, bool take_box) {
567  right_key_tab_ = take_box ? false : src.right_key_tab_;
568  if (right_key_tab_) {
569  right_key_ = src.right_key_;
570  } else {
571  bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));
572  right_key_ = BoxRightKey();
573  }
574  if (right_margin_ < bounding_box_.right()) {
575  right_margin_ = src.right_margin_;
576  }
577 }
578 
579 // Returns the left rule line x coord of the leftmost blob.
581  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST *>(&boxes_));
582  return it.data()->left_rule();
583 }
584 // Returns the right rule line x coord of the rightmost blob.
586  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST *>(&boxes_));
587  it.move_to_last();
588  return it.data()->right_rule();
589 }
590 
593  return special_blobs_densities_[type];
594 }
595 
598  BLOBNBOX_C_IT blob_it(&boxes_);
599  int count = 0;
600  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
601  BLOBNBOX *blob = blob_it.data();
603  if (blob_type == type) {
604  count++;
605  }
606  }
607 
608  return count;
609 }
610 
612  const float density) {
614  special_blobs_densities_[type] = density;
615 }
616 
618  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
619  if (boxes_.empty()) {
620  return;
621  }
622 
623  BLOBNBOX_C_IT blob_it(&boxes_);
624  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
625  BLOBNBOX *blob = blob_it.data();
627  special_blobs_densities_[type]++;
628  }
629 
630  for (float &special_blobs_density : special_blobs_densities_) {
631  special_blobs_density /= boxes_.length();
632  }
633 }
634 
635 // Add a partner above if upper, otherwise below.
636 // Add them uniquely and keep the list sorted by box left.
637 // Partnerships are added symmetrically to partner and this.
638 void ColPartition::AddPartner(bool upper, ColPartition *partner) {
639  if (upper) {
640  partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true,
641  this);
642  upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
643  } else {
644  partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true,
645  this);
646  lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
647  }
648 }
649 
650 // Removes the partner from this, but does not remove this from partner.
651 // This asymmetric removal is so as not to mess up the iterator that is
652 // working on partner's partner list.
653 void ColPartition::RemovePartner(bool upper, ColPartition *partner) {
654  ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
655  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
656  if (it.data() == partner) {
657  it.extract();
658  break;
659  }
660  }
661 }
662 
663 // Returns the partner if the given partner is a singleton, otherwise nullptr.
665  ColPartition_CLIST *partners = upper ? &upper_partners_ : &lower_partners_;
666  if (!partners->singleton()) {
667  return nullptr;
668  }
669  ColPartition_C_IT it(partners);
670  return it.data();
671 }
672 
673 // Merge with the other partition and delete it.
675  // The result has to either own all of the blobs or none of them.
676  // Verify the flag is consistent.
677  ASSERT_HOST(owns_blobs() == other->owns_blobs());
678  // TODO(nbeato): check owns_blobs better. Right now owns_blobs
679  // should always be true when this is called. So there is no issues.
680  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
681  bounding_box_.bottom()) ||
682  TabFind::WithinTestRegion(2, other->bounding_box_.left(),
683  other->bounding_box_.bottom())) {
684  tprintf("Merging:");
685  Print();
686  other->Print();
687  }
688 
689  // Update the special_blobs_densities_.
690  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
691  for (int type = 0; type < BSTT_COUNT; ++type) {
692  unsigned w1 = boxes_.length();
693  unsigned w2 = other->boxes_.length();
694  float new_val = special_blobs_densities_[type] * w1 +
695  other->special_blobs_densities_[type] * w2;
696  if (!w1 || !w2) {
697  ASSERT_HOST((w1 + w2) > 0);
698  special_blobs_densities_[type] = new_val / (w1 + w2);
699  }
700  }
701 
702  // Merge the two sorted lists.
703  BLOBNBOX_C_IT it(&boxes_);
704  BLOBNBOX_C_IT it2(&other->boxes_);
705  for (; !it2.empty(); it2.forward()) {
706  BLOBNBOX *bbox2 = it2.extract();
707  ColPartition *prev_owner = bbox2->owner();
708  if (prev_owner != other && prev_owner != nullptr) {
709  // A blob on other's list is owned by someone else; let them have it.
710  continue;
711  }
712  ASSERT_HOST(prev_owner == other || prev_owner == nullptr);
713  if (prev_owner == other) {
714  bbox2->set_owner(this);
715  }
716  it.add_to_end(bbox2);
717  }
718  left_margin_ = std::min(left_margin_, other->left_margin_);
719  right_margin_ = std::max(right_margin_, other->right_margin_);
720  if (other->left_key_ < left_key_) {
721  left_key_ = other->left_key_;
722  left_key_tab_ = other->left_key_tab_;
723  }
724  if (other->right_key_ > right_key_) {
725  right_key_ = other->right_key_;
726  right_key_tab_ = other->right_key_tab_;
727  }
728  // Combine the flow and blob_type in a sensible way.
729  // Dominant flows stay.
730  if (!DominatesInMerge(flow_, other->flow_)) {
731  flow_ = other->flow_;
732  blob_type_ = other->blob_type_;
733  }
734  SetBlobTypes();
735  if (IsVerticalType()) {
736  boxes_.sort(SortByBoxBottom<BLOBNBOX>);
737  last_add_was_vertical_ = true;
738  } else {
739  boxes_.sort(SortByBoxLeft<BLOBNBOX>);
740  last_add_was_vertical_ = false;
741  }
742  ComputeLimits();
743  // Fix partner lists. other is going away, so remove it as a
744  // partner of all its partners and add this in its place.
745  for (int upper = 0; upper < 2; ++upper) {
746  ColPartition_CLIST partners;
747  ColPartition_C_IT part_it(&partners);
748  part_it.add_list_after(upper ? &other->upper_partners_
749  : &other->lower_partners_);
750  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
751  ColPartition *partner = part_it.extract();
752  partner->RemovePartner(!upper, other);
753  partner->RemovePartner(!upper, this);
754  partner->AddPartner(!upper, this);
755  }
756  }
757  delete other;
758  if (cb != nullptr) {
759  SetColumnGoodness(cb);
760  }
761 }
762 
763 // Merge1 and merge2 are candidates to be merged, yet their combined box
764 // overlaps this. Is that allowed?
765 // Returns true if the overlap between this and the merged pair of
766 // merge candidates is sufficiently trivial to be allowed.
767 // The merged box can graze the edge of this by the ok_box_overlap
768 // if that exceeds the margin to the median top and bottom.
769 // ok_box_overlap should be set by the caller appropriate to the sizes of
770 // the text involved, and is usually a fraction of the median size of merge1
771 // and/or merge2, or this.
772 // TODO(rays) Determine whether vertical text needs to be considered.
774  const ColPartition &merge2,
775  int ok_box_overlap, bool debug) {
776  // Vertical partitions are not allowed to be involved.
777  if (IsVerticalType() || merge1.IsVerticalType() || merge2.IsVerticalType()) {
778  if (debug) {
779  tprintf("Vertical partition\n");
780  }
781  return false;
782  }
783  // The merging partitions must strongly overlap each other.
784  if (!merge1.VSignificantCoreOverlap(merge2)) {
785  if (debug) {
786  tprintf("Voverlap %d (%d)\n", merge1.VCoreOverlap(merge2),
787  merge1.VSignificantCoreOverlap(merge2));
788  }
789  return false;
790  }
791  // The merged box must not overlap the median bounds of this.
792  TBOX merged_box(merge1.bounding_box());
793  merged_box += merge2.bounding_box();
794  if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
795  merged_box.bottom() < bounding_box_.top() - ok_box_overlap &&
796  merged_box.top() > bounding_box_.bottom() + ok_box_overlap) {
797  if (debug) {
798  tprintf("Excessive box overlap\n");
799  }
800  return false;
801  }
802  // Looks OK!
803  return true;
804 }
805 
806 // Find the blob at which to split this to minimize the overlap with the
807 // given box. Returns the first blob to go in the second partition.
809  if (boxes_.empty() || boxes_.singleton()) {
810  return nullptr;
811  }
812  BLOBNBOX_C_IT it(&boxes_);
813  TBOX left_box(it.data()->bounding_box());
814  for (it.forward(); !it.at_first(); it.forward()) {
815  BLOBNBOX *bbox = it.data();
816  left_box += bbox->bounding_box();
817  if (left_box.overlap(box)) {
818  return bbox;
819  }
820  }
821  return nullptr;
822 }
823 
824 // Split this partition keeping the first half in this and returning
825 // the second half.
826 // Splits by putting the split_blob and the blobs that follow
827 // in the second half, and the rest in the first half.
829  ColPartition *split_part = ShallowCopy();
830  split_part->set_owns_blobs(owns_blobs());
831  BLOBNBOX_C_IT it(&boxes_);
832  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
833  BLOBNBOX *bbox = it.data();
834  ColPartition *prev_owner = bbox->owner();
835  ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == nullptr);
836  if (bbox == split_blob || !split_part->boxes_.empty()) {
837  split_part->AddBox(it.extract());
838  if (owns_blobs() && prev_owner != nullptr) {
839  bbox->set_owner(split_part);
840  }
841  }
842  }
843  ASSERT_HOST(!it.empty());
844  if (split_part->IsEmpty()) {
845  // Split part ended up with nothing. Possible if split_blob is not
846  // in the list of blobs.
847  delete split_part;
848  return nullptr;
849  }
850  right_key_tab_ = false;
851  split_part->left_key_tab_ = false;
852  ComputeLimits();
853  // TODO(nbeato) Merge Ray's CL like this:
854  // if (owns_blobs())
855  // SetBlobTextlineGoodness();
856  split_part->ComputeLimits();
857  // TODO(nbeato) Merge Ray's CL like this:
858  // if (split_part->owns_blobs())
859  // split_part->SetBlobTextlineGoodness();
860  return split_part;
861 }
862 
863 // Split this partition at the given x coordinate, returning the right
864 // half and keeping the left half in this.
866  if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right()) {
867  return nullptr; // There will be no change.
868  }
869  ColPartition *split_part = ShallowCopy();
870  split_part->set_owns_blobs(owns_blobs());
871  BLOBNBOX_C_IT it(&boxes_);
872  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
873  BLOBNBOX *bbox = it.data();
874  ColPartition *prev_owner = bbox->owner();
875  ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == nullptr);
876  const TBOX &box = bbox->bounding_box();
877  if (box.left() >= split_x) {
878  split_part->AddBox(it.extract());
879  if (owns_blobs() && prev_owner != nullptr) {
880  bbox->set_owner(split_part);
881  }
882  }
883  }
884  if (it.empty()) {
885  // Possible if split-x passes through the first blob.
886  it.add_list_after(&split_part->boxes_);
887  }
888  ASSERT_HOST(!it.empty());
889  if (split_part->IsEmpty()) {
890  // Split part ended up with nothing. Possible if split_x passes
891  // through the last blob.
892  delete split_part;
893  return nullptr;
894  }
895  right_key_tab_ = false;
896  split_part->left_key_tab_ = false;
897  right_margin_ = split_x;
898  split_part->left_margin_ = split_x;
899  ComputeLimits();
900  split_part->ComputeLimits();
901  return split_part;
902 }
903 
904 // Recalculates all the coordinate limits of the partition.
906  bounding_box_ = TBOX(); // Clear it
907  BLOBNBOX_C_IT it(&boxes_);
908  BLOBNBOX *bbox = nullptr;
909  int non_leader_count = 0;
910  if (it.empty()) {
911  bounding_box_.set_left(left_margin_);
912  bounding_box_.set_right(right_margin_);
913  bounding_box_.set_bottom(0);
914  bounding_box_.set_top(0);
915  } else {
916  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
917  bbox = it.data();
918  bounding_box_ += bbox->bounding_box();
919  if (bbox->flow() != BTFT_LEADER) {
920  ++non_leader_count;
921  }
922  }
923  }
924  if (!left_key_tab_) {
925  left_key_ = BoxLeftKey();
926  }
927  if (left_key_ > BoxLeftKey() && textord_debug_bugs) {
928  // TODO(rays) investigate the causes of these error messages, to find
929  // out if they are genuinely harmful, or just indicative of junk input.
930  tprintf("Computed left-illegal partition\n");
931  Print();
932  }
933  if (!right_key_tab_) {
934  right_key_ = BoxRightKey();
935  }
936  if (right_key_ < BoxRightKey() && textord_debug_bugs) {
937  tprintf("Computed right-illegal partition\n");
938  Print();
939  }
940  if (it.empty()) {
941  return;
942  }
943  if (IsImageType() || blob_type() == BRT_RECTIMAGE ||
944  blob_type() == BRT_POLYIMAGE) {
945  median_top_ = bounding_box_.top();
946  median_bottom_ = bounding_box_.bottom();
947  median_height_ = bounding_box_.height();
948  median_left_ = bounding_box_.left();
949  median_right_ = bounding_box_.right();
950  median_width_ = bounding_box_.width();
951  } else {
952  STATS top_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
953  STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
954  STATS height_stats(0, bounding_box_.height() + 1);
955  STATS left_stats(bounding_box_.left(), bounding_box_.right() + 1);
956  STATS right_stats(bounding_box_.left(), bounding_box_.right() + 1);
957  STATS width_stats(0, bounding_box_.width() + 1);
958  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
959  bbox = it.data();
960  if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) {
961  const TBOX &box = bbox->bounding_box();
962  int area = box.area();
963  top_stats.add(box.top(), area);
964  bottom_stats.add(box.bottom(), area);
965  height_stats.add(box.height(), area);
966  left_stats.add(box.left(), area);
967  right_stats.add(box.right(), area);
968  width_stats.add(box.width(), area);
969  }
970  }
971  median_top_ = static_cast<int>(top_stats.median() + 0.5);
972  median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
973  median_height_ = static_cast<int>(height_stats.median() + 0.5);
974  median_left_ = static_cast<int>(left_stats.median() + 0.5);
975  median_right_ = static_cast<int>(right_stats.median() + 0.5);
976  median_width_ = static_cast<int>(width_stats.median() + 0.5);
977  }
978 
979  if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {
980  tprintf("Made partition with bad right coords, %d < %d\n", right_margin_,
981  bounding_box_.right());
982  Print();
983  }
984  if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {
985  tprintf("Made partition with bad left coords, %d > %d\n", left_margin_,
986  bounding_box_.left());
987  Print();
988  }
989  // Fix partner lists. The bounding box has changed and partners are stored
990  // in bounding box order, so remove and reinsert this as a partner
991  // of all its partners.
992  for (int upper = 0; upper < 2; ++upper) {
993  ColPartition_CLIST partners;
994  ColPartition_C_IT part_it(&partners);
995  part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
996  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
997  ColPartition *partner = part_it.extract();
998  partner->RemovePartner(!upper, this);
999  partner->AddPartner(!upper, this);
1000  }
1001  }
1002  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
1003  bounding_box_.bottom())) {
1004  tprintf("Recomputed box for partition %p\n", this);
1005  Print();
1006  }
1007 }
1008 
1009 // Returns the number of boxes that overlap the given box.
1011  BLOBNBOX_C_IT it(&boxes_);
1012  int overlap_count = 0;
1013  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1014  BLOBNBOX *bbox = it.data();
1015  if (box.overlap(bbox->bounding_box())) {
1016  ++overlap_count;
1017  }
1018  }
1019  return overlap_count;
1020 }
1021 
1022 // Computes and sets the type_ and first_column_, last_column_ and column_set_.
1023 // resolution refers to the ppi resolution of the image.
1024 void ColPartition::SetPartitionType(int resolution, ColPartitionSet *columns) {
1025  int first_spanned_col = -1;
1026  ColumnSpanningType span_type = columns->SpanningType(
1027  resolution, bounding_box_.left(), bounding_box_.right(),
1028  std::min(bounding_box_.height(), bounding_box_.width()), MidY(),
1029  left_margin_, right_margin_, &first_column_, &last_column_,
1030  &first_spanned_col);
1031  column_set_ = columns;
1032  if (first_column_ < last_column_ && span_type == CST_PULLOUT &&
1033  !IsLineType()) {
1034  // Unequal columns may indicate that the pullout spans one of the columns
1035  // it lies in, so force it to be allocated to just that column.
1036  if (first_spanned_col >= 0) {
1037  first_column_ = first_spanned_col;
1038  last_column_ = first_spanned_col;
1039  } else {
1040  if ((first_column_ & 1) == 0) {
1041  last_column_ = first_column_;
1042  } else if ((last_column_ & 1) == 0) {
1043  first_column_ = last_column_;
1044  } else {
1045  first_column_ = last_column_ = (first_column_ + last_column_) / 2;
1046  }
1047  }
1048  }
1049  type_ = PartitionType(span_type);
1050 }
1051 
1052 // Returns the PartitionType from the current BlobRegionType and a column
1053 // flow spanning type ColumnSpanningType, generated by
1054 // ColPartitionSet::SpanningType, that indicates how the partition sits
1055 // in the columns.
1057  if (flow == CST_NOISE) {
1058  if (blob_type_ != BRT_HLINE && blob_type_ != BRT_VLINE &&
1059  blob_type_ != BRT_RECTIMAGE && blob_type_ != BRT_VERT_TEXT) {
1060  return PT_NOISE;
1061  }
1062  flow = CST_FLOWING;
1063  }
1064 
1065  switch (blob_type_) {
1066  case BRT_NOISE:
1067  return PT_NOISE;
1068  case BRT_HLINE:
1069  return PT_HORZ_LINE;
1070  case BRT_VLINE:
1071  return PT_VERT_LINE;
1072  case BRT_RECTIMAGE:
1073  case BRT_POLYIMAGE:
1074  switch (flow) {
1075  case CST_FLOWING:
1076  return PT_FLOWING_IMAGE;
1077  case CST_HEADING:
1078  return PT_HEADING_IMAGE;
1079  case CST_PULLOUT:
1080  return PT_PULLOUT_IMAGE;
1081  default:
1082  ASSERT_HOST(!"Undefined flow type for image!");
1083  }
1084  break;
1085  case BRT_VERT_TEXT:
1086  return PT_VERTICAL_TEXT;
1087  case BRT_TEXT:
1088  case BRT_UNKNOWN:
1089  default:
1090  switch (flow) {
1091  case CST_FLOWING:
1092  return PT_FLOWING_TEXT;
1093  case CST_HEADING:
1094  return PT_HEADING_TEXT;
1095  case CST_PULLOUT:
1096  return PT_PULLOUT_TEXT;
1097  default:
1098  ASSERT_HOST(!"Undefined flow type for text!");
1099  }
1100  }
1101  ASSERT_HOST(!"Should never get here!");
1102  return PT_NOISE;
1103 }
1104 
1105 // Returns the first and last column touched by this partition.
1106 // resolution refers to the ppi resolution of the image.
1107 void ColPartition::ColumnRange(int resolution, ColPartitionSet *columns,
1108  int *first_col, int *last_col) {
1109  int first_spanned_col = -1;
1110  ColumnSpanningType span_type = columns->SpanningType(
1111  resolution, bounding_box_.left(), bounding_box_.right(),
1112  std::min(bounding_box_.height(), bounding_box_.width()), MidY(),
1113  left_margin_, right_margin_, first_col, last_col, &first_spanned_col);
1114  type_ = PartitionType(span_type);
1115 }
1116 
1117 // Sets the internal flags good_width_ and good_column_.
1119  int y = MidY();
1120  int width = RightAtY(y) - LeftAtY(y);
1121  good_width_ = cb(width);
1122  good_column_ = blob_type_ == BRT_TEXT && left_key_tab_ && right_key_tab_;
1123 }
1124 
1125 // Determines whether the blobs in this partition mostly represent
1126 // a leader (fixed pitch sequence) and sets the member blobs accordingly.
1127 // Note that height is assumed to have been tested elsewhere, and that this
1128 // function will find most fixed-pitch text as leader without a height filter.
1129 // Leader detection is limited to sequences of identical width objects,
1130 // such as .... or ----, so patterns, such as .-.-.-.-. will not be found.
1132  bool result = false;
1133  // Gather statistics on the gaps between blobs and the widths of the blobs.
1134  int part_width = bounding_box_.width();
1135  STATS gap_stats(0, part_width);
1136  STATS width_stats(0, part_width);
1137  BLOBNBOX_C_IT it(&boxes_);
1138  BLOBNBOX *prev_blob = it.data();
1139  prev_blob->set_flow(BTFT_NEIGHBOURS);
1140  width_stats.add(prev_blob->bounding_box().width(), 1);
1141  int blob_count = 1;
1142  for (it.forward(); !it.at_first(); it.forward()) {
1143  BLOBNBOX *blob = it.data();
1144  int left = blob->bounding_box().left();
1145  int right = blob->bounding_box().right();
1146  gap_stats.add(left - prev_blob->bounding_box().right(), 1);
1147  width_stats.add(right - left, 1);
1148  blob->set_flow(BTFT_NEIGHBOURS);
1149  prev_blob = blob;
1150  ++blob_count;
1151  }
1152  double median_gap = gap_stats.median();
1153  double median_width = width_stats.median();
1154  double max_width = std::max(median_gap, median_width);
1155  double min_width = std::min(median_gap, median_width);
1156  double gap_iqr = gap_stats.ile(0.75f) - gap_stats.ile(0.25f);
1157  if (textord_debug_tabfind >= 4) {
1158  tprintf("gap iqr = %g, blob_count=%d, limits=%g,%g\n", gap_iqr, blob_count,
1159  max_width * kMaxLeaderGapFractionOfMax,
1160  min_width * kMaxLeaderGapFractionOfMin);
1161  }
1162  if (gap_iqr < max_width * kMaxLeaderGapFractionOfMax &&
1163  gap_iqr < min_width * kMaxLeaderGapFractionOfMin &&
1164  blob_count >= kMinLeaderCount) {
1165  // This is stable enough to be called a leader, so check the widths.
1166  // Since leader dashes can join, run a dp cutting algorithm and go
1167  // on the cost.
1168  int offset = static_cast<int>(ceil(gap_iqr * 2));
1169  int min_step = static_cast<int>(median_gap + median_width + 0.5);
1170  int max_step = min_step + offset;
1171  min_step -= offset;
1172  // Pad the buffer with min_step/2 on each end.
1173  int part_left = bounding_box_.left() - min_step / 2;
1174  part_width += min_step;
1175  auto *projection = new DPPoint[part_width];
1176  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1177  BLOBNBOX *blob = it.data();
1178  int left = blob->bounding_box().left();
1179  int right = blob->bounding_box().right();
1180  int height = blob->bounding_box().height();
1181  for (int x = left; x < right; ++x) {
1182  projection[left - part_left].AddLocalCost(height);
1183  }
1184  }
1185  DPPoint *best_end =
1186  DPPoint::Solve(min_step, max_step, false, &DPPoint::CostWithVariance,
1187  part_width, projection);
1188  if (best_end != nullptr && best_end->total_cost() < blob_count) {
1189  // Good enough. Call it a leader.
1190  result = true;
1191  bool modified_blob_list = false;
1192  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1193  BLOBNBOX *blob = it.data();
1194  // If the first or last blob is spaced too much, don't mark it.
1195  if (it.at_first()) {
1196  int gap = it.data_relative(1)->bounding_box().left() -
1197  blob->bounding_box().right();
1198  if (blob->bounding_box().width() + gap > max_step) {
1199  it.extract();
1200  modified_blob_list = true;
1201  continue;
1202  }
1203  }
1204  if (it.at_last()) {
1205  int gap = blob->bounding_box().left() -
1206  it.data_relative(-1)->bounding_box().right();
1207  if (blob->bounding_box().width() + gap > max_step) {
1208  it.extract();
1209  modified_blob_list = true;
1210  break;
1211  }
1212  }
1213  blob->set_region_type(BRT_TEXT);
1214  blob->set_flow(BTFT_LEADER);
1215  }
1216  if (modified_blob_list) {
1217  ComputeLimits();
1218  }
1219  blob_type_ = BRT_TEXT;
1220  flow_ = BTFT_LEADER;
1221  } else if (textord_debug_tabfind) {
1222  if (best_end == nullptr) {
1223  tprintf("No path\n");
1224  } else {
1225  tprintf("Total cost = %d vs allowed %d\n", best_end->total_cost(),
1226  blob_count);
1227  }
1228  }
1229  delete[] projection;
1230  }
1231  return result;
1232 }
1233 
1234 // Given the result of TextlineProjection::EvaluateColPartition, (positive for
1235 // horizontal text, negative for vertical text, and near zero for non-text),
1236 // sets the blob_type_ and flow_ for this partition to indicate whether it
1237 // is strongly or weakly vertical or horizontal text, or non-text.
1238 // The function assumes that the blob neighbours are valid (from
1239 // StrokeWidth::SetNeighbours) and that those neighbours have their
1240 // region_type() set.
1242  int blob_count = 0; // Total # blobs.
1243  int good_blob_score_ = 0; // Total # good strokewidth neighbours.
1244  int noisy_count = 0; // Total # neighbours marked as noise.
1245  int hline_count = 0;
1246  int vline_count = 0;
1247  BLOBNBOX_C_IT it(&boxes_);
1248  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1249  BLOBNBOX *blob = it.data();
1250  ++blob_count;
1251  noisy_count += blob->NoisyNeighbours();
1252  good_blob_score_ += blob->GoodTextBlob();
1253  if (blob->region_type() == BRT_HLINE) {
1254  ++hline_count;
1255  }
1256  if (blob->region_type() == BRT_VLINE) {
1257  ++vline_count;
1258  }
1259  }
1260  flow_ = BTFT_NEIGHBOURS;
1261  blob_type_ = BRT_UNKNOWN;
1262  if (hline_count > vline_count) {
1263  flow_ = BTFT_NONE;
1264  blob_type_ = BRT_HLINE;
1265  } else if (vline_count > hline_count) {
1266  flow_ = BTFT_NONE;
1267  blob_type_ = BRT_VLINE;
1268  } else if (value < -1 || 1 < value) {
1269  int long_side;
1270  int short_side;
1271  if (value > 0) {
1272  long_side = bounding_box_.width();
1273  short_side = bounding_box_.height();
1274  blob_type_ = BRT_TEXT;
1275  } else {
1276  long_side = bounding_box_.height();
1277  short_side = bounding_box_.width();
1278  blob_type_ = BRT_VERT_TEXT;
1279  }
1280  // We will combine the old metrics using aspect ratio and blob counts
1281  // with the input value by allowing a strong indication to flip the
1282  // STRONG_CHAIN/CHAIN flow values.
1283  int strong_score = blob_count >= kHorzStrongTextlineCount ? 1 : 0;
1284  if (short_side > kHorzStrongTextlineHeight) {
1285  ++strong_score;
1286  }
1287  if (short_side * kHorzStrongTextlineAspect < long_side) {
1288  ++strong_score;
1289  }
1290  if (abs(value) >= kMinStrongTextValue) {
1291  flow_ = BTFT_STRONG_CHAIN;
1292  } else if (abs(value) >= kMinChainTextValue) {
1293  flow_ = BTFT_CHAIN;
1294  } else {
1295  flow_ = BTFT_NEIGHBOURS;
1296  }
1297  // Upgrade chain to strong chain if the other indicators are good
1298  if (flow_ == BTFT_CHAIN && strong_score == 3) {
1299  flow_ = BTFT_STRONG_CHAIN;
1300  }
1301  // Downgrade strong vertical text to chain if the indicators are bad.
1302  if (flow_ == BTFT_STRONG_CHAIN && value < 0 && strong_score < 2) {
1303  flow_ = BTFT_CHAIN;
1304  }
1305  }
1306  if (flow_ == BTFT_NEIGHBOURS) {
1307  // Check for noisy neighbours.
1308  if (noisy_count >= blob_count) {
1309  flow_ = BTFT_NONTEXT;
1310  blob_type_ = BRT_NOISE;
1311  }
1312  }
1313  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
1314  bounding_box_.bottom())) {
1315  tprintf("RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,",
1316  blob_count, noisy_count, good_blob_score_);
1317  tprintf(" Projection value=%d, flow=%d, blob_type=%d\n", value, flow_,
1318  blob_type_);
1319  Print();
1320  }
1321  SetBlobTypes();
1322 }
1323 
1324 // Sets all blobs with the partition blob type and flow, but never overwrite
1325 // leader blobs, as we need to be able to identify them later.
1327  if (!owns_blobs()) {
1328  return;
1329  }
1330  BLOBNBOX_C_IT it(&boxes_);
1331  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1332  BLOBNBOX *blob = it.data();
1333  if (blob->flow() != BTFT_LEADER) {
1334  blob->set_flow(flow_);
1335  }
1336  blob->set_region_type(blob_type_);
1337  ASSERT_HOST(blob->owner() == nullptr || blob->owner() == this);
1338  }
1339 }
1340 
1341 // Returns true if a decent baseline can be fitted through the blobs.
1342 // Works for both horizontal and vertical text.
1344  // Approximation of the baseline.
1345  DetLineFit linepoints;
1346  // Calculation of the mean height on this line segment. Note that these
1347  // variable names apply to the context of a horizontal line, and work
1348  // analogously, rather than literally in the case of a vertical line.
1349  int total_height = 0;
1350  int coverage = 0;
1351  int height_count = 0;
1352  int width = 0;
1353  BLOBNBOX_C_IT it(&boxes_);
1354  TBOX box(it.data()->bounding_box());
1355  // Accumulate points representing the baseline at the middle of each blob,
1356  // but add an additional point for each end of the line. This makes it
1357  // harder to fit a severe skew angle, as it is most likely not right.
1358  if (IsVerticalType()) {
1359  // For a vertical line, use the right side as the baseline.
1360  ICOORD first_pt(box.right(), box.bottom());
1361  // Use the bottom-right of the first (bottom) box, the top-right of the
1362  // last, and the middle-right of all others.
1363  linepoints.Add(first_pt);
1364  for (it.forward(); !it.at_last(); it.forward()) {
1365  BLOBNBOX *blob = it.data();
1366  box = blob->bounding_box();
1367  ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);
1368  linepoints.Add(box_pt);
1369  total_height += box.width();
1370  coverage += box.height();
1371  ++height_count;
1372  }
1373  box = it.data()->bounding_box();
1374  ICOORD last_pt(box.right(), box.top());
1375  linepoints.Add(last_pt);
1376  width = last_pt.y() - first_pt.y();
1377 
1378  } else {
1379  // Horizontal lines use the bottom as the baseline.
1380  TBOX box(it.data()->bounding_box());
1381  // Use the bottom-left of the first box, the the bottom-right of the last,
1382  // and the middle of all others.
1383  ICOORD first_pt(box.left(), box.bottom());
1384  linepoints.Add(first_pt);
1385  for (it.forward(); !it.at_last(); it.forward()) {
1386  BLOBNBOX *blob = it.data();
1387  box = blob->bounding_box();
1388  ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());
1389  linepoints.Add(box_pt);
1390  total_height += box.height();
1391  coverage += box.width();
1392  ++height_count;
1393  }
1394  box = it.data()->bounding_box();
1395  ICOORD last_pt(box.right(), box.bottom());
1396  linepoints.Add(last_pt);
1397  width = last_pt.x() - first_pt.x();
1398  }
1399  // Maximum median error allowed to be a good text line.
1400  if (height_count == 0) {
1401  return false;
1402  }
1403  double max_error = kMaxBaselineError * total_height / height_count;
1404  ICOORD start_pt, end_pt;
1405  double error = linepoints.Fit(&start_pt, &end_pt);
1406  return error < max_error && coverage >= kMinBaselineCoverage * width;
1407 }
1408 
1409 // Adds this ColPartition to a matching WorkingPartSet if one can be found,
1410 // otherwise starts a new one in the appropriate column, ending the previous.
1411 void ColPartition::AddToWorkingSet(const ICOORD &bleft, const ICOORD &tright,
1412  int resolution,
1413  ColPartition_LIST *used_parts,
1414  WorkingPartSet_LIST *working_sets) {
1415  if (block_owned_) {
1416  return; // Done it already.
1417  }
1418  block_owned_ = true;
1419  WorkingPartSet_IT it(working_sets);
1420  // If there is an upper partner use its working_set_ directly.
1421  ColPartition *partner = SingletonPartner(true);
1422  if (partner != nullptr && partner->working_set_ != nullptr) {
1423  working_set_ = partner->working_set_;
1424  working_set_->AddPartition(this);
1425  return;
1426  }
1427  if (partner != nullptr && textord_debug_bugs) {
1428  tprintf("Partition with partner has no working set!:");
1429  Print();
1430  partner->Print();
1431  }
1432  // Search for the column that the left edge fits in.
1433  WorkingPartSet *work_set = nullptr;
1434  it.move_to_first();
1435  int col_index = 0;
1436  for (it.mark_cycle_pt(); !it.cycled_list() && col_index != first_column_;
1437  it.forward(), ++col_index) {
1438  ;
1439  }
1440  if (textord_debug_tabfind >= 2) {
1441  tprintf("Match is %s for:", (col_index & 1) ? "Real" : "Between");
1442  Print();
1443  }
1444  if (it.cycled_list() && textord_debug_bugs) {
1445  tprintf("Target column=%d, only had %d\n", first_column_, col_index);
1446  }
1447  ASSERT_HOST(!it.cycled_list());
1448  work_set = it.data();
1449  // If last_column_ != first_column, then we need to scoop up all blocks
1450  // between here and the last_column_ and put back in work_set.
1451  if (!it.cycled_list() && last_column_ != first_column_ && !IsPulloutType()) {
1452  // Find the column that the right edge falls in.
1453  BLOCK_LIST completed_blocks;
1454  TO_BLOCK_LIST to_blocks;
1455  for (; !it.cycled_list() && col_index <= last_column_;
1456  it.forward(), ++col_index) {
1457  WorkingPartSet *end_set = it.data();
1458  end_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,
1459  &completed_blocks, &to_blocks);
1460  }
1461  work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
1462  }
1463  working_set_ = work_set;
1464  work_set->AddPartition(this);
1465 }
1466 
1467 // From the given block_parts list, builds one or more BLOCKs and
1468 // corresponding TO_BLOCKs, such that the line spacing is uniform in each.
1469 // Created blocks are appended to the end of completed_blocks and to_blocks.
1470 // The used partitions are put onto used_parts, as they may still be referred
1471 // to in the partition grid. bleft, tright and resolution are the bounds
1472 // and resolution of the original image.
1473 void ColPartition::LineSpacingBlocks(const ICOORD &bleft, const ICOORD &tright,
1474  int resolution,
1475  ColPartition_LIST *block_parts,
1476  ColPartition_LIST *used_parts,
1477  BLOCK_LIST *completed_blocks,
1478  TO_BLOCK_LIST *to_blocks) {
1479  int page_height = tright.y() - bleft.y();
1480  // Compute the initial spacing stats.
1481  ColPartition_IT it(block_parts);
1482  int part_count = 0;
1483  int max_line_height = 0;
1484 
1485  // TODO(joeliu): We should add some special logic for PT_INLINE_EQUATION type
1486  // because their line spacing with their neighbors maybe smaller and their
1487  // height may be slightly larger.
1488 
1489  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1490  ColPartition *part = it.data();
1491  ASSERT_HOST(!part->boxes()->empty());
1492  STATS side_steps(0, part->bounding_box().height());
1493  if (part->bounding_box().height() > max_line_height) {
1494  max_line_height = part->bounding_box().height();
1495  }
1496  BLOBNBOX_C_IT blob_it(part->boxes());
1497  int prev_bottom = blob_it.data()->bounding_box().bottom();
1498  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1499  BLOBNBOX *blob = blob_it.data();
1500  int bottom = blob->bounding_box().bottom();
1501  int step = bottom - prev_bottom;
1502  if (step < 0) {
1503  step = -step;
1504  }
1505  side_steps.add(step, 1);
1506  prev_bottom = bottom;
1507  }
1508  part->set_side_step(static_cast<int>(side_steps.median() + 0.5));
1509  if (!it.at_last()) {
1510  ColPartition *next_part = it.data_relative(1);
1511  part->set_bottom_spacing(part->median_bottom() -
1512  next_part->median_bottom());
1513  part->set_top_spacing(part->median_top() - next_part->median_top());
1514  } else {
1515  part->set_bottom_spacing(page_height);
1516  part->set_top_spacing(page_height);
1517  }
1518  if (textord_debug_tabfind) {
1519  part->Print();
1520  tprintf("side step = %.2f, top spacing = %d, bottom spacing=%d\n",
1521  side_steps.median(), part->top_spacing(), part->bottom_spacing());
1522  }
1523  ++part_count;
1524  }
1525  if (part_count == 0) {
1526  return;
1527  }
1528 
1529  SmoothSpacings(resolution, page_height, block_parts);
1530 
1531  // Move the partitions into individual block lists and make the blocks.
1532  BLOCK_IT block_it(completed_blocks);
1533  TO_BLOCK_IT to_block_it(to_blocks);
1534  ColPartition_LIST spacing_parts;
1535  ColPartition_IT sp_block_it(&spacing_parts);
1536  int same_block_threshold = max_line_height * kMaxSameBlockLineSpacing;
1537  for (it.mark_cycle_pt(); !it.empty();) {
1538  ColPartition *part = it.extract();
1539  sp_block_it.add_to_end(part);
1540  it.forward();
1541  if (it.empty() || part->bottom_spacing() > same_block_threshold ||
1542  !part->SpacingsEqual(*it.data(), resolution)) {
1543  // There is a spacing boundary. Check to see if it.data() belongs
1544  // better in the current block or the next one.
1545  if (!it.empty() && part->bottom_spacing() <= same_block_threshold) {
1546  ColPartition *next_part = it.data();
1547  // If there is a size match one-way, then the middle line goes with
1548  // its matched size, otherwise it goes with the smallest spacing.
1549  ColPartition *third_part = it.at_last() ? nullptr : it.data_relative(1);
1550  if (textord_debug_tabfind) {
1551  tprintf(
1552  "Spacings unequal: upper:%d/%d, lower:%d/%d,"
1553  " sizes %d %d %d\n",
1554  part->top_spacing(), part->bottom_spacing(),
1555  next_part->top_spacing(), next_part->bottom_spacing(),
1556  part->median_height(), next_part->median_height(),
1557  third_part != nullptr ? third_part->median_height() : 0);
1558  }
1559  // We can only consider adding the next line to the block if the sizes
1560  // match and the lines are close enough for their size.
1561  if (part->SizesSimilar(*next_part) &&
1562  next_part->median_height() * kMaxSameBlockLineSpacing >
1563  part->bottom_spacing() &&
1565  part->top_spacing()) {
1566  // Even now, we can only add it as long as the third line doesn't
1567  // match in the same way and have a smaller bottom spacing.
1568  if (third_part == nullptr || !next_part->SizesSimilar(*third_part) ||
1569  third_part->median_height() * kMaxSameBlockLineSpacing <=
1570  next_part->bottom_spacing() ||
1571  next_part->median_height() * kMaxSameBlockLineSpacing <=
1572  next_part->top_spacing() ||
1573  next_part->bottom_spacing() > part->bottom_spacing()) {
1574  // Add to the current block.
1575  sp_block_it.add_to_end(it.extract());
1576  it.forward();
1577  if (textord_debug_tabfind) {
1578  tprintf("Added line to current block.\n");
1579  }
1580  }
1581  }
1582  }
1583  TO_BLOCK *to_block = MakeBlock(bleft, tright, &spacing_parts, used_parts);
1584  if (to_block != nullptr) {
1585  to_block_it.add_to_end(to_block);
1586  block_it.add_to_end(to_block->block);
1587  }
1588  sp_block_it.set_to_list(&spacing_parts);
1589  } else {
1590  if (textord_debug_tabfind && !it.empty()) {
1591  ColPartition *next_part = it.data();
1592  tprintf("Spacings equal: upper:%d/%d, lower:%d/%d, median:%d/%d\n",
1593  part->top_spacing(), part->bottom_spacing(),
1594  next_part->top_spacing(), next_part->bottom_spacing(),
1595  part->median_height(), next_part->median_height());
1596  }
1597  }
1598  }
1599 }
1600 
1601 // Helper function to clip the input pos to the given bleft, tright bounds.
1602 static void ClipCoord(const ICOORD &bleft, const ICOORD &tright, ICOORD *pos) {
1603  if (pos->x() < bleft.x()) {
1604  pos->set_x(bleft.x());
1605  }
1606  if (pos->x() > tright.x()) {
1607  pos->set_x(tright.x());
1608  }
1609  if (pos->y() < bleft.y()) {
1610  pos->set_y(bleft.y());
1611  }
1612  if (pos->y() > tright.y()) {
1613  pos->set_y(tright.y());
1614  }
1615 }
1616 
1617 // Helper moves the blobs from the given list of block_parts into the block
1618 // itself. Sets up the block for (old) textline formation correctly for
1619 // vertical and horizontal text. The partitions are moved to used_parts
1620 // afterwards, as they cannot be deleted yet.
1621 static TO_BLOCK *MoveBlobsToBlock(bool vertical_text, int line_spacing,
1622  BLOCK *block, ColPartition_LIST *block_parts,
1623  ColPartition_LIST *used_parts) {
1624  // Make a matching TO_BLOCK and put all the BLOBNBOXes from the parts in it.
1625  // Move all the parts to a done list as they are no longer needed, except
1626  // that have have to continue to exist until the part grid is deleted.
1627  // Compute the median blob size as we go, as the block needs to know.
1628  TBOX block_box(block->pdblk.bounding_box());
1629  STATS sizes(0, std::max(block_box.width(), block_box.height()));
1630  bool text_type = block->pdblk.poly_block()->IsText();
1631  ColPartition_IT it(block_parts);
1632  auto *to_block = new TO_BLOCK(block);
1633  BLOBNBOX_IT blob_it(&to_block->blobs);
1634  ColPartition_IT used_it(used_parts);
1635  for (it.move_to_first(); !it.empty(); it.forward()) {
1636  ColPartition *part = it.extract();
1637  // Transfer blobs from all regions to the output blocks.
1638  // Blobs for non-text regions will be used to define the polygonal
1639  // bounds of the region.
1640  for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty(); bb_it.forward()) {
1641  BLOBNBOX *bblob = bb_it.extract();
1642  if (bblob->owner() != part) {
1643  tprintf("Ownership incorrect for blob:");
1644  bblob->bounding_box().print();
1645  tprintf("Part=");
1646  part->Print();
1647  if (bblob->owner() == nullptr) {
1648  tprintf("Not owned\n");
1649  } else {
1650  tprintf("Owner part:");
1651  bblob->owner()->Print();
1652  }
1653  }
1654  ASSERT_HOST(bblob->owner() == part);
1655  // Assert failure here is caused by arbitrarily changing the partition
1656  // type without also changing the blob type, such as in
1657  // InsertSmallBlobsAsUnknowns.
1658  ASSERT_HOST(!text_type || bblob->region_type() >= BRT_UNKNOWN);
1659  C_OUTLINE_LIST *outlines = bblob->cblob()->out_list();
1660  C_OUTLINE_IT ol_it(outlines);
1661  ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);
1662  if (vertical_text) {
1663  sizes.add(bblob->bounding_box().width(), 1);
1664  } else {
1665  sizes.add(bblob->bounding_box().height(), 1);
1666  }
1667  blob_it.add_after_then_move(bblob);
1668  }
1669  used_it.add_to_end(part);
1670  }
1671  if (text_type && blob_it.empty()) {
1672  delete block;
1673  delete to_block;
1674  return nullptr;
1675  }
1676  to_block->line_size = sizes.median();
1677  if (vertical_text) {
1678  int block_width = block->pdblk.bounding_box().width();
1679  if (block_width < line_spacing) {
1680  line_spacing = block_width;
1681  }
1682  to_block->line_spacing = static_cast<float>(line_spacing);
1683  to_block->max_blob_size = static_cast<float>(block_width + 1);
1684  } else {
1685  int block_height = block->pdblk.bounding_box().height();
1686  if (block_height < line_spacing) {
1687  line_spacing = block_height;
1688  }
1689  to_block->line_spacing = static_cast<float>(line_spacing);
1690  to_block->max_blob_size = static_cast<float>(block_height + 1);
1691  }
1692  return to_block;
1693 }
1694 
1695 // Constructs a block from the given list of partitions.
1696 // Arguments are as LineSpacingBlocks above.
1697 TO_BLOCK *ColPartition::MakeBlock(const ICOORD &bleft, const ICOORD &tright,
1698  ColPartition_LIST *block_parts,
1699  ColPartition_LIST *used_parts) {
1700  if (block_parts->empty()) {
1701  return nullptr; // Nothing to do.
1702  }
1703  // If the block_parts are not in reading order, then it will make an invalid
1704  // block polygon and bounding_box, so sort by bounding box now just to make
1705  // sure.
1706  block_parts->sort(&ColPartition::SortByBBox);
1707  ColPartition_IT it(block_parts);
1708  ColPartition *part = it.data();
1709  PolyBlockType type = part->type();
1710  if (type == PT_VERTICAL_TEXT) {
1711  return MakeVerticalTextBlock(bleft, tright, block_parts, used_parts);
1712  }
1713  // LineSpacingBlocks has handed us a collection of evenly spaced lines and
1714  // put the average spacing in each partition, so we can just take the
1715  // linespacing from the first partition.
1716  int line_spacing = part->bottom_spacing();
1717  if (line_spacing < part->median_height()) {
1718  line_spacing = part->bounding_box().height();
1719  }
1720  ICOORDELT_LIST vertices;
1721  ICOORDELT_IT vert_it(&vertices);
1722  ICOORD start, end;
1723  int min_x = INT32_MAX;
1724  int max_x = -INT32_MAX;
1725  int min_y = INT32_MAX;
1726  int max_y = -INT32_MAX;
1727  int iteration = 0;
1728  do {
1729  if (iteration == 0) {
1730  ColPartition::LeftEdgeRun(&it, &start, &end);
1731  } else {
1732  ColPartition::RightEdgeRun(&it, &start, &end);
1733  }
1734  ClipCoord(bleft, tright, &start);
1735  ClipCoord(bleft, tright, &end);
1736  vert_it.add_after_then_move(new ICOORDELT(start));
1737  vert_it.add_after_then_move(new ICOORDELT(end));
1738  UpdateRange(start.x(), &min_x, &max_x);
1739  UpdateRange(end.x(), &min_x, &max_x);
1740  UpdateRange(start.y(), &min_y, &max_y);
1741  UpdateRange(end.y(), &min_y, &max_y);
1742  if ((iteration == 0 && it.at_first()) || (iteration == 1 && it.at_last())) {
1743  ++iteration;
1744  it.move_to_last();
1745  }
1746  } while (iteration < 2);
1747  if (textord_debug_tabfind) {
1748  tprintf("Making block at (%d,%d)->(%d,%d)\n", min_x, min_y, max_x, max_y);
1749  }
1750  auto *block = new BLOCK("", true, 0, 0, min_x, min_y, max_x, max_y);
1751  block->pdblk.set_poly_block(new POLY_BLOCK(&vertices, type));
1752  return MoveBlobsToBlock(false, line_spacing, block, block_parts, used_parts);
1753 }
1754 
1755 // Constructs a block from the given list of vertical text partitions.
1756 // Currently only creates rectangular blocks.
1758  const ICOORD &tright,
1759  ColPartition_LIST *block_parts,
1760  ColPartition_LIST *used_parts) {
1761  if (block_parts->empty()) {
1762  return nullptr; // Nothing to do.
1763  }
1764  ColPartition_IT it(block_parts);
1765  ColPartition *part = it.data();
1766  TBOX block_box = part->bounding_box();
1767  int line_spacing = block_box.width();
1768  PolyBlockType type = it.data()->type();
1769  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1770  block_box += it.data()->bounding_box();
1771  }
1772  if (textord_debug_tabfind) {
1773  tprintf("Making block at:");
1774  block_box.print();
1775  }
1776  auto *block = new BLOCK("", true, 0, 0, block_box.left(), block_box.bottom(),
1777  block_box.right(), block_box.top());
1778  block->pdblk.set_poly_block(new POLY_BLOCK(block_box, type));
1779  return MoveBlobsToBlock(true, line_spacing, block, block_parts, used_parts);
1780 }
1781 
1782 // Makes a TO_ROW matching this and moves all the blobs to it, transferring
1783 // ownership to to returned TO_ROW.
1785  BLOBNBOX_C_IT blob_it(&boxes_);
1786  TO_ROW *row = nullptr;
1787  int line_size = IsVerticalType() ? median_width_ : median_height_;
1788  // Add all the blobs to a single TO_ROW.
1789  for (; !blob_it.empty(); blob_it.forward()) {
1790  BLOBNBOX *blob = blob_it.extract();
1791  // blob->compute_bounding_box();
1792  int top = blob->bounding_box().top();
1793  int bottom = blob->bounding_box().bottom();
1794  if (row == nullptr) {
1795  row =
1796  new TO_ROW(blob, static_cast<float>(top), static_cast<float>(bottom),
1797  static_cast<float>(line_size));
1798  } else {
1799  row->add_blob(blob, static_cast<float>(top), static_cast<float>(bottom),
1800  static_cast<float>(line_size));
1801  }
1802  }
1803  return row;
1804 }
1805 
1806 // Returns a copy of everything except the list of boxes. The resulting
1807 // ColPartition is only suitable for keeping in a column candidate list.
1809  auto *part = new ColPartition(blob_type_, vertical_);
1810  part->left_margin_ = left_margin_;
1811  part->right_margin_ = right_margin_;
1812  part->bounding_box_ = bounding_box_;
1813  memcpy(part->special_blobs_densities_, special_blobs_densities_,
1814  sizeof(special_blobs_densities_));
1815  part->median_bottom_ = median_bottom_;
1816  part->median_top_ = median_top_;
1817  part->median_height_ = median_height_;
1818  part->median_left_ = median_left_;
1819  part->median_right_ = median_right_;
1820  part->median_width_ = median_width_;
1821  part->good_width_ = good_width_;
1822  part->good_column_ = good_column_;
1823  part->left_key_tab_ = left_key_tab_;
1824  part->right_key_tab_ = right_key_tab_;
1825  part->type_ = type_;
1826  part->flow_ = flow_;
1827  part->left_key_ = left_key_;
1828  part->right_key_ = right_key_;
1829  part->first_column_ = first_column_;
1830  part->last_column_ = last_column_;
1831  part->owns_blobs_ = false;
1832  return part;
1833 }
1834 
1836  ColPartition *copy = ShallowCopy();
1837  copy->set_owns_blobs(false);
1838  BLOBNBOX_C_IT inserter(copy->boxes());
1839  BLOBNBOX_C_IT traverser(boxes());
1840  for (traverser.mark_cycle_pt(); !traverser.cycled_list();
1841  traverser.forward()) {
1842  inserter.add_after_then_move(traverser.data());
1843  }
1844  return copy;
1845 }
1846 
1847 #ifndef GRAPHICS_DISABLED
1848 // Provides a color for BBGrid to draw the rectangle.
1849 // Must be kept in sync with PolyBlockType.
1851  if (type_ == PT_UNKNOWN) {
1852  return BLOBNBOX::TextlineColor(blob_type_, flow_);
1853  }
1854  return POLY_BLOCK::ColorForPolyBlockType(type_);
1855 }
1856 #endif // !GRAPHICS_DISABLED
1857 
1858 // Keep in sync with BlobRegionType.
1859 static char kBlobTypes[BRT_COUNT + 1] = "NHSRIUVT";
1860 
1861 // Prints debug information on this.
1862 void ColPartition::Print() const {
1863  int y = MidY();
1864  tprintf(
1865  "ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)"
1866  " w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d"
1867  " ts=%d bs=%d ls=%d rs=%d\n",
1868  boxes_.empty() ? 'E' : ' ', left_margin_, left_key_tab_ ? 'T' : 'B',
1869  LeftAtY(y), bounding_box_.left(), median_left_, bounding_box_.bottom(),
1870  median_bottom_, bounding_box_.right(), RightAtY(y),
1871  right_key_tab_ ? 'T' : 'B', right_margin_, median_right_,
1872  bounding_box_.top(), median_top_, good_width_, good_column_, type_,
1873  kBlobTypes[blob_type_], flow_, first_column_, last_column_,
1874  boxes_.length(), space_above_, space_below_, space_to_left_,
1875  space_to_right_);
1876 }
1877 
1878 // Prints debug information on the colors.
1880  tprintf("Colors:(%d, %d, %d)%d -> (%d, %d, %d)\n", color1_[COLOR_RED],
1881  color1_[COLOR_GREEN], color1_[COLOR_BLUE], color1_[L_ALPHA_CHANNEL],
1882  color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);
1883 }
1884 
1885 // Sets the types of all partitions in the run to be the max of the types.
1886 void ColPartition::SmoothPartnerRun(int working_set_count) {
1887  STATS left_stats(0, working_set_count);
1888  STATS right_stats(0, working_set_count);
1889  PolyBlockType max_type = type_;
1890  ColPartition *partner;
1891  for (partner = SingletonPartner(false); partner != nullptr;
1892  partner = partner->SingletonPartner(false)) {
1893  if (partner->type_ > max_type) {
1894  max_type = partner->type_;
1895  }
1896  if (column_set_ == partner->column_set_) {
1897  left_stats.add(partner->first_column_, 1);
1898  right_stats.add(partner->last_column_, 1);
1899  }
1900  }
1901  type_ = max_type;
1902  // TODO(rays) Either establish that it isn't necessary to set the columns,
1903  // or find a way to do it that does not cause an assert failure in
1904  // AddToWorkingSet.
1905 #if 0
1906  first_column_ = left_stats.mode();
1907  last_column_ = right_stats.mode();
1908  if (last_column_ < first_column_)
1909  last_column_ = first_column_;
1910 #endif
1911 
1912  for (partner = SingletonPartner(false); partner != nullptr;
1913  partner = partner->SingletonPartner(false)) {
1914  partner->type_ = max_type;
1915 #if 0 // See TODO above
1916  if (column_set_ == partner->column_set_) {
1917  partner->first_column_ = first_column_;
1918  partner->last_column_ = last_column_;
1919  }
1920 #endif
1921  }
1922 }
1923 
1924 // ======= Scenario common to all Refine*Partners* functions =======
1925 // ColPartitions are aiming to represent textlines, or horizontal slices
1926 // of images, and we are trying to form bi-directional (upper/lower) chains
1927 // of UNIQUE partner ColPartitions that can be made into blocks.
1928 // The ColPartitions have previously been typed (see SetPartitionType)
1929 // according to a combination of the content type and
1930 // how they lie on the columns. We want to chain text into
1931 // groups of a single type, but image ColPartitions may have been typed
1932 // differently in different parts of the image, due to being non-rectangular.
1933 //
1934 // We previously ran a search for upper and lower partners, but there may
1935 // be more than one, and they may be of mixed types, so now we wish to
1936 // refine the partners down to at most one.
1937 // A heading may have multiple partners:
1938 // ===============================
1939 // ======== ========== =========
1940 // ======== ========== =========
1941 // but it should be a different type.
1942 // A regular flowing text line may have multiple partners:
1943 // ================== ===================
1944 // ======= ================= ===========
1945 // This could be the start of a pull-out, or it might all be in a single
1946 // column and might be caused by tightly spaced text, bold words, bullets,
1947 // funny punctuation etc, all of which can cause textlines to be split into
1948 // multiple ColPartitions. Pullouts and figure captions should now be different
1949 // types so we can more aggressively merge groups of partners that all sit
1950 // in a single column.
1951 //
1952 // Cleans up the partners of the given type so that there is at most
1953 // one partner. This makes block creation simpler.
1954 // If get_desperate is true, goes to more desperate merge methods
1955 // to merge flowing text before breaking partnerships.
1956 void ColPartition::RefinePartners(PolyBlockType type, bool get_desperate,
1957  ColPartitionGrid *grid) {
1958  if (TypesSimilar(type_, type)) {
1959  RefinePartnersInternal(true, get_desperate, grid);
1960  RefinePartnersInternal(false, get_desperate, grid);
1961  } else if (type == PT_COUNT) {
1962  // This is the final pass. Make sure only the correctly typed
1963  // partners surivive, however many there are.
1964  RefinePartnersByType(true, &upper_partners_);
1965  RefinePartnersByType(false, &lower_partners_);
1966  // It is possible for a merge to have given a partition multiple
1967  // partners again, so the last resort is to use overlap which is
1968  // guaranteed to leave at most one partner left.
1969  if (!upper_partners_.empty() && !upper_partners_.singleton()) {
1970  RefinePartnersByOverlap(true, &upper_partners_);
1971  }
1972  if (!lower_partners_.empty() && !lower_partners_.singleton()) {
1973  RefinePartnersByOverlap(false, &lower_partners_);
1974  }
1975  }
1976 }
1977 
1979 
1980 // Cleans up the partners above if upper is true, else below.
1981 // If get_desperate is true, goes to more desperate merge methods
1982 // to merge flowing text before breaking partnerships.
1983 void ColPartition::RefinePartnersInternal(bool upper, bool get_desperate,
1984  ColPartitionGrid *grid) {
1985  ColPartition_CLIST *partners = upper ? &upper_partners_ : &lower_partners_;
1986  if (!partners->empty() && !partners->singleton()) {
1987  RefinePartnersByType(upper, partners);
1988  if (!partners->empty() && !partners->singleton()) {
1989  // Check for transitive partnerships and break the cycle.
1990  RefinePartnerShortcuts(upper, partners);
1991  if (!partners->empty() && !partners->singleton()) {
1992  // Types didn't fix it. Flowing text keeps the one with the longest
1993  // sequence of singleton matching partners. All others max overlap.
1994  if (TypesSimilar(type_, PT_FLOWING_TEXT) && get_desperate) {
1995  RefineTextPartnersByMerge(upper, false, partners, grid);
1996  if (!partners->empty() && !partners->singleton()) {
1997  RefineTextPartnersByMerge(upper, true, partners, grid);
1998  }
1999  }
2000  // The last resort is to use overlap.
2001  if (!partners->empty() && !partners->singleton()) {
2002  RefinePartnersByOverlap(upper, partners);
2003  }
2004  }
2005  }
2006  }
2007 }
2008 
2009 // Cleans up the partners above if upper is true, else below.
2010 // Restricts the partners to only desirable types. For text and BRT_HLINE this
2011 // means the same type_ , and for image types it means any image type.
2012 void ColPartition::RefinePartnersByType(bool upper,
2013  ColPartition_CLIST *partners) {
2014  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2015  bounding_box_.bottom());
2016  if (debug) {
2017  tprintf("Refining %d %s partners by type for:\n", partners->length(),
2018  upper ? "Upper" : "Lower");
2019  Print();
2020  }
2021  ColPartition_C_IT it(partners);
2022  // Purify text by type.
2023  if (!IsImageType() && !IsLineType() && type() != PT_TABLE) {
2024  // Keep only partners matching type_.
2025  // Exception: PT_VERTICAL_TEXT is allowed to stay with the other
2026  // text types if it is the only partner.
2027  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2028  ColPartition *partner = it.data();
2029  if (!TypesSimilar(type_, partner->type_)) {
2030  if (debug) {
2031  tprintf("Removing partner:");
2032  partner->Print();
2033  }
2034  partner->RemovePartner(!upper, this);
2035  it.extract();
2036  } else if (debug) {
2037  tprintf("Keeping partner:");
2038  partner->Print();
2039  }
2040  }
2041  } else {
2042  // Only polyimages are allowed to have partners of any kind!
2043  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2044  ColPartition *partner = it.data();
2045  if (partner->blob_type() != BRT_POLYIMAGE ||
2046  blob_type() != BRT_POLYIMAGE) {
2047  if (debug) {
2048  tprintf("Removing partner:");
2049  partner->Print();
2050  }
2051  partner->RemovePartner(!upper, this);
2052  it.extract();
2053  } else if (debug) {
2054  tprintf("Keeping partner:");
2055  partner->Print();
2056  }
2057  }
2058  }
2059 }
2060 
2061 // Cleans up the partners above if upper is true, else below.
2062 // Remove transitive partnerships: this<->a, and a<->b and this<->b.
2063 // Gets rid of this<->b, leaving a clean chain.
2064 // Also if we have this<->a and a<->this, then gets rid of this<->a, as
2065 // this has multiple partners.
2066 void ColPartition::RefinePartnerShortcuts(bool upper,
2067  ColPartition_CLIST *partners) {
2068  bool done_any = false;
2069  do {
2070  done_any = false;
2071  ColPartition_C_IT it(partners);
2072  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2073  ColPartition *a = it.data();
2074  // Check for a match between all of a's partners (it1/b1) and all
2075  // of this's partners (it2/b2).
2076  ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
2077  for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
2078  ColPartition *b1 = it1.data();
2079  if (b1 == this) {
2080  done_any = true;
2081  it.extract();
2082  a->RemovePartner(!upper, this);
2083  break;
2084  }
2085  ColPartition_C_IT it2(partners);
2086  for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
2087  ColPartition *b2 = it2.data();
2088  if (b1 == b2) {
2089  // Jackpot! b2 should not be a partner of this.
2090  it2.extract();
2091  b2->RemovePartner(!upper, this);
2092  done_any = true;
2093  // That potentially invalidated all the iterators, so break out
2094  // and start again.
2095  break;
2096  }
2097  }
2098  if (done_any) {
2099  break;
2100  }
2101  }
2102  if (done_any) {
2103  break;
2104  }
2105  }
2106  } while (done_any && !partners->empty() && !partners->singleton());
2107 }
2108 
2109 // Cleans up the partners above if upper is true, else below.
2110 // If multiple text partners can be merged, (with each other, NOT with this),
2111 // then do so.
2112 // If desperate is true, then an increase in overlap with the merge is
2113 // allowed. If the overlap increases, then the desperately_merged_ flag
2114 // is set, indicating that the textlines probably need to be regenerated
2115 // by aggressive line fitting/splitting, as there are probably vertically
2116 // joined blobs that cross textlines.
2117 void ColPartition::RefineTextPartnersByMerge(bool upper, bool desperate,
2118  ColPartition_CLIST *partners,
2119  ColPartitionGrid *grid) {
2120  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2121  bounding_box_.bottom());
2122  if (debug) {
2123  tprintf("Refining %d %s partners by merge for:\n", partners->length(),
2124  upper ? "Upper" : "Lower");
2125  Print();
2126  }
2127  while (!partners->empty() && !partners->singleton()) {
2128  // Absorb will mess up the iterators, so we have to merge one partition
2129  // at a time and rebuild the iterators each time.
2130  ColPartition_C_IT it(partners);
2131  ColPartition *part = it.data();
2132  // Gather a list of merge candidates, from the list of partners, that
2133  // are all in the same single column. See general scenario comment above.
2134  ColPartition_CLIST candidates;
2135  ColPartition_C_IT cand_it(&candidates);
2136  for (it.forward(); !it.at_first(); it.forward()) {
2137  ColPartition *candidate = it.data();
2138  if (part->first_column_ == candidate->last_column_ &&
2139  part->last_column_ == candidate->first_column_) {
2140  cand_it.add_after_then_move(it.data());
2141  }
2142  }
2143  int overlap_increase;
2144  ColPartition *candidate = grid->BestMergeCandidate(
2145  part, &candidates, debug, nullptr, &overlap_increase);
2146  if (candidate != nullptr && (overlap_increase <= 0 || desperate)) {
2147  if (debug) {
2148  tprintf("Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
2149  part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),
2150  overlap_increase);
2151  }
2152  // Remove before merge and re-insert to keep the integrity of the grid.
2153  grid->RemoveBBox(candidate);
2154  grid->RemoveBBox(part);
2155  part->Absorb(candidate, nullptr);
2156  // We modified the box of part, so re-insert it into the grid.
2157  grid->InsertBBox(true, true, part);
2158  if (overlap_increase > 0) {
2159  part->desperately_merged_ = true;
2160  }
2161  } else {
2162  break; // Can't merge.
2163  }
2164  }
2165 }
2166 
2167 // Cleans up the partners above if upper is true, else below.
2168 // Keep the partner with the biggest overlap.
2169 void ColPartition::RefinePartnersByOverlap(bool upper,
2170  ColPartition_CLIST *partners) {
2171  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2172  bounding_box_.bottom());
2173  if (debug) {
2174  tprintf("Refining %d %s partners by overlap for:\n", partners->length(),
2175  upper ? "Upper" : "Lower");
2176  Print();
2177  }
2178  ColPartition_C_IT it(partners);
2179  ColPartition *best_partner = it.data();
2180  // Find the partner with the best overlap.
2181  int best_overlap = 0;
2182  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2183  ColPartition *partner = it.data();
2184  int overlap =
2185  std::min(bounding_box_.right(), partner->bounding_box_.right()) -
2186  std::max(bounding_box_.left(), partner->bounding_box_.left());
2187  if (overlap > best_overlap) {
2188  best_overlap = overlap;
2189  best_partner = partner;
2190  }
2191  }
2192  // Keep only the best partner.
2193  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2194  ColPartition *partner = it.data();
2195  if (partner != best_partner) {
2196  if (debug) {
2197  tprintf("Removing partner:");
2198  partner->Print();
2199  }
2200  partner->RemovePartner(!upper, this);
2201  it.extract();
2202  }
2203  }
2204 }
2205 
2206 // Return true if bbox belongs better in this than other.
2207 bool ColPartition::ThisPartitionBetter(BLOBNBOX *bbox,
2208  const ColPartition &other) {
2209  const TBOX &box = bbox->bounding_box();
2210  // Margins take priority.
2211  int left = box.left();
2212  int right = box.right();
2213  if (left < left_margin_ || right > right_margin_) {
2214  return false;
2215  }
2216  if (left < other.left_margin_ || right > other.right_margin_) {
2217  return true;
2218  }
2219  int top = box.top();
2220  int bottom = box.bottom();
2221  int this_overlap =
2222  std::min(top, median_top_) - std::max(bottom, median_bottom_);
2223  int other_overlap =
2224  std::min(top, other.median_top_) - std::max(bottom, other.median_bottom_);
2225  int this_miss = median_top_ - median_bottom_ - this_overlap;
2226  int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
2227  if (TabFind::WithinTestRegion(3, box.left(), box.bottom())) {
2228  tprintf("Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
2229  box.left(), box.bottom(), box.right(), box.top(), this_overlap,
2230  other_overlap, this_miss, other_miss, median_top_,
2231  other.median_top_);
2232  }
2233  if (this_miss < other_miss) {
2234  return true;
2235  }
2236  if (this_miss > other_miss) {
2237  return false;
2238  }
2239  if (this_overlap > other_overlap) {
2240  return true;
2241  }
2242  if (this_overlap < other_overlap) {
2243  return false;
2244  }
2245  return median_top_ >= other.median_top_;
2246 }
2247 
2248 // Returns the median line-spacing between the current position and the end
2249 // of the list.
2250 // The iterator is passed by value so the iteration does not modify the
2251 // caller's iterator.
2252 static int MedianSpacing(int page_height, ColPartition_IT it) {
2253  STATS stats(0, page_height);
2254  while (!it.cycled_list()) {
2255  ColPartition *part = it.data();
2256  it.forward();
2257  stats.add(part->bottom_spacing(), 1);
2258  stats.add(part->top_spacing(), 1);
2259  }
2260  return static_cast<int>(stats.median() + 0.5);
2261 }
2262 
2263 // Returns true if this column partition is in the same column as
2264 // part. This function will only work after the SetPartitionType function
2265 // has been called on both column partitions. This is useful for
2266 // doing a SideSearch when you want things in the same page column.
2267 //
2268 // Currently called by the table detection code to identify if potential table
2269 // partitions exist in the same column.
2271  // Overlap does not occur when last < part.first or first > part.last.
2272  // In other words, one is completely to the side of the other.
2273  // This is just DeMorgan's law applied to that so the function returns true.
2274  return (last_column_ >= part.first_column_) &&
2275  (first_column_ <= part.last_column_);
2276 }
2277 
2278 // Smoothes the spacings in the list into groups of equal linespacing.
2279 // resolution is the resolution of the original image, used as a basis
2280 // for thresholds in change of spacing. page_height is in pixels.
2281 void ColPartition::SmoothSpacings(int resolution, int page_height,
2282  ColPartition_LIST *parts) {
2283  // The task would be trivial if we didn't have to allow for blips -
2284  // occasional offsets in spacing caused by anomalous text, such as all
2285  // caps, groups of descenders, joined words, Arabic etc.
2286  // The neighbourhood stores a consecutive group of partitions so that
2287  // blips can be detected correctly, yet conservatively enough to not
2288  // mistake genuine spacing changes for blips. See example below.
2289  ColPartition *neighbourhood[PN_COUNT];
2290  ColPartition_IT it(parts);
2291  it.mark_cycle_pt();
2292  // Although we know nothing about the spacings is this list, the median is
2293  // used as an approximation to allow blips.
2294  // If parts of this block aren't spaced to the median, then we can't
2295  // accept blips in those parts, but we'll recalculate it each time we
2296  // split the block, so the median becomes more likely to match all the text.
2297  int median_space = MedianSpacing(page_height, it);
2298  ColPartition_IT start_it(it);
2299  ColPartition_IT end_it(it);
2300  for (int i = 0; i < PN_COUNT; ++i) {
2301  if (i < PN_UPPER || it.cycled_list()) {
2302  neighbourhood[i] = nullptr;
2303  } else {
2304  if (i == PN_LOWER) {
2305  end_it = it;
2306  }
2307  neighbourhood[i] = it.data();
2308  it.forward();
2309  }
2310  }
2311  while (neighbourhood[PN_UPPER] != nullptr) {
2312  // Test for end of a group. Normally SpacingsEqual is true within a group,
2313  // but in the case of a blip, it will be false. Here is an example:
2314  // Line enum Spacing below (spacing between tops of lines)
2315  // 1 ABOVE2 20
2316  // 2 ABOVE1 20
2317  // 3 UPPER 15
2318  // 4 LOWER 25
2319  // 5 BELOW1 20
2320  // 6 BELOW2 20
2321  // Line 4 is all in caps (regular caps), so the spacing between line 3
2322  // and line 4 (looking at the tops) is smaller than normal, and the
2323  // spacing between line 4 and line 5 is larger than normal, but the
2324  // two of them add to twice the normal spacing.
2325  // The following if has to accept unequal spacings 3 times to pass the
2326  // blip (20/15, 15/25 and 25/20)
2327  // When the blip is in the middle, OKSpacingBlip tests that one of
2328  // ABOVE1 and BELOW1 matches the median.
2329  // The first time, everything is shifted down 1, so we present
2330  // OKSpacingBlip with neighbourhood+1 and check that PN_UPPER is median.
2331  // The last time, everything is shifted up 1, so we present OKSpacingBlip
2332  // with neighbourhood-1 and check that PN_LOWER matches the median.
2333  if (neighbourhood[PN_LOWER] == nullptr ||
2334  (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
2335  resolution) &&
2336  (neighbourhood[PN_UPPER] == nullptr ||
2337  neighbourhood[PN_LOWER] == nullptr ||
2338  !OKSpacingBlip(resolution, median_space, neighbourhood, 0)) &&
2339  (neighbourhood[PN_UPPER - 1] == nullptr ||
2340  neighbourhood[PN_LOWER - 1] == nullptr ||
2341  !OKSpacingBlip(resolution, median_space, neighbourhood, -1) ||
2342  !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
2343  (neighbourhood[PN_UPPER + 1] == nullptr ||
2344  neighbourhood[PN_LOWER + 1] == nullptr ||
2345  !OKSpacingBlip(resolution, median_space, neighbourhood, 1) ||
2346  !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
2347  // The group has ended. PN_UPPER is the last member.
2348  // Compute the mean spacing over the group.
2349  ColPartition_IT sum_it(start_it);
2350  ColPartition *last_part = neighbourhood[PN_UPPER];
2351  double total_bottom = 0.0;
2352  double total_top = 0.0;
2353  int total_count = 0;
2354  ColPartition *upper = sum_it.data();
2355  // We do not process last_part, as its spacing is different.
2356  while (upper != last_part) {
2357  total_bottom += upper->bottom_spacing();
2358  total_top += upper->top_spacing();
2359  ++total_count;
2360  sum_it.forward();
2361  upper = sum_it.data();
2362  }
2363  if (total_count > 0) {
2364  // There were at least 2 lines, so set them all to the mean.
2365  int top_spacing = static_cast<int>(total_top / total_count + 0.5);
2366  int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);
2367  if (textord_debug_tabfind) {
2368  tprintf("Spacing run ended. Cause:");
2369  if (neighbourhood[PN_LOWER] == nullptr) {
2370  tprintf("No more lines\n");
2371  } else {
2372  tprintf("Spacing change. Spacings:\n");
2373  for (int i = 0; i < PN_COUNT; ++i) {
2374  if (neighbourhood[i] == nullptr) {
2375  tprintf("NULL");
2376  if (i > 0 && neighbourhood[i - 1] != nullptr) {
2377  if (neighbourhood[i - 1]->SingletonPartner(false) !=
2378  nullptr) {
2379  tprintf(" Lower partner:");
2380  neighbourhood[i - 1]->SingletonPartner(false)->Print();
2381  } else {
2382  tprintf(" nullptr lower partner:\n");
2383  }
2384  } else {
2385  tprintf("\n");
2386  }
2387  } else {
2388  tprintf("Top = %d, bottom = %d\n",
2389  neighbourhood[i]->top_spacing(),
2390  neighbourhood[i]->bottom_spacing());
2391  }
2392  }
2393  }
2394  tprintf("Mean spacing = %d/%d\n", top_spacing, bottom_spacing);
2395  }
2396  sum_it = start_it;
2397  upper = sum_it.data();
2398  while (upper != last_part) {
2399  upper->set_top_spacing(top_spacing);
2400  upper->set_bottom_spacing(bottom_spacing);
2401  if (textord_debug_tabfind) {
2402  tprintf("Setting mean on:");
2403  upper->Print();
2404  }
2405  sum_it.forward();
2406  upper = sum_it.data();
2407  }
2408  }
2409  // PN_LOWER starts the next group and end_it is the next start_it.
2410  start_it = end_it;
2411  // Recalculate the median spacing to maximize the chances of detecting
2412  // spacing blips.
2413  median_space = MedianSpacing(page_height, end_it);
2414  }
2415  // Shuffle pointers.
2416  for (int j = 1; j < PN_COUNT; ++j) {
2417  neighbourhood[j - 1] = neighbourhood[j];
2418  }
2419  if (it.cycled_list()) {
2420  neighbourhood[PN_COUNT - 1] = nullptr;
2421  } else {
2422  neighbourhood[PN_COUNT - 1] = it.data();
2423  it.forward();
2424  }
2425  end_it.forward();
2426  }
2427 }
2428 
2429 // Returns true if the parts array of pointers to partitions matches the
2430 // condition for a spacing blip. See SmoothSpacings for what this means
2431 // and how it is used.
2432 bool ColPartition::OKSpacingBlip(int resolution, int median_spacing,
2433  ColPartition **parts, int offset) {
2434  // The blip is OK if upper and lower sum to an OK value and at least
2435  // one of above1 and below1 is equal to the median.
2436  parts += offset;
2437  return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER], median_spacing,
2438  resolution) &&
2439  ((parts[PN_ABOVE1] != nullptr &&
2440  parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
2441  (parts[PN_BELOW1] != nullptr &&
2442  parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
2443 }
2444 
2445 // Returns true if both the top and bottom spacings of this match the given
2446 // spacing to within suitable margins dictated by the image resolution.
2447 bool ColPartition::SpacingEqual(int spacing, int resolution) const {
2448  int bottom_error = BottomSpacingMargin(resolution);
2449  int top_error = TopSpacingMargin(resolution);
2450  return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
2451  NearlyEqual(top_spacing_, spacing, top_error);
2452 }
2453 
2454 // Returns true if both the top and bottom spacings of this and other
2455 // match to within suitable margins dictated by the image resolution.
2456 bool ColPartition::SpacingsEqual(const ColPartition &other,
2457  int resolution) const {
2458  int bottom_error = std::max(BottomSpacingMargin(resolution),
2459  other.BottomSpacingMargin(resolution));
2460  int top_error = std::max(TopSpacingMargin(resolution),
2461  other.TopSpacingMargin(resolution));
2462  return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
2463  (NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
2464  NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
2465  bottom_error));
2466 }
2467 
2468 // Returns true if the sum spacing of this and other match the given
2469 // spacing (or twice the given spacing) to within a suitable margin dictated
2470 // by the image resolution.
2471 bool ColPartition::SummedSpacingOK(const ColPartition &other, int spacing,
2472  int resolution) const {
2473  int bottom_error = std::max(BottomSpacingMargin(resolution),
2474  other.BottomSpacingMargin(resolution));
2475  int top_error = std::max(TopSpacingMargin(resolution),
2476  other.TopSpacingMargin(resolution));
2477  int bottom_total = bottom_spacing_ + other.bottom_spacing_;
2478  int top_total = top_spacing_ + other.top_spacing_;
2479  return (NearlyEqual(spacing, bottom_total, bottom_error) &&
2480  NearlyEqual(spacing, top_total, top_error)) ||
2481  (NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
2482  NearlyEqual(spacing * 2, top_total, top_error));
2483 }
2484 
2485 // Returns a suitable spacing margin that can be applied to bottoms of
2486 // text lines, based on the resolution and the stored side_step_.
2487 int ColPartition::BottomSpacingMargin(int resolution) const {
2488  return static_cast<int>(kMaxSpacingDrift * resolution + 0.5) + side_step_;
2489 }
2490 
2491 // Returns a suitable spacing margin that can be applied to tops of
2492 // text lines, based on the resolution and the stored side_step_.
2493 int ColPartition::TopSpacingMargin(int resolution) const {
2494  return static_cast<int>(kMaxTopSpacingFraction * median_height_ + 0.5) +
2495  BottomSpacingMargin(resolution);
2496 }
2497 
2498 // Returns true if the median text sizes of this and other agree to within
2499 // a reasonable multiplicative factor.
2500 bool ColPartition::SizesSimilar(const ColPartition &other) const {
2501  return median_height_ <= other.median_height_ * kMaxSizeRatio &&
2502  other.median_height_ <= median_height_ * kMaxSizeRatio;
2503 }
2504 
2505 // Helper updates margin_left and margin_right, being the bounds of the left
2506 // margin of part of a block. Returns false and does not update the bounds if
2507 // this partition has a disjoint margin with the established margin.
2508 static bool UpdateLeftMargin(const ColPartition &part, int *margin_left,
2509  int *margin_right) {
2510  const TBOX &part_box = part.bounding_box();
2511  int top = part_box.top();
2512  int bottom = part_box.bottom();
2513  int tl_key = part.SortKey(part.left_margin(), top);
2514  int tr_key = part.SortKey(part_box.left(), top);
2515  int bl_key = part.SortKey(part.left_margin(), bottom);
2516  int br_key = part.SortKey(part_box.left(), bottom);
2517  int left_key = std::max(tl_key, bl_key);
2518  int right_key = std::min(tr_key, br_key);
2519  if (left_key <= *margin_right && right_key >= *margin_left) {
2520  // This part is good - let's keep it.
2521  *margin_right = std::min(*margin_right, right_key);
2522  *margin_left = std::max(*margin_left, left_key);
2523  return true;
2524  }
2525  return false;
2526 }
2527 
2528 // Computes and returns in start, end a line segment formed from a
2529 // forwards-iterated group of left edges of partitions that satisfy the
2530 // condition that the intersection of the left margins is non-empty, ie the
2531 // rightmost left margin is to the left of the leftmost left bounding box edge.
2532 // On return the iterator is set to the start of the next run.
2533 void ColPartition::LeftEdgeRun(ColPartition_IT *part_it, ICOORD *start,
2534  ICOORD *end) {
2535  ColPartition *part = part_it->data();
2536  ColPartition *start_part = part;
2537  int start_y = part->bounding_box_.top();
2538  if (!part_it->at_first()) {
2539  int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();
2540  if (prev_bottom < start_y) {
2541  start_y = prev_bottom;
2542  } else if (prev_bottom > start_y) {
2543  start_y = (start_y + prev_bottom) / 2;
2544  }
2545  }
2546  int end_y = part->bounding_box_.bottom();
2547  int margin_right = INT32_MAX;
2548  int margin_left = -INT32_MAX;
2549  UpdateLeftMargin(*part, &margin_left, &margin_right);
2550  do {
2551  part_it->forward();
2552  part = part_it->data();
2553  } while (!part_it->at_first() &&
2554  UpdateLeftMargin(*part, &margin_left, &margin_right));
2555  // The run ended. If we were pushed inwards, compute the next run and
2556  // extend it backwards into the run we just calculated to find the end of
2557  // this run that provides a tight box.
2558  int next_margin_right = INT32_MAX;
2559  int next_margin_left = -INT32_MAX;
2560  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);
2561  if (next_margin_left > margin_right) {
2562  ColPartition_IT next_it(*part_it);
2563  do {
2564  next_it.forward();
2565  part = next_it.data();
2566  } while (!next_it.at_first() &&
2567  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2568  // Now extend the next run backwards into the original run to get the
2569  // tightest fit.
2570  do {
2571  part_it->backward();
2572  part = part_it->data();
2573  } while (part != start_part &&
2574  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2575  part_it->forward();
2576  }
2577  // Now calculate the end_y.
2578  part = part_it->data_relative(-1);
2579  end_y = part->bounding_box_.bottom();
2580  if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y) {
2581  end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
2582  }
2583  start->set_y(start_y);
2584  start->set_x(part->XAtY(margin_right, start_y));
2585  end->set_y(end_y);
2586  end->set_x(part->XAtY(margin_right, end_y));
2587  if (textord_debug_tabfind && !part_it->at_first()) {
2588  tprintf("Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2589  start_y, end_y, part->XAtY(margin_left, end_y), end->x(),
2590  part->left_margin_, part->bounding_box_.left());
2591  }
2592 }
2593 
2594 // Helper updates margin_left and margin_right, being the bounds of the right
2595 // margin of part of a block. Returns false and does not update the bounds if
2596 // this partition has a disjoint margin with the established margin.
2597 static bool UpdateRightMargin(const ColPartition &part, int *margin_left,
2598  int *margin_right) {
2599  const TBOX &part_box = part.bounding_box();
2600  int top = part_box.top();
2601  int bottom = part_box.bottom();
2602  int tl_key = part.SortKey(part_box.right(), top);
2603  int tr_key = part.SortKey(part.right_margin(), top);
2604  int bl_key = part.SortKey(part_box.right(), bottom);
2605  int br_key = part.SortKey(part.right_margin(), bottom);
2606  int left_key = std::max(tl_key, bl_key);
2607  int right_key = std::min(tr_key, br_key);
2608  if (left_key <= *margin_right && right_key >= *margin_left) {
2609  // This part is good - let's keep it.
2610  *margin_right = std::min(*margin_right, right_key);
2611  *margin_left = std::max(*margin_left, left_key);
2612  return true;
2613  }
2614  return false;
2615 }
2616 
2617 // Computes and returns in start, end a line segment formed from a
2618 // backwards-iterated group of right edges of partitions that satisfy the
2619 // condition that the intersection of the right margins is non-empty, ie the
2620 // leftmost right margin is to the right of the rightmost right bounding box
2621 // edge.
2622 // On return the iterator is set to the start of the next run.
2623 void ColPartition::RightEdgeRun(ColPartition_IT *part_it, ICOORD *start,
2624  ICOORD *end) {
2625  ColPartition *part = part_it->data();
2626  ColPartition *start_part = part;
2627  int start_y = part->bounding_box_.bottom();
2628  if (!part_it->at_last()) {
2629  int next_y = part_it->data_relative(1)->bounding_box_.top();
2630  if (next_y > start_y) {
2631  start_y = next_y;
2632  } else if (next_y < start_y) {
2633  start_y = (start_y + next_y) / 2;
2634  }
2635  }
2636  int end_y = part->bounding_box_.top();
2637  int margin_right = INT32_MAX;
2638  int margin_left = -INT32_MAX;
2639  UpdateRightMargin(*part, &margin_left, &margin_right);
2640  do {
2641  part_it->backward();
2642  part = part_it->data();
2643  } while (!part_it->at_last() &&
2644  UpdateRightMargin(*part, &margin_left, &margin_right));
2645  // The run ended. If we were pushed inwards, compute the next run and
2646  // extend it backwards to find the end of this run for a tight box.
2647  int next_margin_right = INT32_MAX;
2648  int next_margin_left = -INT32_MAX;
2649  UpdateRightMargin(*part, &next_margin_left, &next_margin_right);
2650  if (next_margin_right < margin_left) {
2651  ColPartition_IT next_it(*part_it);
2652  do {
2653  next_it.backward();
2654  part = next_it.data();
2655  } while (!next_it.at_last() &&
2656  UpdateRightMargin(*part, &next_margin_left, &next_margin_right));
2657  // Now extend the next run forwards into the original run to get the
2658  // tightest fit.
2659  do {
2660  part_it->forward();
2661  part = part_it->data();
2662  } while (part != start_part &&
2663  UpdateRightMargin(*part, &next_margin_left, &next_margin_right));
2664  part_it->backward();
2665  }
2666  // Now calculate the end_y.
2667  part = part_it->data_relative(1);
2668  end_y = part->bounding_box().top();
2669  if (!part_it->at_last() && part_it->data()->bounding_box_.bottom() > end_y) {
2670  end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
2671  }
2672  start->set_y(start_y);
2673  start->set_x(part->XAtY(margin_left, start_y));
2674  end->set_y(end_y);
2675  end->set_x(part->XAtY(margin_left, end_y));
2676  if (textord_debug_tabfind && !part_it->at_last()) {
2677  tprintf("Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2678  start_y, end_y, end->x(), part->XAtY(margin_right, end_y),
2679  part->bounding_box_.right(), part->right_margin_);
2680  }
2681 }
2682 
2683 } // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:59
@ TBOX
const double kMaxLeaderGapFractionOfMin
const int kColumnWidthFactor
Definition: tabfind.h:41
BlobRegionType
Definition: blobbox.h:74
@ BRT_TEXT
Definition: blobbox.h:82
@ BRT_COUNT
Definition: blobbox.h:84
@ BRT_HLINE
Definition: blobbox.h:76
@ BRT_NOISE
Definition: blobbox.h:75
@ BRT_VLINE
Definition: blobbox.h:77
@ BRT_POLYIMAGE
Definition: blobbox.h:79
@ BRT_VERT_TEXT
Definition: blobbox.h:81
@ BRT_UNKNOWN
Definition: blobbox.h:80
@ BRT_RECTIMAGE
Definition: blobbox.h:78
const int kMinChainTextValue
const double kMaxSizeRatio
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:51
const int kHorzStrongTextlineCount
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const int kMaxColorDistance
std::function< bool(int)> WidthCallback
Definition: tabfind.h:35
const int kHorzStrongTextlineHeight
const double kMinBaselineCoverage
bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2)
Definition: blobbox.h:125
const double kMaxBaselineError
BlobSpecialTextType
Definition: blobbox.h:92
@ BSTT_COUNT
Definition: blobbox.h:99
const int kHorzStrongTextlineAspect
int textord_debug_tabfind
Definition: alignedblob.cpp:29
const int kMinLeaderCount
BlobTextFlowType
Definition: blobbox.h:110
@ BTFT_STRONG_CHAIN
Definition: blobbox.h:115
@ BTFT_NONE
Definition: blobbox.h:111
@ BTFT_CHAIN
Definition: blobbox.h:114
@ BTFT_LEADER
Definition: blobbox.h:117
@ BTFT_NEIGHBOURS
Definition: blobbox.h:113
@ BTFT_NONTEXT
Definition: blobbox.h:112
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:122
const double kMaxSameBlockLineSpacing
const double kMaxTopSpacingFraction
int textord_debug_bugs
Definition: alignedblob.cpp:30
const int kMaxRMSColorNoise
const int kMinStrongTextValue
const double kMaxSpacingDrift
@ PT_VERTICAL_TEXT
Definition: publictypes.h:61
@ PT_PULLOUT_IMAGE
Definition: publictypes.h:65
@ PT_HEADING_IMAGE
Definition: publictypes.h:64
@ PT_HORZ_LINE
Definition: publictypes.h:66
@ PT_FLOWING_IMAGE
Definition: publictypes.h:63
@ PT_VERT_LINE
Definition: publictypes.h:67
@ PT_PULLOUT_TEXT
Definition: publictypes.h:57
@ PT_HEADING_TEXT
Definition: publictypes.h:56
@ PT_FLOWING_TEXT
Definition: publictypes.h:55
const double kMaxLeaderGapFractionOfMax
int base_char_bottom() const
Definition: blobbox.h:401
int base_char_top() const
Definition: blobbox.h:398
int GoodTextBlob() const
Definition: blobbox.cpp:226
bool IsDiacritic() const
Definition: blobbox.h:395
BlobRegionType region_type() const
Definition: blobbox.h:298
int NoisyNeighbours() const
Definition: blobbox.cpp:238
const TBOX & bounding_box() const
Definition: blobbox.h:239
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:313
C_BLOB * remove_cblob()
Definition: blobbox.h:280
tesseract::ColPartition * owner() const
Definition: blobbox.h:367
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:370
BlobTextFlowType flow() const
Definition: blobbox.h:310
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:304
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:301
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:442
void add_blob(BLOBNBOX *blob, float top, float bottom, float row_size)
Definition: blobbox.cpp:734
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:50
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:73
int total_cost() const
Definition: dppoint.h:72
static DPPoint * Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size, DPPoint *points)
Definition: dppoint.cpp:31
int64_t CostWithVariance(const DPPoint *prev)
Definition: dppoint.cpp:70
integer coordinate
Definition: points.h:36
void set_x(TDimension xin)
rewrite function
Definition: points.h:67
TDimension y() const
access_function
Definition: points.h:62
void set_y(TDimension yin)
rewrite function
Definition: points.h:71
TDimension x() const
access function
Definition: points.h:58
static ScrollView::Color ColorForPolyBlockType(PolyBlockType type)
Returns a color to draw the given type.
Definition: polyblk.cpp:389
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
void set_right(int x)
Definition: rect.h:92
void set_left(int x)
Definition: rect.h:85
TDimension top() const
Definition: rect.h:68
void set_bottom(int y)
Definition: rect.h:78
void print() const
Definition: rect.h:289
TDimension right() const
Definition: rect.h:89
bool overlap(const TBOX &box) const
Definition: rect.h:363
TDimension bottom() const
Definition: rect.h:75
int32_t area() const
Definition: rect.h:134
void set_top(int y)
Definition: rect.h:71
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
double median() const
Definition: statistc.cpp:242
int32_t mode() const
Definition: statistc.cpp:113
double ile(double frac) const
Definition: statistc.cpp:173
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:238
static bool WithinTestRegion(int detail_level, int x, int y)
bool IsImageType() const
Definition: colpartition.h:429
void SetSpecialBlobsDensity(const BlobSpecialTextType type, const float density)
BlobTextFlowType flow() const
Definition: colpartition.h:153
static void LineSpacingBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts, BLOCK_LIST *completed_blocks, TO_BLOCK_LIST *to_blocks)
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
float SpecialBlobsDensity(const BlobSpecialTextType type) const
static ColPartition * FakePartition(const TBOX &box, PolyBlockType block_type, BlobRegionType blob_type, BlobTextFlowType flow)
static TO_BLOCK * MakeBlock(const ICOORD &bleft, const ICOORD &tright, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts)
TBOX BoundsWithoutBox(BLOBNBOX *box)
int SpecialBlobsCount(const BlobSpecialTextType type)
PolyBlockType type() const
Definition: colpartition.h:180
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
void set_side_step(int step)
Definition: colpartition.h:216
ColPartition * CopyButDontOwnBlobs()
ColPartition * SplitAt(int split_x)
void AddToWorkingSet(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *used_parts, WorkingPartSet_LIST *working_set)
void SetColumnGoodness(const WidthCallback &cb)
void RefinePartners(PolyBlockType type, bool get_desperate, ColPartitionGrid *grid)
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:186
void SetRightTab(const TabVector *tab_vector)
static ColPartition * MakeLinePartition(BlobRegionType blob_type, const ICOORD &vertical, int left, int bottom, int right, int top)
int bottom_spacing() const
Definition: colpartition.h:219
void SetRegionAndFlowTypesFromProjectionValue(int value)
bool IsPulloutType() const
Definition: colpartition.h:437
bool VSignificantCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:390
BlobRegionType blob_type() const
Definition: colpartition.h:147
void ColumnRange(int resolution, ColPartitionSet *columns, int *first_col, int *last_col)
void AddBox(BLOBNBOX *box)
void SmoothPartnerRun(int working_set_count)
BLOBNBOX * OverlapSplitBlob(const TBOX &box)
int VCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:375
int XAtY(int sort_key, int y) const
Definition: colpartition.h:320
static TO_BLOCK * MakeVerticalTextBlock(const ICOORD &bleft, const ICOORD &tright, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts)
bool IsVerticalType() const
Definition: colpartition.h:441
void SetLeftTab(const TabVector *tab_vector)
ScrollView::Color BoxColor() const
int CountOverlappingBoxes(const TBOX &box)
bool MatchingTextColor(const ColPartition &other) const
bool OKDiacriticMerge(const ColPartition &candidate, bool debug) const
bool OKMergeOverlap(const ColPartition &merge1, const ColPartition &merge2, int ok_box_overlap, bool debug)
void set_owns_blobs(bool owns_blobs)
Definition: colpartition.h:294
bool MatchingSizes(const ColPartition &other) const
int LeftAtY(int y) const
Definition: colpartition.h:340
void RemovePartner(bool upper, ColPartition *partner)
static bool TypesSimilar(PolyBlockType type1, PolyBlockType type2)
Definition: colpartition.h:418
ColPartition * ShallowCopy() const
PolyBlockType PartitionType(ColumnSpanningType flow) const
bool IsInSameColumnAs(const ColPartition &part) const
void CopyRightTab(const ColPartition &src, bool take_box)
ColPartition * SplitAtBlob(BLOBNBOX *split_blob)
ColPartition * SingletonPartner(bool upper)
bool ConfirmNoTabViolation(const ColPartition &other) const
void set_bottom_spacing(int spacing)
Definition: colpartition.h:222
const TBOX & bounding_box() const
Definition: colpartition.h:108
void SetPartitionType(int resolution, ColPartitionSet *columns)
void set_top_spacing(int spacing)
Definition: colpartition.h:228
int RightAtY(int y) const
Definition: colpartition.h:344
void Absorb(ColPartition *other, const WidthCallback &cb)
static int SortByBBox(const void *p1, const void *p2)
Definition: colpartition.h:712
bool MatchingColumns(const ColPartition &other) const
void RemoveBox(BLOBNBOX *box)
void AddPartner(bool upper, ColPartition *partner)
void CopyLeftTab(const ColPartition &src, bool take_box)
ColumnSpanningType SpanningType(int resolution, int left, int right, int height, int y, int left_margin, int right_margin, int *first_col, int *last_col, int *first_spanned_col)
static double ColorDistanceFromLine(const uint8_t *line1, const uint8_t *line2, const uint8_t *point)
Definition: imagefind.cpp:372
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:407
int sort_key() const
Definition: tabvector.h:150
void AddPartition(ColPartition *part)
void ExtractCompletedBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *used_parts, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
void InsertCompletedBlocks(BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)