tesseract  5.0.0
blobbox.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: blobbox.h (Formerly blobnbox.h)
3  * Description: Code for the textord blob class.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef BLOBBOX_H
20 #define BLOBBOX_H
21 
22 #include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
23 #include "elst2.h" // for ELIST2_ITERATOR, ELIST2IZEH, ELIST2_LINK
24 #include "errcode.h" // for ASSERT_HOST
25 #include "ocrblock.h" // for BLOCK
26 #include "params.h" // for DoubleParam, double_VAR_H
27 #include "pdblock.h" // for PDBLK
28 #include "points.h" // for FCOORD, ICOORD, ICOORDELT_LIST
29 #include "quspline.h" // for QSPLINE
30 #include "rect.h" // for TBOX
31 #include "scrollview.h" // for ScrollView, ScrollView::Color
32 #include "statistc.h" // for STATS
33 #include "stepblob.h" // for C_BLOB
34 #include "tprintf.h" // for tprintf
35 #include "werd.h" // for WERD_LIST
36 
37 #include <cinttypes> // for PRId32
38 #include <cmath> // for std::sqrt
39 #include <cstdint> // for int16_t, int32_t
40 
41 struct Pix;
42 
43 namespace tesseract {
44 
45 class C_OUTLINE;
46 
47 enum PITCH_TYPE {
48  PITCH_DUNNO, // insufficient data
49  PITCH_DEF_FIXED, // definitely fixed
50  PITCH_MAYBE_FIXED, // could be
55 };
56 
57 // The possible tab-stop types of each side of a BLOBNBOX.
58 // The ordering is important, as it is used for deleting dead-ends in the
59 // search. ALIGNED, CONFIRMED and VLINE should remain greater than the
60 // non-aligned, unset, or deleted members.
61 enum TabType {
62  TT_NONE, // Not a tab.
63  TT_DELETED, // Not a tab after detailed analysis.
64  TT_MAYBE_RAGGED, // Initial designation of a tab-stop candidate.
65  TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.
66  TT_CONFIRMED, // Aligned with neighbours.
67  TT_VLINE // Detected as a vertical line.
68 };
69 
70 // The possible region types of a BLOBNBOX.
71 // Note: keep all the text types > BRT_UNKNOWN and all the image types less.
72 // Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
73 // *Type static functions below.
75  BRT_NOISE, // Neither text nor image.
76  BRT_HLINE, // Horizontal separator line.
77  BRT_VLINE, // Vertical separator line.
78  BRT_RECTIMAGE, // Rectangular image.
79  BRT_POLYIMAGE, // Non-rectangular image.
80  BRT_UNKNOWN, // Not determined yet.
81  BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.
82  BRT_TEXT, // Convincing text.
83 
84  BRT_COUNT // Number of possibilities.
85 };
86 
87 // enum for elements of arrays that refer to neighbours.
88 // NOTE: keep in this order, so ^2 can be used to flip direction.
90 
91 // enum for special type of text characters, such as math symbol or italic.
93  BSTT_NONE, // No special.
94  BSTT_ITALIC, // Italic style.
95  BSTT_DIGIT, // Digit symbols.
96  BSTT_MATH, // Mathematical symbols (not including digit).
97  BSTT_UNCLEAR, // Characters with low recognition rate.
98  BSTT_SKIP, // Characters that we skip labeling (usually too small).
100 };
101 
103  return static_cast<BlobNeighbourDir>(dir ^ 2);
104 }
105 
106 // BlobTextFlowType indicates the quality of neighbouring information
107 // related to a chain of connected components, either horizontally or
108 // vertically. Also used by ColPartition for the collection of blobs
109 // within, which should all have the same value in most cases.
111  BTFT_NONE, // No text flow set yet.
112  BTFT_NONTEXT, // Flow too poor to be likely text.
113  BTFT_NEIGHBOURS, // Neighbours support flow in this direction.
114  BTFT_CHAIN, // There is a weak chain of text in this direction.
115  BTFT_STRONG_CHAIN, // There is a strong chain of text in this direction.
116  BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.
117  BTFT_LEADER, // Leader dots/dashes etc.
118  BTFT_COUNT
119 };
120 
121 // Returns true if type1 dominates type2 in a merge. Mostly determined by the
122 // ordering of the enum, LEADER is weak and dominates nothing.
123 // The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
124 // this cannot be true if t1 == t2, so the result is undefined.
126  // LEADER always loses.
127  if (type1 == BTFT_LEADER) {
128  return false;
129  }
130  if (type2 == BTFT_LEADER) {
131  return true;
132  }
133  // With those out of the way, the ordering of the enum determines the result.
134  return type1 >= type2;
135 }
136 
137 class ColPartition;
138 
139 class BLOBNBOX;
140 ELISTIZEH(BLOBNBOX)
141 class BLOBNBOX : public ELIST_LINK {
142 public:
144  ReInit();
145  }
146  explicit BLOBNBOX(C_BLOB *srcblob) {
147  box = srcblob->bounding_box();
148  ReInit();
149  cblob_ptr = srcblob;
150  area = static_cast<int>(srcblob->area());
151  }
153  if (owns_cblob_) {
154  delete cblob_ptr;
155  }
156  }
157 
158  static void clear_blobnboxes(BLOBNBOX_LIST *boxes) {
159  BLOBNBOX_IT it = boxes;
160  // A BLOBNBOX generally doesn't own its blobs, so if they do, you
161  // have to delete them explicitly.
162  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
163  BLOBNBOX *box = it.data();
164  // TODO: remove next line, currently still needed for resultiterator_test.
165  delete box->remove_cblob();
166  }
167  }
168 
169  static BLOBNBOX *RealBlob(C_OUTLINE *outline) {
170  auto *blob = new C_BLOB(outline);
171  return new BLOBNBOX(blob);
172  }
173 
174  // Rotates the box and the underlying blob.
175  void rotate(FCOORD rotation);
176 
177  // Methods that act on the box without touching the underlying blob.
178  // Reflect the box in the y-axis, leaving the underlying blob untouched.
179  void reflect_box_in_y_axis();
180  // Rotates the box by the angle given by rotation.
181  // If the blob is a diacritic, then only small rotations for skew
182  // correction can be applied.
183  void rotate_box(FCOORD rotation);
184  // Moves just the box by the given vector.
186  if (IsDiacritic()) {
187  box.move(v);
188  base_char_top_ += v.y();
189  base_char_bottom_ += v.y();
190  } else {
191  box.move(v);
192  set_diacritic_box(box);
193  }
194  }
195  void merge(BLOBNBOX *nextblob);
196  void really_merge(BLOBNBOX *other);
197  void chop( // fake chop blob
198  BLOBNBOX_IT *start_it, // location of this
199  BLOBNBOX_IT *blob_it, // iterator
200  FCOORD rotation, // for landscape
201  float xheight); // line height
202 
203  void NeighbourGaps(int gaps[BND_COUNT]) const;
204  void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const;
205  void CleanNeighbours();
206  // Returns positive if there is at least one side neighbour that has a
207  // similar stroke width and is not on the other side of a rule line.
208  int GoodTextBlob() const;
209  // Returns the number of side neighbours that are of type BRT_NOISE.
210  int NoisyNeighbours() const;
211 
212  // Returns true if the blob is noise and has no owner.
213  bool DeletableNoise() const {
214  return owner() == nullptr && region_type() == BRT_NOISE;
215  }
216 
217  // Returns true, and sets vert_possible/horz_possible if the blob has some
218  // feature that makes it individually appear to flow one way.
219  // eg if it has a high aspect ratio, yet has a complex shape, such as a
220  // joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
221  bool DefiniteIndividualFlow();
222 
223  // Returns true if there is no tabstop violation in merging this and other.
224  bool ConfirmNoTabViolation(const BLOBNBOX &other) const;
225 
226  // Returns true if other has a similar stroke width to this.
227  bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance,
228  double constant_tolerance) const;
229 
230  // Returns a bounding box of the outline contained within the
231  // given horizontal range.
232  TBOX BoundsWithinLimits(int left, int right);
233 
234  // Estimates and stores the baseline position based on the shape of the
235  // outline.
236  void EstimateBaselinePosition();
237 
238  // Simple accessors.
239  const TBOX &bounding_box() const {
240  return box;
241  }
242  // Set the bounding box. Use with caution.
243  // Normally use compute_bounding_box instead.
244  void set_bounding_box(const TBOX &new_box) {
245  box = new_box;
246  base_char_top_ = box.top();
247  base_char_bottom_ = box.bottom();
248  }
250  box = cblob_ptr->bounding_box();
251  base_char_top_ = box.top();
252  base_char_bottom_ = box.bottom();
253  baseline_y_ = box.bottom();
254  }
255  const TBOX &reduced_box() const {
256  return red_box;
257  }
258  void set_reduced_box(TBOX new_box) {
259  red_box = new_box;
260  reduced = true;
261  }
262  int32_t enclosed_area() const {
263  return area;
264  }
265  bool joined_to_prev() const {
266  return joined;
267  }
268  bool red_box_set() const {
269  return reduced;
270  }
271  int repeated_set() const {
272  return repeated_set_;
273  }
274  void set_repeated_set(int set_id) {
275  repeated_set_ = set_id;
276  }
277  C_BLOB *cblob() const {
278  return cblob_ptr;
279  }
281  auto blob = cblob_ptr;
282  cblob_ptr = nullptr;
283  owns_cblob_ = false;
284  return blob;
285  }
287  return left_tab_type_;
288  }
289  void set_left_tab_type(TabType new_type) {
290  left_tab_type_ = new_type;
291  }
293  return right_tab_type_;
294  }
295  void set_right_tab_type(TabType new_type) {
296  right_tab_type_ = new_type;
297  }
299  return region_type_;
300  }
302  region_type_ = new_type;
303  }
305  return spt_type_;
306  }
308  spt_type_ = new_type;
309  }
311  return flow_;
312  }
314  flow_ = value;
315  }
316  bool vert_possible() const {
317  return vert_possible_;
318  }
319  void set_vert_possible(bool value) {
320  vert_possible_ = value;
321  }
322  bool horz_possible() const {
323  return horz_possible_;
324  }
325  void set_horz_possible(bool value) {
326  horz_possible_ = value;
327  }
328  int left_rule() const {
329  return left_rule_;
330  }
331  void set_left_rule(int new_left) {
332  left_rule_ = new_left;
333  }
334  int right_rule() const {
335  return right_rule_;
336  }
337  void set_right_rule(int new_right) {
338  right_rule_ = new_right;
339  }
340  int left_crossing_rule() const {
341  return left_crossing_rule_;
342  }
343  void set_left_crossing_rule(int new_left) {
344  left_crossing_rule_ = new_left;
345  }
346  int right_crossing_rule() const {
347  return right_crossing_rule_;
348  }
349  void set_right_crossing_rule(int new_right) {
350  right_crossing_rule_ = new_right;
351  }
352  float horz_stroke_width() const {
353  return horz_stroke_width_;
354  }
355  void set_horz_stroke_width(float width) {
356  horz_stroke_width_ = width;
357  }
358  float vert_stroke_width() const {
359  return vert_stroke_width_;
360  }
361  void set_vert_stroke_width(float width) {
362  vert_stroke_width_ = width;
363  }
364  float area_stroke_width() const {
365  return area_stroke_width_;
366  }
368  return owner_;
369  }
371  owner_ = new_owner;
372  }
373  bool leader_on_left() const {
374  return leader_on_left_;
375  }
376  void set_leader_on_left(bool flag) {
377  leader_on_left_ = flag;
378  }
379  bool leader_on_right() const {
380  return leader_on_right_;
381  }
382  void set_leader_on_right(bool flag) {
383  leader_on_right_ = flag;
384  }
386  return neighbours_[n];
387  }
389  return good_stroke_neighbours_[n];
390  }
391  void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good) {
392  neighbours_[n] = neighbour;
393  good_stroke_neighbours_[n] = good;
394  }
395  bool IsDiacritic() const {
396  return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
397  }
398  int base_char_top() const {
399  return base_char_top_;
400  }
401  int base_char_bottom() const {
402  return base_char_bottom_;
403  }
404  int baseline_position() const {
405  return baseline_y_;
406  }
407  int line_crossings() const {
408  return line_crossings_;
409  }
410  void set_line_crossings(int value) {
411  line_crossings_ = value;
412  }
413  void set_diacritic_box(const TBOX &diacritic_box) {
414  base_char_top_ = diacritic_box.top();
415  base_char_bottom_ = diacritic_box.bottom();
416  }
418  return base_char_blob_;
419  }
421  base_char_blob_ = blob;
422  }
423  void set_owns_cblob(bool value) {
424  owns_cblob_ = value;
425  }
426 
427  bool UniquelyVertical() const {
428  return vert_possible_ && !horz_possible_;
429  }
430  bool UniquelyHorizontal() const {
431  return horz_possible_ && !vert_possible_;
432  }
433 
434  // Returns true if the region type is text.
435  static bool IsTextType(BlobRegionType type) {
436  return type == BRT_TEXT || type == BRT_VERT_TEXT;
437  }
438  // Returns true if the region type is image.
439  static bool IsImageType(BlobRegionType type) {
440  return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
441  }
442  // Returns true if the region type is line.
443  static bool IsLineType(BlobRegionType type) {
444  return type == BRT_HLINE || type == BRT_VLINE;
445  }
446  // Returns true if the region type cannot be merged.
447  static bool UnMergeableType(BlobRegionType type) {
448  return IsLineType(type) || IsImageType(type);
449  }
450  // Helper to call CleanNeighbours on all blobs on the list.
451  static void CleanNeighbours(BLOBNBOX_LIST *blobs);
452  // Helper to delete all the deletable blobs on the list.
453  static void DeleteNoiseBlobs(BLOBNBOX_LIST *blobs);
454  // Helper to compute edge offsets for all the blobs on the list.
455  // See coutln.h for an explanation of edge offsets.
456  static void ComputeEdgeOffsets(Image thresholds, Image grey, BLOBNBOX_LIST *blobs);
457 
458 #ifndef GRAPHICS_DISABLED
459  // Helper to draw all the blobs on the list in the given body_colour,
460  // with child outlines in the child_colour.
461  static void PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,
462  ScrollView::Color child_colour, ScrollView *win);
463  // Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
464  // given list in the given body_colour, with child outlines in the
465  // child_colour.
466  static void PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,
467  ScrollView::Color child_colour, ScrollView *win);
468 
469  static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type);
470 
471  // Keep in sync with BlobRegionType.
472  ScrollView::Color BoxColor() const;
473 
474  void plot(ScrollView *window, // window to draw in
475  ScrollView::Color blob_colour, // for outer bits
476  ScrollView::Color child_colour); // for holes
477 #endif
478 
479  // Initializes members set by StrokeWidth and beyond, without discarding
480  // stored area and strokewidth values, which are expensive to calculate.
481  void ReInit() {
482  joined = false;
483  reduced = false;
484  repeated_set_ = 0;
485  left_tab_type_ = TT_NONE;
486  right_tab_type_ = TT_NONE;
487  region_type_ = BRT_UNKNOWN;
488  flow_ = BTFT_NONE;
489  spt_type_ = BSTT_SKIP;
490  left_rule_ = 0;
491  right_rule_ = 0;
492  left_crossing_rule_ = 0;
493  right_crossing_rule_ = 0;
494  if (area_stroke_width_ == 0.0f && area > 0 && cblob() != nullptr && cblob()->perimeter() != 0) {
495  area_stroke_width_ = 2.0f * area / cblob()->perimeter();
496  }
497  owner_ = nullptr;
498  base_char_top_ = box.top();
499  base_char_bottom_ = box.bottom();
500  baseline_y_ = box.bottom();
501  line_crossings_ = 0;
502  base_char_blob_ = nullptr;
503  horz_possible_ = false;
504  vert_possible_ = false;
505  leader_on_left_ = false;
506  leader_on_right_ = false;
507  ClearNeighbours();
508  }
509 
511  for (int n = 0; n < BND_COUNT; ++n) {
512  neighbours_[n] = nullptr;
513  good_stroke_neighbours_[n] = false;
514  }
515  }
516 
517 private:
518  C_BLOB *cblob_ptr = nullptr; // edgestep blob
519  TBOX box; // bounding box
520  TBOX red_box; // bounding box
521  int32_t area = 0; // enclosed area
522  int32_t repeated_set_ = 0; // id of the set of repeated blobs
523  TabType left_tab_type_ = TT_NONE; // Indicates tab-stop assessment
524  TabType right_tab_type_ = TT_NONE; // Indicates tab-stop assessment
525  BlobRegionType region_type_ = BRT_UNKNOWN; // Type of region this blob belongs to
526  BlobTextFlowType flow_ = BTFT_NONE; // Quality of text flow.
527  BlobSpecialTextType spt_type_; // Special text type.
528  bool joined = false; // joined to prev
529  bool reduced = false; // reduced box set
530  int16_t left_rule_ = 0; // x-coord of nearest but not crossing rule line
531  int16_t right_rule_ = 0; // x-coord of nearest but not crossing rule line
532  int16_t left_crossing_rule_; // x-coord of nearest or crossing rule line
533  int16_t right_crossing_rule_; // x-coord of nearest or crossing rule line
534  int16_t base_char_top_; // y-coord of top/bottom of diacritic base,
535  int16_t base_char_bottom_; // if it exists else top/bottom of this blob.
536  int16_t baseline_y_; // Estimate of baseline position.
537  int32_t line_crossings_; // Number of line intersections touched.
538  BLOBNBOX *base_char_blob_; // The blob that was the base char.
539  tesseract::ColPartition *owner_; // Who will delete me when I am not needed
540  BLOBNBOX *neighbours_[BND_COUNT];
541  float horz_stroke_width_ = 0.0f; // Median horizontal stroke width
542  float vert_stroke_width_ = 0.0f; // Median vertical stroke width
543  float area_stroke_width_ = 0.0f; // Stroke width from area/perimeter ratio.
544  bool good_stroke_neighbours_[BND_COUNT];
545  bool horz_possible_; // Could be part of horizontal flow.
546  bool vert_possible_; // Could be part of vertical flow.
547  bool leader_on_left_; // There is a leader to the left.
548  bool leader_on_right_; // There is a leader to the right.
549  // Iff true, then the destructor should delete the cblob_ptr.
550  // TODO(rays) migrate all uses to correctly setting this flag instead of
551  // deleting the C_BLOB before deleting the BLOBNBOX.
552  bool owns_cblob_ = false;
553 };
554 
555 class TO_ROW : public ELIST2_LINK {
556 public:
557  static const int kErrorWeight = 3;
558 
559  TO_ROW() {
560  clear();
561  } // empty
562  TO_ROW( // constructor
563  BLOBNBOX *blob, // from first blob
564  float top, // of row //target height
565  float bottom, float row_size);
566 
567  void print() const;
568  float max_y() const { // access function
569  return y_max;
570  }
571  float min_y() const {
572  return y_min;
573  }
574  float mean_y() const {
575  return (y_min + y_max) / 2.0f;
576  }
577  float initial_min_y() const {
578  return initial_y_min;
579  }
580  float line_m() const { // access to line fit
581  return m;
582  }
583  float line_c() const {
584  return c;
585  }
586  float line_error() const {
587  return error;
588  }
589  float parallel_c() const {
590  return para_c;
591  }
592  float parallel_error() const {
593  return para_error;
594  }
595  float believability() const { // baseline goodness
596  return credibility;
597  }
598  float intercept() const { // real parallel_c
599  return y_origin;
600  }
601  void add_blob( // put in row
602  BLOBNBOX *blob, // blob to add
603  float top, // of row //target height
604  float bottom, float row_size);
605  void insert_blob( // put in row in order
606  BLOBNBOX *blob);
607 
608  BLOBNBOX_LIST *blob_list() { // get list
609  return &blobs;
610  }
611 
612  void set_line( // set line spec
613  float new_m, // line to set
614  float new_c, float new_error) {
615  m = new_m;
616  c = new_c;
617  error = new_error;
618  }
619  void set_parallel_line( // set fixed gradient line
620  float gradient, // page gradient
621  float new_c, float new_error) {
622  para_c = new_c;
623  para_error = new_error;
624  credibility = blobs.length() - kErrorWeight * new_error;
625  y_origin = new_c / std::sqrt(1 + gradient * gradient);
626  // real intercept
627  }
628  void set_limits( // set min,max
629  float new_min, // bottom and
630  float new_max) { // top of row
631  y_min = new_min;
632  y_max = new_max;
633  }
635  // get projection
636 
637  bool rep_chars_marked() const {
638  return num_repeated_sets_ != -1;
639  }
641  num_repeated_sets_ = -1;
642  }
643  int num_repeated_sets() const {
644  return num_repeated_sets_;
645  }
646  void set_num_repeated_sets(int num_sets) {
647  num_repeated_sets_ = num_sets;
648  }
649 
650  // true when dead
651  bool merged = false;
652  bool all_caps; // had no ascenders
653  bool used_dm_model; // in guessing pitch
654  int16_t projection_left; // start of projection
655  int16_t projection_right; // start of projection
656  PITCH_TYPE pitch_decision; // how strong is decision
657  float fixed_pitch; // pitch or 0
658  float fp_space; // sp if fixed pitch
659  float fp_nonsp; // nonsp if fixed pitch
660  float pr_space; // sp if prop
661  float pr_nonsp; // non sp if prop
662  float spacing; // to "next" row
663  float xheight; // of line
664  int xheight_evidence; // number of blobs of height xheight
665  float ascrise; // ascenders
666  float descdrop; // descenders
667  float body_size; // of CJK characters. Assumed to be
668  // xheight+ascrise for non-CJK text.
669  int32_t min_space; // min size for real space
670  int32_t max_nonspace; // max size of non-space
671  int32_t space_threshold; // space vs nonspace
672  float kern_size; // average non-space
673  float space_size; // average space
674  WERD_LIST rep_words; // repeated chars
675  ICOORDELT_LIST char_cells; // fixed pitch cells
676  QSPLINE baseline; // curved baseline
677  STATS projection; // vertical projection
678 
679 private:
680  void clear(); // clear all values to reasonable defaults
681 
682  BLOBNBOX_LIST blobs; // blobs in row
683  float y_min; // coords
684  float y_max;
685  float initial_y_min;
686  float m, c; // line spec
687  float error; // line error
688  float para_c; // constrained fit
689  float para_error;
690  float y_origin; // rotated para_c;
691  float credibility; // baseline believability
692  int num_repeated_sets_; // number of sets of repeated blobs
693  // set to -1 if we have not searched
694  // for repeated blobs in this row yet
695 };
696 
698 class TESS_API TO_BLOCK : public ELIST_LINK {
699 public:
700  TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
701  clear();
702  } // empty
703  TO_BLOCK( // constructor
704  BLOCK *src_block); // real block
705  ~TO_BLOCK();
706 
707  void clear(); // clear all scalar members.
708 
709  TO_ROW_LIST *get_rows() { // access function
710  return &row_list;
711  }
712 
713  // Rotate all the blobnbox lists and the underlying block. Then update the
714  // median size statistic from the blobs list.
715  void rotate(const FCOORD &rotation) {
716  BLOBNBOX_LIST *blobnbox_list[] = {&blobs, &underlines, &noise_blobs,
717  &small_blobs, &large_blobs, nullptr};
718  for (BLOBNBOX_LIST **list = blobnbox_list; *list != nullptr; ++list) {
719  BLOBNBOX_IT it(*list);
720  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
721  it.data()->rotate(rotation);
722  }
723  }
724  // Rotate the block
725  ASSERT_HOST(block->pdblk.poly_block() != nullptr);
726  block->rotate(rotation);
727  // Update the median size statistic from the blobs list.
728  STATS widths(0, block->pdblk.bounding_box().width());
729  STATS heights(0, block->pdblk.bounding_box().height());
730  BLOBNBOX_IT blob_it(&blobs);
731  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
732  widths.add(blob_it.data()->bounding_box().width(), 1);
733  heights.add(blob_it.data()->bounding_box().height(), 1);
734  }
735  block->set_median_size(static_cast<int>(widths.median() + 0.5),
736  static_cast<int>(heights.median() + 0.5));
737  }
738 
739  void print_rows() { // debug info
740  TO_ROW_IT row_it = &row_list;
741  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
742  auto row = row_it.data();
743  tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n",
744  static_cast<double>(row->min_y()),
745  static_cast<double>(row->max_y()),
746  static_cast<double>(row->parallel_c()),
747  row->blob_list()->length());
748  }
749  }
750 
751  // Reorganizes the blob lists with a different definition of small, medium
752  // and large, compared to the original definition.
753  // Height is still the primary filter key, but medium width blobs of small
754  // height become medium, and very wide blobs of small height stay small.
755  void ReSetAndReFilterBlobs();
756 
757  // Deletes noise blobs from all lists where not owned by a ColPartition.
758  void DeleteUnownedNoise();
759 
760  // Computes and stores the edge offsets on each blob for use in feature
761  // extraction, using greyscale if the supplied grey and thresholds pixes
762  // are 8-bit or otherwise (if nullptr or not 8 bit) the original binary
763  // edge step outlines.
764  // Thresholds must either be the same size as grey or an integer down-scale
765  // of grey.
766  // See coutln.h for an explanation of edge offsets.
767  void ComputeEdgeOffsets(Image thresholds, Image grey);
768 
769 #ifndef GRAPHICS_DISABLED
770  // Draw the noise blobs from all lists in red.
771  void plot_noise_blobs(ScrollView *to_win);
772  // Draw the blobs on on the various lists in the block in different colors.
773  void plot_graded_blobs(ScrollView *to_win);
774 #endif
775 
776  BLOBNBOX_LIST blobs; // medium size
777  BLOBNBOX_LIST underlines; // underline blobs
778  BLOBNBOX_LIST noise_blobs; // very small
779  BLOBNBOX_LIST small_blobs; // fairly small
780  BLOBNBOX_LIST large_blobs; // big blobs
781  BLOCK *block; // real block
782  PITCH_TYPE pitch_decision; // how strong is decision
783  float line_spacing; // estimate
784  // line_size is a lower-bound estimate of the font size in pixels of
785  // the text in the block (with ascenders and descenders), being a small
786  // (1.25) multiple of the median height of filtered blobs.
787  // In most cases the font size will be bigger, but it will be closer
788  // if the text is allcaps, or in a no-x-height script.
789  float line_size; // estimate
790  float max_blob_size; // line assignment limit
791  float baseline_offset; // phase shift
792  float xheight; // median blob size
793  float fixed_pitch; // pitch or 0
794  float kern_size; // average non-space
795  float space_size; // average space
796  int32_t min_space; // min definite space
797  int32_t max_nonspace; // max definite
798  float fp_space; // sp if fixed pitch
799  float fp_nonsp; // nonsp if fixed pitch
800  float pr_space; // sp if prop
801  float pr_nonsp; // non sp if prop
802  TO_ROW *key_row; // starting row
803 
804 private:
805  TO_ROW_LIST row_list; // temporary rows
806 };
807 
809 void find_cblob_limits( // get y limits
810  C_BLOB *blob, // blob to search
811  float leftx, // x limits
812  float rightx,
813  FCOORD rotation, // for landscape
814  float &ymin, // output y limits
815  float &ymax);
816 void find_cblob_vlimits( // get y limits
817  C_BLOB *blob, // blob to search
818  float leftx, // x limits
819  float rightx,
820  float &ymin, // output y limits
821  float &ymax);
822 void find_cblob_hlimits( // get x limits
823  C_BLOB *blob, // blob to search
824  float bottomy, // y limits
825  float topy,
826  float &xmin, // output x limits
827  float &xymax);
828 C_BLOB *crotate_cblob( // rotate it
829  C_BLOB *blob, // blob to search
830  FCOORD rotation // for landscape
831 );
832 TBOX box_next( // get bounding box
833  BLOBNBOX_IT *it // iterator to blobds
834 );
835 TBOX box_next_pre_chopped( // get bounding box
836  BLOBNBOX_IT *it // iterator to blobds
837 );
838 void vertical_cblob_projection( // project outlines
839  C_BLOB *blob, // blob to project
840  STATS *stats // output
841 );
842 void vertical_coutline_projection( // project outlines
843  C_OUTLINE *outline, // outline to project
844  STATS *stats // output
845 );
846 #ifndef GRAPHICS_DISABLED
847 void plot_blob_list(ScrollView *win, // window to draw in
848  BLOBNBOX_LIST *list, // blob list
849  ScrollView::Color body_colour, // colour to draw
850  ScrollView::Color child_colour); // colour of child
851 #endif // !GRAPHICS_DISABLED
852 
853 } // namespace tesseract
854 
855 #endif
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:803
#define ELIST2IZEH(CLASSNAME)
Definition: elst2.h:822
#define ASSERT_HOST(x)
Definition: errcode.h:59
C_BLOB * crotate_cblob(C_BLOB *blob, FCOORD rotation)
Definition: blobbox.cpp:614
BlobRegionType
Definition: blobbox.h:74
@ BRT_TEXT
Definition: blobbox.h:82
@ BRT_COUNT
Definition: blobbox.h:84
@ BRT_HLINE
Definition: blobbox.h:76
@ BRT_NOISE
Definition: blobbox.h:75
@ BRT_VLINE
Definition: blobbox.h:77
@ BRT_POLYIMAGE
Definition: blobbox.h:79
@ BRT_VERT_TEXT
Definition: blobbox.h:81
@ BRT_UNKNOWN
Definition: blobbox.h:80
@ BRT_RECTIMAGE
Definition: blobbox.h:78
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void vertical_cblob_projection(C_BLOB *blob, STATS *stats)
Definition: blobbox.cpp:871
ScrollView * to_win
Definition: drawtord.cpp:37
void find_cblob_limits(C_BLOB *blob, float leftx, float rightx, FCOORD rotation, float &ymin, float &ymax)
Definition: blobbox.cpp:504
PITCH_TYPE
Definition: blobbox.h:47
@ PITCH_DUNNO
Definition: blobbox.h:48
@ PITCH_MAYBE_FIXED
Definition: blobbox.h:50
@ PITCH_DEF_FIXED
Definition: blobbox.h:49
@ PITCH_MAYBE_PROP
Definition: blobbox.h:52
@ PITCH_DEF_PROP
Definition: blobbox.h:51
@ PITCH_CORR_FIXED
Definition: blobbox.h:53
@ PITCH_CORR_PROP
Definition: blobbox.h:54
void plot_blob_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour)
Definition: blobbox.cpp:1071
bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2)
Definition: blobbox.h:125
BlobSpecialTextType
Definition: blobbox.h:92
@ BSTT_SKIP
Definition: blobbox.h:98
@ BSTT_MATH
Definition: blobbox.h:96
@ BSTT_UNCLEAR
Definition: blobbox.h:97
@ BSTT_DIGIT
Definition: blobbox.h:95
@ BSTT_ITALIC
Definition: blobbox.h:94
@ BSTT_NONE
Definition: blobbox.h:93
@ BSTT_COUNT
Definition: blobbox.h:99
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xmax)
Definition: blobbox.cpp:579
void vertical_coutline_projection(C_OUTLINE *outline, STATS *stats)
Definition: blobbox.cpp:890
BlobTextFlowType
Definition: blobbox.h:110
@ BTFT_STRONG_CHAIN
Definition: blobbox.h:115
@ BTFT_NONE
Definition: blobbox.h:111
@ BTFT_CHAIN
Definition: blobbox.h:114
@ BTFT_LEADER
Definition: blobbox.h:117
@ BTFT_TEXT_ON_IMAGE
Definition: blobbox.h:116
@ BTFT_COUNT
Definition: blobbox.h:118
@ BTFT_NEIGHBOURS
Definition: blobbox.h:113
@ BTFT_NONTEXT
Definition: blobbox.h:112
void find_cblob_vlimits(C_BLOB *blob, float leftx, float rightx, float &ymin, float &ymax)
Definition: blobbox.cpp:543
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:667
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:102
@ TT_MAYBE_RAGGED
Definition: blobbox.h:64
@ TT_VLINE
Definition: blobbox.h:67
@ TT_MAYBE_ALIGNED
Definition: blobbox.h:65
@ TT_CONFIRMED
Definition: blobbox.h:66
@ TT_DELETED
Definition: blobbox.h:63
@ TT_NONE
Definition: blobbox.h:62
BlobNeighbourDir
Definition: blobbox.h:89
@ BND_LEFT
Definition: blobbox.h:89
@ BND_RIGHT
Definition: blobbox.h:89
@ BND_BELOW
Definition: blobbox.h:89
@ BND_ABOVE
Definition: blobbox.h:89
@ BND_COUNT
Definition: blobbox.h:89
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:638
int base_char_bottom() const
Definition: blobbox.h:401
int base_char_top() const
Definition: blobbox.h:398
float vert_stroke_width() const
Definition: blobbox.h:358
void set_vert_possible(bool value)
Definition: blobbox.h:319
static bool IsImageType(BlobRegionType type)
Definition: blobbox.h:439
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:388
bool leader_on_right() const
Definition: blobbox.h:379
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:385
bool DeletableNoise() const
Definition: blobbox.h:213
static bool IsTextType(BlobRegionType type)
Definition: blobbox.h:435
bool red_box_set() const
Definition: blobbox.h:268
const TBOX & reduced_box() const
Definition: blobbox.h:255
bool UniquelyHorizontal() const
Definition: blobbox.h:430
bool IsDiacritic() const
Definition: blobbox.h:395
BLOBNBOX(C_BLOB *srcblob)
Definition: blobbox.h:146
void set_left_tab_type(TabType new_type)
Definition: blobbox.h:289
int repeated_set() const
Definition: blobbox.h:271
BlobRegionType region_type() const
Definition: blobbox.h:298
void set_horz_possible(bool value)
Definition: blobbox.h:325
bool UniquelyVertical() const
Definition: blobbox.h:427
void set_left_rule(int new_left)
Definition: blobbox.h:331
int left_rule() const
Definition: blobbox.h:328
TabType left_tab_type() const
Definition: blobbox.h:286
const TBOX & bounding_box() const
Definition: blobbox.h:239
int32_t enclosed_area() const
Definition: blobbox.h:262
void set_right_tab_type(TabType new_type)
Definition: blobbox.h:295
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:313
static BLOBNBOX * RealBlob(C_OUTLINE *outline)
Definition: blobbox.h:169
int left_crossing_rule() const
Definition: blobbox.h:340
void set_repeated_set(int set_id)
Definition: blobbox.h:274
void set_horz_stroke_width(float width)
Definition: blobbox.h:355
C_BLOB * remove_cblob()
Definition: blobbox.h:280
void set_leader_on_right(bool flag)
Definition: blobbox.h:382
void set_left_crossing_rule(int new_left)
Definition: blobbox.h:343
void set_line_crossings(int value)
Definition: blobbox.h:410
void set_base_char_blob(BLOBNBOX *blob)
Definition: blobbox.h:420
static bool IsLineType(BlobRegionType type)
Definition: blobbox.h:443
tesseract::ColPartition * owner() const
Definition: blobbox.h:367
void set_vert_stroke_width(float width)
Definition: blobbox.h:361
bool vert_possible() const
Definition: blobbox.h:316
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good)
Definition: blobbox.h:391
void set_special_text_type(BlobSpecialTextType new_type)
Definition: blobbox.h:307
int line_crossings() const
Definition: blobbox.h:407
static void clear_blobnboxes(BLOBNBOX_LIST *boxes)
Definition: blobbox.h:158
int baseline_position() const
Definition: blobbox.h:404
C_BLOB * cblob() const
Definition: blobbox.h:277
void set_diacritic_box(const TBOX &diacritic_box)
Definition: blobbox.h:413
TabType right_tab_type() const
Definition: blobbox.h:292
void set_reduced_box(TBOX new_box)
Definition: blobbox.h:258
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:370
BlobTextFlowType flow() const
Definition: blobbox.h:310
void set_bounding_box(const TBOX &new_box)
Definition: blobbox.h:244
void translate_box(ICOORD v)
Definition: blobbox.h:185
void set_right_crossing_rule(int new_right)
Definition: blobbox.h:349
BLOBNBOX * base_char_blob() const
Definition: blobbox.h:417
float area_stroke_width() const
Definition: blobbox.h:364
void compute_bounding_box()
Definition: blobbox.h:249
int right_rule() const
Definition: blobbox.h:334
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:304
bool leader_on_left() const
Definition: blobbox.h:373
void ClearNeighbours()
Definition: blobbox.h:510
void set_leader_on_left(bool flag)
Definition: blobbox.h:376
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:301
float horz_stroke_width() const
Definition: blobbox.h:352
bool horz_possible() const
Definition: blobbox.h:322
void set_right_rule(int new_right)
Definition: blobbox.h:337
void set_owns_cblob(bool value)
Definition: blobbox.h:423
int right_crossing_rule() const
Definition: blobbox.h:346
bool joined_to_prev() const
Definition: blobbox.h:265
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:447
float max_y() const
Definition: blobbox.h:568
bool rep_chars_marked() const
Definition: blobbox.h:637
static const int kErrorWeight
Definition: blobbox.h:557
int xheight_evidence
Definition: blobbox.h:664
QSPLINE baseline
Definition: blobbox.h:676
void add_blob(BLOBNBOX *blob, float top, float bottom, float row_size)
Definition: blobbox.cpp:734
int32_t min_space
Definition: blobbox.h:669
float believability() const
Definition: blobbox.h:595
void set_num_repeated_sets(int num_sets)
Definition: blobbox.h:646
float initial_min_y() const
Definition: blobbox.h:577
ICOORDELT_LIST char_cells
Definition: blobbox.h:675
void print() const
Definition: blobbox.cpp:718
float line_error() const
Definition: blobbox.h:586
WERD_LIST rep_words
Definition: blobbox.h:674
void set_line(float new_m, float new_c, float new_error)
Definition: blobbox.h:612
int num_repeated_sets() const
Definition: blobbox.h:643
int32_t max_nonspace
Definition: blobbox.h:670
void clear_rep_chars_marked()
Definition: blobbox.h:640
bool used_dm_model
Definition: blobbox.h:653
STATS projection
Definition: blobbox.h:677
float min_y() const
Definition: blobbox.h:571
float space_size
Definition: blobbox.h:673
float fixed_pitch
Definition: blobbox.h:657
void set_limits(float new_min, float new_max)
Definition: blobbox.h:628
float line_m() const
Definition: blobbox.h:580
int32_t space_threshold
Definition: blobbox.h:671
float intercept() const
Definition: blobbox.h:598
float parallel_c() const
Definition: blobbox.h:589
float parallel_error() const
Definition: blobbox.h:592
void compute_vertical_projection()
Definition: blobbox.cpp:799
void insert_blob(BLOBNBOX *blob)
Definition: blobbox.cpp:773
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:608
PITCH_TYPE pitch_decision
Definition: blobbox.h:656
float mean_y() const
Definition: blobbox.h:574
int16_t projection_left
Definition: blobbox.h:654
void set_parallel_line(float gradient, float new_c, float new_error)
Definition: blobbox.h:619
float line_c() const
Definition: blobbox.h:583
int16_t projection_right
Definition: blobbox.h:655
void rotate(const FCOORD &rotation)
Definition: blobbox.h:715
BLOBNBOX_LIST underlines
Definition: blobbox.h:777
float baseline_offset
Definition: blobbox.h:791
BLOBNBOX_LIST blobs
Definition: blobbox.h:776
TO_ROW * key_row
Definition: blobbox.h:802
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:779
TO_ROW_LIST * get_rows()
Definition: blobbox.h:709
int32_t min_space
Definition: blobbox.h:796
int32_t max_nonspace
Definition: blobbox.h:797
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:780
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:778
PITCH_TYPE pitch_decision
Definition: blobbox.h:782
integer coordinate
Definition: points.h:36
TDimension y() const
access_function
Definition: points.h:62
TDimension top() const
Definition: rect.h:68
TDimension bottom() const
Definition: rect.h:75
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
double median() const
Definition: statistc.cpp:242
TBOX bounding_box() const
Definition: stepblob.cpp:250
int32_t area()
Definition: stepblob.cpp:268
#define TESS_API
Definition: export.h:34