tesseract  5.0.0
strokewidth.cpp
Go to the documentation of this file.
1 // File: strokewidth.cpp
3 // Description: Subclass of BBGrid to find uniformity of strokewidth.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2008, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config_auto.h"
21 #endif
22 
23 #include "strokewidth.h"
24 
25 #include <algorithm>
26 #include <cmath>
27 
28 #include "blobbox.h"
29 #include "colpartition.h"
30 #include "colpartitiongrid.h"
31 #include "helpers.h" // for IntCastRounded
32 #include "imagefind.h"
33 #include "linlsq.h"
34 #include "statistc.h"
35 #include "tabfind.h"
36 #include "textlineprojection.h"
37 #include "tordmain.h" // For SetBlobStrokeWidth.
38 
39 namespace tesseract {
40 
41 #ifndef GRAPHICS_DISABLED
42 static INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths (ScrollView)");
43 #else
44 static INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
45 #endif
46 static BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
47 
49 const double kStrokeWidthFractionTolerance = 0.125;
54 const double kStrokeWidthTolerance = 1.5;
55 // Same but for CJK we are a bit more generous.
56 const double kStrokeWidthFractionCJK = 0.25;
57 const double kStrokeWidthCJK = 2.0;
58 // Radius in grid cells of search for broken CJK. Doesn't need to be very
59 // large as the grid size should be about the size of a character anyway.
60 const int kCJKRadius = 2;
61 // Max distance fraction of size to join close but broken CJK characters.
62 const double kCJKBrokenDistanceFraction = 0.25;
63 // Max number of components in a broken CJK character.
64 const int kCJKMaxComponents = 8;
65 // Max aspect ratio of CJK broken characters when put back together.
66 const double kCJKAspectRatio = 1.25;
67 // Max increase in aspect ratio of CJK broken characters when merged.
68 const double kCJKAspectRatioIncrease = 1.0625;
69 // Max multiple of the grid size that will be used in computing median CJKsize.
70 const int kMaxCJKSizeRatio = 5;
71 // Min fraction of blobs broken CJK to iterate and run it again.
72 const double kBrokenCJKIterationFraction = 0.125;
73 // Multiple of gridsize as x-padding for a search box for diacritic base
74 // characters.
75 const double kDiacriticXPadRatio = 7.0;
76 // Multiple of gridsize as y-padding for a search box for diacritic base
77 // characters.
78 const double kDiacriticYPadRatio = 1.75;
79 // Min multiple of diacritic height that a neighbour must be to be a
80 // convincing base character.
81 const double kMinDiacriticSizeRatio = 1.0625;
82 // Max multiple of a textline's median height as a threshold for the sum of
83 // a diacritic's farthest x and y distances (gap + size).
84 const double kMaxDiacriticDistanceRatio = 1.25;
85 // Max x-gap between a diacritic and its base char as a fraction of the height
86 // of the base char (allowing other blobs to fill the gap.)
88 // Ratio between longest side of a line and longest side of a character.
89 // (neighbor_min > blob_min * kLineTrapShortest &&
90 // neighbor_max < blob_max / kLineTrapLongest)
91 // => neighbor is a grapheme and blob is a line.
92 const int kLineTrapLongest = 4;
93 // Ratio between shortest side of a line and shortest side of a character.
94 const int kLineTrapShortest = 2;
95 // Max aspect ratio of the total box before CountNeighbourGaps
96 // decides immediately based on the aspect ratio.
97 const int kMostlyOneDirRatio = 3;
98 // Aspect ratio for a blob to be considered as line residue.
99 const double kLineResidueAspectRatio = 8.0;
100 // Padding ratio for line residue search box.
101 const int kLineResiduePadRatio = 3;
102 // Min multiple of neighbour size for a line residue to be genuine.
103 const double kLineResidueSizeRatio = 1.75;
104 // Aspect ratio filter for OSD.
105 const float kSizeRatioToReject = 2.0;
106 // Expansion factor for search box for good neighbours.
107 const double kNeighbourSearchFactor = 2.5;
108 // Factor of increase of overlap when adding diacritics to make an image noisy.
109 const double kNoiseOverlapGrowthFactor = 4.0;
110 // Fraction of the image size to add overlap when adding diacritics for an
111 // image to qualify as noisy.
112 const double kNoiseOverlapAreaFactor = 1.0 / 512;
113 
114 StrokeWidth::StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright)
115  : BlobGrid(gridsize, bleft, tright)
116  , nontext_map_(nullptr)
117  , projection_(nullptr)
118  , denorm_(nullptr)
119  , grid_box_(bleft, tright)
120  , rerotation_(1.0f, 0.0f) {
121 }
122 
124 #ifndef GRAPHICS_DISABLED
125  if (widths_win_ != nullptr) {
126  delete widths_win_->AwaitEvent(SVET_DESTROY);
127  if (textord_tabfind_only_strokewidths) {
128  exit(0);
129  }
130  delete widths_win_;
131  }
132  delete leaders_win_;
133  delete initial_widths_win_;
134  delete chains_win_;
135  delete textlines_win_;
136  delete smoothed_win_;
137  delete diacritics_win_;
138 #endif
139 }
140 
141 // Sets the neighbours member of the medium-sized blobs in the block.
142 // Searches on 4 sides of each blob for similar-sized, similar-strokewidth
143 // blobs and sets pointers to the good neighbours.
145  // Run a preliminary strokewidth neighbour detection on the medium blobs.
146  InsertBlobList(&block->blobs);
147  BLOBNBOX_IT blob_it(&block->blobs);
148  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
149  SetNeighbours(false, false, blob_it.data());
150  }
151  Clear();
152 }
153 
154 // Sets the neighbour/textline writing direction members of the medium
155 // and large blobs with optional repair of broken CJK characters first.
156 // Repair of broken CJK is needed here because broken CJK characters
157 // can fool the textline direction detection algorithm.
159  TO_BLOCK *input_block) {
160  // Setup the grid with the remaining (non-noise) blobs.
161  InsertBlobs(input_block);
162  // Repair broken CJK characters if needed.
163  while (cjk_merge && FixBrokenCJK(input_block)) {
164  ;
165  }
166  // Grade blobs by inspection of neighbours.
167  FindTextlineFlowDirection(pageseg_mode, false);
168  // Clear the grid ready for rotation or leader finding.
169  Clear();
170 }
171 
172 // Helper to collect and count horizontal and vertical blobs from a list.
173 static void CollectHorizVertBlobs(BLOBNBOX_LIST *input_blobs, int *num_vertical_blobs,
174  int *num_horizontal_blobs, BLOBNBOX_CLIST *vertical_blobs,
175  BLOBNBOX_CLIST *horizontal_blobs,
176  BLOBNBOX_CLIST *nondescript_blobs) {
177  BLOBNBOX_C_IT v_it(vertical_blobs);
178  BLOBNBOX_C_IT h_it(horizontal_blobs);
179  BLOBNBOX_C_IT n_it(nondescript_blobs);
180  BLOBNBOX_IT blob_it(input_blobs);
181  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
182  BLOBNBOX *blob = blob_it.data();
183  const TBOX &box = blob->bounding_box();
184  float y_x = static_cast<float>(box.height()) / box.width();
185  float x_y = 1.0f / y_x;
186  // Select a >= 1.0 ratio
187  float ratio = x_y > y_x ? x_y : y_x;
188  // If the aspect ratio is small and we want them for osd, save the blob.
189  bool ok_blob = ratio <= kSizeRatioToReject;
190  if (blob->UniquelyVertical()) {
191  ++*num_vertical_blobs;
192  if (ok_blob) {
193  v_it.add_after_then_move(blob);
194  }
195  } else if (blob->UniquelyHorizontal()) {
196  ++*num_horizontal_blobs;
197  if (ok_blob) {
198  h_it.add_after_then_move(blob);
199  }
200  } else if (ok_blob) {
201  n_it.add_after_then_move(blob);
202  }
203  }
204 }
205 
206 // Types all the blobs as vertical or horizontal text or unknown and
207 // returns true if the majority are vertical.
208 // If the blobs are rotated, it is necessary to call CorrectForRotation
209 // after rotating everything, otherwise the work done here will be enough.
210 // If osd_blobs is not null, a list of blobs from the dominant textline
211 // direction are returned for use in orientation and script detection.
212 bool StrokeWidth::TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block,
213  BLOBNBOX_CLIST *osd_blobs) {
214  int vertical_boxes = 0;
215  int horizontal_boxes = 0;
216  // Count vertical normal and large blobs.
217  BLOBNBOX_CLIST vertical_blobs;
218  BLOBNBOX_CLIST horizontal_blobs;
219  BLOBNBOX_CLIST nondescript_blobs;
220  CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes, &vertical_blobs,
221  &horizontal_blobs, &nondescript_blobs);
222  CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes, &vertical_blobs,
223  &horizontal_blobs, &nondescript_blobs);
224  if (textord_debug_tabfind) {
225  tprintf("TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n", horizontal_boxes,
226  vertical_boxes, horizontal_blobs.length(), vertical_blobs.length(),
227  nondescript_blobs.length());
228  }
229  if (osd_blobs != nullptr && vertical_boxes == 0 && horizontal_boxes == 0) {
230  // Only nondescript blobs available, so return those.
231  BLOBNBOX_C_IT osd_it(osd_blobs);
232  osd_it.add_list_after(&nondescript_blobs);
233  return false;
234  }
235  int min_vert_boxes =
236  static_cast<int>((vertical_boxes + horizontal_boxes) * find_vertical_text_ratio);
237  if (vertical_boxes >= min_vert_boxes) {
238  if (osd_blobs != nullptr) {
239  BLOBNBOX_C_IT osd_it(osd_blobs);
240  osd_it.add_list_after(&vertical_blobs);
241  }
242  return true;
243  } else {
244  if (osd_blobs != nullptr) {
245  BLOBNBOX_C_IT osd_it(osd_blobs);
246  osd_it.add_list_after(&horizontal_blobs);
247  }
248  return false;
249  }
250 }
251 
252 // Corrects the data structures for the given rotation.
253 void StrokeWidth::CorrectForRotation(const FCOORD &rotation, ColPartitionGrid *part_grid) {
254  Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright());
255  grid_box_ = TBOX(bleft(), tright());
256  rerotation_.set_x(rotation.x());
257  rerotation_.set_y(-rotation.y());
258 }
259 
260 // Finds leader partitions and inserts them into the given part_grid.
262  Clear();
263  // Find and isolate leaders in the noise list.
264  ColPartition_LIST leader_parts;
265  FindLeadersAndMarkNoise(block, &leader_parts);
266  // Setup the strokewidth grid with the block's remaining (non-noise) blobs.
267  InsertBlobList(&block->blobs);
268  // Mark blobs that have leader neighbours.
269  for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {
270  ColPartition *part = it.extract();
271  part->ClaimBoxes();
272  MarkLeaderNeighbours(part, LR_LEFT);
273  MarkLeaderNeighbours(part, LR_RIGHT);
274  part_grid->InsertBBox(true, true, part);
275  }
276 }
277 
278 // Finds and marks noise those blobs that look like bits of vertical lines
279 // that would otherwise screw up layout analysis.
280 void StrokeWidth::RemoveLineResidue(ColPartition_LIST *big_part_list) {
281  BlobGridSearch gsearch(this);
282  BLOBNBOX *bbox;
283  // For every vertical line-like bbox in the grid, search its neighbours
284  // to find the tallest, and if the original box is taller by sufficient
285  // margin, then call it line residue and delete it.
286  gsearch.StartFullSearch();
287  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
288  TBOX box = bbox->bounding_box();
289  if (box.height() < box.width() * kLineResidueAspectRatio) {
290  continue;
291  }
292  // Set up a rectangle search around the blob to find the size of its
293  // neighbours.
294  int padding = box.height() * kLineResiduePadRatio;
295  TBOX search_box = box;
296  search_box.pad(padding, padding);
297  bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());
298  // Find the largest object in the search box not equal to bbox.
299  BlobGridSearch rsearch(this);
300  int max_height = 0;
301  BLOBNBOX *n;
302  rsearch.StartRectSearch(search_box);
303  while ((n = rsearch.NextRectSearch()) != nullptr) {
304  if (n == bbox) {
305  continue;
306  }
307  TBOX nbox = n->bounding_box();
308  if (nbox.height() > max_height) {
309  max_height = nbox.height();
310  }
311  }
312  if (debug) {
313  tprintf("Max neighbour size=%d for candidate line box at:", max_height);
314  box.print();
315  }
316  if (max_height * kLineResidueSizeRatio < box.height()) {
317 #ifndef GRAPHICS_DISABLED
318  if (leaders_win_ != nullptr) {
319  // We are debugging, so display deleted in pink blobs in the same
320  // window that we use to display leader detection.
321  leaders_win_->Pen(ScrollView::PINK);
322  leaders_win_->Rectangle(box.left(), box.bottom(), box.right(), box.top());
323  }
324 #endif // !GRAPHICS_DISABLED
325  ColPartition::MakeBigPartition(bbox, big_part_list);
326  }
327  }
328 }
329 
330 // Types all the blobs as vertical text or horizontal text or unknown and
331 // puts them into initial ColPartitions in the supplied part_grid.
332 // rerotation determines how to get back to the image coordinates from the
333 // blob coordinates (since they may have been rotated for vertical text).
334 // block is the single block for the whole page or rectangle to be OCRed.
335 // nontext_pix (full-size), is a binary mask used to prevent merges across
336 // photo/text boundaries. It is not kept beyond this function.
337 // denorm provides a mapping back to the image from the current blob
338 // coordinate space.
339 // projection provides a measure of textline density over the image and
340 // provides functions to assist with diacritic detection. It should be a
341 // pointer to a new TextlineProjection, and will be setup here.
342 // part_grid is the output grid of textline partitions.
343 // Large blobs that cause overlap are put in separate partitions and added
344 // to the big_parts list.
345 void StrokeWidth::GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation,
346  TO_BLOCK *block, Image nontext_pix, const DENORM *denorm,
347  bool cjk_script, TextlineProjection *projection,
348  BLOBNBOX_LIST *diacritic_blobs,
349  ColPartitionGrid *part_grid,
350  ColPartition_LIST *big_parts) {
351  nontext_map_ = nontext_pix;
352  projection_ = projection;
353  denorm_ = denorm;
354  // Clear and re Insert to take advantage of the tab stops in the blobs.
355  Clear();
356  // Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
357  InsertBlobs(block);
358 
359  // Run FixBrokenCJK() again if the page is CJK.
360  if (cjk_script) {
361  FixBrokenCJK(block);
362  }
363  FindTextlineFlowDirection(pageseg_mode, false);
364  projection_->ConstructProjection(block, rerotation, nontext_map_);
365 #ifndef GRAPHICS_DISABLED
366  if (textord_tabfind_show_strokewidths) {
367  ScrollView *line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
368  projection_->PlotGradedBlobs(&block->blobs, line_blobs_win);
369  projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win);
370  }
371 #endif
372  projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs);
373  projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs);
374  // Clear and re Insert to take advantage of the removed diacritics.
375  Clear();
376  InsertBlobs(block);
377  FCOORD skew;
378  FindTextlineFlowDirection(pageseg_mode, true);
379  PartitionFindResult r = FindInitialPartitions(pageseg_mode, rerotation, true, block,
380  diacritic_blobs, part_grid, big_parts, &skew);
381  if (r == PFR_NOISE) {
382  tprintf("Detected %d diacritics\n", diacritic_blobs->length());
383  // Noise was found, and removed.
384  Clear();
385  InsertBlobs(block);
386  FindTextlineFlowDirection(pageseg_mode, true);
387  r = FindInitialPartitions(pageseg_mode, rerotation, false, block, diacritic_blobs, part_grid,
388  big_parts, &skew);
389  }
390  nontext_map_ = nullptr;
391  projection_ = nullptr;
392  denorm_ = nullptr;
393 }
394 
395 static void PrintBoxWidths(BLOBNBOX *neighbour) {
396  const TBOX &nbox = neighbour->bounding_box();
397  tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n", nbox.left(),
398  nbox.bottom(), nbox.right(), nbox.top(), neighbour->horz_stroke_width(),
399  neighbour->vert_stroke_width(),
400  2.0 * neighbour->cblob()->area() / neighbour->cblob()->perimeter());
401 }
402 
404 void StrokeWidth::HandleClick(int x, int y) {
406  // Run a radial search for blobs that overlap.
407  BlobGridSearch radsearch(this);
408  radsearch.StartRadSearch(x, y, 1);
409  BLOBNBOX *neighbour;
410  FCOORD click(static_cast<float>(x), static_cast<float>(y));
411  while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
412  TBOX nbox = neighbour->bounding_box();
413  if (nbox.contains(click) && neighbour->cblob() != nullptr) {
414  PrintBoxWidths(neighbour);
415  if (neighbour->neighbour(BND_LEFT) != nullptr) {
416  PrintBoxWidths(neighbour->neighbour(BND_LEFT));
417  }
418  if (neighbour->neighbour(BND_RIGHT) != nullptr) {
419  PrintBoxWidths(neighbour->neighbour(BND_RIGHT));
420  }
421  if (neighbour->neighbour(BND_ABOVE) != nullptr) {
422  PrintBoxWidths(neighbour->neighbour(BND_ABOVE));
423  }
424  if (neighbour->neighbour(BND_BELOW) != nullptr) {
425  PrintBoxWidths(neighbour->neighbour(BND_BELOW));
426  }
427  int gaps[BND_COUNT];
428  neighbour->NeighbourGaps(gaps);
429  tprintf(
430  "Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n"
431  "Good= %d %d %d %d\n",
432  gaps[BND_LEFT], gaps[BND_RIGHT], gaps[BND_ABOVE], gaps[BND_BELOW],
433  neighbour->horz_possible(), neighbour->vert_possible(),
436  break;
437  }
438  }
439 }
440 
441 // Detects and marks leader dots/dashes.
442 // Leaders are horizontal chains of small or noise blobs that look
443 // monospace according to ColPartition::MarkAsLeaderIfMonospaced().
444 // Detected leaders become the only occupants of the block->small_blobs list.
445 // Non-leader small blobs get moved to the blobs list.
446 // Non-leader noise blobs remain singletons in the noise list.
447 // All small and noise blobs in high density regions are marked BTFT_NONTEXT.
448 // block is the single block for the whole page or rectangle to be OCRed.
449 // leader_parts is the output.
450 void StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK *block, ColPartition_LIST *leader_parts) {
451  InsertBlobList(&block->small_blobs);
452  InsertBlobList(&block->noise_blobs);
453  BlobGridSearch gsearch(this);
454  BLOBNBOX *bbox;
455  // For every bbox in the grid, set its neighbours.
456  gsearch.StartFullSearch();
457  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
458  SetNeighbours(true, false, bbox);
459  }
460  ColPartition_IT part_it(leader_parts);
461  gsearch.StartFullSearch();
462  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
463  if (bbox->flow() == BTFT_NONE) {
464  if (bbox->neighbour(BND_RIGHT) == nullptr && bbox->neighbour(BND_LEFT) == nullptr) {
465  continue;
466  }
467  // Put all the linked blobs into a ColPartition.
468  auto *part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
469  BLOBNBOX *blob;
470  for (blob = bbox; blob != nullptr && blob->flow() == BTFT_NONE;
471  blob = blob->neighbour(BND_RIGHT)) {
472  part->AddBox(blob);
473  }
474  for (blob = bbox->neighbour(BND_LEFT); blob != nullptr && blob->flow() == BTFT_NONE;
475  blob = blob->neighbour(BND_LEFT)) {
476  part->AddBox(blob);
477  }
478  if (part->MarkAsLeaderIfMonospaced()) {
479  part_it.add_after_then_move(part);
480  } else {
481  delete part;
482  }
483  }
484  }
485 #ifndef GRAPHICS_DISABLED
486  if (textord_tabfind_show_strokewidths) {
487  leaders_win_ = DisplayGoodBlobs("LeaderNeighbours", 0, 0);
488  }
489 #endif
490  // Move any non-leaders from the small to the blobs list, as they are
491  // most likely dashes or broken characters.
492  BLOBNBOX_IT blob_it(&block->blobs);
493  BLOBNBOX_IT small_it(&block->small_blobs);
494  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
495  BLOBNBOX *blob = small_it.data();
496  if (blob->flow() != BTFT_LEADER) {
497  if (blob->flow() == BTFT_NEIGHBOURS) {
498  blob->set_flow(BTFT_NONE);
499  }
500  blob->ClearNeighbours();
501  blob_it.add_to_end(small_it.extract());
502  }
503  }
504  // Move leaders from the noise list to the small list, leaving the small
505  // list exclusively leaders, so they don't get processed further,
506  // and the remaining small blobs all in the noise list.
507  BLOBNBOX_IT noise_it(&block->noise_blobs);
508  for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {
509  BLOBNBOX *blob = noise_it.data();
510  if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) {
511  small_it.add_to_end(noise_it.extract());
512  } else if (blob->flow() == BTFT_NEIGHBOURS) {
513  blob->set_flow(BTFT_NONE);
514  blob->ClearNeighbours();
515  }
516  }
517  // Clear the grid as we don't want the small stuff hanging around in it.
518  Clear();
519 }
520 
523 void StrokeWidth::InsertBlobs(TO_BLOCK *block) {
524  InsertBlobList(&block->blobs);
525  InsertBlobList(&block->large_blobs);
526 }
527 
528 // Checks the left or right side of the given leader partition and sets the
529 // (opposite) leader_on_right or leader_on_left flags for blobs
530 // that are next to the given side of the given leader partition.
531 void StrokeWidth::MarkLeaderNeighbours(const ColPartition *part, LeftOrRight side) {
532  const TBOX &part_box = part->bounding_box();
533  BlobGridSearch blobsearch(this);
534  // Search to the side of the leader for the nearest neighbour.
535  BLOBNBOX *best_blob = nullptr;
536  int best_gap = 0;
537  blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left() : part_box.right(),
538  part_box.bottom(), part_box.top());
539  BLOBNBOX *blob;
540  while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != nullptr) {
541  const TBOX &blob_box = blob->bounding_box();
542  if (!blob_box.y_overlap(part_box)) {
543  continue;
544  }
545  int x_gap = blob_box.x_gap(part_box);
546  if (x_gap > 2 * gridsize()) {
547  break;
548  } else if (best_blob == nullptr || x_gap < best_gap) {
549  best_blob = blob;
550  best_gap = x_gap;
551  }
552  }
553  if (best_blob != nullptr) {
554  if (side == LR_LEFT) {
555  best_blob->set_leader_on_right(true);
556  } else {
557  best_blob->set_leader_on_left(true);
558  }
559 #ifndef GRAPHICS_DISABLED
560  if (leaders_win_ != nullptr) {
561  leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN);
562  const TBOX &blob_box = best_blob->bounding_box();
563  leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(), blob_box.right(), blob_box.top());
564  }
565 #endif // !GRAPHICS_DISABLED
566  }
567 }
568 
569 // Helper to compute the UQ of the square-ish CJK characters.
570 static int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST *blobs) {
571  STATS sizes(0, gridsize * kMaxCJKSizeRatio);
572  BLOBNBOX_IT it(blobs);
573  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
574  BLOBNBOX *blob = it.data();
575  int width = blob->bounding_box().width();
576  int height = blob->bounding_box().height();
577  if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio) {
578  sizes.add(height, 1);
579  }
580  }
581  return static_cast<int>(sizes.ile(0.75f) + 0.5);
582 }
583 
584 // Fix broken CJK characters, using the fake joined blobs mechanism.
585 // Blobs are really merged, ie the master takes all the outlines and the
586 // others are deleted.
587 // Returns true if sufficient blobs are merged that it may be worth running
588 // again, due to a better estimate of character size.
589 bool StrokeWidth::FixBrokenCJK(TO_BLOCK *block) {
590  BLOBNBOX_LIST *blobs = &block->blobs;
591  int median_height = UpperQuartileCJKSize(gridsize(), blobs);
592  int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction);
593  int max_height = static_cast<int>(median_height * kCJKAspectRatio);
594  int num_fixed = 0;
595  BLOBNBOX_IT blob_it(blobs);
596 
597  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
598  BLOBNBOX *blob = blob_it.data();
599  if (blob->cblob() == nullptr || blob->cblob()->out_list()->empty()) {
600  continue;
601  }
602  TBOX bbox = blob->bounding_box();
603  bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(), bbox.bottom());
604  if (debug) {
605  tprintf("Checking for Broken CJK (max size=%d):", max_height);
606  bbox.print();
607  }
608  // Generate a list of blobs that overlap or are near enough to merge.
609  BLOBNBOX_CLIST overlapped_blobs;
610  AccumulateOverlaps(blob, debug, max_height, max_dist, &bbox, &overlapped_blobs);
611  if (!overlapped_blobs.empty()) {
612  // There are overlapping blobs, so qualify them as being satisfactory
613  // before removing them from the grid and replacing them with the union.
614  // The final box must be roughly square.
615  if (bbox.width() > bbox.height() * kCJKAspectRatio ||
616  bbox.height() > bbox.width() * kCJKAspectRatio) {
617  if (debug) {
618  tprintf("Bad final aspectratio:");
619  bbox.print();
620  }
621  continue;
622  }
623  // There can't be too many blobs to merge.
624  if (overlapped_blobs.length() >= kCJKMaxComponents) {
625  if (debug) {
626  tprintf("Too many neighbours: %d\n", overlapped_blobs.length());
627  }
628  continue;
629  }
630  // The strokewidths must match amongst the join candidates.
631  BLOBNBOX_C_IT n_it(&overlapped_blobs);
632  for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
633  BLOBNBOX *neighbour = nullptr;
634  neighbour = n_it.data();
635  if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK, kStrokeWidthCJK)) {
636  break;
637  }
638  }
639  if (!n_it.cycled_list()) {
640  if (debug) {
641  tprintf("Bad stroke widths:");
642  PrintBoxWidths(blob);
643  }
644  continue; // Not good enough.
645  }
646 
647  // Merge all the candidates into blob.
648  // We must remove blob from the grid and reinsert it after merging
649  // to maintain the integrity of the grid.
650  RemoveBBox(blob);
651  // Everything else will be calculated later.
652  for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
653  BLOBNBOX *neighbour = n_it.data();
654  RemoveBBox(neighbour);
655  // Mark empty blob for deletion.
656  neighbour->set_region_type(BRT_NOISE);
657  blob->really_merge(neighbour);
658  if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
659  blob->rotate_box(rerotation_);
660  }
661  }
662  InsertBBox(true, true, blob);
663  ++num_fixed;
664  if (debug) {
665  tprintf("Done! Final box:");
666  bbox.print();
667  }
668  }
669  }
670  // Count remaining blobs.
671  int num_remaining = 0;
672  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
673  BLOBNBOX *blob = blob_it.data();
674  if (blob->cblob() != nullptr && !blob->cblob()->out_list()->empty()) {
675  ++num_remaining;
676  }
677  }
678  // Permanently delete all the marked blobs after first removing all
679  // references in the neighbour members.
680  block->DeleteUnownedNoise();
681  return num_fixed > num_remaining * kBrokenCJKIterationFraction;
682 }
683 
684 // Helper function to determine whether it is reasonable to merge the
685 // bbox and the nbox for repairing broken CJK.
686 // The distance apart must not exceed max_dist, the combined size must
687 // not exceed max_size, and the aspect ratio must either improve or at
688 // least not get worse by much.
689 static bool AcceptableCJKMerge(const TBOX &bbox, const TBOX &nbox, bool debug, int max_size,
690  int max_dist, int *x_gap, int *y_gap) {
691  *x_gap = bbox.x_gap(nbox);
692  *y_gap = bbox.y_gap(nbox);
693  TBOX merged(nbox);
694  merged += bbox;
695  if (debug) {
696  tprintf("gaps = %d, %d, merged_box:", *x_gap, *y_gap);
697  merged.print();
698  }
699  if (*x_gap <= max_dist && *y_gap <= max_dist && merged.width() <= max_size &&
700  merged.height() <= max_size) {
701  // Close enough to call overlapping. Check aspect ratios.
702  double old_ratio = static_cast<double>(bbox.width()) / bbox.height();
703  if (old_ratio < 1.0) {
704  old_ratio = 1.0 / old_ratio;
705  }
706  double new_ratio = static_cast<double>(merged.width()) / merged.height();
707  if (new_ratio < 1.0) {
708  new_ratio = 1.0 / new_ratio;
709  }
710  if (new_ratio <= old_ratio * kCJKAspectRatioIncrease) {
711  return true;
712  }
713  }
714  return false;
715 }
716 
717 // Collect blobs that overlap or are within max_dist of the input bbox.
718 // Return them in the list of blobs and expand the bbox to be the union
719 // of all the boxes. not_this is excluded from the search, as are blobs
720 // that cause the merged box to exceed max_size in either dimension.
721 void StrokeWidth::AccumulateOverlaps(const BLOBNBOX *not_this, bool debug, int max_size,
722  int max_dist, TBOX *bbox, BLOBNBOX_CLIST *blobs) {
723  // While searching, nearests holds the nearest failed blob in each
724  // direction. When we have a nearest in each of the 4 directions, then
725  // the search is over, and at this point the final bbox must not overlap
726  // any of the nearests.
727  BLOBNBOX *nearests[BND_COUNT];
728  for (auto &nearest : nearests) {
729  nearest = nullptr;
730  }
731  int x = (bbox->left() + bbox->right()) / 2;
732  int y = (bbox->bottom() + bbox->top()) / 2;
733  // Run a radial search for blobs that overlap or are sufficiently close.
734  BlobGridSearch radsearch(this);
735  radsearch.StartRadSearch(x, y, kCJKRadius);
736  BLOBNBOX *neighbour;
737  while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
738  if (neighbour == not_this) {
739  continue;
740  }
741  TBOX nbox = neighbour->bounding_box();
742  int x_gap, y_gap;
743  if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist, &x_gap, &y_gap)) {
744  // Close enough to call overlapping. Merge boxes.
745  *bbox += nbox;
746  blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
747  if (debug) {
748  tprintf("Added:");
749  nbox.print();
750  }
751  // Since we merged, search the nearests, as some might now me mergeable.
752  for (int dir = 0; dir < BND_COUNT; ++dir) {
753  if (nearests[dir] == nullptr) {
754  continue;
755  }
756  nbox = nearests[dir]->bounding_box();
757  if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist, &x_gap, &y_gap)) {
758  // Close enough to call overlapping. Merge boxes.
759  *bbox += nbox;
760  blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]);
761  if (debug) {
762  tprintf("Added:");
763  nbox.print();
764  }
765  nearests[dir] = nullptr;
766  dir = -1; // Restart the search.
767  }
768  }
769  } else if (x_gap < 0 && x_gap <= y_gap) {
770  // A vertical neighbour. Record the nearest.
771  BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW;
772  if (nearests[dir] == nullptr || y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {
773  nearests[dir] = neighbour;
774  }
775  } else if (y_gap < 0 && y_gap <= x_gap) {
776  // A horizontal neighbour. Record the nearest.
777  BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT;
778  if (nearests[dir] == nullptr || x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {
779  nearests[dir] = neighbour;
780  }
781  }
782  // If all nearests are non-null, then we have finished.
783  if (nearests[BND_LEFT] && nearests[BND_RIGHT] && nearests[BND_ABOVE] && nearests[BND_BELOW]) {
784  break;
785  }
786  }
787  // Final overlap with a nearest is not allowed.
788  for (auto &nearest : nearests) {
789  if (nearest == nullptr) {
790  continue;
791  }
792  const TBOX &nbox = nearest->bounding_box();
793  if (debug) {
794  tprintf("Testing for overlap with:");
795  nbox.print();
796  }
797  if (bbox->overlap(nbox)) {
798  blobs->shallow_clear();
799  if (debug) {
800  tprintf("Final box overlaps nearest\n");
801  }
802  return;
803  }
804  }
805 }
806 
807 // For each blob in this grid, Finds the textline direction to be horizontal
808 // or vertical according to distance to neighbours and 1st and 2nd order
809 // neighbours. Non-text tends to end up without a definite direction.
810 // Result is setting of the neighbours and vert_possible/horz_possible
811 // flags in the BLOBNBOXes currently in this grid.
812 // This function is called more than once if page orientation is uncertain,
813 // so display_if_debugging is true on the final call to display the results.
814 void StrokeWidth::FindTextlineFlowDirection(PageSegMode pageseg_mode, bool display_if_debugging) {
815  BlobGridSearch gsearch(this);
816  BLOBNBOX *bbox;
817  // For every bbox in the grid, set its neighbours.
818  gsearch.StartFullSearch();
819  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
820  SetNeighbours(false, display_if_debugging, bbox);
821  }
822  // Where vertical or horizontal wins by a big margin, clarify it.
823  gsearch.StartFullSearch();
824  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
825  SimplifyObviousNeighbours(bbox);
826  }
827  // Now try to make the blobs only vertical or horizontal using neighbours.
828  gsearch.StartFullSearch();
829  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
830  if (FindingVerticalOnly(pageseg_mode)) {
831  bbox->set_vert_possible(true);
832  bbox->set_horz_possible(false);
833  } else if (FindingHorizontalOnly(pageseg_mode)) {
834  bbox->set_vert_possible(false);
835  bbox->set_horz_possible(true);
836  } else {
837  SetNeighbourFlows(bbox);
838  }
839  }
840 #ifndef GRAPHICS_DISABLED
841  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
842  textord_tabfind_show_strokewidths > 1) {
843  initial_widths_win_ = DisplayGoodBlobs("InitialStrokewidths", 400, 0);
844  }
845 #endif
846  // Improve flow direction with neighbours.
847  gsearch.StartFullSearch();
848  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
849  SmoothNeighbourTypes(pageseg_mode, false, bbox);
850  }
851  // Now allow reset of firm values to fix renegades.
852  gsearch.StartFullSearch();
853  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
854  SmoothNeighbourTypes(pageseg_mode, true, bbox);
855  }
856  // Repeat.
857  gsearch.StartFullSearch();
858  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
859  SmoothNeighbourTypes(pageseg_mode, true, bbox);
860  }
861 #ifndef GRAPHICS_DISABLED
862  if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
863  textord_tabfind_show_strokewidths > 1) {
864  widths_win_ = DisplayGoodBlobs("ImprovedStrokewidths", 800, 0);
865  }
866 #endif
867 }
868 
869 // Sets the neighbours and good_stroke_neighbours members of the blob by
870 // searching close on all 4 sides.
871 // When finding leader dots/dashes, there is a slightly different rule for
872 // what makes a good neighbour.
873 void StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap, BLOBNBOX *blob) {
874  int line_trap_count = 0;
875  for (int dir = 0; dir < BND_COUNT; ++dir) {
876  auto bnd = static_cast<BlobNeighbourDir>(dir);
877  line_trap_count += FindGoodNeighbour(bnd, leaders, blob);
878  }
879  if (line_trap_count > 0 && activate_line_trap) {
880  // It looks like a line so isolate it by clearing its neighbours.
881  blob->ClearNeighbours();
882  const TBOX &box = blob->bounding_box();
883  blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE);
884  }
885 }
886 
887 // Sets the good_stroke_neighbours member of the blob if it has a
888 // GoodNeighbour on the given side.
889 // Also sets the neighbour in the blob, whether or not a good one is found.
890 // Returns the number of blobs in the nearby search area that would lead us to
891 // believe that this blob is a line separator.
892 // Leaders get extra special lenient treatment.
893 int StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders, BLOBNBOX *blob) {
894  // Search for neighbours that overlap vertically.
895  TBOX blob_box = blob->bounding_box();
896  bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(), blob_box.bottom());
897  if (debug) {
898  tprintf("FGN in dir %d for blob:", dir);
899  blob_box.print();
900  }
901  int top = blob_box.top();
902  int bottom = blob_box.bottom();
903  int left = blob_box.left();
904  int right = blob_box.right();
905  int width = right - left;
906  int height = top - bottom;
907 
908  // A trap to detect lines tests for the min dimension of neighbours
909  // being larger than a multiple of the min dimension of the line
910  // and the larger dimension being smaller than a fraction of the max
911  // dimension of the line.
912  int line_trap_max = std::max(width, height) / kLineTrapLongest;
913  int line_trap_min = std::min(width, height) * kLineTrapShortest;
914  int line_trap_count = 0;
915 
916  int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT) ? height / 2 : width / 2;
917  int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT) ? height / 3 : width / 3;
918  if (leaders) {
919  min_good_overlap = min_decent_overlap = 1;
920  }
921 
922  int search_pad =
923  static_cast<int>(sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor);
924  if (gridsize() > search_pad) {
925  search_pad = gridsize();
926  }
927  TBOX search_box = blob_box;
928  // Pad the search in the appropriate direction.
929  switch (dir) {
930  case BND_LEFT:
931  search_box.set_left(search_box.left() - search_pad);
932  break;
933  case BND_RIGHT:
934  search_box.set_right(search_box.right() + search_pad);
935  break;
936  case BND_BELOW:
937  search_box.set_bottom(search_box.bottom() - search_pad);
938  break;
939  case BND_ABOVE:
940  search_box.set_top(search_box.top() + search_pad);
941  break;
942  case BND_COUNT:
943  return 0;
944  }
945 
946  BlobGridSearch rectsearch(this);
947  rectsearch.StartRectSearch(search_box);
948  BLOBNBOX *best_neighbour = nullptr;
949  double best_goodness = 0.0;
950  bool best_is_good = false;
951  BLOBNBOX *neighbour;
952  while ((neighbour = rectsearch.NextRectSearch()) != nullptr) {
953  TBOX nbox = neighbour->bounding_box();
954  if (neighbour == blob) {
955  continue;
956  }
957  int mid_x = (nbox.left() + nbox.right()) / 2;
958  if (mid_x < blob->left_rule() || mid_x > blob->right_rule()) {
959  continue; // In a different column.
960  }
961  if (debug) {
962  tprintf("Neighbour at:");
963  nbox.print();
964  }
965 
966  // Last-minute line detector. There is a small upper limit to the line
967  // width accepted by the morphological line detector.
968  int n_width = nbox.width();
969  int n_height = nbox.height();
970  if (std::min(n_width, n_height) > line_trap_min &&
971  std::max(n_width, n_height) < line_trap_max) {
972  ++line_trap_count;
973  }
974  // Heavily joined text, such as Arabic may have very different sizes when
975  // looking at the maxes, but the heights may be almost identical, so check
976  // for a difference in height if looking sideways or width vertically.
977  if (TabFind::VeryDifferentSizes(std::max(n_width, n_height), std::max(width, height)) &&
978  (((dir == BND_LEFT || dir == BND_RIGHT) && TabFind::DifferentSizes(n_height, height)) ||
979  ((dir == BND_BELOW || dir == BND_ABOVE) && TabFind::DifferentSizes(n_width, width)))) {
980  if (debug) {
981  tprintf("Bad size\n");
982  }
983  continue; // Could be a different font size or non-text.
984  }
985  // Amount of vertical overlap between the blobs.
986  int overlap;
987  // If the overlap is along the short side of the neighbour, and it
988  // is fully overlapped, then perp_overlap holds the length of the long
989  // side of the neighbour. A measure to include hyphens and dashes as
990  // legitimate neighbours.
991  int perp_overlap;
992  int gap;
993  if (dir == BND_LEFT || dir == BND_RIGHT) {
994  overlap = std::min(static_cast<int>(nbox.top()), top) -
995  std::max(static_cast<int>(nbox.bottom()), bottom);
996  if (overlap == nbox.height() && nbox.width() > nbox.height()) {
997  perp_overlap = nbox.width();
998  } else {
999  perp_overlap = overlap;
1000  }
1001  gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right;
1002  if (gap <= 0) {
1003  if (debug) {
1004  tprintf("On wrong side\n");
1005  }
1006  continue; // On the wrong side.
1007  }
1008  gap -= n_width;
1009  } else {
1010  overlap = std::min(static_cast<int>(nbox.right()), right) -
1011  std::max(static_cast<int>(nbox.left()), left);
1012  if (overlap == nbox.width() && nbox.height() > nbox.width()) {
1013  perp_overlap = nbox.height();
1014  } else {
1015  perp_overlap = overlap;
1016  }
1017  gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top;
1018  if (gap <= 0) {
1019  if (debug) {
1020  tprintf("On wrong side\n");
1021  }
1022  continue; // On the wrong side.
1023  }
1024  gap -= n_height;
1025  }
1026  if (-gap > overlap) {
1027  if (debug) {
1028  tprintf("Overlaps wrong way\n");
1029  }
1030  continue; // Overlaps the wrong way.
1031  }
1032  if (perp_overlap < min_decent_overlap) {
1033  if (debug) {
1034  tprintf("Doesn't overlap enough\n");
1035  }
1036  continue; // Doesn't overlap enough.
1037  }
1038  bool bad_sizes =
1039  TabFind::DifferentSizes(height, n_height) && TabFind::DifferentSizes(width, n_width);
1040  bool is_good =
1041  overlap >= min_good_overlap && !bad_sizes &&
1042  blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionTolerance, kStrokeWidthTolerance);
1043  // Best is a fuzzy combination of gap, overlap and is good.
1044  // Basically if you make one thing twice as good without making
1045  // anything else twice as bad, then it is better.
1046  if (gap < 1) {
1047  gap = 1;
1048  }
1049  double goodness = (1.0 + is_good) * overlap / gap;
1050  if (debug) {
1051  tprintf("goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n", goodness, best_goodness,
1052  is_good, overlap, gap);
1053  }
1054  if (goodness > best_goodness) {
1055  best_neighbour = neighbour;
1056  best_goodness = goodness;
1057  best_is_good = is_good;
1058  }
1059  }
1060  blob->set_neighbour(dir, best_neighbour, best_is_good);
1061  return line_trap_count;
1062 }
1063 
1064 // Helper to get a list of 1st-order neighbours.
1065 static void ListNeighbours(const BLOBNBOX *blob, BLOBNBOX_CLIST *neighbours) {
1066  for (int dir = 0; dir < BND_COUNT; ++dir) {
1067  auto bnd = static_cast<BlobNeighbourDir>(dir);
1068  BLOBNBOX *neighbour = blob->neighbour(bnd);
1069  if (neighbour != nullptr) {
1070  neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
1071  }
1072  }
1073 }
1074 
1075 // Helper to get a list of 1st and 2nd order neighbours.
1076 static void List2ndNeighbours(const BLOBNBOX *blob, BLOBNBOX_CLIST *neighbours) {
1077  ListNeighbours(blob, neighbours);
1078  for (int dir = 0; dir < BND_COUNT; ++dir) {
1079  auto bnd = static_cast<BlobNeighbourDir>(dir);
1080  BLOBNBOX *neighbour = blob->neighbour(bnd);
1081  if (neighbour != nullptr) {
1082  ListNeighbours(neighbour, neighbours);
1083  }
1084  }
1085 }
1086 
1087 // Helper to get a list of 1st, 2nd and 3rd order neighbours.
1088 static void List3rdNeighbours(const BLOBNBOX *blob, BLOBNBOX_CLIST *neighbours) {
1089  List2ndNeighbours(blob, neighbours);
1090  for (int dir = 0; dir < BND_COUNT; ++dir) {
1091  auto bnd = static_cast<BlobNeighbourDir>(dir);
1092  BLOBNBOX *neighbour = blob->neighbour(bnd);
1093  if (neighbour != nullptr) {
1094  List2ndNeighbours(neighbour, neighbours);
1095  }
1096  }
1097 }
1098 
1099 // Helper to count the evidence for verticalness or horizontalness
1100 // in a list of neighbours.
1101 static void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST *neighbours, int *pure_h_count,
1102  int *pure_v_count) {
1103  if (neighbours->length() <= kMostlyOneDirRatio) {
1104  return;
1105  }
1106  BLOBNBOX_C_IT it(neighbours);
1107  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1108  BLOBNBOX *blob = it.data();
1109  int h_min, h_max, v_min, v_max;
1110  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1111  if (debug) {
1112  tprintf("Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max);
1113  }
1114  if (h_max < v_min || blob->leader_on_left() || blob->leader_on_right()) {
1115  // Horizontal gaps are clear winners. Count a pure horizontal.
1116  ++*pure_h_count;
1117  if (debug) {
1118  tprintf("Horz at:");
1119  }
1120  } else if (v_max < h_min) {
1121  // Vertical gaps are clear winners. Clear a pure vertical.
1122  ++*pure_v_count;
1123  if (debug) {
1124  tprintf("Vert at:");
1125  }
1126  } else {
1127  if (debug) {
1128  tprintf("Neither at:");
1129  }
1130  }
1131  if (debug) {
1132  blob->bounding_box().print();
1133  }
1134  }
1135 }
1136 
1137 // Makes the blob to be only horizontal or vertical where evidence
1138 // is clear based on gaps of 2nd order neighbours, or definite individual
1139 // blobs.
1140 void StrokeWidth::SetNeighbourFlows(BLOBNBOX *blob) {
1141  if (blob->DefiniteIndividualFlow()) {
1142  return;
1143  }
1144  bool debug =
1145  AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(), blob->bounding_box().bottom());
1146  if (debug) {
1147  tprintf("SetNeighbourFlows (current flow=%d, type=%d) on:", blob->flow(), blob->region_type());
1148  blob->bounding_box().print();
1149  }
1150  BLOBNBOX_CLIST neighbours;
1151  List3rdNeighbours(blob, &neighbours);
1152  // The number of pure horizontal and vertical neighbours.
1153  int pure_h_count = 0;
1154  int pure_v_count = 0;
1155  CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);
1156  if (debug) {
1157  HandleClick(blob->bounding_box().left() + 1, blob->bounding_box().bottom() + 1);
1158  tprintf("SetFlows: h_count=%d, v_count=%d\n", pure_h_count, pure_v_count);
1159  }
1160  if (!neighbours.empty()) {
1161  blob->set_vert_possible(true);
1162  blob->set_horz_possible(true);
1163  if (pure_h_count > 2 * pure_v_count) {
1164  // Horizontal gaps are clear winners. Clear vertical neighbours.
1165  blob->set_vert_possible(false);
1166  } else if (pure_v_count > 2 * pure_h_count) {
1167  // Vertical gaps are clear winners. Clear horizontal neighbours.
1168  blob->set_horz_possible(false);
1169  }
1170  } else {
1171  // Lonely blob. Can't tell its flow direction.
1172  blob->set_vert_possible(false);
1173  blob->set_horz_possible(false);
1174  }
1175 }
1176 
1177 // Helper to count the number of horizontal and vertical blobs in a list.
1178 static void CountNeighbourTypes(BLOBNBOX_CLIST *neighbours, int *pure_h_count, int *pure_v_count) {
1179  BLOBNBOX_C_IT it(neighbours);
1180  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1181  BLOBNBOX *blob = it.data();
1182  if (blob->UniquelyHorizontal()) {
1183  ++*pure_h_count;
1184  }
1185  if (blob->UniquelyVertical()) {
1186  ++*pure_v_count;
1187  }
1188  }
1189 }
1190 
1191 // Nullify the neighbours in the wrong directions where the direction
1192 // is clear-cut based on a distance margin. Good for isolating vertical
1193 // text from neighbouring horizontal text.
1194 void StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX *blob) {
1195  // Case 1: We have text that is likely several characters, blurry and joined
1196  // together.
1197  if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() &&
1198  blob->bounding_box().height() > 3 * blob->area_stroke_width())) {
1199  // The blob is complex (not stick-like).
1200  if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) {
1201  // Horizontal conjoined text.
1202  blob->set_neighbour(BND_ABOVE, nullptr, false);
1203  blob->set_neighbour(BND_BELOW, nullptr, false);
1204  return;
1205  }
1206  if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) {
1207  // Vertical conjoined text.
1208  blob->set_neighbour(BND_LEFT, nullptr, false);
1209  blob->set_neighbour(BND_RIGHT, nullptr, false);
1210  return;
1211  }
1212  }
1213 
1214  // Case 2: This blob is likely a single character.
1215  int margin = gridsize() / 2;
1216  int h_min, h_max, v_min, v_max;
1217  blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1218  if ((h_max + margin < v_min && h_max < margin / 2) || blob->leader_on_left() ||
1219  blob->leader_on_right()) {
1220  // Horizontal gaps are clear winners. Clear vertical neighbours.
1221  blob->set_neighbour(BND_ABOVE, nullptr, false);
1222  blob->set_neighbour(BND_BELOW, nullptr, false);
1223  } else if (v_max + margin < h_min && v_max < margin / 2) {
1224  // Vertical gaps are clear winners. Clear horizontal neighbours.
1225  blob->set_neighbour(BND_LEFT, nullptr, false);
1226  blob->set_neighbour(BND_RIGHT, nullptr, false);
1227  }
1228 }
1229 
1230 // Smoothes the vertical/horizontal type of the blob based on the
1231 // 2nd-order neighbours. If reset_all is true, then all blobs are
1232 // changed. Otherwise, only ambiguous blobs are processed.
1233 void StrokeWidth::SmoothNeighbourTypes(PageSegMode pageseg_mode, bool reset_all, BLOBNBOX *blob) {
1234  if ((blob->vert_possible() && blob->horz_possible()) || reset_all) {
1235  // There are both horizontal and vertical so try to fix it.
1236  BLOBNBOX_CLIST neighbours;
1237  List2ndNeighbours(blob, &neighbours);
1238  // The number of pure horizontal and vertical neighbours.
1239  int pure_h_count = 0;
1240  int pure_v_count = 0;
1241  CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);
1242  if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1243  blob->bounding_box().bottom())) {
1244  HandleClick(blob->bounding_box().left() + 1, blob->bounding_box().bottom() + 1);
1245  tprintf("pure_h=%d, pure_v=%d\n", pure_h_count, pure_v_count);
1246  }
1247  if (pure_h_count > pure_v_count && !FindingVerticalOnly(pageseg_mode)) {
1248  // Horizontal gaps are clear winners. Clear vertical neighbours.
1249  blob->set_vert_possible(false);
1250  blob->set_horz_possible(true);
1251  } else if (pure_v_count > pure_h_count && !FindingHorizontalOnly(pageseg_mode)) {
1252  // Vertical gaps are clear winners. Clear horizontal neighbours.
1253  blob->set_horz_possible(false);
1254  blob->set_vert_possible(true);
1255  }
1256  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1257  blob->bounding_box().bottom())) {
1258  HandleClick(blob->bounding_box().left() + 1, blob->bounding_box().bottom() + 1);
1259  tprintf("Clean on pass 3!\n");
1260  }
1261 }
1262 
1263 // Partition creation. Accumulates vertical and horizontal text chains,
1264 // puts the remaining blobs in as unknowns, and then merges/splits to
1265 // minimize overlap and smoothes the types with neighbours and the color
1266 // image if provided. rerotation is used to rotate the coordinate space
1267 // back to the nontext_map_ image.
1268 // If find_problems is true, detects possible noise pollution by the amount
1269 // of partition overlap that is created by the diacritics. If excessive, the
1270 // noise is separated out into diacritic blobs, and PFR_NOISE is returned.
1271 // [TODO(rays): if the partition overlap is caused by heavy skew, deskews
1272 // the components, saves the skew_angle and returns PFR_SKEW.] If the return
1273 // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
1274 // called again after cleaning up the partly done work.
1275 PartitionFindResult StrokeWidth::FindInitialPartitions(
1276  PageSegMode pageseg_mode, const FCOORD &rerotation, bool find_problems, TO_BLOCK *block,
1277  BLOBNBOX_LIST *diacritic_blobs, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts,
1278  FCOORD *skew_angle) {
1279  if (!FindingHorizontalOnly(pageseg_mode)) {
1280  FindVerticalTextChains(part_grid);
1281  }
1282  if (!FindingVerticalOnly(pageseg_mode)) {
1283  FindHorizontalTextChains(part_grid);
1284  }
1285 #ifndef GRAPHICS_DISABLED
1286  if (textord_tabfind_show_strokewidths) {
1287  chains_win_ = MakeWindow(0, 400, "Initial text chains");
1288  part_grid->DisplayBoxes(chains_win_);
1289  projection_->DisplayProjection();
1290  }
1291 #endif
1292  if (find_problems) {
1293  // TODO(rays) Do something to find skew, set skew_angle and return if there
1294  // is some.
1295  }
1296  part_grid->SplitOverlappingPartitions(big_parts);
1297  EasyMerges(part_grid);
1298  RemoveLargeUnusedBlobs(block, part_grid, big_parts);
1299  TBOX grid_box(bleft(), tright());
1300  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box, rerotation)) {
1301  ;
1302  }
1303  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, grid_box, rerotation)) {
1304  ;
1305  }
1306  int pre_overlap = part_grid->ComputeTotalOverlap(nullptr);
1307  TestDiacritics(part_grid, block);
1308  MergeDiacritics(block, part_grid);
1309  if (find_problems && diacritic_blobs != nullptr &&
1310  DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid, diacritic_blobs)) {
1311  return PFR_NOISE;
1312  }
1313 #ifndef GRAPHICS_DISABLED
1314  if (textord_tabfind_show_strokewidths) {
1315  textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
1316  part_grid->DisplayBoxes(textlines_win_);
1317  diacritics_win_ = DisplayDiacritics("Diacritics", 0, 0, block);
1318  }
1319 #endif
1320  PartitionRemainingBlobs(pageseg_mode, part_grid);
1321  part_grid->SplitOverlappingPartitions(big_parts);
1322  EasyMerges(part_grid);
1323  while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box, rerotation)) {
1324  ;
1325  }
1326  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, grid_box, rerotation)) {
1327  ;
1328  }
1329  // Now eliminate strong stuff in a sea of the opposite.
1330  while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_, grid_box, rerotation)) {
1331  ;
1332  }
1333 #ifndef GRAPHICS_DISABLED
1334  if (textord_tabfind_show_strokewidths) {
1335  smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
1336  part_grid->DisplayBoxes(smoothed_win_);
1337  }
1338 #endif
1339  return PFR_OK;
1340 }
1341 
1342 // Detects noise by a significant increase in partition overlap from
1343 // pre_overlap to now, and removes noise from the union of all the overlapping
1344 // partitions, placing the blobs in diacritic_blobs. Returns true if any noise
1345 // was found and removed.
1346 bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX &grid_box, TO_BLOCK *block,
1347  ColPartitionGrid *part_grid,
1348  BLOBNBOX_LIST *diacritic_blobs) {
1349  ColPartitionGrid *noise_grid = nullptr;
1350  int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);
1351  if (pre_overlap == 0) {
1352  pre_overlap = 1;
1353  }
1354  BLOBNBOX_IT diacritic_it(diacritic_blobs);
1355  if (noise_grid != nullptr) {
1356  if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor &&
1357  post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) {
1358  // This is noisy enough to fix.
1359 #ifndef GRAPHICS_DISABLED
1360  if (textord_tabfind_show_strokewidths) {
1361  ScrollView *noise_win = MakeWindow(1000, 500, "Noise Areas");
1362  noise_grid->DisplayBoxes(noise_win);
1363  }
1364 #endif
1365  part_grid->DeleteNonLeaderParts();
1366  BLOBNBOX_IT blob_it(&block->noise_blobs);
1367  ColPartitionGridSearch rsearch(noise_grid);
1368  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1369  BLOBNBOX *blob = blob_it.data();
1370  blob->ClearNeighbours();
1371  if (!blob->IsDiacritic() || blob->owner() != nullptr) {
1372  continue; // Not a noise candidate.
1373  }
1374  TBOX search_box(blob->bounding_box());
1375  search_box.pad(gridsize(), gridsize());
1376  rsearch.StartRectSearch(search_box);
1377  ColPartition *part = rsearch.NextRectSearch();
1378  if (part != nullptr) {
1379  // Consider blob as possible noise.
1380  blob->set_owns_cblob(true);
1381  blob->compute_bounding_box();
1382  diacritic_it.add_after_then_move(blob_it.extract());
1383  }
1384  }
1385  noise_grid->DeleteParts();
1386  delete noise_grid;
1387  return true;
1388  }
1389  noise_grid->DeleteParts();
1390  delete noise_grid;
1391  }
1392  return false;
1393 }
1394 
1395 // Helper verifies that blob's neighbour in direction dir is good to add to a
1396 // vertical text chain by returning the neighbour if it is not null, not owned,
1397 // and not uniquely horizontal, as well as its neighbour in the opposite
1398 // direction is blob.
1399 static BLOBNBOX *MutualUnusedVNeighbour(const BLOBNBOX *blob, BlobNeighbourDir dir) {
1400  BLOBNBOX *next_blob = blob->neighbour(dir);
1401  if (next_blob == nullptr || next_blob->owner() != nullptr || next_blob->UniquelyHorizontal()) {
1402  return nullptr;
1403  }
1404  if (next_blob->neighbour(DirOtherWay(dir)) == blob) {
1405  return next_blob;
1406  }
1407  return nullptr;
1408 }
1409 
1410 // Finds vertical chains of text-like blobs and puts them in ColPartitions.
1411 void StrokeWidth::FindVerticalTextChains(ColPartitionGrid *part_grid) {
1412  // A PageSegMode that forces vertical textlines with the current rotation.
1413  PageSegMode pageseg_mode =
1414  rerotation_.y() == 0.0f ? PSM_SINGLE_BLOCK_VERT_TEXT : PSM_SINGLE_COLUMN;
1415  BlobGridSearch gsearch(this);
1416  BLOBNBOX *bbox;
1417  gsearch.StartFullSearch();
1418  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1419  // Only process boxes that have no horizontal hope and have not yet
1420  // been included in a chain.
1421  BLOBNBOX *blob;
1422  if (bbox->owner() == nullptr && bbox->UniquelyVertical() &&
1423  (blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != nullptr) {
1424  // Put all the linked blobs into a ColPartition.
1425  auto *part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1));
1426  part->AddBox(bbox);
1427  while (blob != nullptr) {
1428  part->AddBox(blob);
1429  blob = MutualUnusedVNeighbour(blob, BND_ABOVE);
1430  }
1431  blob = MutualUnusedVNeighbour(bbox, BND_BELOW);
1432  while (blob != nullptr) {
1433  part->AddBox(blob);
1434  blob = MutualUnusedVNeighbour(blob, BND_BELOW);
1435  }
1436  CompletePartition(pageseg_mode, part, part_grid);
1437  }
1438  }
1439 }
1440 
1441 // Helper verifies that blob's neighbour in direction dir is good to add to a
1442 // horizontal text chain by returning the neighbour if it is not null, not
1443 // owned, and not uniquely vertical, as well as its neighbour in the opposite
1444 // direction is blob.
1445 static BLOBNBOX *MutualUnusedHNeighbour(const BLOBNBOX *blob, BlobNeighbourDir dir) {
1446  BLOBNBOX *next_blob = blob->neighbour(dir);
1447  if (next_blob == nullptr || next_blob->owner() != nullptr || next_blob->UniquelyVertical()) {
1448  return nullptr;
1449  }
1450  if (next_blob->neighbour(DirOtherWay(dir)) == blob) {
1451  return next_blob;
1452  }
1453  return nullptr;
1454 }
1455 
1456 // Finds horizontal chains of text-like blobs and puts them in ColPartitions.
1457 void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid *part_grid) {
1458  // A PageSegMode that forces horizontal textlines with the current rotation.
1459  PageSegMode pageseg_mode =
1460  rerotation_.y() == 0.0f ? PSM_SINGLE_COLUMN : PSM_SINGLE_BLOCK_VERT_TEXT;
1461  BlobGridSearch gsearch(this);
1462  BLOBNBOX *bbox;
1463  gsearch.StartFullSearch();
1464  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1465  BLOBNBOX *blob;
1466  if (bbox->owner() == nullptr && bbox->UniquelyHorizontal() &&
1467  (blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != nullptr) {
1468  // Put all the linked blobs into a ColPartition.
1469  auto *part = new ColPartition(BRT_TEXT, ICOORD(0, 1));
1470  part->AddBox(bbox);
1471  while (blob != nullptr) {
1472  part->AddBox(blob);
1473  blob = MutualUnusedHNeighbour(blob, BND_RIGHT);
1474  }
1475  blob = MutualUnusedHNeighbour(bbox, BND_LEFT);
1476  while (blob != nullptr) {
1477  part->AddBox(blob);
1478  blob = MutualUnusedVNeighbour(blob, BND_LEFT);
1479  }
1480  CompletePartition(pageseg_mode, part, part_grid);
1481  }
1482  }
1483 }
1484 
1485 // Finds diacritics and saves their base character in the blob.
1486 // The objective is to move all diacritics to the noise_blobs list, so
1487 // they don't mess up early textline finding/merging, or force splits
1488 // on textlines that overlap a bit. Blobs that become diacritics must be
1489 // either part of no ColPartition (nullptr owner) or in a small partition in
1490 // which ALL the blobs are diacritics, in which case the partition is
1491 // exploded (deleted) back to its blobs.
1492 void StrokeWidth::TestDiacritics(ColPartitionGrid *part_grid, TO_BLOCK *block) {
1493  BlobGrid small_grid(gridsize(), bleft(), tright());
1494  small_grid.InsertBlobList(&block->noise_blobs);
1495  small_grid.InsertBlobList(&block->blobs);
1496  int medium_diacritics = 0;
1497  int small_diacritics = 0;
1498  BLOBNBOX_IT small_it(&block->noise_blobs);
1499  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1500  BLOBNBOX *blob = small_it.data();
1501  if (blob->owner() == nullptr && !blob->IsDiacritic() && DiacriticBlob(&small_grid, blob)) {
1502  ++small_diacritics;
1503  }
1504  }
1505  BLOBNBOX_IT blob_it(&block->blobs);
1506  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1507  BLOBNBOX *blob = blob_it.data();
1508  if (blob->IsDiacritic()) {
1509  small_it.add_to_end(blob_it.extract());
1510  continue; // Already a diacritic.
1511  }
1512  ColPartition *part = blob->owner();
1513  if (part == nullptr && DiacriticBlob(&small_grid, blob)) {
1514  ++medium_diacritics;
1515  RemoveBBox(blob);
1516  small_it.add_to_end(blob_it.extract());
1517  } else if (part != nullptr && !part->block_owned() && part->boxes_count() < 3) {
1518  // We allow blobs in small partitions to become diacritics if ALL the
1519  // blobs in the partition qualify as we can then cleanly delete the
1520  // partition, turn all the blobs in it to diacritics and they can be
1521  // merged into the base character partition more easily than merging
1522  // the partitions.
1523  BLOBNBOX_C_IT box_it(part->boxes());
1524  for (box_it.mark_cycle_pt();
1525  !box_it.cycled_list() && DiacriticBlob(&small_grid, box_it.data()); box_it.forward()) {
1526  ;
1527  }
1528  if (box_it.cycled_list()) {
1529  // They are all good.
1530  while (!box_it.empty()) {
1531  // Liberate the blob from its partition so it can be treated
1532  // as a diacritic and merged explicitly with the base part.
1533  // The blob is really owned by the block. The partition "owner"
1534  // is nulled to allow the blob to get merged with its base character
1535  // partition.
1536  BLOBNBOX *box = box_it.extract();
1537  box->set_owner(nullptr);
1538  box_it.forward();
1539  ++medium_diacritics;
1540  // We remove the blob from the grid so it isn't found by subsequent
1541  // searches where we might not want to include diacritics.
1542  RemoveBBox(box);
1543  }
1544  // We only move the one blob to the small list here, but the others
1545  // all get moved by the test at the top of the loop.
1546  small_it.add_to_end(blob_it.extract());
1547  part_grid->RemoveBBox(part);
1548  delete part;
1549  }
1550  } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1551  blob->bounding_box().bottom())) {
1552  tprintf("Blob not available to be a diacritic at:");
1553  blob->bounding_box().print();
1554  }
1555  }
1556  if (textord_tabfind_show_strokewidths) {
1557  tprintf("Found %d small diacritics, %d medium\n", small_diacritics, medium_diacritics);
1558  }
1559 }
1560 
1561 // Searches this grid for an appropriately close and sized neighbour of the
1562 // given [small] blob. If such a blob is found, the diacritic base is saved
1563 // in the blob and true is returned.
1564 // The small_grid is a secondary grid that contains the small/noise objects
1565 // that are not in this grid, but may be useful for determining a connection
1566 // between blob and its potential base character. (See DiacriticXGapFilled.)
1567 bool StrokeWidth::DiacriticBlob(BlobGrid *small_grid, BLOBNBOX *blob) {
1568  if (BLOBNBOX::UnMergeableType(blob->region_type()) || blob->region_type() == BRT_VERT_TEXT) {
1569  return false;
1570  }
1571  TBOX small_box(blob->bounding_box());
1572  bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(), small_box.bottom());
1573  if (debug) {
1574  tprintf("Testing blob for diacriticness at:");
1575  small_box.print();
1576  }
1577  int x = (small_box.left() + small_box.right()) / 2;
1578  int y = (small_box.bottom() + small_box.top()) / 2;
1579  int grid_x, grid_y;
1580  GridCoords(x, y, &grid_x, &grid_y);
1581  int height = small_box.height();
1582  // Setup a rectangle search to find its nearest base-character neighbour.
1583  // We keep 2 different best candidates:
1584  // best_x_overlap is a category of base characters that have an overlap in x
1585  // (like a acute) in which we look for the least y-gap, computed using the
1586  // projection to favor base characters in the same textline.
1587  // best_y_overlap is a category of base characters that have no x overlap,
1588  // (nominally a y-overlap is preferrecd but not essential) in which we
1589  // look for the least weighted sum of x-gap and y-gap, with x-gap getting
1590  // a lower weight to catch quotes at the end of a textline.
1591  // NOTE that x-gap and y-gap are measured from the nearest side of the base
1592  // character to the FARTHEST side of the diacritic to allow small diacritics
1593  // to be a reasonable distance away, but not big diacritics.
1594  BLOBNBOX *best_x_overlap = nullptr;
1595  BLOBNBOX *best_y_overlap = nullptr;
1596  int best_total_dist = 0;
1597  int best_y_gap = 0;
1598  TBOX best_xbox;
1599  // TODO(rays) the search box could be setup using the projection as a guide.
1600  TBOX search_box(small_box);
1601  int x_pad = IntCastRounded(gridsize() * kDiacriticXPadRatio);
1602  int y_pad = IntCastRounded(gridsize() * kDiacriticYPadRatio);
1603  search_box.pad(x_pad, y_pad);
1604  BlobGridSearch rsearch(this);
1605  rsearch.SetUniqueMode(true);
1606  int min_height = height * kMinDiacriticSizeRatio;
1607  rsearch.StartRectSearch(search_box);
1608  BLOBNBOX *neighbour;
1609  while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
1610  if (BLOBNBOX::UnMergeableType(neighbour->region_type()) || neighbour == blob ||
1611  neighbour->owner() == blob->owner()) {
1612  continue;
1613  }
1614  TBOX nbox = neighbour->bounding_box();
1615  if (neighbour->owner() == nullptr || neighbour->owner()->IsVerticalType() ||
1616  (neighbour->flow() != BTFT_CHAIN && neighbour->flow() != BTFT_STRONG_CHAIN)) {
1617  if (debug) {
1618  tprintf("Neighbour not strong enough:");
1619  nbox.print();
1620  }
1621  continue; // Diacritics must be attached to strong text.
1622  }
1623  if (nbox.height() < min_height) {
1624  if (debug) {
1625  tprintf("Neighbour not big enough:");
1626  nbox.print();
1627  }
1628  continue; // Too small to be the base character.
1629  }
1630  int x_gap = small_box.x_gap(nbox);
1631  int y_gap = small_box.y_gap(nbox);
1632  int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox, true, denorm_, debug);
1633  if (debug) {
1634  tprintf("xgap=%d, y=%d, total dist=%d\n", x_gap, y_gap, total_distance);
1635  }
1636  if (total_distance > neighbour->owner()->median_height() * kMaxDiacriticDistanceRatio) {
1637  if (debug) {
1638  tprintf("Neighbour with median size %d too far away:", neighbour->owner()->median_height());
1639  neighbour->bounding_box().print();
1640  }
1641  continue; // Diacritics must not be too distant.
1642  }
1643  if (x_gap <= 0) {
1644  if (debug) {
1645  tprintf("Computing reduced box for :");
1646  nbox.print();
1647  }
1648  int left = small_box.left() - small_box.width();
1649  int right = small_box.right() + small_box.width();
1650  nbox = neighbour->BoundsWithinLimits(left, right);
1651  y_gap = small_box.y_gap(nbox);
1652  if (best_x_overlap == nullptr || y_gap < best_y_gap) {
1653  best_x_overlap = neighbour;
1654  best_xbox = nbox;
1655  best_y_gap = y_gap;
1656  if (debug) {
1657  tprintf("New best:");
1658  nbox.print();
1659  }
1660  } else if (debug) {
1661  tprintf("Shrunken box doesn't win:");
1662  nbox.print();
1663  }
1664  } else if (blob->ConfirmNoTabViolation(*neighbour)) {
1665  if (best_y_overlap == nullptr || total_distance < best_total_dist) {
1666  if (debug) {
1667  tprintf("New best y overlap:");
1668  nbox.print();
1669  }
1670  best_y_overlap = neighbour;
1671  best_total_dist = total_distance;
1672  } else if (debug) {
1673  tprintf("New y overlap box doesn't win:");
1674  nbox.print();
1675  }
1676  } else if (debug) {
1677  tprintf("Neighbour wrong side of a tab:");
1678  nbox.print();
1679  }
1680  }
1681  if (best_x_overlap != nullptr &&
1682  (best_y_overlap == nullptr || best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) {
1683  blob->set_diacritic_box(best_xbox);
1684  blob->set_base_char_blob(best_x_overlap);
1685  if (debug) {
1686  tprintf("DiacriticBlob OK! (x-overlap:");
1687  small_box.print();
1688  best_xbox.print();
1689  }
1690  return true;
1691  }
1692  if (best_y_overlap != nullptr &&
1693  DiacriticXGapFilled(small_grid, small_box, best_y_overlap->bounding_box()) &&
1694  NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) {
1695  blob->set_diacritic_box(best_y_overlap->bounding_box());
1696  blob->set_base_char_blob(best_y_overlap);
1697  if (debug) {
1698  tprintf("DiacriticBlob OK! (y-overlap:");
1699  small_box.print();
1700  best_y_overlap->bounding_box().print();
1701  }
1702  return true;
1703  }
1704  if (debug) {
1705  tprintf("DiacriticBlob fails:");
1706  small_box.print();
1707  tprintf("Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap);
1708  if (best_y_overlap != nullptr) {
1709  tprintf("XGapFilled=%d, NoiseBetween=%d\n",
1710  DiacriticXGapFilled(small_grid, small_box, best_y_overlap->bounding_box()),
1711  NoNoiseInBetween(small_box, best_y_overlap->bounding_box()));
1712  }
1713  }
1714  return false;
1715 }
1716 
1717 // Returns true if there is no gap between the base char and the diacritic
1718 // bigger than a fraction of the height of the base char:
1719 // Eg: line end.....'
1720 // The quote is a long way from the end of the line, yet it needs to be a
1721 // diacritic. To determine that the quote is not part of an image, or
1722 // a different text block, we check for other marks in the gap between
1723 // the base char and the diacritic.
1724 // '<--Diacritic
1725 // |---------|
1726 // | |<-toobig-gap->
1727 // | Base |<ok gap>
1728 // |---------| x<-----Dot occupying gap
1729 // The grid is const really.
1730 bool StrokeWidth::DiacriticXGapFilled(BlobGrid *grid, const TBOX &diacritic_box,
1731  const TBOX &base_box) {
1732  // Since most gaps are small, use an iterative algorithm to search the gap.
1733  int max_gap = IntCastRounded(base_box.height() * kMaxDiacriticGapToBaseCharHeight);
1734  TBOX occupied_box(base_box);
1735  int diacritic_gap;
1736  while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) {
1737  TBOX search_box(occupied_box);
1738  if (diacritic_box.left() > search_box.right()) {
1739  // We are looking right.
1740  search_box.set_left(search_box.right());
1741  search_box.set_right(search_box.left() + max_gap);
1742  } else {
1743  // We are looking left.
1744  search_box.set_right(search_box.left());
1745  search_box.set_left(search_box.left() - max_gap);
1746  }
1747  BlobGridSearch rsearch(grid);
1748  rsearch.StartRectSearch(search_box);
1749  BLOBNBOX *neighbour;
1750  while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
1751  const TBOX &nbox = neighbour->bounding_box();
1752  if (nbox.x_gap(diacritic_box) < diacritic_gap) {
1753  if (nbox.left() < occupied_box.left()) {
1754  occupied_box.set_left(nbox.left());
1755  }
1756  if (nbox.right() > occupied_box.right()) {
1757  occupied_box.set_right(nbox.right());
1758  }
1759  break;
1760  }
1761  }
1762  if (neighbour == nullptr) {
1763  return false; // Found a big gap.
1764  }
1765  }
1766  return true; // The gap was filled.
1767 }
1768 
1769 // Merges diacritics with the ColPartition of the base character blob.
1770 void StrokeWidth::MergeDiacritics(TO_BLOCK *block, ColPartitionGrid *part_grid) {
1771  BLOBNBOX_IT small_it(&block->noise_blobs);
1772  for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1773  BLOBNBOX *blob = small_it.data();
1774  if (blob->base_char_blob() != nullptr) {
1775  ColPartition *part = blob->base_char_blob()->owner();
1776  // The base character must be owned by a partition and that partition
1777  // must not be on the big_parts list (not block owned).
1778  if (part != nullptr && !part->block_owned() && blob->owner() == nullptr &&
1779  blob->IsDiacritic()) {
1780  // The partition has to be removed from the grid and reinserted
1781  // because its bounding box may change.
1782  part_grid->RemoveBBox(part);
1783  part->AddBox(blob);
1784  blob->set_region_type(part->blob_type());
1785  blob->set_flow(part->flow());
1786  blob->set_owner(part);
1787  part_grid->InsertBBox(true, true, part);
1788  }
1789  // Set all base chars to nullptr before any blobs get deleted.
1790  blob->set_base_char_blob(nullptr);
1791  }
1792  }
1793 }
1794 
1795 // Any blobs on the large_blobs list of block that are still unowned by a
1796 // ColPartition, are probably drop-cap or vertically touching so the blobs
1797 // are removed to the big_parts list and treated separately.
1798 void StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK *block, ColPartitionGrid *part_grid,
1799  ColPartition_LIST *big_parts) {
1800  BLOBNBOX_IT large_it(&block->large_blobs);
1801  for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
1802  BLOBNBOX *blob = large_it.data();
1803  ColPartition *big_part = blob->owner();
1804  if (big_part == nullptr) {
1805  // Large blobs should have gone into partitions by now if they are
1806  // genuine characters, so move any unowned ones out to the big parts
1807  // list. This will include drop caps and vertically touching characters.
1808  ColPartition::MakeBigPartition(blob, big_parts);
1809  }
1810  }
1811 }
1812 
1813 // All remaining unused blobs are put in individual ColPartitions.
1814 void StrokeWidth::PartitionRemainingBlobs(PageSegMode pageseg_mode, ColPartitionGrid *part_grid) {
1815  BlobGridSearch gsearch(this);
1816  BLOBNBOX *bbox;
1817  int prev_grid_x = -1;
1818  int prev_grid_y = -1;
1819  BLOBNBOX_CLIST cell_list;
1820  BLOBNBOX_C_IT cell_it(&cell_list);
1821  bool cell_all_noise = true;
1822  gsearch.StartFullSearch();
1823  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1824  int grid_x = gsearch.GridX();
1825  int grid_y = gsearch.GridY();
1826  if (grid_x != prev_grid_x || grid_y != prev_grid_y) {
1827  // New cell. Process old cell.
1828  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid, &cell_list);
1829  cell_it.set_to_list(&cell_list);
1830  prev_grid_x = grid_x;
1831  prev_grid_y = grid_y;
1832  cell_all_noise = true;
1833  }
1834  if (bbox->owner() == nullptr) {
1835  cell_it.add_to_end(bbox);
1836  if (bbox->flow() != BTFT_NONTEXT) {
1837  cell_all_noise = false;
1838  }
1839  } else {
1840  cell_all_noise = false;
1841  }
1842  }
1843  MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid, &cell_list);
1844 }
1845 
1846 // If combine, put all blobs in the cell_list into a single partition, otherwise
1847 // put each one into its own partition.
1848 void StrokeWidth::MakePartitionsFromCellList(PageSegMode pageseg_mode, bool combine,
1849  ColPartitionGrid *part_grid,
1850  BLOBNBOX_CLIST *cell_list) {
1851  if (cell_list->empty()) {
1852  return;
1853  }
1854  BLOBNBOX_C_IT cell_it(cell_list);
1855  if (combine) {
1856  BLOBNBOX *bbox = cell_it.extract();
1857  auto *part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1858  part->AddBox(bbox);
1859  part->set_flow(bbox->flow());
1860  for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {
1861  part->AddBox(cell_it.extract());
1862  }
1863  CompletePartition(pageseg_mode, part, part_grid);
1864  } else {
1865  for (; !cell_it.empty(); cell_it.forward()) {
1866  BLOBNBOX *bbox = cell_it.extract();
1867  auto *part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1868  part->set_flow(bbox->flow());
1869  part->AddBox(bbox);
1870  CompletePartition(pageseg_mode, part, part_grid);
1871  }
1872  }
1873 }
1874 
1875 // Helper function to finish setting up a ColPartition and insert into
1876 // part_grid.
1877 void StrokeWidth::CompletePartition(PageSegMode pageseg_mode, ColPartition *part,
1878  ColPartitionGrid *part_grid) {
1879  part->ComputeLimits();
1880  TBOX box = part->bounding_box();
1881  bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());
1882  int value = projection_->EvaluateColPartition(*part, denorm_, debug);
1883  // Override value if pageseg_mode disagrees.
1884  if (value > 0 && FindingVerticalOnly(pageseg_mode)) {
1885  value = part->boxes_count() == 1 ? 0 : -2;
1886  } else if (value < 0 && FindingHorizontalOnly(pageseg_mode)) {
1887  value = part->boxes_count() == 1 ? 0 : 2;
1888  }
1889  part->SetRegionAndFlowTypesFromProjectionValue(value);
1890  part->ClaimBoxes();
1891  part_grid->InsertBBox(true, true, part);
1892 }
1893 
1894 // Merge partitions where the merge appears harmless.
1895 // As this
1896 void StrokeWidth::EasyMerges(ColPartitionGrid *part_grid) {
1897  using namespace std::placeholders; // for _1, _2
1898  part_grid->Merges(std::bind(&StrokeWidth::OrientationSearchBox, this, _1, _2),
1899  std::bind(&StrokeWidth::ConfirmEasyMerge, this, _1, _2));
1900 }
1901 
1902 // Compute a search box based on the orientation of the partition.
1903 // Returns true if a suitable box can be calculated.
1904 // Callback for EasyMerges.
1905 bool StrokeWidth::OrientationSearchBox(ColPartition *part, TBOX *box) {
1906  if (part->IsVerticalType()) {
1907  box->set_top(box->top() + box->width());
1908  box->set_bottom(box->bottom() - box->width());
1909  } else {
1910  box->set_left(box->left() - box->height());
1911  box->set_right(box->right() + box->height());
1912  }
1913  return true;
1914 }
1915 
1916 // Merge confirmation callback for EasyMerges.
1917 bool StrokeWidth::ConfirmEasyMerge(const ColPartition *p1, const ColPartition *p2) {
1918  ASSERT_HOST(p1 != nullptr && p2 != nullptr);
1919  ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty());
1920  if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) ||
1921  (p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT)) {
1922  return false; // Don't merge confirmed image with text.
1923  }
1924  if ((p1->IsVerticalType() || p2->IsVerticalType()) && p1->HCoreOverlap(*p2) <= 0 &&
1925  ((!p1->IsSingleton() && !p2->IsSingleton()) ||
1926  !p1->bounding_box().major_overlap(p2->bounding_box()))) {
1927  return false; // Overlap must be in the text line.
1928  }
1929  if ((p1->IsHorizontalType() || p2->IsHorizontalType()) && p1->VCoreOverlap(*p2) <= 0 &&
1930  ((!p1->IsSingleton() && !p2->IsSingleton()) ||
1931  (!p1->bounding_box().major_overlap(p2->bounding_box()) &&
1932  !p1->OKDiacriticMerge(*p2, false) && !p2->OKDiacriticMerge(*p1, false)))) {
1933  return false; // Overlap must be in the text line.
1934  }
1935  if (!p1->ConfirmNoTabViolation(*p2)) {
1936  return false;
1937  }
1938  if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT) {
1939  return true;
1940  }
1941  return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());
1942 }
1943 
1944 // Returns true if there is no significant noise in between the boxes.
1945 bool StrokeWidth::NoNoiseInBetween(const TBOX &box1, const TBOX &box2) const {
1946  return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_, nontext_map_);
1947 }
1948 
1949 #ifndef GRAPHICS_DISABLED
1950 
1954 ScrollView *StrokeWidth::DisplayGoodBlobs(const char *window_name, int x, int y) {
1955  auto window = MakeWindow(x, y, window_name);
1956  // For every blob in the grid, display it.
1957  window->Brush(ScrollView::NONE);
1958 
1959  // For every bbox in the grid, display it.
1960  BlobGridSearch gsearch(this);
1961  gsearch.StartFullSearch();
1962  BLOBNBOX *bbox;
1963  while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1964  const TBOX &box = bbox->bounding_box();
1965  int left_x = box.left();
1966  int right_x = box.right();
1967  int top_y = box.top();
1968  int bottom_y = box.bottom();
1969  int goodness = bbox->GoodTextBlob();
1970  BlobRegionType blob_type = bbox->region_type();
1971  if (bbox->UniquelyVertical()) {
1972  blob_type = BRT_VERT_TEXT;
1973  }
1974  if (bbox->UniquelyHorizontal()) {
1975  blob_type = BRT_TEXT;
1976  }
1977  BlobTextFlowType flow = bbox->flow();
1978  if (flow == BTFT_NONE) {
1979  if (goodness == 0) {
1980  flow = BTFT_NEIGHBOURS;
1981  } else if (goodness == 1) {
1982  flow = BTFT_CHAIN;
1983  } else {
1984  flow = BTFT_STRONG_CHAIN;
1985  }
1986  }
1987  window->Pen(BLOBNBOX::TextlineColor(blob_type, flow));
1988  window->Rectangle(left_x, bottom_y, right_x, top_y);
1989  }
1990  window->Update();
1991  return window;
1992 }
1993 
1994 static void DrawDiacriticJoiner(const BLOBNBOX *blob, ScrollView *window) {
1995  const TBOX &blob_box(blob->bounding_box());
1996  int top = std::max(static_cast<int>(blob_box.top()), blob->base_char_top());
1997  int bottom = std::min(static_cast<int>(blob_box.bottom()), blob->base_char_bottom());
1998  int x = (blob_box.left() + blob_box.right()) / 2;
1999  window->Line(x, top, x, bottom);
2000 }
2001 
2002 // Displays blobs colored according to whether or not they are diacritics.
2003 ScrollView *StrokeWidth::DisplayDiacritics(const char *window_name, int x, int y, TO_BLOCK *block) {
2004  auto window = MakeWindow(x, y, window_name);
2005  // For every blob in the grid, display it.
2006  window->Brush(ScrollView::NONE);
2007 
2008  BLOBNBOX_IT it(&block->blobs);
2009  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2010  BLOBNBOX *blob = it.data();
2011  if (blob->IsDiacritic()) {
2012  window->Pen(ScrollView::GREEN);
2013  DrawDiacriticJoiner(blob, window);
2014  } else {
2015  window->Pen(blob->BoxColor());
2016  }
2017  const TBOX &box = blob->bounding_box();
2018  window->Rectangle(box.left(), box.bottom(), box.right(), box.top());
2019  }
2020  it.set_to_list(&block->noise_blobs);
2021  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2022  BLOBNBOX *blob = it.data();
2023  if (blob->IsDiacritic()) {
2024  window->Pen(ScrollView::GREEN);
2025  DrawDiacriticJoiner(blob, window);
2026  } else {
2027  window->Pen(ScrollView::WHITE);
2028  }
2029  const TBOX &box = blob->bounding_box();
2030  window->Rectangle(box.left(), box.bottom(), box.right(), box.top());
2031  }
2032  window->Update();
2033  return window;
2034 }
2035 
2036 #endif // !GRAPHICS_DISABLED
2037 
2038 } // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define BOOL_VAR(name, val, comment)
Definition: params.h:359
#define INT_VAR(name, val, comment)
Definition: params.h:356
@ TBOX
const double kMaxDiacriticDistanceRatio
Definition: strokewidth.cpp:84
const int kLineResiduePadRatio
BlobRegionType
Definition: blobbox.h:74
@ BRT_TEXT
Definition: blobbox.h:82
@ BRT_HLINE
Definition: blobbox.h:76
@ BRT_NOISE
Definition: blobbox.h:75
@ BRT_VLINE
Definition: blobbox.h:77
@ BRT_VERT_TEXT
Definition: blobbox.h:81
@ BRT_UNKNOWN
Definition: blobbox.h:80
PartitionFindResult
Definition: strokewidth.h:42
const double kNoiseOverlapAreaFactor
const int kCJKMaxComponents
Definition: strokewidth.cpp:64
@ PSM_SINGLE_BLOCK_VERT_TEXT
Definition: publictypes.h:166
@ PSM_SINGLE_COLUMN
Assume a single column of text of variable sizes.
Definition: publictypes.h:165
const double kMinDiacriticSizeRatio
Definition: strokewidth.cpp:81
const double kCJKBrokenDistanceFraction
Definition: strokewidth.cpp:62
const int kLineTrapLongest
Definition: strokewidth.cpp:92
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const double kCJKAspectRatio
Definition: strokewidth.cpp:66
int IntCastRounded(double x)
Definition: helpers.h:175
@ SVET_DESTROY
Definition: scrollview.h:53
const double kStrokeWidthTolerance
Definition: strokewidth.cpp:54
const double kNoiseOverlapGrowthFactor
const double kCJKAspectRatioIncrease
Definition: strokewidth.cpp:68
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:919
const int kCJKRadius
Definition: strokewidth.cpp:60
const double kNeighbourSearchFactor
const double kLineResidueAspectRatio
Definition: strokewidth.cpp:99
int textord_debug_tabfind
Definition: alignedblob.cpp:29
const double kBrokenCJKIterationFraction
Definition: strokewidth.cpp:72
BlobTextFlowType
Definition: blobbox.h:110
@ BTFT_STRONG_CHAIN
Definition: blobbox.h:115
@ BTFT_NONE
Definition: blobbox.h:111
@ BTFT_CHAIN
Definition: blobbox.h:114
@ BTFT_LEADER
Definition: blobbox.h:117
@ BTFT_NEIGHBOURS
Definition: blobbox.h:113
@ BTFT_NONTEXT
Definition: blobbox.h:112
const int kMaxCJKSizeRatio
Definition: strokewidth.cpp:70
const double kDiacriticXPadRatio
Definition: strokewidth.cpp:75
const double kLineResidueSizeRatio
const double kMaxDiacriticGapToBaseCharHeight
Definition: strokewidth.cpp:87
const int kLineTrapShortest
Definition: strokewidth.cpp:94
const double kStrokeWidthFractionTolerance
Definition: strokewidth.cpp:49
const double kStrokeWidthFractionCJK
Definition: strokewidth.cpp:56
const double kStrokeWidthCJK
Definition: strokewidth.cpp:57
const double kDiacriticYPadRatio
Definition: strokewidth.cpp:78
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:102
BlobNeighbourDir
Definition: blobbox.h:89
@ BND_LEFT
Definition: blobbox.h:89
@ BND_RIGHT
Definition: blobbox.h:89
@ BND_BELOW
Definition: blobbox.h:89
@ BND_ABOVE
Definition: blobbox.h:89
@ BND_COUNT
Definition: blobbox.h:89
const float kSizeRatioToReject
Definition: osdetect.cpp:41
const int kMostlyOneDirRatio
Definition: strokewidth.cpp:97
GridSearch< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT > BlobGridSearch
Definition: blobgrid.h:30
float vert_stroke_width() const
Definition: blobbox.h:358
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:388
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:385
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:178
bool UniquelyHorizontal() const
Definition: blobbox.h:430
bool UniquelyVertical() const
Definition: blobbox.h:427
const TBOX & bounding_box() const
Definition: blobbox.h:239
bool vert_possible() const
Definition: blobbox.h:316
C_BLOB * cblob() const
Definition: blobbox.h:277
BlobTextFlowType flow() const
Definition: blobbox.h:310
float horz_stroke_width() const
Definition: blobbox.h:352
bool horz_possible() const
Definition: blobbox.h:322
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:442
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:447
BLOBNBOX_LIST blobs
Definition: blobbox.h:776
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:779
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:780
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:778
integer coordinate
Definition: points.h:36
void set_y(float yin)
rewrite function
Definition: points.h:217
void set_x(float xin)
rewrite function
Definition: points.h:213
float y() const
Definition: points.h:209
float x() const
Definition: points.h:206
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
void print() const
Definition: rect.h:289
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void pad(int xpad, int ypad)
Definition: rect.h:144
bool contains(const FCOORD pt) const
Definition: rect.h:344
int32_t perimeter()
Definition: stepblob.cpp:285
int32_t area()
Definition: stepblob.cpp:268
static bool WithinTestRegion(int detail_level, int x, int y)
void StartRadSearch(int x, int y, int max_radius)
Definition: bbgrid.h:735
BBC * NextRectSearch()
Definition: bbgrid.h:896
void StartFullSearch()
Definition: bbgrid.h:701
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:884
BBC * NextFullSearch()
Definition: bbgrid.h:711
BBC * NextRadSearch()
Definition: bbgrid.h:749
int gridsize() const
Definition: bbgrid.h:63
const ICOORD & bleft() const
Definition: bbgrid.h:72
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:53
const ICOORD & tright() const
Definition: bbgrid.h:75
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:488
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:529
virtual void HandleClick(int x, int y)
Definition: bbgrid.h:691
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:633
BlobGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: blobgrid.cpp:24
void InsertBlobList(BLOBNBOX_LIST *blobs)
Definition: blobgrid.cpp:35
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
static bool BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box, const FCOORD &rotation, Image pix)
Definition: imagefind.cpp:587
void FindTextlineDirectionAndFixBrokenCJK(PageSegMode pageseg_mode, bool cjk_merge, TO_BLOCK *input_block)
void CorrectForRotation(const FCOORD &rerotation, ColPartitionGrid *part_grid)
void HandleClick(int x, int y) override
StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright)
void RemoveLineResidue(ColPartition_LIST *big_part_list)
void GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation, TO_BLOCK *block, Image nontext_pix, const DENORM *denorm, bool cjk_script, TextlineProjection *projection, BLOBNBOX_LIST *diacritic_blobs, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts)
void SetNeighboursOnMediumBlobs(TO_BLOCK *block)
void FindLeaderPartitions(TO_BLOCK *block, ColPartitionGrid *part_grid)
bool TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block, BLOBNBOX_CLIST *osd_blobs)
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:407
static bool VeryDifferentSizes(int size1, int size2)
Definition: tabfind.cpp:413
void ConstructProjection(TO_BLOCK *input_block, const FCOORD &rotation, Image nontext_map)
void MoveNonTextlineBlobs(BLOBNBOX_LIST *blobs, BLOBNBOX_LIST *small_blobs) const
int EvaluateColPartition(const ColPartition &part, const DENORM *denorm, bool debug) const
int DistanceOfBoxFromBox(const TBOX &from_box, const TBOX &to_box, bool horizontal_textline, const DENORM *denorm, bool debug) const
void PlotGradedBlobs(BLOBNBOX_LIST *blobs, ScrollView *win)
void Pen(Color color)
Definition: scrollview.cpp:723
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:589
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:445