tesseract  5.0.0
tablefind.cpp
Go to the documentation of this file.
1 // File: tablefind.cpp
3 // Description: Helper classes to find tables from ColPartitions.
4 // Author: Faisal Shafait (faisal.shafait@dfki.de)
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config_auto.h"
21 #endif
22 
23 #include <algorithm>
24 #include <cmath>
25 #include <utility>
26 #include "tablefind.h"
27 
28 #include <allheaders.h>
29 
30 #include "colpartitionset.h"
31 #include "tablerecog.h"
32 
33 namespace tesseract {
34 
35 // These numbers are used to calculate the global median stats.
36 // They just set an upper bound on the stats objects.
37 // Maximum vertical spacing between neighbor partitions.
38 const int kMaxVerticalSpacing = 500;
39 // Maximum width of a blob in a partition.
40 const int kMaxBlobWidth = 500;
41 
42 // Minimum whitespace size to split a partition (measured as a multiple
43 // of a partition's median width).
44 const double kSplitPartitionSize = 2.0;
45 // To insert text, the partition must satisfy these size constraints
46 // in AllowTextPartition(). The idea is to filter noise partitions
47 // determined by the size compared to the global medians.
48 // TODO(nbeato): Need to find good numbers again.
49 const double kAllowTextHeight = 0.5;
50 const double kAllowTextWidth = 0.6;
51 const double kAllowTextArea = 0.8;
52 // The same thing applies to blobs (to filter noise).
53 // TODO(nbeato): These numbers are a shot in the dark...
54 // height and width are 0.5 * gridsize() in colfind.cpp
55 // area is a rough guess for the size of a period.
56 const double kAllowBlobHeight = 0.3;
57 const double kAllowBlobWidth = 0.4;
58 const double kAllowBlobArea = 0.05;
59 
60 // Minimum number of components in a text partition. A partition having fewer
61 // components than that is more likely a data partition and is a candidate
62 // table cell.
63 const int kMinBoxesInTextPartition = 10;
64 
65 // Maximum number of components that a data partition can have
66 const int kMaxBoxesInDataPartition = 20;
67 
68 // Maximum allowed gap in a text partitions as a multiple of its median size.
69 const double kMaxGapInTextPartition = 4.0;
70 
71 // Minimum value that the maximum gap in a text partition should have as a
72 // factor of its median size.
73 const double kMinMaxGapInTextPartition = 0.5;
74 
75 // The amount of overlap that is "normal" for adjacent blobs in a text
76 // partition. This is used to calculate gap between overlapping blobs.
77 const double kMaxBlobOverlapFactor = 4.0;
78 
79 // Maximum x-height a table partition can have as a multiple of global
80 // median x-height
81 const double kMaxTableCellXheight = 2.0;
82 
83 // Maximum line spacing between a table column header and column contents
84 // for merging the two (as a multiple of the partition's median_height).
86 
87 // Minimum ratio of num_table_partitions to num_text_partitions in a column
88 // block to be called it a table column
89 const double kTableColumnThreshold = 3.0;
90 
91 // Search for horizontal ruling lines within the vertical margin as a
92 // multiple of grid size
93 // const int kRulingVerticalMargin = 3;
94 
95 // Minimum overlap that a colpartition must have with a table region
96 // to become part of that table
97 const double kMinOverlapWithTable = 0.6;
98 
99 // Maximum side space (distance from column boundary) that a typical
100 // text-line in flowing text should have as a multiple of its x-height
101 // (Median size).
102 const int kSideSpaceMargin = 10;
103 
104 // Fraction of the peak of x-projection of a table region to set the
105 // threshold for the x-projection histogram
106 const double kSmallTableProjectionThreshold = 0.35;
107 const double kLargeTableProjectionThreshold = 0.45;
108 // Minimum number of rows required to look for more rows in the projection.
109 const int kLargeTableRowCount = 6;
110 
111 // Minimum number of rows in a table
112 const int kMinRowsInTable = 3;
113 
114 // The amount of padding (multiplied by global_median_xheight_ during use)
115 // that is vertically added to the search adjacent leader search during
116 // ColPartition marking.
118 
119 // Used when filtering false positives. When finding the last line
120 // of a paragraph (typically left-aligned), the previous line should have
121 // its center to the right of the last line by this scaled amount.
123 
124 // The maximum amount of whitespace allowed left of a paragraph ending.
125 // Do not filter a ColPartition with more than this space left of it.
127 
128 // Used when filtering false positives. The last line of a paragraph
129 // should be preceded by a line that is predominantly text. This is the
130 // ratio of text to whitespace (to the right of the text) that is required
131 // for the previous line to be a text.
133 
134 // When counting table columns, this is the required gap between two columns
135 // (it is multiplied by global_median_xheight_).
136 const double kMaxXProjectionGapFactor = 2.0;
137 
138 // Used for similarity in partitions using stroke width. Values copied
139 // from ColFind.cpp in Ray's CL.
141 const double kStrokeWidthConstantTolerance = 2.0;
142 
143 #ifndef GRAPHICS_DISABLED
144 static BOOL_VAR(textord_show_tables, false, "Show table regions (ScrollView)");
145 static BOOL_VAR(textord_tablefind_show_mark, false,
146  "Debug table marking steps in detail (ScrollView)");
147 static BOOL_VAR(textord_tablefind_show_stats, false,
148  "Show page stats used in table finding (ScrollView)");
149 #endif
150 static BOOL_VAR(textord_tablefind_recognize_tables, false,
151  "Enables the table recognizer for table layout and filtering.");
152 
153 // Templated helper function used to create destructor callbacks for the
154 // BBGrid::ClearGridData() method.
155 template <typename T>
156 void DeleteObject(T *object) {
157  delete object;
158 }
159 
161  : resolution_(0),
162  global_median_xheight_(0),
163  global_median_blob_width_(0),
164  global_median_ledding_(0),
165  left_to_right_language_(true) {}
166 
168  // ColPartitions and ColSegments created by this class for storage in grids
169  // need to be deleted explicitly.
170  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
171  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
172  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
173  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
174  table_grid_.ClearGridData(&DeleteObject<ColSegment>);
175 }
176 
178  left_to_right_language_ = order;
179 }
180 
181 void TableFinder::Init(int grid_size, const ICOORD &bottom_left,
182  const ICOORD &top_right) {
183  // Initialize clean partitions list and grid
184  clean_part_grid_.Init(grid_size, bottom_left, top_right);
185  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
186  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
187  col_seg_grid_.Init(grid_size, bottom_left, top_right);
188  table_grid_.Init(grid_size, bottom_left, top_right);
189 }
190 
191 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
192 // insert leaders and rulers into the leader_and_ruling_grid_
194  TO_BLOCK *block) {
195  // Calculate stats. This lets us filter partitions in AllowTextPartition()
196  // and filter blobs in AllowBlob().
197  SetGlobalSpacings(grid);
198 
199  // Iterate the ColPartitions in the grid.
200  ColPartitionGridSearch gsearch(grid);
201  gsearch.SetUniqueMode(true);
202  gsearch.StartFullSearch();
203  ColPartition *part = nullptr;
204  while ((part = gsearch.NextFullSearch()) != nullptr) {
205  // Reject partitions with nothing useful inside of them.
206  if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) {
207  continue;
208  }
209  ColPartition *clean_part = part->ShallowCopy();
210  ColPartition *leader_part = nullptr;
211  if (part->IsLineType()) {
212  InsertRulingPartition(clean_part);
213  continue;
214  }
215  // Insert all non-text partitions to clean_parts
216  if (!part->IsTextType()) {
217  InsertImagePartition(clean_part);
218  continue;
219  }
220  // Insert text colpartitions after removing noisy components from them
221  // The leaders are split into a separate grid.
222  BLOBNBOX_CLIST *part_boxes = part->boxes();
223  BLOBNBOX_C_IT pit(part_boxes);
224  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
225  BLOBNBOX *pblob = pit.data();
226  // Bad blobs... happens in UNLV set.
227  // news.3G1, page 17 (around x=6)
228  if (!AllowBlob(*pblob)) {
229  continue;
230  }
231  if (pblob->flow() == BTFT_LEADER) {
232  if (leader_part == nullptr) {
233  leader_part = part->ShallowCopy();
234  leader_part->set_flow(BTFT_LEADER);
235  }
236  leader_part->AddBox(pblob);
237  } else if (pblob->region_type() != BRT_NOISE) {
238  clean_part->AddBox(pblob);
239  }
240  }
241  clean_part->ComputeLimits();
242  ColPartition *fragmented = clean_part->CopyButDontOwnBlobs();
243  InsertTextPartition(clean_part);
245  if (leader_part != nullptr) {
246  // TODO(nbeato): Note that ComputeLimits does not update the column
247  // information. So the leader may appear to span more columns than it
248  // really does later on when IsInSameColumnAs gets called to test
249  // for adjacent leaders.
250  leader_part->ComputeLimits();
251  InsertLeaderPartition(leader_part);
252  }
253  }
254 
255  // Make the partition partners better for upper and lower neighbors.
258 }
259 
260 // High level function to perform table detection
262  ColPartitionSet **all_columns,
263  WidthCallback width_cb, const FCOORD &reskew) {
264  // initialize spacing, neighbors, and columns
265  InitializePartitions(all_columns);
266 
267 #ifndef GRAPHICS_DISABLED
268  if (textord_show_tables) {
269  ScrollView *table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
275 
276  table_win = MakeWindow(100, 300, "Fragmented Text");
278  }
279 #endif // !GRAPHICS_DISABLED
280 
281  // mark, filter, and smooth candidate table partitions
283 
284  // Make single-column blocks from good_columns_ partitions. col_segments are
285  // moved to a grid later which takes the ownership
286  ColSegment_LIST column_blocks;
287  GetColumnBlocks(all_columns, &column_blocks);
288  // Set the ratio of candidate table partitions in each column
289  SetColumnsType(&column_blocks);
290 
291  // Move column segments to col_seg_grid_
292  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
293 
294  // Detect split in column layout that might have occurred due to the
295  // presence of a table. In such a case, merge the corresponding columns.
297 
298  // Group horizontally overlapping table partitions into table columns.
299  // table_columns created here get deleted at the end of this method.
300  ColSegment_LIST table_columns;
301  GetTableColumns(&table_columns);
302 
303  // Within each column, mark the range table regions occupy based on the
304  // table columns detected. table_regions are moved to a grid later which
305  // takes the ownership
306  ColSegment_LIST table_regions;
307  GetTableRegions(&table_columns, &table_regions);
308 
309 #ifndef GRAPHICS_DISABLED
310  if (textord_tablefind_show_mark) {
311  ScrollView *table_win = MakeWindow(1200, 300, "Table Columns and Regions");
312  DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
313  DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
314  }
315 #endif // !GRAPHICS_DISABLED
316 
317  // Merge table regions across columns for tables spanning multiple
318  // columns
319  MoveColSegmentsToGrid(&table_regions, &table_grid_);
321 
322  // Adjust table boundaries by including nearby horizontal lines and left
323  // out column headers
326 
327  if (textord_tablefind_recognize_tables) {
328  // Remove false alarms consisting of a single column
330 
331 #ifndef GRAPHICS_DISABLED
332  if (textord_show_tables) {
333  ScrollView *table_win = MakeWindow(1200, 300, "Detected Table Locations");
335  DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
336  table_grid_.DisplayBoxes(table_win);
337  }
338 #endif // !GRAPHICS_DISABLED
339 
340  // Find table grid structure and reject tables that are malformed.
341  RecognizeTables();
343  RecognizeTables();
344 
345 #ifndef GRAPHICS_DISABLED
346  if (textord_show_tables) {
347  ScrollView *table_win = MakeWindow(1400, 600, "Recognized Tables");
350  table_grid_.DisplayBoxes(table_win);
351  }
352 #endif // !GRAPHICS_DISABLED
353  } else {
354  // Remove false alarms consisting of a single column
355  // TODO(nbeato): verify this is a NOP after structured table rejection.
356  // Right now it isn't. If the recognize function is doing what it is
357  // supposed to do, this function is obsolete.
359 
360 #ifndef GRAPHICS_DISABLED
361  if (textord_show_tables) {
362  ScrollView *table_win = MakeWindow(1500, 300, "Detected Tables");
365  table_grid_.DisplayBoxes(table_win);
366  }
367 #endif // !GRAPHICS_DISABLED
368  }
369 
370  // Merge all colpartitions in table regions to make them a single
371  // colpartition and revert types of isolated table cells not
372  // assigned to any table to their original types.
373  MakeTableBlocks(grid, all_columns, width_cb);
374 }
375 // All grids have the same dimensions. The clean_part_grid_ sizes are set from
376 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as
377 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_
378 // dimensions instead of duplicated memory.
380  return clean_part_grid_.gridsize();
381 }
383  return clean_part_grid_.gridwidth();
384 }
386  return clean_part_grid_.gridheight();
387 }
388 const ICOORD &TableFinder::bleft() const {
389  return clean_part_grid_.bleft();
390 }
391 const ICOORD &TableFinder::tright() const {
392  return clean_part_grid_.tright();
393 }
394 
396  ASSERT_HOST(part != nullptr);
397  if (AllowTextPartition(*part)) {
398  clean_part_grid_.InsertBBox(true, true, part);
399  } else {
400  delete part;
401  }
402 }
404  ASSERT_HOST(part != nullptr);
405  if (AllowTextPartition(*part)) {
406  fragmented_text_grid_.InsertBBox(true, true, part);
407  } else {
408  delete part;
409  }
410 }
412  ASSERT_HOST(part != nullptr);
413  if (!part->IsEmpty() && part->bounding_box().area() > 0) {
414  leader_and_ruling_grid_.InsertBBox(true, true, part);
415  } else {
416  delete part;
417  }
418 }
420  leader_and_ruling_grid_.InsertBBox(true, true, part);
421 }
423  // NOTE: If images are placed into a different grid in the future,
424  // the function SetPartitionSpacings needs to be updated. It should
425  // be the only thing that cares about image partitions.
426  clean_part_grid_.InsertBBox(true, true, part);
427 }
428 
429 // Splits a partition into its "words". The splits happen
430 // at locations with wide inter-blob spacing. This is useful
431 // because it allows the table recognize to "cut through" the
432 // text lines on the page. The assumption is that a table
433 // will have several lines with similar overlapping whitespace
434 // whereas text will not have this type of property.
435 // Note: The code Assumes that blobs are sorted by the left side x!
436 // This will not work (as well) if the blobs are sorted by center/right.
438  ASSERT_HOST(part != nullptr);
439  // Bye bye empty partitions!
440  if (part->boxes()->empty()) {
441  delete part;
442  return;
443  }
444 
445  // The AllowBlob function prevents this.
446  ASSERT_HOST(part->median_width() > 0);
447  const double kThreshold = part->median_width() * kSplitPartitionSize;
448 
449  ColPartition *right_part = part;
450  bool found_split = true;
451  while (found_split) {
452  found_split = false;
453  BLOBNBOX_C_IT box_it(right_part->boxes());
454  // Blobs are sorted left side first. If blobs overlap,
455  // the previous blob may have a "more right" right side.
456  // Account for this by always keeping the largest "right"
457  // so far.
458  int previous_right = INT32_MIN;
459 
460  // Look for the next split in the partition.
461  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
462  const TBOX &box = box_it.data()->bounding_box();
463  if (previous_right != INT32_MIN &&
464  box.left() - previous_right > kThreshold) {
465  // We have a split position. Split the partition in two pieces.
466  // Insert the left piece in the grid and keep processing the right.
467  int mid_x = (box.left() + previous_right) / 2;
468  ColPartition *left_part = right_part;
469  right_part = left_part->SplitAt(mid_x);
470 
472  found_split = true;
473  break;
474  }
475 
476  // The right side of the previous blobs.
477  previous_right = std::max(previous_right, static_cast<int>(box.right()));
478  }
479  }
480  // When a split is not found, the right part is minimized
481  // as much as possible, so process it.
482  InsertFragmentedTextPartition(right_part);
483 }
484 
485 // Some simple criteria to filter out now. We want to make sure the
486 // average blob size in the partition is consistent with the
487 // global page stats.
488 // The area metric will almost always pass for multi-blob partitions.
489 // It is useful when filtering out noise caused by an isolated blob.
491  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
492  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
493  const int median_area = global_median_xheight_ * global_median_blob_width_;
494  const double kAreaPerBlobRequired = median_area * kAllowTextArea;
495  // Keep comparisons strictly greater to disallow 0!
496  return part.median_height() > kHeightRequired &&
497  part.median_width() > kWidthRequired &&
498  part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
499 }
500 
501 // Same as above, applied to blobs. Keep in mind that
502 // leaders, commas, and periods are important in tables.
503 bool TableFinder::AllowBlob(const BLOBNBOX &blob) const {
504  const TBOX &box = blob.bounding_box();
505  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
506  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
507  const int median_area = global_median_xheight_ * global_median_blob_width_;
508  const double kAreaRequired = median_area * kAllowBlobArea;
509  // Keep comparisons strictly greater to disallow 0!
510  return box.height() > kHeightRequired && box.width() > kWidthRequired &&
511  box.area() > kAreaRequired;
512 }
513 
514 // TODO(nbeato): The grid that makes the window doesn't seem to matter.
515 // The only downside is that window messages will be caught by
516 // clean_part_grid_ instead of a useful object. This is a temporary solution
517 // for the debug windows created by the TableFinder.
518 #ifndef GRAPHICS_DISABLED
519 ScrollView *TableFinder::MakeWindow(int x, int y, const char *window_name) {
520  return clean_part_grid_.MakeWindow(x, y, window_name);
521 }
522 #endif
523 
524 // Make single-column blocks from good_columns_ partitions.
526  ColSegment_LIST *column_blocks) {
527  for (int i = 0; i < gridheight(); ++i) {
528  ColPartitionSet *columns = all_columns[i];
529  if (columns != nullptr) {
530  ColSegment_LIST new_blocks;
531  // Get boxes from the current vertical position on the grid
532  columns->GetColumnBoxes(i * gridsize(), (i + 1) * gridsize(),
533  &new_blocks);
534  // Merge the new_blocks boxes into column_blocks if they are well-aligned
535  GroupColumnBlocks(&new_blocks, column_blocks);
536  }
537  }
538 }
539 
540 // Merge column segments into the current list if they are well aligned.
541 void TableFinder::GroupColumnBlocks(ColSegment_LIST *new_blocks,
542  ColSegment_LIST *column_blocks) {
543  ColSegment_IT src_it(new_blocks);
544  ColSegment_IT dest_it(column_blocks);
545  // iterate through the source list
546  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
547  ColSegment *src_seg = src_it.data();
548  const TBOX &src_box = src_seg->bounding_box();
549  bool match_found = false;
550  // iterate through the destination list to find a matching column block
551  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
552  ColSegment *dest_seg = dest_it.data();
553  TBOX dest_box = dest_seg->bounding_box();
554  if (ConsecutiveBoxes(src_box, dest_box)) {
555  // If matching block is found, insert the current block into it
556  // and delete the source block.
557  dest_seg->InsertBox(src_box);
558  match_found = true;
559  delete src_it.extract();
560  break;
561  }
562  }
563  // If no match is found, just append the source block to column_blocks
564  if (!match_found) {
565  dest_it.add_after_then_move(src_it.extract());
566  }
567  }
568 }
569 
570 // are the two boxes immediate neighbors along the vertical direction
571 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
572  int x_margin = 20;
573  int y_margin = 5;
574  return (abs(b1.left() - b2.left()) < x_margin) &&
575  (abs(b1.right() - b2.right()) < x_margin) &&
576  (abs(b1.top() - b2.bottom()) < y_margin ||
577  abs(b2.top() - b1.bottom()) < y_margin);
578 }
579 
580 // Set up info for clean_part_grid_ partitions to be valid during detection
581 // code.
583  FindNeighbors();
584  SetPartitionSpacings(&clean_part_grid_, all_columns);
586 }
587 
588 // Set left, right and top, bottom spacings of each colpartition.
590  ColPartitionSet **all_columns) {
591  // Iterate the ColPartitions in the grid.
592  ColPartitionGridSearch gsearch(grid);
593  gsearch.StartFullSearch();
594  ColPartition *part = nullptr;
595  while ((part = gsearch.NextFullSearch()) != nullptr) {
596  ColPartitionSet *columns = all_columns[gsearch.GridY()];
597  TBOX box = part->bounding_box();
598  int y = part->MidY();
599  ColPartition *left_column = columns->ColumnContaining(box.left(), y);
600  ColPartition *right_column = columns->ColumnContaining(box.right(), y);
601  // set distance from left column as space to the left
602  if (left_column) {
603  int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
604  part->set_space_to_left(left_space);
605  }
606  // set distance from right column as space to the right
607  if (right_column) {
608  int right_space = std::max(0, right_column->RightAtY(y) - box.right());
609  part->set_space_to_right(right_space);
610  }
611 
612  // Look for images that may be closer.
613  // NOTE: used to be part_grid_, might cause issues now
614  ColPartitionGridSearch hsearch(grid);
615  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
616  ColPartition *neighbor = nullptr;
617  while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
618  if (neighbor->type() == PT_PULLOUT_IMAGE ||
619  neighbor->type() == PT_FLOWING_IMAGE ||
620  neighbor->type() == PT_HEADING_IMAGE) {
621  int right = neighbor->bounding_box().right();
622  if (right < box.left()) {
623  int space = std::min(box.left() - right, part->space_to_left());
624  part->set_space_to_left(space);
625  }
626  }
627  }
628  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
629  neighbor = nullptr;
630  while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
631  if (neighbor->type() == PT_PULLOUT_IMAGE ||
632  neighbor->type() == PT_FLOWING_IMAGE ||
633  neighbor->type() == PT_HEADING_IMAGE) {
634  int left = neighbor->bounding_box().left();
635  if (left > box.right()) {
636  int space = std::min(left - box.right(), part->space_to_right());
637  part->set_space_to_right(space);
638  }
639  }
640  }
641 
642  ColPartition *upper_part = part->SingletonPartner(true);
643  if (upper_part) {
644  int space =
645  std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
646  part->bounding_box().bottom()));
647  part->set_space_above(space);
648  } else {
649  // TODO(nbeato): What constitutes a good value?
650  // 0 is the default value when not set, explicitly noting it needs to
651  // be something else.
652  part->set_space_above(INT32_MAX);
653  }
654 
655  ColPartition *lower_part = part->SingletonPartner(false);
656  if (lower_part) {
657  int space =
658  std::max(0, static_cast<int>(part->bounding_box().bottom() -
659  lower_part->bounding_box().bottom()));
660  part->set_space_below(space);
661  } else {
662  // TODO(nbeato): What constitutes a good value?
663  // 0 is the default value when not set, explicitly noting it needs to
664  // be something else.
665  part->set_space_below(INT32_MAX);
666  }
667  }
668 }
669 
670 // Set spacing and closest neighbors above and below a given colpartition.
672  TBOX box = part->bounding_box();
673  int top_range =
674  std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
675  int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing,
676  static_cast<int>(bleft().y()));
677  box.set_top(top_range);
678  box.set_bottom(bottom_range);
679 
680  TBOX part_box = part->bounding_box();
681  // Start a rect search
684  rectsearch.StartRectSearch(box);
685  ColPartition *neighbor;
686  int min_space_above = kMaxVerticalSpacing;
687  int min_space_below = kMaxVerticalSpacing;
688  ColPartition *above_neighbor = nullptr;
689  ColPartition *below_neighbor = nullptr;
690  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
691  if (neighbor == part) {
692  continue;
693  }
694  TBOX neighbor_box = neighbor->bounding_box();
695  if (neighbor_box.major_x_overlap(part_box)) {
696  int gap = abs(part->median_bottom() - neighbor->median_bottom());
697  // If neighbor is below current partition
698  if (neighbor_box.top() < part_box.bottom() && gap < min_space_below) {
699  min_space_below = gap;
700  below_neighbor = neighbor;
701  } // If neighbor is above current partition
702  else if (part_box.top() < neighbor_box.bottom() &&
703  gap < min_space_above) {
704  min_space_above = gap;
705  above_neighbor = neighbor;
706  }
707  }
708  }
709  part->set_space_above(min_space_above);
710  part->set_space_below(min_space_below);
711  part->set_nearest_neighbor_above(above_neighbor);
712  part->set_nearest_neighbor_below(below_neighbor);
713 }
714 
715 // Set global spacing and x-height estimates
717  STATS xheight_stats(0, kMaxVerticalSpacing + 1);
718  STATS width_stats(0, kMaxBlobWidth + 1);
719  STATS ledding_stats(0, kMaxVerticalSpacing + 1);
720  // Iterate the ColPartitions in the grid.
721  ColPartitionGridSearch gsearch(grid);
722  gsearch.SetUniqueMode(true);
723  gsearch.StartFullSearch();
724  ColPartition *part = nullptr;
725  while ((part = gsearch.NextFullSearch()) != nullptr) {
726  // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
727  // ComputeLimits needs to get called somewhere outside of TableFinder
728  // to make sure the partitions are properly initialized.
729  // When this is called, SmoothPartitionPartners dies in an assert after
730  // table find runs. Alternative solution.
731  // part->ComputeLimits();
732  if (part->IsTextType()) {
733  // xheight_stats.add(part->median_height(), part->boxes_count());
734  // width_stats.add(part->median_width(), part->boxes_count());
735 
736  // This loop can be removed when above issues are fixed.
737  // Replace it with the 2 lines commented out above.
738  BLOBNBOX_C_IT it(part->boxes());
739  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
740  xheight_stats.add(it.data()->bounding_box().height(), 1);
741  width_stats.add(it.data()->bounding_box().width(), 1);
742  }
743 
744  ledding_stats.add(part->space_above(), 1);
745  ledding_stats.add(part->space_below(), 1);
746  }
747  }
748  // Set estimates based on median of statistics obtained
749  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
750  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
751  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
752 #ifndef GRAPHICS_DISABLED
753  if (textord_tablefind_show_stats) {
754  const char *kWindowName = "X-height (R), X-width (G), and ledding (B)";
755  ScrollView *stats_win = MakeWindow(500, 10, kWindowName);
756  xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
757  width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
758  ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
759  }
760 #endif // !GRAPHICS_DISABLED
761 }
762 
764  global_median_xheight_ = xheight;
765 }
768 }
770  global_median_ledding_ = ledding;
771 }
772 
775  gsearch.StartFullSearch();
776  ColPartition *part = nullptr;
777  while ((part = gsearch.NextFullSearch()) != nullptr) {
778  // TODO(nbeato): Rename this function, meaning is different now.
779  // IT is finding nearest neighbors its own way
780  // SetVerticalSpacing(part);
781 
782  ColPartition *upper = part->SingletonPartner(true);
783  if (upper) {
784  part->set_nearest_neighbor_above(upper);
785  }
786 
787  ColPartition *lower = part->SingletonPartner(false);
788  if (lower) {
789  part->set_nearest_neighbor_below(lower);
790  }
791  }
792 }
793 
794 // High level interface. Input is an unmarked ColPartitionGrid
795 // (namely, clean_part_grid_). Partitions are identified using local
796 // information and filter/smoothed. The function exit should contain
797 // a good sampling of the table partitions.
800 #ifndef GRAPHICS_DISABLED
801  if (textord_tablefind_show_mark) {
802  ScrollView *table_win = MakeWindow(300, 300, "Initial Table Partitions");
806  }
807 #endif
809 #ifndef GRAPHICS_DISABLED
810  if (textord_tablefind_show_mark) {
811  ScrollView *table_win = MakeWindow(600, 300, "Filtered Table Partitions");
815  }
816 #endif
818 #ifndef GRAPHICS_DISABLED
819  if (textord_tablefind_show_mark) {
820  ScrollView *table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
824  }
825 #endif
827 #ifndef GRAPHICS_DISABLED
828  if (textord_tablefind_show_mark || textord_show_tables) {
829  ScrollView *table_win = MakeWindow(900, 300, "Final Table Partitions");
833  }
834 #endif
835 }
836 
837 // These types of partitions are marked as table partitions:
838 // 1- Partitions that have at lease one large gap between words
839 // 2- Partitions that consist of only one word (no significant gap
840 // between components)
841 // 3- Partitions that vertically overlap with other partitions within the
842 // same column.
843 // 4- Partitions with leaders before/after them.
845  // Iterate the ColPartitions in the grid.
848  gsearch.StartFullSearch();
849  ColPartition *part = nullptr;
850  while ((part = gsearch.NextFullSearch()) != nullptr) {
851  if (!part->IsTextType()) { // Only consider text partitions
852  continue;
853  }
854  // Only consider partitions in dominant font size or smaller
856  continue;
857  }
858  // Mark partitions with a large gap, or no significant gap as
859  // table partitions.
860  // Comments: It produces several false alarms at:
861  // - last line of a paragraph (fixed)
862  // - single word section headings
863  // - page headers and footers
864  // - numbered equations
865  // - line drawing regions
866  // TODO(faisal): detect and fix above-mentioned cases
867  if (HasWideOrNoInterWordGap(part) || HasLeaderAdjacent(*part)) {
868  part->set_table_type();
869  }
870  }
871 }
872 
873 // Check if the partition has at least one large gap between words or no
874 // significant gap at all
876  // Should only get text partitions.
877  ASSERT_HOST(part->IsTextType());
878  // Blob access
879  BLOBNBOX_CLIST *part_boxes = part->boxes();
880  BLOBNBOX_C_IT it(part_boxes);
881  // Check if this is a relatively small partition (such as a single word)
882  if (part->bounding_box().width() <
884  part_boxes->length() < kMinBoxesInTextPartition) {
885  return true;
886  }
887 
888  // Variables used to compute inter-blob spacing.
889  int current_x0 = -1;
890  int current_x1 = -1;
891  int previous_x1 = -1;
892  // Stores the maximum gap detected.
893  int largest_partition_gap_found = -1;
894  // Text partition gap limits. If this is text (and not a table),
895  // there should be at least one gap larger than min_gap and no gap
896  // larger than max_gap.
897  const double max_gap = kMaxGapInTextPartition * part->median_height();
898  const double min_gap = kMinMaxGapInTextPartition * part->median_height();
899 
900  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
901  BLOBNBOX *blob = it.data();
902  current_x0 = blob->bounding_box().left();
903  current_x1 = blob->bounding_box().right();
904  if (previous_x1 != -1) {
905  int gap = current_x0 - previous_x1;
906 
907  // TODO(nbeato): Boxes may overlap? Huh?
908  // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
909  // on the top right of the page are filtered out with this line.
910  // Note 2: Iterating over blobs in a partition, so we are looking for
911  // spacing between the words.
912  if (gap < 0) {
913  // More likely case, the blobs slightly overlap. This can happen
914  // with diacritics (accents) or broken alphabet symbols (characters).
915  // Merge boxes together by taking max of right sides.
916  if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
917  previous_x1 = std::max(previous_x1, current_x1);
918  continue;
919  }
920  // Extreme case, blobs overlap significantly in the same partition...
921  // This should not happen often (if at all), but it does.
922  // TODO(nbeato): investigate cases when this happens.
923  else {
924  // The behavior before was to completely ignore this case.
925  }
926  }
927 
928  // If a large enough gap is found, mark it as a table cell (return true)
929  if (gap > max_gap) {
930  return true;
931  }
932  if (gap > largest_partition_gap_found) {
933  largest_partition_gap_found = gap;
934  }
935  }
936  previous_x1 = current_x1;
937  }
938  // Since no large gap was found, return false if the partition is too
939  // long to be a data cell
940  if (part->bounding_box().width() >
942  part_boxes->length() > kMaxBoxesInDataPartition) {
943  return false;
944  }
945 
946  // A partition may be a single blob. In this case, it's an isolated symbol
947  // or non-text (such as a ruling or image).
948  // Detect these as table partitions? Shouldn't this be case by case?
949  // The behavior before was to ignore this, making max_partition_gap < 0
950  // and implicitly return true. Just making it explicit.
951  if (largest_partition_gap_found == -1) {
952  return true;
953  }
954 
955  // return true if the maximum gap found is smaller than the minimum allowed
956  // max_gap in a text partition. This indicates that there is no significant
957  // space in the partition, hence it is likely a single word.
958  return largest_partition_gap_found < min_gap;
959 }
960 
961 // A criteria for possible tables is that a table may have leaders
962 // between data cells. An aggressive solution to find such tables is to
963 // explicitly mark partitions that have adjacent leaders.
964 // Note that this includes overlapping leaders. However, it does not
965 // include leaders in different columns on the page.
966 // Possible false-positive will include lists, such as a table of contents.
967 // As these arise, the aggressive nature of this search may need to be
968 // trimmed down.
970  if (part.flow() == BTFT_LEADER) {
971  return true;
972  }
973  // Search range is left and right bounded by an offset of the
974  // median xheight. This offset is to allow some tolerance to the
975  // the leaders on the page in the event that the alignment is still
976  // a bit off.
977  const TBOX &box = part.bounding_box();
978  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
979  const int top = box.top() + search_size;
980  const int bottom = box.bottom() - search_size;
982  for (int direction = 0; direction < 2; ++direction) {
983  bool right_to_left = (direction == 0);
984  int x = right_to_left ? box.right() : box.left();
985  hsearch.StartSideSearch(x, bottom, top);
986  ColPartition *leader = nullptr;
987  while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
988  // The leader could be a horizontal ruling in the grid.
989  // Make sure it is actually a leader.
990  if (leader->flow() != BTFT_LEADER) {
991  continue;
992  }
993  // This should not happen, they are in different grids.
994  ASSERT_HOST(&part != leader);
995  // Make sure the leader shares a page column with the partition,
996  // otherwise we are spreading across columns.
997  if (!part.IsInSameColumnAs(*leader)) {
998  break;
999  }
1000  // There should be a significant vertical overlap
1001  if (!leader->VSignificantCoreOverlap(part)) {
1002  continue;
1003  }
1004  // Leader passed all tests, so it is adjacent.
1005  return true;
1006  }
1007  }
1008  // No leaders are adjacent to the given partition.
1009  return false;
1010 }
1011 
1012 // Filter individual text partitions marked as table partitions
1013 // consisting of paragraph endings, small section headings, and
1014 // headers and footers.
1018  // TODO(nbeato): Fully justified text as non-table?
1019 }
1020 
1022  // Detect last line of paragraph
1023  // Iterate the ColPartitions in the grid.
1025  gsearch.StartFullSearch();
1026  ColPartition *part = nullptr;
1027  while ((part = gsearch.NextFullSearch()) != nullptr) {
1028  if (part->type() != PT_TABLE) {
1029  continue; // Consider only table partitions
1030  }
1031 
1032  // Paragraph ending should have flowing text above it.
1033  ColPartition *upper_part = part->nearest_neighbor_above();
1034  if (!upper_part) {
1035  continue;
1036  }
1037  if (upper_part->type() != PT_FLOWING_TEXT) {
1038  continue;
1039  }
1040  if (upper_part->bounding_box().width() < 2 * part->bounding_box().width()) {
1041  continue;
1042  }
1043  // Check if its the last line of a paragraph.
1044  // In most cases, a paragraph ending should be left-aligned to text line
1045  // above it. Sometimes, it could be a 2 line paragraph, in which case
1046  // the line above it is indented.
1047  // To account for that, check if the partition center is to
1048  // the left of the one above it.
1049  int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1050  int upper_mid = (upper_part->bounding_box().left() +
1051  upper_part->bounding_box().right()) /
1052  2;
1053  int current_spacing = 0; // spacing of the current line to margin
1054  int upper_spacing = 0; // spacing of the previous line to the margin
1056  // Left to right languages, use mid - left to figure out the distance
1057  // the middle is from the left margin.
1058  int left = std::min(part->bounding_box().left(),
1059  upper_part->bounding_box().left());
1060  current_spacing = mid - left;
1061  upper_spacing = upper_mid - left;
1062  } else {
1063  // Right to left languages, use right - mid to figure out the distance
1064  // the middle is from the right margin.
1065  int right = std::max(part->bounding_box().right(),
1066  upper_part->bounding_box().right());
1067  current_spacing = right - mid;
1068  upper_spacing = right - upper_mid;
1069  }
1070  if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) {
1071  continue;
1072  }
1073 
1074  // Paragraphs should have similar fonts.
1075  if (!part->MatchingSizes(*upper_part) ||
1078  continue;
1079  }
1080 
1081  // The last line of a paragraph should be left aligned.
1082  // TODO(nbeato): This would be untrue if the text was right aligned.
1083  // How often is that?
1084  if (part->space_to_left() >
1086  continue;
1087  }
1088  // The line above it should be right aligned (assuming justified format).
1089  // Since we can't assume justified text, we compare whitespace to text.
1090  // The above line should have majority spanning text (or the current
1091  // line could have fit on the previous line). So compare
1092  // whitespace to text.
1093  if (upper_part->bounding_box().width() <
1095  upper_part->space_to_right()) {
1096  continue;
1097  }
1098 
1099  // Ledding above the line should be less than ledding below
1100  if (part->space_above() >= part->space_below() ||
1101  part->space_above() > 2 * global_median_ledding_) {
1102  continue;
1103  }
1104 
1105  // If all checks failed, it is probably text.
1106  part->clear_table_type();
1107  }
1108 }
1109 
1111  // Consider top-most text colpartition as header and bottom most as footer
1112  ColPartition *header = nullptr;
1113  ColPartition *footer = nullptr;
1114  int max_top = INT32_MIN;
1115  int min_bottom = INT32_MAX;
1117  gsearch.StartFullSearch();
1118  ColPartition *part = nullptr;
1119  while ((part = gsearch.NextFullSearch()) != nullptr) {
1120  if (!part->IsTextType()) {
1121  continue; // Consider only text partitions
1122  }
1123  int top = part->bounding_box().top();
1124  int bottom = part->bounding_box().bottom();
1125  if (top > max_top) {
1126  max_top = top;
1127  header = part;
1128  }
1129  if (bottom < min_bottom) {
1130  min_bottom = bottom;
1131  footer = part;
1132  }
1133  }
1134  if (header) {
1135  header->clear_table_type();
1136  }
1137  if (footer) {
1138  footer->clear_table_type();
1139  }
1140 }
1141 
1142 // Mark all ColPartitions as table cells that have a table cell above
1143 // and below them
1144 // TODO(faisal): This is too aggressive at the moment. The method needs to
1145 // consider spacing and alignment as well. Detection of false alarm table cells
1146 // should also be done as part of it.
1148  // Iterate the ColPartitions in the grid.
1150  gsearch.StartFullSearch();
1151  ColPartition *part = nullptr;
1152  while ((part = gsearch.NextFullSearch()) != nullptr) {
1153  if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) {
1154  continue; // Consider only text partitions
1155  }
1156  ColPartition *upper_part = part->nearest_neighbor_above();
1157  ColPartition *lower_part = part->nearest_neighbor_below();
1158  if (!upper_part || !lower_part) {
1159  continue;
1160  }
1161  if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) {
1162  part->set_table_type();
1163  }
1164  }
1165 
1166  // Pass 2, do the opposite. If both the upper and lower neighbors
1167  // exist and are not tables, this probably shouldn't be a table.
1168  gsearch.StartFullSearch();
1169  part = nullptr;
1170  while ((part = gsearch.NextFullSearch()) != nullptr) {
1171  if (part->type() != PT_TABLE) {
1172  continue; // Consider only text partitions
1173  }
1174  ColPartition *upper_part = part->nearest_neighbor_above();
1175  ColPartition *lower_part = part->nearest_neighbor_below();
1176 
1177  // table can't be by itself
1178  if ((upper_part && upper_part->type() != PT_TABLE) &&
1179  (lower_part && lower_part->type() != PT_TABLE)) {
1180  part->clear_table_type();
1181  }
1182  }
1183 }
1184 
1185 // Set the type of a column segment based on the ratio of table to text cells
1186 void TableFinder::SetColumnsType(ColSegment_LIST *column_blocks) {
1187  ColSegment_IT it(column_blocks);
1188  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1189  ColSegment *seg = it.data();
1190  TBOX box = seg->bounding_box();
1191  int num_table_cells = 0;
1192  int num_text_cells = 0;
1194  &clean_part_grid_);
1195  rsearch.SetUniqueMode(true);
1196  rsearch.StartRectSearch(box);
1197  ColPartition *part = nullptr;
1198  while ((part = rsearch.NextRectSearch()) != nullptr) {
1199  if (part->type() == PT_TABLE) {
1200  num_table_cells++;
1201  } else if (part->type() == PT_FLOWING_TEXT) {
1202  num_text_cells++;
1203  }
1204  }
1205  // If a column block has no text or table partition in it, it is not needed
1206  // for table detection.
1207  if (!num_table_cells && !num_text_cells) {
1208  delete it.extract();
1209  } else {
1210  seg->set_num_table_cells(num_table_cells);
1211  seg->set_num_text_cells(num_text_cells);
1212  // set column type based on the ratio of table to text cells
1213  seg->set_type();
1214  }
1215  }
1216 }
1217 
1218 // Move column blocks to grid
1219 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
1220  ColSegmentGrid *col_seg_grid) {
1221  ColSegment_IT it(segments);
1222  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1223  ColSegment *seg = it.extract();
1224  col_seg_grid->InsertBBox(true, true, seg);
1225  }
1226 }
1227 
1228 // Merge column blocks if a split is detected due to the presence of a
1229 // table. A text block is considered split if it has multiple
1230 // neighboring blocks above/below it, and at least one of the
1231 // neighboring blocks is of table type (has a high density of table
1232 // partitions). In this case neighboring blocks in the direction
1233 // (above/below) of the table block are merged with the text block.
1234 
1235 // Comment: This method does not handle split due to a full page table
1236 // since table columns in this case do not have a text column on which
1237 // split decision can be based.
1239  int margin = gridsize();
1240 
1241  // Iterate the Column Blocks in the grid.
1243  &col_seg_grid_);
1244  gsearch.StartFullSearch();
1245  ColSegment *seg;
1246  while ((seg = gsearch.NextFullSearch()) != nullptr) {
1247  if (seg->type() != COL_TEXT) {
1248  continue; // only consider text blocks for split detection
1249  }
1250  bool neighbor_found = false;
1251  bool modified = false; // Modified at least once
1252  // keep expanding current box as long as neighboring table columns
1253  // are found above or below it.
1254  do {
1255  TBOX box = seg->bounding_box();
1256  // slightly expand the search region vertically
1257  int top_range =
1258  std::min(box.top() + margin, static_cast<int>(tright().y()));
1259  int bottom_range =
1260  std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1261  box.set_top(top_range);
1262  box.set_bottom(bottom_range);
1263  neighbor_found = false;
1265  &col_seg_grid_);
1266  rectsearch.StartRectSearch(box);
1267  ColSegment *neighbor = nullptr;
1268  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1269  if (neighbor == seg) {
1270  continue;
1271  }
1272  const TBOX &neighbor_box = neighbor->bounding_box();
1273  // If the neighbor box significantly overlaps with the current
1274  // box (due to the expansion of the current box in the
1275  // previous iteration of this loop), remove the neighbor box
1276  // and expand the current box to include it.
1277  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1278  seg->InsertBox(neighbor_box);
1279  modified = true;
1280  rectsearch.RemoveBBox();
1281  gsearch.RepositionIterator();
1282  delete neighbor;
1283  continue;
1284  }
1285  // Only expand if the neighbor box is of table type
1286  if (neighbor->type() != COL_TABLE) {
1287  continue;
1288  }
1289  // Insert the neighbor box into the current column block
1290  if (neighbor_box.major_x_overlap(box) && !box.contains(neighbor_box)) {
1291  seg->InsertBox(neighbor_box);
1292  neighbor_found = true;
1293  modified = true;
1294  rectsearch.RemoveBBox();
1295  gsearch.RepositionIterator();
1296  delete neighbor;
1297  }
1298  }
1299  } while (neighbor_found);
1300  if (modified) {
1301  // Because the box has changed, it has to be removed first.
1302  gsearch.RemoveBBox();
1303  col_seg_grid_.InsertBBox(true, true, seg);
1304  gsearch.RepositionIterator();
1305  }
1306  }
1307 }
1308 
1309 // Group horizontally overlapping table partitions into table columns.
1310 // TODO(faisal): This is too aggressive at the moment. The method should
1311 // consider more attributes to group table partitions together. Some common
1312 // errors are:
1313 // 1- page number is merged with a table column above it even
1314 // if there is a large vertical gap between them.
1315 // 2- column headers go on to catch one of the columns arbitrarily
1316 // 3- an isolated noise blob near page top or bottom merges with the table
1317 // column below/above it
1318 // 4- cells from two vertically adjacent tables merge together to make a
1319 // single column resulting in merging of the two tables
1320 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
1321  ColSegment_IT it(table_columns);
1322  // Iterate the ColPartitions in the grid.
1324  &clean_part_grid_);
1325  gsearch.StartFullSearch();
1326  ColPartition *part;
1327  while ((part = gsearch.NextFullSearch()) != nullptr) {
1328  if (part->inside_table_column() || part->type() != PT_TABLE) {
1329  continue; // prevent a partition to be assigned to multiple columns
1330  }
1331  const TBOX &box = part->bounding_box();
1332  auto *col = new ColSegment();
1333  col->InsertBox(box);
1334  part->set_inside_table_column(true);
1335  // Start a search below the current cell to find bottom neighbours
1336  // Note: a full search will always process things above it first, so
1337  // this should be starting at the highest cell and working its way down.
1339  &clean_part_grid_);
1340  vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1341  ColPartition *neighbor = nullptr;
1342  bool found_neighbours = false;
1343  while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1344  // only consider neighbors not assigned to any column yet
1345  if (neighbor->inside_table_column()) {
1346  continue;
1347  }
1348  // Horizontal lines should not break the flow
1349  if (neighbor->IsHorizontalLine()) {
1350  continue;
1351  }
1352  // presence of a non-table neighbor marks the end of current
1353  // table column
1354  if (neighbor->type() != PT_TABLE) {
1355  break;
1356  }
1357  // add the neighbor partition to the table column
1358  const TBOX &neighbor_box = neighbor->bounding_box();
1359  col->InsertBox(neighbor_box);
1360  neighbor->set_inside_table_column(true);
1361  found_neighbours = true;
1362  }
1363  if (found_neighbours) {
1364  it.add_after_then_move(col);
1365  } else {
1366  part->set_inside_table_column(false);
1367  delete col;
1368  }
1369  }
1370 }
1371 
1372 // Mark regions in a column that are x-bounded by the column boundaries and
1373 // y-bounded by the table columns' projection on the y-axis as table regions
1374 void TableFinder::GetTableRegions(ColSegment_LIST *table_columns,
1375  ColSegment_LIST *table_regions) {
1376  ColSegment_IT cit(table_columns);
1377  ColSegment_IT rit(table_regions);
1378  // Iterate through column blocks
1380  &col_seg_grid_);
1381  gsearch.StartFullSearch();
1382  ColSegment *part;
1383  int page_height = tright().y() - bleft().y();
1384  ASSERT_HOST(page_height > 0);
1385  // create a bool array to hold projection on y-axis
1386  bool *table_region = new bool[page_height];
1387  while ((part = gsearch.NextFullSearch()) != nullptr) {
1388  const TBOX &part_box = part->bounding_box();
1389  // reset the projection array
1390  for (int i = 0; i < page_height; i++) {
1391  table_region[i] = false;
1392  }
1393  // iterate through all table columns to find regions in the current
1394  // page column block
1395  cit.move_to_first();
1396  for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1397  TBOX col_box = cit.data()->bounding_box();
1398  // find intersection region of table column and page column
1399  TBOX intersection_box = col_box.intersection(part_box);
1400  // project table column on the y-axis
1401  for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1402  table_region[i - bleft().y()] = true;
1403  }
1404  }
1405  // set x-limits of table regions to page column width
1406  TBOX current_table_box;
1407  current_table_box.set_left(part_box.left());
1408  current_table_box.set_right(part_box.right());
1409  // go through the y-axis projection to find runs of table
1410  // regions. Each run makes one table region.
1411  for (int i = 1; i < page_height; i++) {
1412  // detect start of a table region
1413  if (!table_region[i - 1] && table_region[i]) {
1414  current_table_box.set_bottom(i + bleft().y());
1415  }
1416  // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1417  // detect end of a table region
1418  if (table_region[i - 1] && !table_region[i]) {
1419  current_table_box.set_top(i + bleft().y());
1420  if (!current_table_box.null_box()) {
1421  auto *seg = new ColSegment();
1422  seg->InsertBox(current_table_box);
1423  rit.add_after_then_move(seg);
1424  }
1425  }
1426  }
1427  }
1428  delete[] table_region;
1429 }
1430 
1431 // Merge table regions corresponding to tables spanning multiple columns if
1432 // there is a colpartition (horizontal ruling line or normal text) that
1433 // touches both regions.
1434 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
1435 // tables with aligned ruling lines. In this case, line finder returns a
1436 // single line and hence the tables get merged together
1438  // Iterate the table regions in the grid.
1440  &table_grid_);
1441  gsearch.StartFullSearch();
1442  ColSegment *seg = nullptr;
1443  while ((seg = gsearch.NextFullSearch()) != nullptr) {
1444  bool neighbor_found = false;
1445  bool modified = false; // Modified at least once
1446  do {
1447  // Start a rectangle search x-bounded by the image and y by the table
1448  const TBOX &box = seg->bounding_box();
1449  TBOX search_region(box);
1450  search_region.set_left(bleft().x());
1451  search_region.set_right(tright().x());
1452  neighbor_found = false;
1454  &table_grid_);
1455  rectsearch.StartRectSearch(search_region);
1456  ColSegment *neighbor = nullptr;
1457  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1458  if (neighbor == seg) {
1459  continue;
1460  }
1461  const TBOX &neighbor_box = neighbor->bounding_box();
1462  // Check if a neighbor box has a large overlap with the table
1463  // region. This may happen as a result of merging two table
1464  // regions in the previous iteration.
1465  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1466  seg->InsertBox(neighbor_box);
1467  rectsearch.RemoveBBox();
1468  gsearch.RepositionIterator();
1469  delete neighbor;
1470  modified = true;
1471  continue;
1472  }
1473  // Check if two table regions belong together based on a common
1474  // horizontal ruling line
1475  if (BelongToOneTable(box, neighbor_box)) {
1476  seg->InsertBox(neighbor_box);
1477  neighbor_found = true;
1478  modified = true;
1479  rectsearch.RemoveBBox();
1480  gsearch.RepositionIterator();
1481  delete neighbor;
1482  }
1483  }
1484  } while (neighbor_found);
1485  if (modified) {
1486  // Because the box has changed, it has to be removed first.
1487  gsearch.RemoveBBox();
1488  table_grid_.InsertBBox(true, true, seg);
1489  gsearch.RepositionIterator();
1490  }
1491  }
1492 }
1493 
1494 // Decide if two table regions belong to one table based on a common
1495 // horizontal ruling line or another colpartition
1496 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
1497  // Check the obvious case. Most likely not true because overlapping boxes
1498  // should already be merged, but seems like a good thing to do in case things
1499  // change.
1500  if (box1.overlap(box2)) {
1501  return true;
1502  }
1503  // Check for ColPartitions spanning both table regions
1504  TBOX bbox = box1.bounding_union(box2);
1505  // Start a rect search on bbox
1507  &clean_part_grid_);
1508  rectsearch.StartRectSearch(bbox);
1509  ColPartition *part = nullptr;
1510  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1511  const TBOX &part_box = part->bounding_box();
1512  // return true if a colpartition spanning both table regions is found
1513  if (part_box.overlap(box1) && part_box.overlap(box2) &&
1514  !part->IsImageType()) {
1515  return true;
1516  }
1517  }
1518  return false;
1519 }
1520 
1521 // Adjust table boundaries by:
1522 // - building a tight bounding box around all ColPartitions contained in it.
1523 // - expanding table boundaries to include all colpartitions that overlap the
1524 // table by more than half of their area
1525 // - expanding table boundaries to include nearby horizontal rule lines
1526 // - expanding table vertically to include left out column headers
1527 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
1528 // makes following errors:
1529 // 1- horizontal lines consisting of underlines are included in the table if
1530 // they are close enough
1531 // 2- horizontal lines originating from noise tend to get merged with a table
1532 // near the top of the page
1533 // 3- the criteria for including horizontal lines is very generous. Many times
1534 // horizontal lines separating headers and footers get merged with a
1535 // single-column table in a multi-column page thereby including text
1536 // from the neighboring column inside the table
1537 // 4- the criteria for including left out column headers also tends to
1538 // occasionally include text-lines above the tables, typically from
1539 // table caption
1541  // Iterate the table regions in the grid
1542  ColSegment_CLIST adjusted_tables;
1543  ColSegment_C_IT it(&adjusted_tables);
1545  gsearch.StartFullSearch();
1546  ColSegment *table = nullptr;
1547  while ((table = gsearch.NextFullSearch()) != nullptr) {
1548  const TBOX &table_box = table->bounding_box();
1549  TBOX grown_box = table_box;
1550  GrowTableBox(table_box, &grown_box);
1551  // To prevent a table from expanding again, do not insert the
1552  // modified box back to the grid. Instead move it to a list and
1553  // and remove it from the grid. The list is moved later back to the grid.
1554  if (!grown_box.null_box()) {
1555  auto *col = new ColSegment();
1556  col->InsertBox(grown_box);
1557  it.add_after_then_move(col);
1558  }
1559  gsearch.RemoveBBox();
1560  delete table;
1561  }
1562  // clear table grid to move final tables in it
1563  // TODO(nbeato): table_grid_ should already be empty. The above loop
1564  // removed everything. Maybe just assert it is empty?
1565  table_grid_.Clear();
1566  it.move_to_first();
1567  // move back final tables to table_grid_
1568  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1569  ColSegment *seg = it.extract();
1570  table_grid_.InsertBBox(true, true, seg);
1571  }
1572 }
1573 
1574 void TableFinder::GrowTableBox(const TBOX &table_box, TBOX *result_box) {
1575  // TODO(nbeato): The growing code is a bit excessive right now.
1576  // By removing these lines, the partitions considered need
1577  // to have some overlap or be special cases. These lines could
1578  // be added again once a check is put in place to make sure that
1579  // growing tables don't stomp on a lot of non-table partitions.
1580 
1581  // search for horizontal ruling lines within the vertical margin
1582  // int vertical_margin = kRulingVerticalMargin * gridsize();
1583  TBOX search_box = table_box;
1584  // int top = MIN(search_box.top() + vertical_margin, tright().y());
1585  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1586  // search_box.set_top(top);
1587  // search_box.set_bottom(bottom);
1588 
1589  GrowTableToIncludePartials(table_box, search_box, result_box);
1590  GrowTableToIncludeLines(table_box, search_box, result_box);
1591  IncludeLeftOutColumnHeaders(result_box);
1592 }
1593 
1594 // Grow a table by increasing the size of the box to include
1595 // partitions with significant overlap with the table.
1597  const TBOX &search_range,
1598  TBOX *result_box) {
1599  // Rulings are in a different grid, so search 2 grids for rulings, text,
1600  // and table partitions that are not entirely within the new box.
1601  for (int i = 0; i < 2; ++i) {
1602  ColPartitionGrid *grid =
1604  ColPartitionGridSearch rectsearch(grid);
1605  rectsearch.StartRectSearch(search_range);
1606  ColPartition *part = nullptr;
1607  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1608  // Only include text and table types.
1609  if (part->IsImageType()) {
1610  continue;
1611  }
1612  const TBOX &part_box = part->bounding_box();
1613  // Include partition in the table if more than half of it
1614  // is covered by the table
1615  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1616  *result_box = result_box->bounding_union(part_box);
1617  continue;
1618  }
1619  }
1620  }
1621 }
1622 
1623 // Grow a table by expanding to the extents of significantly
1624 // overlapping lines.
1626  const TBOX &search_range,
1627  TBOX *result_box) {
1629  rsearch.SetUniqueMode(true);
1630  rsearch.StartRectSearch(search_range);
1631  ColPartition *part = nullptr;
1632  while ((part = rsearch.NextRectSearch()) != nullptr) {
1633  // TODO(nbeato) This should also do vertical, but column
1634  // boundaries are breaking things. This function needs to be
1635  // updated to allow vertical lines as well.
1636  if (!part->IsLineType()) {
1637  continue;
1638  }
1639  // Avoid the following function call if the result of the
1640  // function is irrelevant.
1641  const TBOX &part_box = part->bounding_box();
1642  if (result_box->contains(part_box)) {
1643  continue;
1644  }
1645  // Include a partially overlapping horizontal line only if the
1646  // extra ColPartitions that will be included due to expansion
1647  // have large side spacing w.r.t. columns containing them.
1648  if (HLineBelongsToTable(*part, table_box)) {
1649  *result_box = result_box->bounding_union(part_box);
1650  }
1651  // TODO(nbeato): Vertical
1652  }
1653 }
1654 
1655 // Checks whether the horizontal line belong to the table by looking at the
1656 // side spacing of extra ColParitions that will be included in the table
1657 // due to expansion
1659  const TBOX &table_box) {
1660  if (!part.IsHorizontalLine()) {
1661  return false;
1662  }
1663  const TBOX &part_box = part.bounding_box();
1664  if (!part_box.major_x_overlap(table_box)) {
1665  return false;
1666  }
1667  // Do not consider top-most horizontal line since it usually
1668  // originates from noise.
1669  // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1670  // have neighbors solved.
1671  // if (!part.nearest_neighbor_above())
1672  // return false;
1673  const TBOX bbox = part_box.bounding_union(table_box);
1674  // In the "unioned table" box (the table extents expanded by the line),
1675  // keep track of how many partitions have significant padding to the left
1676  // and right. If more than half of the partitions covered by the new table
1677  // have significant spacing, the line belongs to the table and the table
1678  // grows to include all of the partitions.
1679  int num_extra_partitions = 0;
1680  int extra_space_to_right = 0;
1681  int extra_space_to_left = 0;
1682  // Rulings are in a different grid, so search 2 grids for rulings, text,
1683  // and table partitions that are introduced by the new box.
1684  for (int i = 0; i < 2; ++i) {
1685  ColPartitionGrid *grid =
1687  // Start a rect search on bbox
1688  ColPartitionGridSearch rectsearch(grid);
1689  rectsearch.SetUniqueMode(true);
1690  rectsearch.StartRectSearch(bbox);
1691  ColPartition *extra_part = nullptr;
1692  while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1693  // ColPartition already in table
1694  const TBOX &extra_part_box = extra_part->bounding_box();
1695  if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1696  continue;
1697  }
1698  // Non-text ColPartitions do not contribute
1699  if (extra_part->IsImageType()) {
1700  continue;
1701  }
1702  // Consider this partition.
1703  num_extra_partitions++;
1704  // presence of a table cell is a strong hint, so just increment the scores
1705  // without looking at the spacing.
1706  if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1707  extra_space_to_right++;
1708  extra_space_to_left++;
1709  continue;
1710  }
1711  int space_threshold = kSideSpaceMargin * part.median_height();
1712  if (extra_part->space_to_right() > space_threshold) {
1713  extra_space_to_right++;
1714  }
1715  if (extra_part->space_to_left() > space_threshold) {
1716  extra_space_to_left++;
1717  }
1718  }
1719  }
1720  // tprintf("%d %d %d\n",
1721  // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1722  return (extra_space_to_right > num_extra_partitions / 2) ||
1723  (extra_space_to_left > num_extra_partitions / 2);
1724 }
1725 
1726 // Look for isolated column headers above the given table box and
1727 // include them in the table
1729  // Start a search above the current table to look for column headers
1731  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1732  table_box->top());
1733  ColPartition *neighbor = nullptr;
1734  ColPartition *previous_neighbor = nullptr;
1735  while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1736  // Max distance to find a table heading.
1737  const int max_distance =
1739  int table_top = table_box->top();
1740  const TBOX &box = neighbor->bounding_box();
1741  // Do not continue if the next box is way above
1742  if (box.bottom() - table_top > max_distance) {
1743  break;
1744  }
1745  // Unconditionally include partitions of type TABLE or LINE
1746  // TODO(faisal): add some reasonable conditions here
1747  if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1748  table_box->set_top(box.top());
1749  previous_neighbor = nullptr;
1750  continue;
1751  }
1752  // If there are two text partitions, one above the other, without a table
1753  // cell on their left or right side, consider them a barrier and quit
1754  if (previous_neighbor == nullptr) {
1755  previous_neighbor = neighbor;
1756  } else {
1757  const TBOX &previous_box = previous_neighbor->bounding_box();
1758  if (!box.major_y_overlap(previous_box)) {
1759  break;
1760  }
1761  }
1762  }
1763 }
1764 
1765 // Remove false alarms consisting of a single column based on their
1766 // projection on the x-axis. Projection of a real table on the x-axis
1767 // should have at least one zero-valley larger than the global median
1768 // x-height of the page.
1770  int page_width = tright().x() - bleft().x();
1771  ASSERT_HOST(page_width > 0);
1772  // create an integer array to hold projection on x-axis
1773  int *table_xprojection = new int[page_width];
1774  // Iterate through all tables in the table grid
1776  &table_grid_);
1777  table_search.StartFullSearch();
1778  ColSegment *table;
1779  while ((table = table_search.NextFullSearch()) != nullptr) {
1780  TBOX table_box = table->bounding_box();
1781  // reset the projection array
1782  for (int i = 0; i < page_width; i++) {
1783  table_xprojection[i] = 0;
1784  }
1785  // Start a rect search on table_box
1787  &clean_part_grid_);
1788  rectsearch.SetUniqueMode(true);
1789  rectsearch.StartRectSearch(table_box);
1790  ColPartition *part;
1791  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1792  if (!part->IsTextType()) {
1793  continue; // Do not consider non-text partitions
1794  }
1795  if (part->flow() == BTFT_LEADER) {
1796  continue; // Assume leaders are in tables
1797  }
1798  TBOX part_box = part->bounding_box();
1799  // Do not consider partitions partially covered by the table
1800  if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) {
1801  continue;
1802  }
1803  BLOBNBOX_CLIST *part_boxes = part->boxes();
1804  BLOBNBOX_C_IT pit(part_boxes);
1805 
1806  // Make sure overlapping blobs don't artificially inflate the number
1807  // of rows in the table. This happens frequently with things such as
1808  // decimals and split characters. Do this by assuming the column
1809  // partition is sorted mostly left to right and just clip
1810  // bounding boxes by the previous box's extent.
1811  int next_position_to_write = 0;
1812 
1813  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1814  BLOBNBOX *pblob = pit.data();
1815  // ignore blob height for the purpose of projection since we
1816  // are only interested in finding valleys
1817  int xstart = pblob->bounding_box().left();
1818  int xend = pblob->bounding_box().right();
1819 
1820  xstart = std::max(xstart, next_position_to_write);
1821  for (int i = xstart; i < xend; i++) {
1822  table_xprojection[i - bleft().x()]++;
1823  }
1824  next_position_to_write = xend;
1825  }
1826  }
1827  // Find largest valley between two reasonable peaks in the table
1828  if (!GapInXProjection(table_xprojection, page_width)) {
1829  table_search.RemoveBBox();
1830  delete table;
1831  }
1832  }
1833  delete[] table_xprojection;
1834 }
1835 
1836 // Return true if at least one gap larger than the global x-height
1837 // exists in the horizontal projection
1838 bool TableFinder::GapInXProjection(int *xprojection, int length) {
1839  // Find peak value of the histogram
1840  int peak_value = 0;
1841  for (int i = 0; i < length; i++) {
1842  if (xprojection[i] > peak_value) {
1843  peak_value = xprojection[i];
1844  }
1845  }
1846  // Peak value represents the maximum number of horizontally
1847  // overlapping colpartitions, so this can be considered as the
1848  // number of rows in the table
1849  if (peak_value < kMinRowsInTable) {
1850  return false;
1851  }
1852  double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1853  if (peak_value >= kLargeTableRowCount) {
1854  projection_threshold = kLargeTableProjectionThreshold * peak_value;
1855  }
1856  // Threshold the histogram
1857  for (int i = 0; i < length; i++) {
1858  xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1859  }
1860  // Find the largest run of zeros between two ones
1861  int largest_gap = 0;
1862  int run_start = -1;
1863  for (int i = 1; i < length; i++) {
1864  // detect start of a run of zeros
1865  if (xprojection[i - 1] && !xprojection[i]) {
1866  run_start = i;
1867  }
1868  // detect end of a run of zeros and update the value of largest gap
1869  if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1870  int gap = i - run_start;
1871  if (gap > largest_gap) {
1872  largest_gap = gap;
1873  }
1874  run_start = -1;
1875  }
1876  }
1877  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1878 }
1879 
1880 // Given the location of a table "guess", try to overlay a cellular
1881 // grid in the location, adjusting the boundaries.
1882 // TODO(nbeato): Falsely introduces:
1883 // -headers/footers (not any worse, too much overlap destroys cells)
1884 // -page numbers (not worse, included because maximize margins)
1885 // -equations (nicely fit into a celluar grid, but more sparsely)
1886 // -figures (random text box, also sparse)
1887 // -small left-aligned text areas with overlapping positioned whitespace
1888 // (rejected before)
1889 // Overall, this just needs some more work.
1891 #ifndef GRAPHICS_DISABLED
1892  ScrollView *table_win = nullptr;
1893  if (textord_show_tables) {
1894  table_win = MakeWindow(0, 0, "Table Structure");
1897  // table_grid_.DisplayBoxes(table_win);
1898  }
1899 #endif
1900 
1901  TableRecognizer recognizer;
1902  recognizer.Init();
1904  recognizer.set_text_grid(&fragmented_text_grid_);
1905  recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1906  recognizer.set_min_height(1.5 * gridheight());
1907  // Loop over all of the tables and try to fit them.
1908  // Store the good tables here.
1909  ColSegment_CLIST good_tables;
1910  ColSegment_C_IT good_it(&good_tables);
1911 
1913  gsearch.StartFullSearch();
1914  ColSegment *found_table = nullptr;
1915  while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1916  gsearch.RemoveBBox();
1917 
1918  // The goal is to make the tables persistent in a list.
1919  // When that happens, this will move into the search loop.
1920  const TBOX &found_box = found_table->bounding_box();
1921  StructuredTable *table_structure = recognizer.RecognizeTable(found_box);
1922 
1923  // Process a table. Good tables are inserted into the grid again later on
1924  // We can't change boxes in the grid while it is running a search.
1925  if (table_structure != nullptr) {
1926 #ifndef GRAPHICS_DISABLED
1927  if (textord_show_tables) {
1928  table_structure->Display(table_win, ScrollView::LIME_GREEN);
1929  }
1930 #endif
1931  found_table->set_bounding_box(table_structure->bounding_box());
1932  delete table_structure;
1933  good_it.add_after_then_move(found_table);
1934  } else {
1935  delete found_table;
1936  }
1937  }
1938  // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1939 
1940  // At this point, the grid is empty. We can safely insert the good tables
1941  // back into grid.
1942  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) {
1943  table_grid_.InsertBBox(true, true, good_it.extract());
1944  }
1945 }
1946 
1947 #ifndef GRAPHICS_DISABLED
1948 
1949 // Displays the column segments in some window.
1950 void TableFinder::DisplayColSegments(ScrollView *win, ColSegment_LIST *segments,
1951  ScrollView::Color color) {
1952  win->Pen(color);
1953  win->Brush(ScrollView::NONE);
1954  ColSegment_IT it(segments);
1955  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1956  ColSegment *col = it.data();
1957  const TBOX &box = col->bounding_box();
1958  int left_x = box.left();
1959  int right_x = box.right();
1960  int top_y = box.top();
1961  int bottom_y = box.bottom();
1962  win->Rectangle(left_x, bottom_y, right_x, top_y);
1963  }
1964  win->UpdateWindow();
1965 }
1966 
1967 // Displays the colpartitions using a new coloring on an existing window.
1968 // Note: This method is only for debug purpose during development and
1969 // would not be part of checked in code
1971  ScrollView::Color default_color,
1972  ScrollView::Color table_color) {
1973  ScrollView::Color color = default_color;
1974  // Iterate the ColPartitions in the grid.
1976  gsearch.StartFullSearch();
1977  ColPartition *part = nullptr;
1978  while ((part = gsearch.NextFullSearch()) != nullptr) {
1979  color = default_color;
1980  if (part->type() == PT_TABLE) {
1981  color = table_color;
1982  }
1983 
1984  const TBOX &box = part->bounding_box();
1985  int left_x = box.left();
1986  int right_x = box.right();
1987  int top_y = box.top();
1988  int bottom_y = box.bottom();
1989  win->Brush(ScrollView::NONE);
1990  win->Pen(color);
1991  win->Rectangle(left_x, bottom_y, right_x, top_y);
1992  }
1993  win->UpdateWindow();
1994 }
1995 
1997  ScrollView::Color default_color) {
1998  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1999 }
2000 
2002  ColPartitionGrid *grid,
2003  ScrollView::Color color) {
2004  // Iterate the ColPartitions in the grid.
2006  gsearch.StartFullSearch();
2007  ColPartition *part = nullptr;
2008  while ((part = gsearch.NextFullSearch()) != nullptr) {
2009  const TBOX &box = part->bounding_box();
2010  int left_x = box.left();
2011  int right_x = box.right();
2012  int top_y = box.top();
2013  int bottom_y = box.bottom();
2014 
2015  ColPartition *upper_part = part->nearest_neighbor_above();
2016  if (upper_part) {
2017  const TBOX &upper_box = upper_part->bounding_box();
2018  int mid_x = (left_x + right_x) / 2;
2019  int mid_y = (top_y + bottom_y) / 2;
2020  int other_x = (upper_box.left() + upper_box.right()) / 2;
2021  int other_y = (upper_box.top() + upper_box.bottom()) / 2;
2022  win->Brush(ScrollView::NONE);
2023  win->Pen(color);
2024  win->Line(mid_x, mid_y, other_x, other_y);
2025  }
2026  ColPartition *lower_part = part->nearest_neighbor_below();
2027  if (lower_part) {
2028  const TBOX &lower_box = lower_part->bounding_box();
2029  int mid_x = (left_x + right_x) / 2;
2030  int mid_y = (top_y + bottom_y) / 2;
2031  int other_x = (lower_box.left() + lower_box.right()) / 2;
2032  int other_y = (lower_box.top() + lower_box.bottom()) / 2;
2033  win->Brush(ScrollView::NONE);
2034  win->Pen(color);
2035  win->Line(mid_x, mid_y, other_x, other_y);
2036  }
2037  }
2038  win->UpdateWindow();
2039 }
2040 
2041 #endif
2042 
2043 // Merge all colpartitions in table regions to make them a single
2044 // colpartition and revert types of isolated table cells not
2045 // assigned to any table to their original types.
2047  ColPartitionSet **all_columns,
2048  const WidthCallback &width_cb) {
2049  // Since we have table blocks already, remove table tags from all
2050  // colpartitions
2052  gsearch.StartFullSearch();
2053  ColPartition *part = nullptr;
2054 
2055  while ((part = gsearch.NextFullSearch()) != nullptr) {
2056  if (part->type() == PT_TABLE) {
2057  part->clear_table_type();
2058  }
2059  }
2060  // Now make a single colpartition out of each table block and remove
2061  // all colpartitions contained within a table
2063  &table_grid_);
2064  table_search.StartFullSearch();
2065  ColSegment *table;
2066  while ((table = table_search.NextFullSearch()) != nullptr) {
2067  const TBOX &table_box = table->bounding_box();
2068  // Start a rect search on table_box
2070  grid);
2071  rectsearch.StartRectSearch(table_box);
2072  ColPartition *part;
2073  ColPartition *table_partition = nullptr;
2074  while ((part = rectsearch.NextRectSearch()) != nullptr) {
2075  // Do not consider image partitions
2076  if (!part->IsTextType()) {
2077  continue;
2078  }
2079  TBOX part_box = part->bounding_box();
2080  // Include partition in the table if more than half of it
2081  // is covered by the table
2082  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2083  rectsearch.RemoveBBox();
2084  if (table_partition) {
2085  table_partition->Absorb(part, width_cb);
2086  } else {
2087  table_partition = part;
2088  }
2089  }
2090  }
2091  // Insert table colpartition back to part_grid_
2092  if (table_partition) {
2093  // To match the columns used when transforming to blocks, the new table
2094  // partition must have its first and last column set at the grid y that
2095  // corresponds to its bottom.
2096  const TBOX &table_box = table_partition->bounding_box();
2097  int grid_x, grid_y;
2098  grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2099  table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2100  table_partition->set_table_type();
2101  table_partition->set_blob_type(BRT_TEXT);
2102  table_partition->set_flow(BTFT_CHAIN);
2103  table_partition->SetBlobTypes();
2104  grid->InsertBBox(true, true, table_partition);
2105  }
2106  }
2107 }
2108 
2112  : ELIST_LINK(),
2113  num_table_cells_(0),
2114  num_text_cells_(0),
2115  type_(COL_UNKNOWN) {}
2116 
2117 // Provides a color for BBGrid to draw the rectangle.
2119  const ScrollView::Color kBoxColors[PT_COUNT] = {
2124  };
2125  return kBoxColors[type_];
2126 }
2127 
2128 // Insert a box into this column segment
2129 void ColSegment::InsertBox(const TBOX &other) {
2130  bounding_box_ = bounding_box_.bounding_union(other);
2131 }
2132 
2133 // Set column segment type based on the ratio of text and table partitions
2134 // in it.
2136  if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) {
2137  type_ = COL_TABLE;
2138  } else if (num_text_cells_ > num_table_cells_) {
2139  type_ = COL_TEXT;
2140  } else {
2141  type_ = COL_MIXED;
2142  }
2143 }
2144 
2145 } // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define BOOL_VAR(name, val, comment)
Definition: params.h:359
const int kMaxVerticalSpacing
Definition: tablefind.cpp:38
const double kAllowBlobHeight
Definition: tablefind.cpp:56
const double kMinOverlapWithTable
Definition: tablefind.cpp:97
const double kMaxTableCellXheight
Definition: tablefind.cpp:81
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:126
@ BRT_TEXT
Definition: blobbox.h:82
@ BRT_NOISE
Definition: blobbox.h:75
@ COL_TEXT
Definition: tablefind.h:29
@ COL_MIXED
Definition: tablefind.h:29
@ COL_TABLE
Definition: tablefind.h:29
@ COL_UNKNOWN
Definition: tablefind.h:29
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:73
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:107
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:63
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:140
const int kMinRowsInTable
Definition: tablefind.cpp:112
std::function< bool(int)> WidthCallback
Definition: tabfind.h:35
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:132
const double kMaxGapInTextPartition
Definition: tablefind.cpp:69
void DeleteObject(T *object)
Definition: tablefind.cpp:156
const double kTableColumnThreshold
Definition: tablefind.cpp:89
const double kAllowBlobArea
Definition: tablefind.cpp:58
const double kAllowTextArea
Definition: tablefind.cpp:51
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:117
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:141
const int kMaxBlobWidth
Definition: tablefind.cpp:40
const double kAllowTextWidth
Definition: tablefind.cpp:50
const double kAllowTextHeight
Definition: tablefind.cpp:49
@ BTFT_CHAIN
Definition: blobbox.h:114
@ BTFT_LEADER
Definition: blobbox.h:117
const double kAllowBlobWidth
Definition: tablefind.cpp:57
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:85
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:66
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:122
const int kLargeTableRowCount
Definition: tablefind.cpp:109
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:106
@ PT_PULLOUT_IMAGE
Definition: publictypes.h:65
@ PT_HEADING_IMAGE
Definition: publictypes.h:64
@ PT_FLOWING_IMAGE
Definition: publictypes.h:63
@ PT_FLOWING_TEXT
Definition: publictypes.h:55
const int kSideSpaceMargin
Definition: tablefind.cpp:102
const double kSplitPartitionSize
Definition: tablefind.cpp:44
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:77
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:136
BlobRegionType region_type() const
Definition: blobbox.h:298
const TBOX & bounding_box() const
Definition: blobbox.h:239
BlobTextFlowType flow() const
Definition: blobbox.h:310
integer coordinate
Definition: points.h:36
TDimension y() const
access_function
Definition: points.h:62
TDimension x() const
access function
Definition: points.h:58
TDimension left() const
Definition: rect.h:82
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:445
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
void set_right(int x)
Definition: rect.h:92
void set_left(int x)
Definition: rect.h:85
TDimension top() const
Definition: rect.h:68
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:128
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:84
bool null_box() const
Definition: rect.h:60
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:419
void set_bottom(int y)
Definition: rect.h:78
TDimension right() const
Definition: rect.h:89
bool overlap(const TBOX &box) const
Definition: rect.h:363
TDimension bottom() const
Definition: rect.h:75
bool contains(const FCOORD pt) const
Definition: rect.h:344
int32_t area() const
Definition: rect.h:134
double overlap_fraction(const TBOX &box) const
Definition: rect.h:396
void set_top(int y)
Definition: rect.h:71
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:597
double median() const
Definition: statistc.cpp:242
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:837
void SetUniqueMode(bool mode)
Definition: bbgrid.h:249
int GridY() const
Definition: bbgrid.h:241
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:802
BBC * NextRectSearch()
Definition: bbgrid.h:896
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:788
void StartFullSearch()
Definition: bbgrid.h:701
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:849
void RepositionIterator()
Definition: bbgrid.h:954
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:884
BBC * NextFullSearch()
Definition: bbgrid.h:711
int gridsize() const
Definition: bbgrid.h:63
int gridheight() const
Definition: bbgrid.h:69
int gridwidth() const
Definition: bbgrid.h:66
const ICOORD & bleft() const
Definition: bbgrid.h:72
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:53
const ICOORD & tright() const
Definition: bbgrid.h:75
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:649
void Clear()
Definition: bbgrid.h:497
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:488
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:529
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:506
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:633
bool IsImageType() const
Definition: colpartition.h:429
void set_space_to_left(int space)
Definition: colpartition.h:276
BlobTextFlowType flow() const
Definition: colpartition.h:153
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
bool IsHorizontalLine() const
Definition: colpartition.h:459
PolyBlockType type() const
Definition: colpartition.h:180
ColPartition * CopyButDontOwnBlobs()
ColPartition * SplitAt(int split_x)
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:186
ColPartition * nearest_neighbor_below() const
Definition: colpartition.h:255
bool VSignificantCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:390
void set_nearest_neighbor_above(ColPartition *part)
Definition: colpartition.h:252
BlobRegionType blob_type() const
Definition: colpartition.h:147
void AddBox(BLOBNBOX *box)
void set_blob_type(BlobRegionType t)
Definition: colpartition.h:150
void set_nearest_neighbor_below(ColPartition *part)
Definition: colpartition.h:258
void set_space_above(int space)
Definition: colpartition.h:264
void set_inside_table_column(bool val)
Definition: colpartition.h:246
bool MatchingSizes(const ColPartition &other) const
int LeftAtY(int y) const
Definition: colpartition.h:340
void set_space_to_right(int space)
Definition: colpartition.h:282
ColPartition * ShallowCopy() const
bool IsInSameColumnAs(const ColPartition &part) const
int space_to_right() const
Definition: colpartition.h:279
ColPartition * nearest_neighbor_above() const
Definition: colpartition.h:249
ColPartition * SingletonPartner(bool upper)
const TBOX & bounding_box() const
Definition: colpartition.h:108
void SetPartitionType(int resolution, ColPartitionSet *columns)
void set_space_below(int space)
Definition: colpartition.h:270
int RightAtY(int y) const
Definition: colpartition.h:344
void Absorb(ColPartition *other, const WidthCallback &cb)
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:156
void RefinePartitionPartners(bool get_desperate)
ColPartition * ColumnContaining(int x, int y)
void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments)
void InsertBox(const TBOX &other)
Definition: tablefind.cpp:2129
void set_bounding_box(const TBOX &other)
Definition: tablefind.h:65
void set_num_table_cells(int n)
Definition: tablefind.h:74
void set_num_text_cells(int n)
Definition: tablefind.h:83
ColSegType type() const
Definition: tablefind.h:87
ScrollView::Color BoxColor() const
Definition: tablefind.cpp:2118
const TBOX & bounding_box() const
Definition: tablefind.h:45
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1950
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1496
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1574
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:519
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:403
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1728
void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
Definition: tablefind.cpp:181
const ICOORD & bleft() const
Definition: tablefind.cpp:388
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1625
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1320
const ICOORD & tright() const
Definition: tablefind.cpp:391
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:437
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:541
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, const WidthCallback &width_cb)
Definition: tablefind.cpp:2046
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:571
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:716
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1374
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:875
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:582
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:969
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1658
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:525
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:766
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:397
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1219
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:411
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1838
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:763
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:403
int gridheight() const
Definition: tablefind.cpp:385
void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block)
Definition: tablefind.cpp:193
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:589
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:769
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:419
void set_left_to_right_language(bool order)
Definition: tablefind.cpp:177
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:503
ColSegmentGrid table_grid_
Definition: tablefind.h:405
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1186
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1596
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:401
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:395
void SetVerticalSpacing(ColPartition *part)
Definition: tablefind.cpp:671
void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback width_cb, const FCOORD &reskew)
Definition: tablefind.cpp:261
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:2001
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1970
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:844
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:395
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:422
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:490
const TBOX & bounding_box() const
Definition: tablerecog.cpp:104
void Display(ScrollView *window, ScrollView::Color color)
Definition: tablerecog.cpp:297
void set_max_text_height(int height)
Definition: tablerecog.cpp:737
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:728
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:725
void set_min_height(int height)
Definition: tablerecog.cpp:731
StructuredTable * RecognizeTable(const TBOX &guess_box)
Definition: tablerecog.cpp:741
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:511
void Pen(Color color)
Definition: scrollview.cpp:723
void Brush(Color color)
Definition: scrollview.cpp:729
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:589