tesseract  5.0.0
tesseract::TableFinder Class Reference

#include <tablefind.h>

Inheritance diagram for tesseract::TableFinder:
tesseract::TestableTableFinder

Public Member Functions

 TableFinder ()
 
 ~TableFinder ()
 
void set_resolution (int resolution)
 
void set_left_to_right_language (bool order)
 
void Init (int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
 
void InsertCleanPartitions (ColPartitionGrid *grid, TO_BLOCK *block)
 
void LocateTables (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback width_cb, const FCOORD &reskew)
 

Protected Member Functions

int gridsize () const
 
int gridwidth () const
 
int gridheight () const
 
const ICOORDbleft () const
 
const ICOORDtright () const
 
ScrollViewMakeWindow (int x, int y, const char *window_name)
 
void InsertTextPartition (ColPartition *part)
 
void InsertFragmentedTextPartition (ColPartition *part)
 
void InsertLeaderPartition (ColPartition *part)
 
void InsertRulingPartition (ColPartition *part)
 
void InsertImagePartition (ColPartition *part)
 
void SplitAndInsertFragmentedTextPartition (ColPartition *part)
 
bool AllowTextPartition (const ColPartition &part) const
 
bool AllowBlob (const BLOBNBOX &blob) const
 
void MoveColSegmentsToGrid (ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
 
void InitializePartitions (ColPartitionSet **all_columns)
 
void SetVerticalSpacing (ColPartition *part)
 
void SetGlobalSpacings (ColPartitionGrid *grid)
 
void set_global_median_xheight (int xheight)
 
void set_global_median_blob_width (int width)
 
void set_global_median_ledding (int ledding)
 
void FindNeighbors ()
 
void MarkTablePartitions ()
 
void MarkPartitionsUsingLocalInformation ()
 
bool HasWideOrNoInterWordGap (ColPartition *part) const
 
bool HasLeaderAdjacent (const ColPartition &part)
 
void FilterFalseAlarms ()
 
void FilterParagraphEndings ()
 
void FilterHeaderAndFooter ()
 
void SmoothTablePartitionRuns ()
 
void GetColumnBlocks (ColPartitionSet **columns, ColSegment_LIST *col_segments)
 
void GroupColumnBlocks (ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
 
bool ConsecutiveBoxes (const TBOX &b1, const TBOX &b2)
 
void SetColumnsType (ColSegment_LIST *col_segments)
 
void GridMergeColumnBlocks ()
 
void GetTableColumns (ColSegment_LIST *table_columns)
 
void GetTableRegions (ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
 
void GridMergeTableRegions ()
 
bool BelongToOneTable (const TBOX &box1, const TBOX &box2)
 
void AdjustTableBoundaries ()
 
void GrowTableBox (const TBOX &table_box, TBOX *result_box)
 
void GrowTableToIncludePartials (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
void GrowTableToIncludeLines (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
bool HLineBelongsToTable (const ColPartition &part, const TBOX &table_box)
 
void IncludeLeftOutColumnHeaders (TBOX *table_box)
 
void DeleteSingleColumnTables ()
 
bool GapInXProjection (int *xprojection, int length)
 
void RecognizeTables ()
 
void DisplayColSegments (ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColPartitionConnections (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void MakeTableBlocks (ColPartitionGrid *grid, ColPartitionSet **columns, const WidthCallback &width_cb)
 

Static Protected Member Functions

static void SetPartitionSpacings (ColPartitionGrid *grid, ColPartitionSet **all_columns)
 

Protected Attributes

int resolution_
 
int global_median_xheight_
 
int global_median_blob_width_
 
int global_median_ledding_
 
ColPartitionGrid clean_part_grid_
 
ColPartitionGrid leader_and_ruling_grid_
 
ColPartitionGrid fragmented_text_grid_
 
ColSegmentGrid col_seg_grid_
 
ColSegmentGrid table_grid_
 
bool left_to_right_language_
 

Detailed Description

Definition at line 121 of file tablefind.h.

Constructor & Destructor Documentation

◆ TableFinder()

tesseract::TableFinder::TableFinder ( )

Definition at line 160 of file tablefind.cpp.

◆ ~TableFinder()

tesseract::TableFinder::~TableFinder ( )

Definition at line 167 of file tablefind.cpp.

167  {
168  // ColPartitions and ColSegments created by this class for storage in grids
169  // need to be deleted explicitly.
170  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
171  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
172  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
173  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
174  table_grid_.ClearGridData(&DeleteObject<ColSegment>);
175 }
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:506
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:397
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:403
ColSegmentGrid table_grid_
Definition: tablefind.h:405
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:401
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:395

Member Function Documentation

◆ AdjustTableBoundaries()

void tesseract::TableFinder::AdjustTableBoundaries ( )
protected

Definition at line 1540 of file tablefind.cpp.

1540  {
1541  // Iterate the table regions in the grid
1542  ColSegment_CLIST adjusted_tables;
1543  ColSegment_C_IT it(&adjusted_tables);
1545  gsearch.StartFullSearch();
1546  ColSegment *table = nullptr;
1547  while ((table = gsearch.NextFullSearch()) != nullptr) {
1548  const TBOX &table_box = table->bounding_box();
1549  TBOX grown_box = table_box;
1550  GrowTableBox(table_box, &grown_box);
1551  // To prevent a table from expanding again, do not insert the
1552  // modified box back to the grid. Instead move it to a list and
1553  // and remove it from the grid. The list is moved later back to the grid.
1554  if (!grown_box.null_box()) {
1555  auto *col = new ColSegment();
1556  col->InsertBox(grown_box);
1557  it.add_after_then_move(col);
1558  }
1559  gsearch.RemoveBBox();
1560  delete table;
1561  }
1562  // clear table grid to move final tables in it
1563  // TODO(nbeato): table_grid_ should already be empty. The above loop
1564  // removed everything. Maybe just assert it is empty?
1565  table_grid_.Clear();
1566  it.move_to_first();
1567  // move back final tables to table_grid_
1568  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1569  ColSegment *seg = it.extract();
1570  table_grid_.InsertBBox(true, true, seg);
1571  }
1572 }
@ TBOX
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:111
void Clear()
Definition: bbgrid.h:497
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:529
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1574

◆ AllowBlob()

bool tesseract::TableFinder::AllowBlob ( const BLOBNBOX blob) const
protected

Definition at line 503 of file tablefind.cpp.

503  {
504  const TBOX &box = blob.bounding_box();
505  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
506  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
507  const int median_area = global_median_xheight_ * global_median_blob_width_;
508  const double kAreaRequired = median_area * kAllowBlobArea;
509  // Keep comparisons strictly greater to disallow 0!
510  return box.height() > kHeightRequired && box.width() > kWidthRequired &&
511  box.area() > kAreaRequired;
512 }
const double kAllowBlobHeight
Definition: tablefind.cpp:56
const double kAllowBlobArea
Definition: tablefind.cpp:58
const double kAllowBlobWidth
Definition: tablefind.cpp:57

◆ AllowTextPartition()

bool tesseract::TableFinder::AllowTextPartition ( const ColPartition part) const
protected

Definition at line 490 of file tablefind.cpp.

490  {
491  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
492  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
493  const int median_area = global_median_xheight_ * global_median_blob_width_;
494  const double kAreaPerBlobRequired = median_area * kAllowTextArea;
495  // Keep comparisons strictly greater to disallow 0!
496  return part.median_height() > kHeightRequired &&
497  part.median_width() > kWidthRequired &&
498  part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
499 }
const double kAllowTextArea
Definition: tablefind.cpp:51
const double kAllowTextWidth
Definition: tablefind.cpp:50
const double kAllowTextHeight
Definition: tablefind.cpp:49

◆ BelongToOneTable()

bool tesseract::TableFinder::BelongToOneTable ( const TBOX box1,
const TBOX box2 
)
protected

Definition at line 1496 of file tablefind.cpp.

1496  {
1497  // Check the obvious case. Most likely not true because overlapping boxes
1498  // should already be merged, but seems like a good thing to do in case things
1499  // change.
1500  if (box1.overlap(box2)) {
1501  return true;
1502  }
1503  // Check for ColPartitions spanning both table regions
1504  TBOX bbox = box1.bounding_union(box2);
1505  // Start a rect search on bbox
1506  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
1507  &clean_part_grid_);
1508  rectsearch.StartRectSearch(bbox);
1509  ColPartition *part = nullptr;
1510  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1511  const TBOX &part_box = part->bounding_box();
1512  // return true if a colpartition spanning both table regions is found
1513  if (part_box.overlap(box1) && part_box.overlap(box2) &&
1514  !part->IsImageType()) {
1515  return true;
1516  }
1517  }
1518  return false;
1519 }

◆ bleft()

const ICOORD & tesseract::TableFinder::bleft ( ) const
protected

Definition at line 388 of file tablefind.cpp.

388  {
389  return clean_part_grid_.bleft();
390 }
const ICOORD & bleft() const
Definition: bbgrid.h:72

◆ ConsecutiveBoxes()

bool tesseract::TableFinder::ConsecutiveBoxes ( const TBOX b1,
const TBOX b2 
)
protected

Definition at line 571 of file tablefind.cpp.

571  {
572  int x_margin = 20;
573  int y_margin = 5;
574  return (abs(b1.left() - b2.left()) < x_margin) &&
575  (abs(b1.right() - b2.right()) < x_margin) &&
576  (abs(b1.top() - b2.bottom()) < y_margin ||
577  abs(b2.top() - b1.bottom()) < y_margin);
578 }

◆ DeleteSingleColumnTables()

void tesseract::TableFinder::DeleteSingleColumnTables ( )
protected

Definition at line 1769 of file tablefind.cpp.

1769  {
1770  int page_width = tright().x() - bleft().x();
1771  ASSERT_HOST(page_width > 0);
1772  // create an integer array to hold projection on x-axis
1773  int *table_xprojection = new int[page_width];
1774  // Iterate through all tables in the table grid
1775  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(
1776  &table_grid_);
1777  table_search.StartFullSearch();
1778  ColSegment *table;
1779  while ((table = table_search.NextFullSearch()) != nullptr) {
1780  TBOX table_box = table->bounding_box();
1781  // reset the projection array
1782  for (int i = 0; i < page_width; i++) {
1783  table_xprojection[i] = 0;
1784  }
1785  // Start a rect search on table_box
1786  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
1787  &clean_part_grid_);
1788  rectsearch.SetUniqueMode(true);
1789  rectsearch.StartRectSearch(table_box);
1790  ColPartition *part;
1791  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1792  if (!part->IsTextType()) {
1793  continue; // Do not consider non-text partitions
1794  }
1795  if (part->flow() == BTFT_LEADER) {
1796  continue; // Assume leaders are in tables
1797  }
1798  TBOX part_box = part->bounding_box();
1799  // Do not consider partitions partially covered by the table
1800  if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) {
1801  continue;
1802  }
1803  BLOBNBOX_CLIST *part_boxes = part->boxes();
1804  BLOBNBOX_C_IT pit(part_boxes);
1805 
1806  // Make sure overlapping blobs don't artificially inflate the number
1807  // of rows in the table. This happens frequently with things such as
1808  // decimals and split characters. Do this by assuming the column
1809  // partition is sorted mostly left to right and just clip
1810  // bounding boxes by the previous box's extent.
1811  int next_position_to_write = 0;
1812 
1813  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1814  BLOBNBOX *pblob = pit.data();
1815  // ignore blob height for the purpose of projection since we
1816  // are only interested in finding valleys
1817  int xstart = pblob->bounding_box().left();
1818  int xend = pblob->bounding_box().right();
1819 
1820  xstart = std::max(xstart, next_position_to_write);
1821  for (int i = xstart; i < xend; i++) {
1822  table_xprojection[i - bleft().x()]++;
1823  }
1824  next_position_to_write = xend;
1825  }
1826  }
1827  // Find largest valley between two reasonable peaks in the table
1828  if (!GapInXProjection(table_xprojection, page_width)) {
1829  table_search.RemoveBBox();
1830  delete table;
1831  }
1832  }
1833  delete[] table_xprojection;
1834 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
const double kMinOverlapWithTable
Definition: tablefind.cpp:97
@ BTFT_LEADER
Definition: blobbox.h:117
TDimension x() const
access function
Definition: points.h:58
const ICOORD & bleft() const
Definition: tablefind.cpp:388
const ICOORD & tright() const
Definition: tablefind.cpp:391
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1838

◆ DisplayColPartitionConnections()

void tesseract::TableFinder::DisplayColPartitionConnections ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 2001 of file tablefind.cpp.

2003  {
2004  // Iterate the ColPartitions in the grid.
2005  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
2006  gsearch.StartFullSearch();
2007  ColPartition *part = nullptr;
2008  while ((part = gsearch.NextFullSearch()) != nullptr) {
2009  const TBOX &box = part->bounding_box();
2010  int left_x = box.left();
2011  int right_x = box.right();
2012  int top_y = box.top();
2013  int bottom_y = box.bottom();
2014 
2015  ColPartition *upper_part = part->nearest_neighbor_above();
2016  if (upper_part) {
2017  const TBOX &upper_box = upper_part->bounding_box();
2018  int mid_x = (left_x + right_x) / 2;
2019  int mid_y = (top_y + bottom_y) / 2;
2020  int other_x = (upper_box.left() + upper_box.right()) / 2;
2021  int other_y = (upper_box.top() + upper_box.bottom()) / 2;
2022  win->Brush(ScrollView::NONE);
2023  win->Pen(color);
2024  win->Line(mid_x, mid_y, other_x, other_y);
2025  }
2026  ColPartition *lower_part = part->nearest_neighbor_below();
2027  if (lower_part) {
2028  const TBOX &lower_box = lower_part->bounding_box();
2029  int mid_x = (left_x + right_x) / 2;
2030  int mid_y = (top_y + bottom_y) / 2;
2031  int other_x = (lower_box.left() + lower_box.right()) / 2;
2032  int other_y = (lower_box.top() + lower_box.bottom()) / 2;
2033  win->Brush(ScrollView::NONE);
2034  win->Pen(color);
2035  win->Line(mid_x, mid_y, other_x, other_y);
2036  }
2037  }
2038  win->UpdateWindow();
2039 }

◆ DisplayColPartitions() [1/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1996 of file tablefind.cpp.

1997  {
1998  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1999 }
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1970

◆ DisplayColPartitions() [2/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  text_color,
ScrollView::Color  table_color 
)
protected

Definition at line 1970 of file tablefind.cpp.

1972  {
1973  ScrollView::Color color = default_color;
1974  // Iterate the ColPartitions in the grid.
1975  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
1976  gsearch.StartFullSearch();
1977  ColPartition *part = nullptr;
1978  while ((part = gsearch.NextFullSearch()) != nullptr) {
1979  color = default_color;
1980  if (part->type() == PT_TABLE) {
1981  color = table_color;
1982  }
1983 
1984  const TBOX &box = part->bounding_box();
1985  int left_x = box.left();
1986  int right_x = box.right();
1987  int top_y = box.top();
1988  int bottom_y = box.bottom();
1989  win->Brush(ScrollView::NONE);
1990  win->Pen(color);
1991  win->Rectangle(left_x, bottom_y, right_x, top_y);
1992  }
1993  win->UpdateWindow();
1994 }

◆ DisplayColSegments()

void tesseract::TableFinder::DisplayColSegments ( ScrollView win,
ColSegment_LIST *  cols,
ScrollView::Color  color 
)
protected

Definition at line 1950 of file tablefind.cpp.

1951  {
1952  win->Pen(color);
1953  win->Brush(ScrollView::NONE);
1954  ColSegment_IT it(segments);
1955  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1956  ColSegment *col = it.data();
1957  const TBOX &box = col->bounding_box();
1958  int left_x = box.left();
1959  int right_x = box.right();
1960  int top_y = box.top();
1961  int bottom_y = box.bottom();
1962  win->Rectangle(left_x, bottom_y, right_x, top_y);
1963  }
1964  win->UpdateWindow();
1965 }

◆ FilterFalseAlarms()

void tesseract::TableFinder::FilterFalseAlarms ( )
protected

Definition at line 1015 of file tablefind.cpp.

1015  {
1018  // TODO(nbeato): Fully justified text as non-table?
1019 }

◆ FilterHeaderAndFooter()

void tesseract::TableFinder::FilterHeaderAndFooter ( )
protected

Definition at line 1110 of file tablefind.cpp.

1110  {
1111  // Consider top-most text colpartition as header and bottom most as footer
1112  ColPartition *header = nullptr;
1113  ColPartition *footer = nullptr;
1114  int max_top = INT32_MIN;
1115  int min_bottom = INT32_MAX;
1117  gsearch.StartFullSearch();
1118  ColPartition *part = nullptr;
1119  while ((part = gsearch.NextFullSearch()) != nullptr) {
1120  if (!part->IsTextType()) {
1121  continue; // Consider only text partitions
1122  }
1123  int top = part->bounding_box().top();
1124  int bottom = part->bounding_box().bottom();
1125  if (top > max_top) {
1126  max_top = top;
1127  header = part;
1128  }
1129  if (bottom < min_bottom) {
1130  min_bottom = bottom;
1131  footer = part;
1132  }
1133  }
1134  if (header) {
1135  header->clear_table_type();
1136  }
1137  if (footer) {
1138  footer->clear_table_type();
1139  }
1140 }
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:919

◆ FilterParagraphEndings()

void tesseract::TableFinder::FilterParagraphEndings ( )
protected

Definition at line 1021 of file tablefind.cpp.

1021  {
1022  // Detect last line of paragraph
1023  // Iterate the ColPartitions in the grid.
1025  gsearch.StartFullSearch();
1026  ColPartition *part = nullptr;
1027  while ((part = gsearch.NextFullSearch()) != nullptr) {
1028  if (part->type() != PT_TABLE) {
1029  continue; // Consider only table partitions
1030  }
1031 
1032  // Paragraph ending should have flowing text above it.
1033  ColPartition *upper_part = part->nearest_neighbor_above();
1034  if (!upper_part) {
1035  continue;
1036  }
1037  if (upper_part->type() != PT_FLOWING_TEXT) {
1038  continue;
1039  }
1040  if (upper_part->bounding_box().width() < 2 * part->bounding_box().width()) {
1041  continue;
1042  }
1043  // Check if its the last line of a paragraph.
1044  // In most cases, a paragraph ending should be left-aligned to text line
1045  // above it. Sometimes, it could be a 2 line paragraph, in which case
1046  // the line above it is indented.
1047  // To account for that, check if the partition center is to
1048  // the left of the one above it.
1049  int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1050  int upper_mid = (upper_part->bounding_box().left() +
1051  upper_part->bounding_box().right()) /
1052  2;
1053  int current_spacing = 0; // spacing of the current line to margin
1054  int upper_spacing = 0; // spacing of the previous line to the margin
1056  // Left to right languages, use mid - left to figure out the distance
1057  // the middle is from the left margin.
1058  int left = std::min(part->bounding_box().left(),
1059  upper_part->bounding_box().left());
1060  current_spacing = mid - left;
1061  upper_spacing = upper_mid - left;
1062  } else {
1063  // Right to left languages, use right - mid to figure out the distance
1064  // the middle is from the right margin.
1065  int right = std::max(part->bounding_box().right(),
1066  upper_part->bounding_box().right());
1067  current_spacing = right - mid;
1068  upper_spacing = right - upper_mid;
1069  }
1070  if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) {
1071  continue;
1072  }
1073 
1074  // Paragraphs should have similar fonts.
1075  if (!part->MatchingSizes(*upper_part) ||
1076  !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1078  continue;
1079  }
1080 
1081  // The last line of a paragraph should be left aligned.
1082  // TODO(nbeato): This would be untrue if the text was right aligned.
1083  // How often is that?
1084  if (part->space_to_left() >
1085  kMaxParagraphEndingLeftSpaceMultiple * part->median_height()) {
1086  continue;
1087  }
1088  // The line above it should be right aligned (assuming justified format).
1089  // Since we can't assume justified text, we compare whitespace to text.
1090  // The above line should have majority spanning text (or the current
1091  // line could have fit on the previous line). So compare
1092  // whitespace to text.
1093  if (upper_part->bounding_box().width() <
1095  upper_part->space_to_right()) {
1096  continue;
1097  }
1098 
1099  // Ledding above the line should be less than ledding below
1100  if (part->space_above() >= part->space_below() ||
1101  part->space_above() > 2 * global_median_ledding_) {
1102  continue;
1103  }
1104 
1105  // If all checks failed, it is probably text.
1106  part->clear_table_type();
1107  }
1108 }
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:126
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:140
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:132
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:141
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:122
@ PT_FLOWING_TEXT
Definition: publictypes.h:55

◆ FindNeighbors()

void tesseract::TableFinder::FindNeighbors ( )
protected

Definition at line 773 of file tablefind.cpp.

773  {
775  gsearch.StartFullSearch();
776  ColPartition *part = nullptr;
777  while ((part = gsearch.NextFullSearch()) != nullptr) {
778  // TODO(nbeato): Rename this function, meaning is different now.
779  // IT is finding nearest neighbors its own way
780  // SetVerticalSpacing(part);
781 
782  ColPartition *upper = part->SingletonPartner(true);
783  if (upper) {
784  part->set_nearest_neighbor_above(upper);
785  }
786 
787  ColPartition *lower = part->SingletonPartner(false);
788  if (lower) {
789  part->set_nearest_neighbor_below(lower);
790  }
791  }
792 }

◆ GapInXProjection()

bool tesseract::TableFinder::GapInXProjection ( int *  xprojection,
int  length 
)
protected

Definition at line 1838 of file tablefind.cpp.

1838  {
1839  // Find peak value of the histogram
1840  int peak_value = 0;
1841  for (int i = 0; i < length; i++) {
1842  if (xprojection[i] > peak_value) {
1843  peak_value = xprojection[i];
1844  }
1845  }
1846  // Peak value represents the maximum number of horizontally
1847  // overlapping colpartitions, so this can be considered as the
1848  // number of rows in the table
1849  if (peak_value < kMinRowsInTable) {
1850  return false;
1851  }
1852  double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1853  if (peak_value >= kLargeTableRowCount) {
1854  projection_threshold = kLargeTableProjectionThreshold * peak_value;
1855  }
1856  // Threshold the histogram
1857  for (int i = 0; i < length; i++) {
1858  xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1859  }
1860  // Find the largest run of zeros between two ones
1861  int largest_gap = 0;
1862  int run_start = -1;
1863  for (int i = 1; i < length; i++) {
1864  // detect start of a run of zeros
1865  if (xprojection[i - 1] && !xprojection[i]) {
1866  run_start = i;
1867  }
1868  // detect end of a run of zeros and update the value of largest gap
1869  if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1870  int gap = i - run_start;
1871  if (gap > largest_gap) {
1872  largest_gap = gap;
1873  }
1874  run_start = -1;
1875  }
1876  }
1877  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1878 }
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:107
const int kMinRowsInTable
Definition: tablefind.cpp:112
const int kLargeTableRowCount
Definition: tablefind.cpp:109
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:106
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:136

◆ GetColumnBlocks()

void tesseract::TableFinder::GetColumnBlocks ( ColPartitionSet **  columns,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 525 of file tablefind.cpp.

526  {
527  for (int i = 0; i < gridheight(); ++i) {
528  ColPartitionSet *columns = all_columns[i];
529  if (columns != nullptr) {
530  ColSegment_LIST new_blocks;
531  // Get boxes from the current vertical position on the grid
532  columns->GetColumnBoxes(i * gridsize(), (i + 1) * gridsize(),
533  &new_blocks);
534  // Merge the new_blocks boxes into column_blocks if they are well-aligned
535  GroupColumnBlocks(&new_blocks, column_blocks);
536  }
537  }
538 }
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:541
int gridheight() const
Definition: tablefind.cpp:385

◆ GetTableColumns()

void tesseract::TableFinder::GetTableColumns ( ColSegment_LIST *  table_columns)
protected

Definition at line 1320 of file tablefind.cpp.

1320  {
1321  ColSegment_IT it(table_columns);
1322  // Iterate the ColPartitions in the grid.
1323  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(
1324  &clean_part_grid_);
1325  gsearch.StartFullSearch();
1326  ColPartition *part;
1327  while ((part = gsearch.NextFullSearch()) != nullptr) {
1328  if (part->inside_table_column() || part->type() != PT_TABLE) {
1329  continue; // prevent a partition to be assigned to multiple columns
1330  }
1331  const TBOX &box = part->bounding_box();
1332  auto *col = new ColSegment();
1333  col->InsertBox(box);
1334  part->set_inside_table_column(true);
1335  // Start a search below the current cell to find bottom neighbours
1336  // Note: a full search will always process things above it first, so
1337  // this should be starting at the highest cell and working its way down.
1338  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> vsearch(
1339  &clean_part_grid_);
1340  vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1341  ColPartition *neighbor = nullptr;
1342  bool found_neighbours = false;
1343  while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1344  // only consider neighbors not assigned to any column yet
1345  if (neighbor->inside_table_column()) {
1346  continue;
1347  }
1348  // Horizontal lines should not break the flow
1349  if (neighbor->IsHorizontalLine()) {
1350  continue;
1351  }
1352  // presence of a non-table neighbor marks the end of current
1353  // table column
1354  if (neighbor->type() != PT_TABLE) {
1355  break;
1356  }
1357  // add the neighbor partition to the table column
1358  const TBOX &neighbor_box = neighbor->bounding_box();
1359  col->InsertBox(neighbor_box);
1360  neighbor->set_inside_table_column(true);
1361  found_neighbours = true;
1362  }
1363  if (found_neighbours) {
1364  it.add_after_then_move(col);
1365  } else {
1366  part->set_inside_table_column(false);
1367  delete col;
1368  }
1369  }
1370 }

◆ GetTableRegions()

void tesseract::TableFinder::GetTableRegions ( ColSegment_LIST *  table_columns,
ColSegment_LIST *  table_regions 
)
protected

Definition at line 1374 of file tablefind.cpp.

1375  {
1376  ColSegment_IT cit(table_columns);
1377  ColSegment_IT rit(table_regions);
1378  // Iterate through column blocks
1379  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1380  &col_seg_grid_);
1381  gsearch.StartFullSearch();
1382  ColSegment *part;
1383  int page_height = tright().y() - bleft().y();
1384  ASSERT_HOST(page_height > 0);
1385  // create a bool array to hold projection on y-axis
1386  bool *table_region = new bool[page_height];
1387  while ((part = gsearch.NextFullSearch()) != nullptr) {
1388  const TBOX &part_box = part->bounding_box();
1389  // reset the projection array
1390  for (int i = 0; i < page_height; i++) {
1391  table_region[i] = false;
1392  }
1393  // iterate through all table columns to find regions in the current
1394  // page column block
1395  cit.move_to_first();
1396  for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1397  TBOX col_box = cit.data()->bounding_box();
1398  // find intersection region of table column and page column
1399  TBOX intersection_box = col_box.intersection(part_box);
1400  // project table column on the y-axis
1401  for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1402  table_region[i - bleft().y()] = true;
1403  }
1404  }
1405  // set x-limits of table regions to page column width
1406  TBOX current_table_box;
1407  current_table_box.set_left(part_box.left());
1408  current_table_box.set_right(part_box.right());
1409  // go through the y-axis projection to find runs of table
1410  // regions. Each run makes one table region.
1411  for (int i = 1; i < page_height; i++) {
1412  // detect start of a table region
1413  if (!table_region[i - 1] && table_region[i]) {
1414  current_table_box.set_bottom(i + bleft().y());
1415  }
1416  // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1417  // detect end of a table region
1418  if (table_region[i - 1] && !table_region[i]) {
1419  current_table_box.set_top(i + bleft().y());
1420  if (!current_table_box.null_box()) {
1421  auto *seg = new ColSegment();
1422  seg->InsertBox(current_table_box);
1423  rit.add_after_then_move(seg);
1424  }
1425  }
1426  }
1427  }
1428  delete[] table_region;
1429 }
TDimension y() const
access_function
Definition: points.h:62

◆ gridheight()

int tesseract::TableFinder::gridheight ( ) const
protected

Definition at line 385 of file tablefind.cpp.

385  {
386  return clean_part_grid_.gridheight();
387 }
int gridheight() const
Definition: bbgrid.h:69

◆ GridMergeColumnBlocks()

void tesseract::TableFinder::GridMergeColumnBlocks ( )
protected

Definition at line 1238 of file tablefind.cpp.

1238  {
1239  int margin = gridsize();
1240 
1241  // Iterate the Column Blocks in the grid.
1242  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1243  &col_seg_grid_);
1244  gsearch.StartFullSearch();
1245  ColSegment *seg;
1246  while ((seg = gsearch.NextFullSearch()) != nullptr) {
1247  if (seg->type() != COL_TEXT) {
1248  continue; // only consider text blocks for split detection
1249  }
1250  bool neighbor_found = false;
1251  bool modified = false; // Modified at least once
1252  // keep expanding current box as long as neighboring table columns
1253  // are found above or below it.
1254  do {
1255  TBOX box = seg->bounding_box();
1256  // slightly expand the search region vertically
1257  int top_range =
1258  std::min(box.top() + margin, static_cast<int>(tright().y()));
1259  int bottom_range =
1260  std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1261  box.set_top(top_range);
1262  box.set_bottom(bottom_range);
1263  neighbor_found = false;
1264  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(
1265  &col_seg_grid_);
1266  rectsearch.StartRectSearch(box);
1267  ColSegment *neighbor = nullptr;
1268  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1269  if (neighbor == seg) {
1270  continue;
1271  }
1272  const TBOX &neighbor_box = neighbor->bounding_box();
1273  // If the neighbor box significantly overlaps with the current
1274  // box (due to the expansion of the current box in the
1275  // previous iteration of this loop), remove the neighbor box
1276  // and expand the current box to include it.
1277  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1278  seg->InsertBox(neighbor_box);
1279  modified = true;
1280  rectsearch.RemoveBBox();
1281  gsearch.RepositionIterator();
1282  delete neighbor;
1283  continue;
1284  }
1285  // Only expand if the neighbor box is of table type
1286  if (neighbor->type() != COL_TABLE) {
1287  continue;
1288  }
1289  // Insert the neighbor box into the current column block
1290  if (neighbor_box.major_x_overlap(box) && !box.contains(neighbor_box)) {
1291  seg->InsertBox(neighbor_box);
1292  neighbor_found = true;
1293  modified = true;
1294  rectsearch.RemoveBBox();
1295  gsearch.RepositionIterator();
1296  delete neighbor;
1297  }
1298  }
1299  } while (neighbor_found);
1300  if (modified) {
1301  // Because the box has changed, it has to be removed first.
1302  gsearch.RemoveBBox();
1303  col_seg_grid_.InsertBBox(true, true, seg);
1304  gsearch.RepositionIterator();
1305  }
1306  }
1307 }
@ COL_TEXT
Definition: tablefind.h:29
@ COL_TABLE
Definition: tablefind.h:29

◆ GridMergeTableRegions()

void tesseract::TableFinder::GridMergeTableRegions ( )
protected

Definition at line 1437 of file tablefind.cpp.

1437  {
1438  // Iterate the table regions in the grid.
1439  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1440  &table_grid_);
1441  gsearch.StartFullSearch();
1442  ColSegment *seg = nullptr;
1443  while ((seg = gsearch.NextFullSearch()) != nullptr) {
1444  bool neighbor_found = false;
1445  bool modified = false; // Modified at least once
1446  do {
1447  // Start a rectangle search x-bounded by the image and y by the table
1448  const TBOX &box = seg->bounding_box();
1449  TBOX search_region(box);
1450  search_region.set_left(bleft().x());
1451  search_region.set_right(tright().x());
1452  neighbor_found = false;
1453  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(
1454  &table_grid_);
1455  rectsearch.StartRectSearch(search_region);
1456  ColSegment *neighbor = nullptr;
1457  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1458  if (neighbor == seg) {
1459  continue;
1460  }
1461  const TBOX &neighbor_box = neighbor->bounding_box();
1462  // Check if a neighbor box has a large overlap with the table
1463  // region. This may happen as a result of merging two table
1464  // regions in the previous iteration.
1465  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1466  seg->InsertBox(neighbor_box);
1467  rectsearch.RemoveBBox();
1468  gsearch.RepositionIterator();
1469  delete neighbor;
1470  modified = true;
1471  continue;
1472  }
1473  // Check if two table regions belong together based on a common
1474  // horizontal ruling line
1475  if (BelongToOneTable(box, neighbor_box)) {
1476  seg->InsertBox(neighbor_box);
1477  neighbor_found = true;
1478  modified = true;
1479  rectsearch.RemoveBBox();
1480  gsearch.RepositionIterator();
1481  delete neighbor;
1482  }
1483  }
1484  } while (neighbor_found);
1485  if (modified) {
1486  // Because the box has changed, it has to be removed first.
1487  gsearch.RemoveBBox();
1488  table_grid_.InsertBBox(true, true, seg);
1489  gsearch.RepositionIterator();
1490  }
1491  }
1492 }
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1496

◆ gridsize()

int tesseract::TableFinder::gridsize ( ) const
protected

Definition at line 379 of file tablefind.cpp.

379  {
380  return clean_part_grid_.gridsize();
381 }
int gridsize() const
Definition: bbgrid.h:63

◆ gridwidth()

int tesseract::TableFinder::gridwidth ( ) const
protected

Definition at line 382 of file tablefind.cpp.

382  {
383  return clean_part_grid_.gridwidth();
384 }
int gridwidth() const
Definition: bbgrid.h:66

◆ GroupColumnBlocks()

void tesseract::TableFinder::GroupColumnBlocks ( ColSegment_LIST *  current_segments,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 541 of file tablefind.cpp.

542  {
543  ColSegment_IT src_it(new_blocks);
544  ColSegment_IT dest_it(column_blocks);
545  // iterate through the source list
546  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
547  ColSegment *src_seg = src_it.data();
548  const TBOX &src_box = src_seg->bounding_box();
549  bool match_found = false;
550  // iterate through the destination list to find a matching column block
551  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
552  ColSegment *dest_seg = dest_it.data();
553  TBOX dest_box = dest_seg->bounding_box();
554  if (ConsecutiveBoxes(src_box, dest_box)) {
555  // If matching block is found, insert the current block into it
556  // and delete the source block.
557  dest_seg->InsertBox(src_box);
558  match_found = true;
559  delete src_it.extract();
560  break;
561  }
562  }
563  // If no match is found, just append the source block to column_blocks
564  if (!match_found) {
565  dest_it.add_after_then_move(src_it.extract());
566  }
567  }
568 }
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:571

◆ GrowTableBox()

void tesseract::TableFinder::GrowTableBox ( const TBOX table_box,
TBOX result_box 
)
protected

Definition at line 1574 of file tablefind.cpp.

1574  {
1575  // TODO(nbeato): The growing code is a bit excessive right now.
1576  // By removing these lines, the partitions considered need
1577  // to have some overlap or be special cases. These lines could
1578  // be added again once a check is put in place to make sure that
1579  // growing tables don't stomp on a lot of non-table partitions.
1580 
1581  // search for horizontal ruling lines within the vertical margin
1582  // int vertical_margin = kRulingVerticalMargin * gridsize();
1583  TBOX search_box = table_box;
1584  // int top = MIN(search_box.top() + vertical_margin, tright().y());
1585  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1586  // search_box.set_top(top);
1587  // search_box.set_bottom(bottom);
1588 
1589  GrowTableToIncludePartials(table_box, search_box, result_box);
1590  GrowTableToIncludeLines(table_box, search_box, result_box);
1591  IncludeLeftOutColumnHeaders(result_box);
1592 }
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1728
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1625
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1596

◆ GrowTableToIncludeLines()

void tesseract::TableFinder::GrowTableToIncludeLines ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1625 of file tablefind.cpp.

1627  {
1629  rsearch.SetUniqueMode(true);
1630  rsearch.StartRectSearch(search_range);
1631  ColPartition *part = nullptr;
1632  while ((part = rsearch.NextRectSearch()) != nullptr) {
1633  // TODO(nbeato) This should also do vertical, but column
1634  // boundaries are breaking things. This function needs to be
1635  // updated to allow vertical lines as well.
1636  if (!part->IsLineType()) {
1637  continue;
1638  }
1639  // Avoid the following function call if the result of the
1640  // function is irrelevant.
1641  const TBOX &part_box = part->bounding_box();
1642  if (result_box->contains(part_box)) {
1643  continue;
1644  }
1645  // Include a partially overlapping horizontal line only if the
1646  // extra ColPartitions that will be included due to expansion
1647  // have large side spacing w.r.t. columns containing them.
1648  if (HLineBelongsToTable(*part, table_box)) {
1649  *result_box = result_box->bounding_union(part_box);
1650  }
1651  // TODO(nbeato): Vertical
1652  }
1653 }
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1658

◆ GrowTableToIncludePartials()

void tesseract::TableFinder::GrowTableToIncludePartials ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1596 of file tablefind.cpp.

1598  {
1599  // Rulings are in a different grid, so search 2 grids for rulings, text,
1600  // and table partitions that are not entirely within the new box.
1601  for (int i = 0; i < 2; ++i) {
1602  ColPartitionGrid *grid =
1604  ColPartitionGridSearch rectsearch(grid);
1605  rectsearch.StartRectSearch(search_range);
1606  ColPartition *part = nullptr;
1607  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1608  // Only include text and table types.
1609  if (part->IsImageType()) {
1610  continue;
1611  }
1612  const TBOX &part_box = part->bounding_box();
1613  // Include partition in the table if more than half of it
1614  // is covered by the table
1615  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1616  *result_box = result_box->bounding_union(part_box);
1617  continue;
1618  }
1619  }
1620  }
1621 }

◆ HasLeaderAdjacent()

bool tesseract::TableFinder::HasLeaderAdjacent ( const ColPartition part)
protected

Definition at line 969 of file tablefind.cpp.

969  {
970  if (part.flow() == BTFT_LEADER) {
971  return true;
972  }
973  // Search range is left and right bounded by an offset of the
974  // median xheight. This offset is to allow some tolerance to the
975  // the leaders on the page in the event that the alignment is still
976  // a bit off.
977  const TBOX &box = part.bounding_box();
978  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
979  const int top = box.top() + search_size;
980  const int bottom = box.bottom() - search_size;
982  for (int direction = 0; direction < 2; ++direction) {
983  bool right_to_left = (direction == 0);
984  int x = right_to_left ? box.right() : box.left();
985  hsearch.StartSideSearch(x, bottom, top);
986  ColPartition *leader = nullptr;
987  while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
988  // The leader could be a horizontal ruling in the grid.
989  // Make sure it is actually a leader.
990  if (leader->flow() != BTFT_LEADER) {
991  continue;
992  }
993  // This should not happen, they are in different grids.
994  ASSERT_HOST(&part != leader);
995  // Make sure the leader shares a page column with the partition,
996  // otherwise we are spreading across columns.
997  if (!part.IsInSameColumnAs(*leader)) {
998  break;
999  }
1000  // There should be a significant vertical overlap
1001  if (!leader->VSignificantCoreOverlap(part)) {
1002  continue;
1003  }
1004  // Leader passed all tests, so it is adjacent.
1005  return true;
1006  }
1007  }
1008  // No leaders are adjacent to the given partition.
1009  return false;
1010 }
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:117

◆ HasWideOrNoInterWordGap()

bool tesseract::TableFinder::HasWideOrNoInterWordGap ( ColPartition part) const
protected

Definition at line 875 of file tablefind.cpp.

875  {
876  // Should only get text partitions.
877  ASSERT_HOST(part->IsTextType());
878  // Blob access
879  BLOBNBOX_CLIST *part_boxes = part->boxes();
880  BLOBNBOX_C_IT it(part_boxes);
881  // Check if this is a relatively small partition (such as a single word)
882  if (part->bounding_box().width() <
883  kMinBoxesInTextPartition * part->median_height() &&
884  part_boxes->length() < kMinBoxesInTextPartition) {
885  return true;
886  }
887 
888  // Variables used to compute inter-blob spacing.
889  int current_x0 = -1;
890  int current_x1 = -1;
891  int previous_x1 = -1;
892  // Stores the maximum gap detected.
893  int largest_partition_gap_found = -1;
894  // Text partition gap limits. If this is text (and not a table),
895  // there should be at least one gap larger than min_gap and no gap
896  // larger than max_gap.
897  const double max_gap = kMaxGapInTextPartition * part->median_height();
898  const double min_gap = kMinMaxGapInTextPartition * part->median_height();
899 
900  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
901  BLOBNBOX *blob = it.data();
902  current_x0 = blob->bounding_box().left();
903  current_x1 = blob->bounding_box().right();
904  if (previous_x1 != -1) {
905  int gap = current_x0 - previous_x1;
906 
907  // TODO(nbeato): Boxes may overlap? Huh?
908  // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
909  // on the top right of the page are filtered out with this line.
910  // Note 2: Iterating over blobs in a partition, so we are looking for
911  // spacing between the words.
912  if (gap < 0) {
913  // More likely case, the blobs slightly overlap. This can happen
914  // with diacritics (accents) or broken alphabet symbols (characters).
915  // Merge boxes together by taking max of right sides.
916  if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
917  previous_x1 = std::max(previous_x1, current_x1);
918  continue;
919  }
920  // Extreme case, blobs overlap significantly in the same partition...
921  // This should not happen often (if at all), but it does.
922  // TODO(nbeato): investigate cases when this happens.
923  else {
924  // The behavior before was to completely ignore this case.
925  }
926  }
927 
928  // If a large enough gap is found, mark it as a table cell (return true)
929  if (gap > max_gap) {
930  return true;
931  }
932  if (gap > largest_partition_gap_found) {
933  largest_partition_gap_found = gap;
934  }
935  }
936  previous_x1 = current_x1;
937  }
938  // Since no large gap was found, return false if the partition is too
939  // long to be a data cell
940  if (part->bounding_box().width() >
941  kMaxBoxesInDataPartition * part->median_height() ||
942  part_boxes->length() > kMaxBoxesInDataPartition) {
943  return false;
944  }
945 
946  // A partition may be a single blob. In this case, it's an isolated symbol
947  // or non-text (such as a ruling or image).
948  // Detect these as table partitions? Shouldn't this be case by case?
949  // The behavior before was to ignore this, making max_partition_gap < 0
950  // and implicitly return true. Just making it explicit.
951  if (largest_partition_gap_found == -1) {
952  return true;
953  }
954 
955  // return true if the maximum gap found is smaller than the minimum allowed
956  // max_gap in a text partition. This indicates that there is no significant
957  // space in the partition, hence it is likely a single word.
958  return largest_partition_gap_found < min_gap;
959 }
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:73
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:63
const double kMaxGapInTextPartition
Definition: tablefind.cpp:69
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:66
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:77

◆ HLineBelongsToTable()

bool tesseract::TableFinder::HLineBelongsToTable ( const ColPartition part,
const TBOX table_box 
)
protected

Definition at line 1658 of file tablefind.cpp.

1659  {
1660  if (!part.IsHorizontalLine()) {
1661  return false;
1662  }
1663  const TBOX &part_box = part.bounding_box();
1664  if (!part_box.major_x_overlap(table_box)) {
1665  return false;
1666  }
1667  // Do not consider top-most horizontal line since it usually
1668  // originates from noise.
1669  // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1670  // have neighbors solved.
1671  // if (!part.nearest_neighbor_above())
1672  // return false;
1673  const TBOX bbox = part_box.bounding_union(table_box);
1674  // In the "unioned table" box (the table extents expanded by the line),
1675  // keep track of how many partitions have significant padding to the left
1676  // and right. If more than half of the partitions covered by the new table
1677  // have significant spacing, the line belongs to the table and the table
1678  // grows to include all of the partitions.
1679  int num_extra_partitions = 0;
1680  int extra_space_to_right = 0;
1681  int extra_space_to_left = 0;
1682  // Rulings are in a different grid, so search 2 grids for rulings, text,
1683  // and table partitions that are introduced by the new box.
1684  for (int i = 0; i < 2; ++i) {
1685  ColPartitionGrid *grid =
1687  // Start a rect search on bbox
1688  ColPartitionGridSearch rectsearch(grid);
1689  rectsearch.SetUniqueMode(true);
1690  rectsearch.StartRectSearch(bbox);
1691  ColPartition *extra_part = nullptr;
1692  while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1693  // ColPartition already in table
1694  const TBOX &extra_part_box = extra_part->bounding_box();
1695  if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1696  continue;
1697  }
1698  // Non-text ColPartitions do not contribute
1699  if (extra_part->IsImageType()) {
1700  continue;
1701  }
1702  // Consider this partition.
1703  num_extra_partitions++;
1704  // presence of a table cell is a strong hint, so just increment the scores
1705  // without looking at the spacing.
1706  if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1707  extra_space_to_right++;
1708  extra_space_to_left++;
1709  continue;
1710  }
1711  int space_threshold = kSideSpaceMargin * part.median_height();
1712  if (extra_part->space_to_right() > space_threshold) {
1713  extra_space_to_right++;
1714  }
1715  if (extra_part->space_to_left() > space_threshold) {
1716  extra_space_to_left++;
1717  }
1718  }
1719  }
1720  // tprintf("%d %d %d\n",
1721  // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1722  return (extra_space_to_right > num_extra_partitions / 2) ||
1723  (extra_space_to_left > num_extra_partitions / 2);
1724 }
const int kSideSpaceMargin
Definition: tablefind.cpp:102

◆ IncludeLeftOutColumnHeaders()

void tesseract::TableFinder::IncludeLeftOutColumnHeaders ( TBOX table_box)
protected

Definition at line 1728 of file tablefind.cpp.

1728  {
1729  // Start a search above the current table to look for column headers
1731  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1732  table_box->top());
1733  ColPartition *neighbor = nullptr;
1734  ColPartition *previous_neighbor = nullptr;
1735  while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1736  // Max distance to find a table heading.
1737  const int max_distance =
1738  kMaxColumnHeaderDistance * neighbor->median_height();
1739  int table_top = table_box->top();
1740  const TBOX &box = neighbor->bounding_box();
1741  // Do not continue if the next box is way above
1742  if (box.bottom() - table_top > max_distance) {
1743  break;
1744  }
1745  // Unconditionally include partitions of type TABLE or LINE
1746  // TODO(faisal): add some reasonable conditions here
1747  if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1748  table_box->set_top(box.top());
1749  previous_neighbor = nullptr;
1750  continue;
1751  }
1752  // If there are two text partitions, one above the other, without a table
1753  // cell on their left or right side, consider them a barrier and quit
1754  if (previous_neighbor == nullptr) {
1755  previous_neighbor = neighbor;
1756  } else {
1757  const TBOX &previous_box = previous_neighbor->bounding_box();
1758  if (!box.major_y_overlap(previous_box)) {
1759  break;
1760  }
1761  }
1762  }
1763 }
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:85

◆ Init()

void tesseract::TableFinder::Init ( int  grid_size,
const ICOORD bottom_left,
const ICOORD top_right 
)

Definition at line 181 of file tablefind.cpp.

182  {
183  // Initialize clean partitions list and grid
184  clean_part_grid_.Init(grid_size, bottom_left, top_right);
185  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
186  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
187  col_seg_grid_.Init(grid_size, bottom_left, top_right);
188  table_grid_.Init(grid_size, bottom_left, top_right);
189 }
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:488

◆ InitializePartitions()

void tesseract::TableFinder::InitializePartitions ( ColPartitionSet **  all_columns)
protected

Definition at line 582 of file tablefind.cpp.

582  {
583  FindNeighbors();
584  SetPartitionSpacings(&clean_part_grid_, all_columns);
586 }
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:716
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:589

◆ InsertCleanPartitions()

void tesseract::TableFinder::InsertCleanPartitions ( ColPartitionGrid grid,
TO_BLOCK block 
)

Definition at line 193 of file tablefind.cpp.

194  {
195  // Calculate stats. This lets us filter partitions in AllowTextPartition()
196  // and filter blobs in AllowBlob().
197  SetGlobalSpacings(grid);
198 
199  // Iterate the ColPartitions in the grid.
200  ColPartitionGridSearch gsearch(grid);
201  gsearch.SetUniqueMode(true);
202  gsearch.StartFullSearch();
203  ColPartition *part = nullptr;
204  while ((part = gsearch.NextFullSearch()) != nullptr) {
205  // Reject partitions with nothing useful inside of them.
206  if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) {
207  continue;
208  }
209  ColPartition *clean_part = part->ShallowCopy();
210  ColPartition *leader_part = nullptr;
211  if (part->IsLineType()) {
212  InsertRulingPartition(clean_part);
213  continue;
214  }
215  // Insert all non-text partitions to clean_parts
216  if (!part->IsTextType()) {
217  InsertImagePartition(clean_part);
218  continue;
219  }
220  // Insert text colpartitions after removing noisy components from them
221  // The leaders are split into a separate grid.
222  BLOBNBOX_CLIST *part_boxes = part->boxes();
223  BLOBNBOX_C_IT pit(part_boxes);
224  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
225  BLOBNBOX *pblob = pit.data();
226  // Bad blobs... happens in UNLV set.
227  // news.3G1, page 17 (around x=6)
228  if (!AllowBlob(*pblob)) {
229  continue;
230  }
231  if (pblob->flow() == BTFT_LEADER) {
232  if (leader_part == nullptr) {
233  leader_part = part->ShallowCopy();
234  leader_part->set_flow(BTFT_LEADER);
235  }
236  leader_part->AddBox(pblob);
237  } else if (pblob->region_type() != BRT_NOISE) {
238  clean_part->AddBox(pblob);
239  }
240  }
241  clean_part->ComputeLimits();
242  ColPartition *fragmented = clean_part->CopyButDontOwnBlobs();
243  InsertTextPartition(clean_part);
245  if (leader_part != nullptr) {
246  // TODO(nbeato): Note that ComputeLimits does not update the column
247  // information. So the leader may appear to span more columns than it
248  // really does later on when IsInSameColumnAs gets called to test
249  // for adjacent leaders.
250  leader_part->ComputeLimits();
251  InsertLeaderPartition(leader_part);
252  }
253  }
254 
255  // Make the partition partners better for upper and lower neighbors.
258 }
@ BRT_NOISE
Definition: blobbox.h:75
void RefinePartitionPartners(bool get_desperate)
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:437
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:411
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:419
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:503
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:395
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:422

◆ InsertFragmentedTextPartition()

void tesseract::TableFinder::InsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 403 of file tablefind.cpp.

403  {
404  ASSERT_HOST(part != nullptr);
405  if (AllowTextPartition(*part)) {
406  fragmented_text_grid_.InsertBBox(true, true, part);
407  } else {
408  delete part;
409  }
410 }
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:490

◆ InsertImagePartition()

void tesseract::TableFinder::InsertImagePartition ( ColPartition part)
protected

Definition at line 422 of file tablefind.cpp.

422  {
423  // NOTE: If images are placed into a different grid in the future,
424  // the function SetPartitionSpacings needs to be updated. It should
425  // be the only thing that cares about image partitions.
426  clean_part_grid_.InsertBBox(true, true, part);
427 }

◆ InsertLeaderPartition()

void tesseract::TableFinder::InsertLeaderPartition ( ColPartition part)
protected

Definition at line 411 of file tablefind.cpp.

411  {
412  ASSERT_HOST(part != nullptr);
413  if (!part->IsEmpty() && part->bounding_box().area() > 0) {
414  leader_and_ruling_grid_.InsertBBox(true, true, part);
415  } else {
416  delete part;
417  }
418 }

◆ InsertRulingPartition()

void tesseract::TableFinder::InsertRulingPartition ( ColPartition part)
protected

Definition at line 419 of file tablefind.cpp.

419  {
420  leader_and_ruling_grid_.InsertBBox(true, true, part);
421 }

◆ InsertTextPartition()

void tesseract::TableFinder::InsertTextPartition ( ColPartition part)
protected

Definition at line 395 of file tablefind.cpp.

395  {
396  ASSERT_HOST(part != nullptr);
397  if (AllowTextPartition(*part)) {
398  clean_part_grid_.InsertBBox(true, true, part);
399  } else {
400  delete part;
401  }
402 }

◆ LocateTables()

void tesseract::TableFinder::LocateTables ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback  width_cb,
const FCOORD reskew 
)

Definition at line 261 of file tablefind.cpp.

263  {
264  // initialize spacing, neighbors, and columns
265  InitializePartitions(all_columns);
266 
267 #ifndef GRAPHICS_DISABLED
268  if (textord_show_tables) {
269  ScrollView *table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
275 
276  table_win = MakeWindow(100, 300, "Fragmented Text");
278  }
279 #endif // !GRAPHICS_DISABLED
280 
281  // mark, filter, and smooth candidate table partitions
283 
284  // Make single-column blocks from good_columns_ partitions. col_segments are
285  // moved to a grid later which takes the ownership
286  ColSegment_LIST column_blocks;
287  GetColumnBlocks(all_columns, &column_blocks);
288  // Set the ratio of candidate table partitions in each column
289  SetColumnsType(&column_blocks);
290 
291  // Move column segments to col_seg_grid_
292  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
293 
294  // Detect split in column layout that might have occurred due to the
295  // presence of a table. In such a case, merge the corresponding columns.
297 
298  // Group horizontally overlapping table partitions into table columns.
299  // table_columns created here get deleted at the end of this method.
300  ColSegment_LIST table_columns;
301  GetTableColumns(&table_columns);
302 
303  // Within each column, mark the range table regions occupy based on the
304  // table columns detected. table_regions are moved to a grid later which
305  // takes the ownership
306  ColSegment_LIST table_regions;
307  GetTableRegions(&table_columns, &table_regions);
308 
309 #ifndef GRAPHICS_DISABLED
310  if (textord_tablefind_show_mark) {
311  ScrollView *table_win = MakeWindow(1200, 300, "Table Columns and Regions");
312  DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
313  DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
314  }
315 #endif // !GRAPHICS_DISABLED
316 
317  // Merge table regions across columns for tables spanning multiple
318  // columns
319  MoveColSegmentsToGrid(&table_regions, &table_grid_);
321 
322  // Adjust table boundaries by including nearby horizontal lines and left
323  // out column headers
326 
327  if (textord_tablefind_recognize_tables) {
328  // Remove false alarms consisting of a single column
330 
331 #ifndef GRAPHICS_DISABLED
332  if (textord_show_tables) {
333  ScrollView *table_win = MakeWindow(1200, 300, "Detected Table Locations");
335  DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
336  table_grid_.DisplayBoxes(table_win);
337  }
338 #endif // !GRAPHICS_DISABLED
339 
340  // Find table grid structure and reject tables that are malformed.
341  RecognizeTables();
343  RecognizeTables();
344 
345 #ifndef GRAPHICS_DISABLED
346  if (textord_show_tables) {
347  ScrollView *table_win = MakeWindow(1400, 600, "Recognized Tables");
350  table_grid_.DisplayBoxes(table_win);
351  }
352 #endif // !GRAPHICS_DISABLED
353  } else {
354  // Remove false alarms consisting of a single column
355  // TODO(nbeato): verify this is a NOP after structured table rejection.
356  // Right now it isn't. If the recognize function is doing what it is
357  // supposed to do, this function is obsolete.
359 
360 #ifndef GRAPHICS_DISABLED
361  if (textord_show_tables) {
362  ScrollView *table_win = MakeWindow(1500, 300, "Detected Tables");
365  table_grid_.DisplayBoxes(table_win);
366  }
367 #endif // !GRAPHICS_DISABLED
368  }
369 
370  // Merge all colpartitions in table regions to make them a single
371  // colpartition and revert types of isolated table cells not
372  // assigned to any table to their original types.
373  MakeTableBlocks(grid, all_columns, width_cb);
374 }
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:649
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1950
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:519
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1320
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, const WidthCallback &width_cb)
Definition: tablefind.cpp:2046
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1374
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:582
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:525
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1219
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1186
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:2001

◆ MakeTableBlocks()

void tesseract::TableFinder::MakeTableBlocks ( ColPartitionGrid grid,
ColPartitionSet **  columns,
const WidthCallback width_cb 
)
protected

Definition at line 2046 of file tablefind.cpp.

2048  {
2049  // Since we have table blocks already, remove table tags from all
2050  // colpartitions
2051  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
2052  gsearch.StartFullSearch();
2053  ColPartition *part = nullptr;
2054 
2055  while ((part = gsearch.NextFullSearch()) != nullptr) {
2056  if (part->type() == PT_TABLE) {
2057  part->clear_table_type();
2058  }
2059  }
2060  // Now make a single colpartition out of each table block and remove
2061  // all colpartitions contained within a table
2062  GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(
2063  &table_grid_);
2064  table_search.StartFullSearch();
2065  ColSegment *table;
2066  while ((table = table_search.NextFullSearch()) != nullptr) {
2067  const TBOX &table_box = table->bounding_box();
2068  // Start a rect search on table_box
2069  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
2070  grid);
2071  rectsearch.StartRectSearch(table_box);
2072  ColPartition *part;
2073  ColPartition *table_partition = nullptr;
2074  while ((part = rectsearch.NextRectSearch()) != nullptr) {
2075  // Do not consider image partitions
2076  if (!part->IsTextType()) {
2077  continue;
2078  }
2079  TBOX part_box = part->bounding_box();
2080  // Include partition in the table if more than half of it
2081  // is covered by the table
2082  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2083  rectsearch.RemoveBBox();
2084  if (table_partition) {
2085  table_partition->Absorb(part, width_cb);
2086  } else {
2087  table_partition = part;
2088  }
2089  }
2090  }
2091  // Insert table colpartition back to part_grid_
2092  if (table_partition) {
2093  // To match the columns used when transforming to blocks, the new table
2094  // partition must have its first and last column set at the grid y that
2095  // corresponds to its bottom.
2096  const TBOX &table_box = table_partition->bounding_box();
2097  int grid_x, grid_y;
2098  grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2099  table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2100  table_partition->set_table_type();
2101  table_partition->set_blob_type(BRT_TEXT);
2102  table_partition->set_flow(BTFT_CHAIN);
2103  table_partition->SetBlobTypes();
2104  grid->InsertBBox(true, true, table_partition);
2105  }
2106  }
2107 }
@ BRT_TEXT
Definition: blobbox.h:82
@ BTFT_CHAIN
Definition: blobbox.h:114

◆ MakeWindow()

ScrollView * tesseract::TableFinder::MakeWindow ( int  x,
int  y,
const char *  window_name 
)
protected

Definition at line 519 of file tablefind.cpp.

519  {
520  return clean_part_grid_.MakeWindow(x, y, window_name);
521 }
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:633

◆ MarkPartitionsUsingLocalInformation()

void tesseract::TableFinder::MarkPartitionsUsingLocalInformation ( )
protected

Definition at line 844 of file tablefind.cpp.

844  {
845  // Iterate the ColPartitions in the grid.
846  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(
848  gsearch.StartFullSearch();
849  ColPartition *part = nullptr;
850  while ((part = gsearch.NextFullSearch()) != nullptr) {
851  if (!part->IsTextType()) { // Only consider text partitions
852  continue;
853  }
854  // Only consider partitions in dominant font size or smaller
855  if (part->median_height() > kMaxTableCellXheight * global_median_xheight_) {
856  continue;
857  }
858  // Mark partitions with a large gap, or no significant gap as
859  // table partitions.
860  // Comments: It produces several false alarms at:
861  // - last line of a paragraph (fixed)
862  // - single word section headings
863  // - page headers and footers
864  // - numbered equations
865  // - line drawing regions
866  // TODO(faisal): detect and fix above-mentioned cases
867  if (HasWideOrNoInterWordGap(part) || HasLeaderAdjacent(*part)) {
868  part->set_table_type();
869  }
870  }
871 }
const double kMaxTableCellXheight
Definition: tablefind.cpp:81
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:875
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:969

◆ MarkTablePartitions()

void tesseract::TableFinder::MarkTablePartitions ( )
protected

Definition at line 798 of file tablefind.cpp.

798  {
800 #ifndef GRAPHICS_DISABLED
801  if (textord_tablefind_show_mark) {
802  ScrollView *table_win = MakeWindow(300, 300, "Initial Table Partitions");
806  }
807 #endif
809 #ifndef GRAPHICS_DISABLED
810  if (textord_tablefind_show_mark) {
811  ScrollView *table_win = MakeWindow(600, 300, "Filtered Table Partitions");
815  }
816 #endif
818 #ifndef GRAPHICS_DISABLED
819  if (textord_tablefind_show_mark) {
820  ScrollView *table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
824  }
825 #endif
827 #ifndef GRAPHICS_DISABLED
828  if (textord_tablefind_show_mark || textord_show_tables) {
829  ScrollView *table_win = MakeWindow(900, 300, "Final Table Partitions");
833  }
834 #endif
835 }
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:844

◆ MoveColSegmentsToGrid()

void tesseract::TableFinder::MoveColSegmentsToGrid ( ColSegment_LIST *  segments,
ColSegmentGrid col_seg_grid 
)
protected

Definition at line 1219 of file tablefind.cpp.

1220  {
1221  ColSegment_IT it(segments);
1222  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1223  ColSegment *seg = it.extract();
1224  col_seg_grid->InsertBBox(true, true, seg);
1225  }
1226 }

◆ RecognizeTables()

void tesseract::TableFinder::RecognizeTables ( )
protected

Definition at line 1890 of file tablefind.cpp.

1890  {
1891 #ifndef GRAPHICS_DISABLED
1892  ScrollView *table_win = nullptr;
1893  if (textord_show_tables) {
1894  table_win = MakeWindow(0, 0, "Table Structure");
1897  // table_grid_.DisplayBoxes(table_win);
1898  }
1899 #endif
1900 
1901  TableRecognizer recognizer;
1902  recognizer.Init();
1903  recognizer.set_line_grid(&leader_and_ruling_grid_);
1904  recognizer.set_text_grid(&fragmented_text_grid_);
1905  recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1906  recognizer.set_min_height(1.5 * gridheight());
1907  // Loop over all of the tables and try to fit them.
1908  // Store the good tables here.
1909  ColSegment_CLIST good_tables;
1910  ColSegment_C_IT good_it(&good_tables);
1911 
1913  gsearch.StartFullSearch();
1914  ColSegment *found_table = nullptr;
1915  while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1916  gsearch.RemoveBBox();
1917 
1918  // The goal is to make the tables persistent in a list.
1919  // When that happens, this will move into the search loop.
1920  const TBOX &found_box = found_table->bounding_box();
1921  StructuredTable *table_structure = recognizer.RecognizeTable(found_box);
1922 
1923  // Process a table. Good tables are inserted into the grid again later on
1924  // We can't change boxes in the grid while it is running a search.
1925  if (table_structure != nullptr) {
1926 #ifndef GRAPHICS_DISABLED
1927  if (textord_show_tables) {
1928  table_structure->Display(table_win, ScrollView::LIME_GREEN);
1929  }
1930 #endif
1931  found_table->set_bounding_box(table_structure->bounding_box());
1932  delete table_structure;
1933  good_it.add_after_then_move(found_table);
1934  } else {
1935  delete found_table;
1936  }
1937  }
1938  // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1939 
1940  // At this point, the grid is empty. We can safely insert the good tables
1941  // back into grid.
1942  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) {
1943  table_grid_.InsertBBox(true, true, good_it.extract());
1944  }
1945 }

◆ set_global_median_blob_width()

void tesseract::TableFinder::set_global_median_blob_width ( int  width)
protected

Definition at line 766 of file tablefind.cpp.

766  {
768 }

◆ set_global_median_ledding()

void tesseract::TableFinder::set_global_median_ledding ( int  ledding)
protected

Definition at line 769 of file tablefind.cpp.

769  {
770  global_median_ledding_ = ledding;
771 }

◆ set_global_median_xheight()

void tesseract::TableFinder::set_global_median_xheight ( int  xheight)
protected

Definition at line 763 of file tablefind.cpp.

763  {
764  global_median_xheight_ = xheight;
765 }

◆ set_left_to_right_language()

void tesseract::TableFinder::set_left_to_right_language ( bool  order)

Definition at line 177 of file tablefind.cpp.

177  {
178  left_to_right_language_ = order;
179 }

◆ set_resolution()

void tesseract::TableFinder::set_resolution ( int  resolution)
inline

Definition at line 128 of file tablefind.h.

128  {
129  resolution_ = resolution;
130  }

◆ SetColumnsType()

void tesseract::TableFinder::SetColumnsType ( ColSegment_LIST *  col_segments)
protected

Definition at line 1186 of file tablefind.cpp.

1186  {
1187  ColSegment_IT it(column_blocks);
1188  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1189  ColSegment *seg = it.data();
1190  TBOX box = seg->bounding_box();
1191  int num_table_cells = 0;
1192  int num_text_cells = 0;
1193  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rsearch(
1194  &clean_part_grid_);
1195  rsearch.SetUniqueMode(true);
1196  rsearch.StartRectSearch(box);
1197  ColPartition *part = nullptr;
1198  while ((part = rsearch.NextRectSearch()) != nullptr) {
1199  if (part->type() == PT_TABLE) {
1200  num_table_cells++;
1201  } else if (part->type() == PT_FLOWING_TEXT) {
1202  num_text_cells++;
1203  }
1204  }
1205  // If a column block has no text or table partition in it, it is not needed
1206  // for table detection.
1207  if (!num_table_cells && !num_text_cells) {
1208  delete it.extract();
1209  } else {
1210  seg->set_num_table_cells(num_table_cells);
1211  seg->set_num_text_cells(num_text_cells);
1212  // set column type based on the ratio of table to text cells
1213  seg->set_type();
1214  }
1215  }
1216 }

◆ SetGlobalSpacings()

void tesseract::TableFinder::SetGlobalSpacings ( ColPartitionGrid grid)
protected

Definition at line 716 of file tablefind.cpp.

716  {
717  STATS xheight_stats(0, kMaxVerticalSpacing + 1);
718  STATS width_stats(0, kMaxBlobWidth + 1);
719  STATS ledding_stats(0, kMaxVerticalSpacing + 1);
720  // Iterate the ColPartitions in the grid.
721  ColPartitionGridSearch gsearch(grid);
722  gsearch.SetUniqueMode(true);
723  gsearch.StartFullSearch();
724  ColPartition *part = nullptr;
725  while ((part = gsearch.NextFullSearch()) != nullptr) {
726  // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
727  // ComputeLimits needs to get called somewhere outside of TableFinder
728  // to make sure the partitions are properly initialized.
729  // When this is called, SmoothPartitionPartners dies in an assert after
730  // table find runs. Alternative solution.
731  // part->ComputeLimits();
732  if (part->IsTextType()) {
733  // xheight_stats.add(part->median_height(), part->boxes_count());
734  // width_stats.add(part->median_width(), part->boxes_count());
735 
736  // This loop can be removed when above issues are fixed.
737  // Replace it with the 2 lines commented out above.
738  BLOBNBOX_C_IT it(part->boxes());
739  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
740  xheight_stats.add(it.data()->bounding_box().height(), 1);
741  width_stats.add(it.data()->bounding_box().width(), 1);
742  }
743 
744  ledding_stats.add(part->space_above(), 1);
745  ledding_stats.add(part->space_below(), 1);
746  }
747  }
748  // Set estimates based on median of statistics obtained
749  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
750  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
751  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
752 #ifndef GRAPHICS_DISABLED
753  if (textord_tablefind_show_stats) {
754  const char *kWindowName = "X-height (R), X-width (G), and ledding (B)";
755  ScrollView *stats_win = MakeWindow(500, 10, kWindowName);
756  xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
757  width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
758  ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
759  }
760 #endif // !GRAPHICS_DISABLED
761 }
const int kMaxVerticalSpacing
Definition: tablefind.cpp:38
const int kMaxBlobWidth
Definition: tablefind.cpp:40
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:766
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:763
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:769

◆ SetPartitionSpacings()

void tesseract::TableFinder::SetPartitionSpacings ( ColPartitionGrid grid,
ColPartitionSet **  all_columns 
)
staticprotected

Definition at line 589 of file tablefind.cpp.

590  {
591  // Iterate the ColPartitions in the grid.
592  ColPartitionGridSearch gsearch(grid);
593  gsearch.StartFullSearch();
594  ColPartition *part = nullptr;
595  while ((part = gsearch.NextFullSearch()) != nullptr) {
596  ColPartitionSet *columns = all_columns[gsearch.GridY()];
597  TBOX box = part->bounding_box();
598  int y = part->MidY();
599  ColPartition *left_column = columns->ColumnContaining(box.left(), y);
600  ColPartition *right_column = columns->ColumnContaining(box.right(), y);
601  // set distance from left column as space to the left
602  if (left_column) {
603  int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
604  part->set_space_to_left(left_space);
605  }
606  // set distance from right column as space to the right
607  if (right_column) {
608  int right_space = std::max(0, right_column->RightAtY(y) - box.right());
609  part->set_space_to_right(right_space);
610  }
611 
612  // Look for images that may be closer.
613  // NOTE: used to be part_grid_, might cause issues now
614  ColPartitionGridSearch hsearch(grid);
615  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
616  ColPartition *neighbor = nullptr;
617  while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
618  if (neighbor->type() == PT_PULLOUT_IMAGE ||
619  neighbor->type() == PT_FLOWING_IMAGE ||
620  neighbor->type() == PT_HEADING_IMAGE) {
621  int right = neighbor->bounding_box().right();
622  if (right < box.left()) {
623  int space = std::min(box.left() - right, part->space_to_left());
624  part->set_space_to_left(space);
625  }
626  }
627  }
628  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
629  neighbor = nullptr;
630  while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
631  if (neighbor->type() == PT_PULLOUT_IMAGE ||
632  neighbor->type() == PT_FLOWING_IMAGE ||
633  neighbor->type() == PT_HEADING_IMAGE) {
634  int left = neighbor->bounding_box().left();
635  if (left > box.right()) {
636  int space = std::min(left - box.right(), part->space_to_right());
637  part->set_space_to_right(space);
638  }
639  }
640  }
641 
642  ColPartition *upper_part = part->SingletonPartner(true);
643  if (upper_part) {
644  int space =
645  std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
646  part->bounding_box().bottom()));
647  part->set_space_above(space);
648  } else {
649  // TODO(nbeato): What constitutes a good value?
650  // 0 is the default value when not set, explicitly noting it needs to
651  // be something else.
652  part->set_space_above(INT32_MAX);
653  }
654 
655  ColPartition *lower_part = part->SingletonPartner(false);
656  if (lower_part) {
657  int space =
658  std::max(0, static_cast<int>(part->bounding_box().bottom() -
659  lower_part->bounding_box().bottom()));
660  part->set_space_below(space);
661  } else {
662  // TODO(nbeato): What constitutes a good value?
663  // 0 is the default value when not set, explicitly noting it needs to
664  // be something else.
665  part->set_space_below(INT32_MAX);
666  }
667  }
668 }
@ PT_PULLOUT_IMAGE
Definition: publictypes.h:65
@ PT_HEADING_IMAGE
Definition: publictypes.h:64
@ PT_FLOWING_IMAGE
Definition: publictypes.h:63

◆ SetVerticalSpacing()

void tesseract::TableFinder::SetVerticalSpacing ( ColPartition part)
protected

Definition at line 671 of file tablefind.cpp.

671  {
672  TBOX box = part->bounding_box();
673  int top_range =
674  std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
675  int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing,
676  static_cast<int>(bleft().y()));
677  box.set_top(top_range);
678  box.set_bottom(bottom_range);
679 
680  TBOX part_box = part->bounding_box();
681  // Start a rect search
682  GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
684  rectsearch.StartRectSearch(box);
685  ColPartition *neighbor;
686  int min_space_above = kMaxVerticalSpacing;
687  int min_space_below = kMaxVerticalSpacing;
688  ColPartition *above_neighbor = nullptr;
689  ColPartition *below_neighbor = nullptr;
690  while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
691  if (neighbor == part) {
692  continue;
693  }
694  TBOX neighbor_box = neighbor->bounding_box();
695  if (neighbor_box.major_x_overlap(part_box)) {
696  int gap = abs(part->median_bottom() - neighbor->median_bottom());
697  // If neighbor is below current partition
698  if (neighbor_box.top() < part_box.bottom() && gap < min_space_below) {
699  min_space_below = gap;
700  below_neighbor = neighbor;
701  } // If neighbor is above current partition
702  else if (part_box.top() < neighbor_box.bottom() &&
703  gap < min_space_above) {
704  min_space_above = gap;
705  above_neighbor = neighbor;
706  }
707  }
708  }
709  part->set_space_above(min_space_above);
710  part->set_space_below(min_space_below);
711  part->set_nearest_neighbor_above(above_neighbor);
712  part->set_nearest_neighbor_below(below_neighbor);
713 }

◆ SmoothTablePartitionRuns()

void tesseract::TableFinder::SmoothTablePartitionRuns ( )
protected

Definition at line 1147 of file tablefind.cpp.

1147  {
1148  // Iterate the ColPartitions in the grid.
1150  gsearch.StartFullSearch();
1151  ColPartition *part = nullptr;
1152  while ((part = gsearch.NextFullSearch()) != nullptr) {
1153  if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) {
1154  continue; // Consider only text partitions
1155  }
1156  ColPartition *upper_part = part->nearest_neighbor_above();
1157  ColPartition *lower_part = part->nearest_neighbor_below();
1158  if (!upper_part || !lower_part) {
1159  continue;
1160  }
1161  if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) {
1162  part->set_table_type();
1163  }
1164  }
1165 
1166  // Pass 2, do the opposite. If both the upper and lower neighbors
1167  // exist and are not tables, this probably shouldn't be a table.
1168  gsearch.StartFullSearch();
1169  part = nullptr;
1170  while ((part = gsearch.NextFullSearch()) != nullptr) {
1171  if (part->type() != PT_TABLE) {
1172  continue; // Consider only text partitions
1173  }
1174  ColPartition *upper_part = part->nearest_neighbor_above();
1175  ColPartition *lower_part = part->nearest_neighbor_below();
1176 
1177  // table can't be by itself
1178  if ((upper_part && upper_part->type() != PT_TABLE) &&
1179  (lower_part && lower_part->type() != PT_TABLE)) {
1180  part->clear_table_type();
1181  }
1182  }
1183 }

◆ SplitAndInsertFragmentedTextPartition()

void tesseract::TableFinder::SplitAndInsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 437 of file tablefind.cpp.

437  {
438  ASSERT_HOST(part != nullptr);
439  // Bye bye empty partitions!
440  if (part->boxes()->empty()) {
441  delete part;
442  return;
443  }
444 
445  // The AllowBlob function prevents this.
446  ASSERT_HOST(part->median_width() > 0);
447  const double kThreshold = part->median_width() * kSplitPartitionSize;
448 
449  ColPartition *right_part = part;
450  bool found_split = true;
451  while (found_split) {
452  found_split = false;
453  BLOBNBOX_C_IT box_it(right_part->boxes());
454  // Blobs are sorted left side first. If blobs overlap,
455  // the previous blob may have a "more right" right side.
456  // Account for this by always keeping the largest "right"
457  // so far.
458  int previous_right = INT32_MIN;
459 
460  // Look for the next split in the partition.
461  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
462  const TBOX &box = box_it.data()->bounding_box();
463  if (previous_right != INT32_MIN &&
464  box.left() - previous_right > kThreshold) {
465  // We have a split position. Split the partition in two pieces.
466  // Insert the left piece in the grid and keep processing the right.
467  int mid_x = (box.left() + previous_right) / 2;
468  ColPartition *left_part = right_part;
469  right_part = left_part->SplitAt(mid_x);
470 
472  found_split = true;
473  break;
474  }
475 
476  // The right side of the previous blobs.
477  previous_right = std::max(previous_right, static_cast<int>(box.right()));
478  }
479  }
480  // When a split is not found, the right part is minimized
481  // as much as possible, so process it.
482  InsertFragmentedTextPartition(right_part);
483 }
const double kSplitPartitionSize
Definition: tablefind.cpp:44
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:403

◆ tright()

const ICOORD & tesseract::TableFinder::tright ( ) const
protected

Definition at line 391 of file tablefind.cpp.

391  {
392  return clean_part_grid_.tright();
393 }
const ICOORD & tright() const
Definition: bbgrid.h:75

Member Data Documentation

◆ clean_part_grid_

ColPartitionGrid tesseract::TableFinder::clean_part_grid_
protected

Definition at line 395 of file tablefind.h.

◆ col_seg_grid_

ColSegmentGrid tesseract::TableFinder::col_seg_grid_
protected

Definition at line 403 of file tablefind.h.

◆ fragmented_text_grid_

ColPartitionGrid tesseract::TableFinder::fragmented_text_grid_
protected

Definition at line 401 of file tablefind.h.

◆ global_median_blob_width_

int tesseract::TableFinder::global_median_blob_width_
protected

Definition at line 389 of file tablefind.h.

◆ global_median_ledding_

int tesseract::TableFinder::global_median_ledding_
protected

Definition at line 391 of file tablefind.h.

◆ global_median_xheight_

int tesseract::TableFinder::global_median_xheight_
protected

Definition at line 387 of file tablefind.h.

◆ leader_and_ruling_grid_

ColPartitionGrid tesseract::TableFinder::leader_and_ruling_grid_
protected

Definition at line 397 of file tablefind.h.

◆ left_to_right_language_

bool tesseract::TableFinder::left_to_right_language_
protected

Definition at line 407 of file tablefind.h.

◆ resolution_

int tesseract::TableFinder::resolution_
protected

Definition at line 385 of file tablefind.h.

◆ table_grid_

ColSegmentGrid tesseract::TableFinder::table_grid_
protected

Definition at line 405 of file tablefind.h.


The documentation for this class was generated from the following files: