tesseract  5.0.0
tesseract::TestableTableRecognizer Class Reference
Inheritance diagram for tesseract::TestableTableRecognizer:
tesseract::TableRecognizer

Public Member Functions

bool FindLinesBoundingBox (TBOX *bounding_box)
 
bool HasSignificantLines (const TBOX &guess)
 
bool RecognizeLinedTable (const TBOX &guess_box, StructuredTable *table)
 
StructuredTableRecognizeTable (const TBOX &guess_box)
 
bool RecognizeWhitespacedTable (const TBOX &guess_box, StructuredTable *table)
 
- Public Member Functions inherited from tesseract::TableRecognizer
 TableRecognizer ()=default
 
 ~TableRecognizer ()=default
 
void Init ()
 
void set_text_grid (ColPartitionGrid *text)
 
void set_line_grid (ColPartitionGrid *lines)
 
void set_min_height (int height)
 
void set_min_width (int width)
 
void set_max_text_height (int height)
 
StructuredTableRecognizeTable (const TBOX &guess_box)
 

Additional Inherited Members

- Protected Member Functions inherited from tesseract::TableRecognizer
bool RecognizeLinedTable (const TBOX &guess_box, StructuredTable *table)
 
bool HasSignificantLines (const TBOX &guess)
 
bool FindLinesBoundingBox (TBOX *bounding_box)
 
bool FindLinesBoundingBoxIteration (TBOX *bounding_box)
 
bool RecognizeWhitespacedTable (const TBOX &guess_box, StructuredTable *table)
 
int NextHorizontalSplit (int left, int right, int y, bool top_to_bottom)
 
- Static Protected Member Functions inherited from tesseract::TableRecognizer
static bool IsWeakTableRow (StructuredTable *table, int row)
 
- Protected Attributes inherited from tesseract::TableRecognizer
ColPartitionGridtext_grid_ = nullptr
 
ColPartitionGridline_grid_ = nullptr
 
int min_height_ = 0
 
int min_width_ = 0
 
int max_text_height_ = INT32_MAX
 

Detailed Description

Definition at line 22 of file tablerecog_test.cc.

Member Function Documentation

◆ FindLinesBoundingBox()

bool tesseract::TableRecognizer::FindLinesBoundingBox

Definition at line 332 of file tablerecog.cpp.

825  {
826  // The first iteration will tell us if there are lines
827  // present and shrink the box to a minimal iterative size.
828  if (!FindLinesBoundingBoxIteration(bounding_box)) {
829  return false;
830  }
831 
832  // Keep growing until the area of the table stabilizes.
833  // The box can only get bigger, increasing area.
834  bool changed = true;
835  while (changed) {
836  changed = false;
837  int old_area = bounding_box->area();
838  bool check = FindLinesBoundingBoxIteration(bounding_box);
839  // At this point, the function will return true.
840  ASSERT_HOST(check);
841  ASSERT_HOST(bounding_box->area() >= old_area);
842  changed = (bounding_box->area() > old_area);
843  }
844 
845  return true;
846 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
bool FindLinesBoundingBoxIteration(TBOX *bounding_box)
Definition: tablerecog.cpp:848

◆ HasSignificantLines()

bool tesseract::TableRecognizer::HasSignificantLines

Definition at line 323 of file tablerecog.cpp.

784  {
786  box_search.SetUniqueMode(true);
787  box_search.StartRectSearch(guess);
788  ColPartition *line = nullptr;
789  int vertical_count = 0;
790  int horizontal_count = 0;
791 
792  while ((line = box_search.NextRectSearch()) != nullptr) {
793  if (line->IsHorizontalLine()) {
794  ++horizontal_count;
795  }
796  if (line->IsVerticalLine()) {
797  ++vertical_count;
798  }
799  }
800 
801  return vertical_count >= kLinedTableMinVerticalLines &&
802  horizontal_count >= kLinedTableMinHorizontalLines;
803 }
const int kLinedTableMinHorizontalLines
Definition: tablerecog.cpp:44
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:919
const int kLinedTableMinVerticalLines
Definition: tablerecog.cpp:43
ColPartitionGrid * line_grid_
Definition: tablerecog.h:362

◆ RecognizeLinedTable()

bool tesseract::TableRecognizer::RecognizeLinedTable

Definition at line 319 of file tablerecog.cpp.

766  {
767  if (!HasSignificantLines(guess_box)) {
768  return false;
769  }
770  TBOX line_bound = guess_box;
771  if (!FindLinesBoundingBox(&line_bound)) {
772  return false;
773  }
774  table->set_bounding_box(line_bound);
775  return table->FindLinedStructure();
776 }
@ TBOX
bool FindLinesBoundingBox(TBOX *bounding_box)
Definition: tablerecog.cpp:825
bool HasSignificantLines(const TBOX &guess)
Definition: tablerecog.cpp:784

◆ RecognizeTable()

StructuredTable * tesseract::TableRecognizer::RecognizeTable

Definition at line 310 of file tablerecog.cpp.

741  {
742  auto *table = new StructuredTable();
743  table->Init();
744  table->set_text_grid(text_grid_);
745  table->set_line_grid(line_grid_);
746  table->set_max_text_height(max_text_height_);
747 
748  // Try to solve this simple case, a table with *both*
749  // vertical and horizontal lines.
750  if (RecognizeLinedTable(guess, table)) {
751  return table;
752  }
753 
754  // Fallback to whitespace if that failed.
755  // TODO(nbeato): Break this apart to take advantage of horizontal
756  // lines or vertical lines when present.
757  if (RecognizeWhitespacedTable(guess, table)) {
758  return table;
759  }
760 
761  // No table found...
762  delete table;
763  return nullptr;
764 }
ColPartitionGrid * text_grid_
Definition: tablerecog.h:361
bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:766
bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:886

◆ RecognizeWhitespacedTable()

bool tesseract::TableRecognizer::RecognizeWhitespacedTable

Definition at line 347 of file tablerecog.cpp.

886  {
887  TBOX best_box = guess_box; // Best borders known.
888  int best_below = 0; // Margin size above best table.
889  int best_above = 0; // Margin size below best table.
890  TBOX adjusted = guess_box; // The search box.
891 
892  // We assume that the guess box is somewhat accurate, so we don't allow
893  // the adjusted border to pass half of the guessed area. This prevents
894  // "negative" tables from forming.
895  const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
896  // Keeps track of the most columns in an accepted table. The resulting table
897  // may be less than the max, but we don't want to stray too far.
898  unsigned best_cols = 0;
899  // Make sure we find a good border.
900  bool found_good_border = false;
901 
902  // Find the bottom of the table by trying a few different locations. For
903  // each location, the top, left, and right are fixed. We start the search
904  // in a smaller table to favor best_cols getting a good estimate sooner.
905  int last_bottom = INT32_MAX;
906  int bottom =
907  NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY - min_height_ / 2, true);
908  int top =
909  NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
910  adjusted.set_top(top);
911 
912  // Headers/footers can be spaced far from everything.
913  // Make sure that the space below is greater than the space above
914  // the lowest row.
915  int previous_below = 0;
916  const int kMaxChances = 10;
917  int chances = kMaxChances;
918  while (bottom != last_bottom) {
919  adjusted.set_bottom(bottom);
920 
921  if (adjusted.height() >= min_height_) {
922  // Try to fit the grid on the current box. We give it a chance
923  // if the number of columns didn't significantly drop.
924  table->set_bounding_box(adjusted);
925  if (table->FindWhitespacedStructure() &&
926  table->column_count() >= best_cols * kRequiredColumns) {
927  if (false && IsWeakTableRow(table, 0)) {
928  // Currently buggy, but was looking promising so disabled.
929  --chances;
930  } else {
931  // We favor 2 things,
932  // 1- Adding rows that have partitioned data.
933  // 2- Better margins (to find header/footer).
934  // For better tables, we just look for multiple cells in the
935  // bottom row with data in them.
936  // For margins, the space below the last row should
937  // be better than a table with the last row removed.
938  chances = kMaxChances;
939  double max_row_height = kMaxRowSize * table->median_cell_height();
940  if ((table->space_below() * kMarginFactor >= best_below &&
941  table->space_below() >= previous_below) ||
942  (table->CountFilledCellsInRow(0) > 1 && table->row_height(0) < max_row_height)) {
943  best_box.set_bottom(bottom);
944  best_below = table->space_below();
945  best_cols = std::max(table->column_count(), best_cols);
946  found_good_border = true;
947  }
948  }
949  previous_below = table->space_below();
950  } else {
951  --chances;
952  }
953  }
954  if (chances <= 0) {
955  break;
956  }
957 
958  last_bottom = bottom;
959  bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_bottom, true);
960  }
961  if (!found_good_border) {
962  return false;
963  }
964 
965  // TODO(nbeato) comments: follow modified code above... put it in a function!
966  found_good_border = false;
967  int last_top = INT32_MIN;
968  top =
969  NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
970  int previous_above = 0;
971  chances = kMaxChances;
972 
973  adjusted.set_bottom(best_box.bottom());
974  while (last_top != top) {
975  adjusted.set_top(top);
976  if (adjusted.height() >= min_height_) {
977  table->set_bounding_box(adjusted);
978  if (table->FindWhitespacedStructure() &&
979  table->column_count() >= best_cols * kRequiredColumns) {
980  int last_row = table->row_count() - 1;
981  if (false && IsWeakTableRow(table, last_row)) {
982  // Currently buggy, but was looking promising so disabled.
983  --chances;
984  } else {
985  chances = kMaxChances;
986  double max_row_height = kMaxRowSize * table->median_cell_height();
987  if ((table->space_above() * kMarginFactor >= best_above &&
988  table->space_above() >= previous_above) ||
989  (table->CountFilledCellsInRow(last_row) > 1 &&
990  table->row_height(last_row) < max_row_height)) {
991  best_box.set_top(top);
992  best_above = table->space_above();
993  best_cols = std::max(table->column_count(), best_cols);
994  found_good_border = true;
995  }
996  }
997  previous_above = table->space_above();
998  } else {
999  --chances;
1000  }
1001  }
1002  if (chances <= 0) {
1003  break;
1004  }
1005 
1006  last_top = top;
1007  top = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_top, false);
1008  }
1009 
1010  if (!found_good_border) {
1011  return false;
1012  }
1013 
1014  // If we get here, this shouldn't happen. It can be an assert, but
1015  // I haven't tested it enough to make it crash things.
1016  if (best_box.null_box()) {
1017  return false;
1018  }
1019 
1020  // Given the best locations, fit the box to those locations.
1021  table->set_bounding_box(best_box);
1022  return table->FindWhitespacedStructure();
1023 }
const double kRequiredColumns
Definition: tablerecog.cpp:47
const double kMaxRowSize
Definition: tablerecog.cpp:52
const double kMarginFactor
Definition: tablerecog.cpp:49
static bool IsWeakTableRow(StructuredTable *table, int row)
int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom)

The documentation for this class was generated from the following file: