tesseract  5.0.0
gap_map.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 
11 #include "gap_map.h"
12 
13 #include "statistc.h"
14 
15 namespace tesseract {
16 
17 BOOL_VAR(gapmap_debug, false, "Say which blocks have tables");
18 BOOL_VAR(gapmap_use_ends, false, "Use large space at start and end of rows");
19 BOOL_VAR(gapmap_no_isolated_quanta, false, "Ensure gaps not less than 2quanta wide");
20 double_VAR(gapmap_big_gaps, 1.75, "xht multiplier");
21 
22 /*************************************************************************
23  * A block gap map is a quantised histogram of whitespace regions in the
24  * block. It is a vertical projection of wide gaps WITHIN lines
25  *
26  * The map is held as an array of counts of rows which have a wide gap
27  * covering that region of the row. Each bucket in the map represents a width
28  * of about half an xheight - (The median of the xhts in the rows is used.)
29  *
30  * The block is considered RECTANGULAR - delimited by the left and right
31  * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
32  * counted.
33  *
34  *************************************************************************/
35 
36 GAPMAP::GAPMAP( // Constructor
37  TO_BLOCK *block // block
38 ) {
39  TO_ROW *row; // current row
40  BLOBNBOX_IT blob_it; // iterator
41  TBOX blob_box;
42  TBOX prev_blob_box;
43  int16_t gap_width;
44  int16_t start_of_row;
45  int16_t end_of_row;
46  STATS xht_stats(0, 128);
47  int16_t min_quantum;
48  int16_t max_quantum;
49  int16_t i;
50 
51  /*
52  Find left and right extremes and bucket size
53 */
54  map = nullptr;
55  min_left = INT16_MAX;
56  max_right = -INT16_MAX;
57  total_rows = 0;
58  any_tabs = false;
59 
60  // row iterator
61  TO_ROW_IT row_it(block->get_rows());
62  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
63  row = row_it.data();
64  if (!row->blob_list()->empty()) {
65  total_rows++;
66  xht_stats.add(static_cast<int16_t>(floor(row->xheight + 0.5)), 1);
67  blob_it.set_to_list(row->blob_list());
68  start_of_row = blob_it.data()->bounding_box().left();
69  end_of_row = blob_it.data_relative(-1)->bounding_box().right();
70  if (min_left > start_of_row) {
71  min_left = start_of_row;
72  }
73  if (max_right < end_of_row) {
74  max_right = end_of_row;
75  }
76  }
77  }
78  if ((total_rows < 3) || (min_left >= max_right)) {
79  bucket_size = 0;
80  map_max = 0;
81  total_rows = 0;
82  min_left = max_right = 0;
83  return;
84  }
85  bucket_size = static_cast<int16_t>(floor(xht_stats.median() + 0.5)) / 2;
86  map_max = (max_right - min_left) / bucket_size;
87  map = new int16_t[map_max + 1];
88  for (i = 0; i <= map_max; i++) {
89  map[i] = 0;
90  }
91 
92  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
93  row = row_it.data();
94  if (!row->blob_list()->empty()) {
95  blob_it.set_to_list(row->blob_list());
96  blob_it.mark_cycle_pt();
97  blob_box = box_next(&blob_it);
98  prev_blob_box = blob_box;
99  if (gapmap_use_ends) {
100  /* Leading space */
101  gap_width = blob_box.left() - min_left;
102  if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
103  max_quantum = (blob_box.left() - min_left) / bucket_size;
104  if (max_quantum > map_max) {
105  max_quantum = map_max;
106  }
107  for (i = 0; i <= max_quantum; i++) {
108  map[i]++;
109  }
110  }
111  }
112  while (!blob_it.cycled_list()) {
113  blob_box = box_next(&blob_it);
114  gap_width = blob_box.left() - prev_blob_box.right();
115  if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
116  min_quantum = (prev_blob_box.right() - min_left) / bucket_size;
117  max_quantum = (blob_box.left() - min_left) / bucket_size;
118  if (max_quantum > map_max) {
119  max_quantum = map_max;
120  }
121  for (i = min_quantum; i <= max_quantum; i++) {
122  map[i]++;
123  }
124  }
125  prev_blob_box = blob_box;
126  }
127  if (gapmap_use_ends) {
128  /* Trailing space */
129  gap_width = max_right - prev_blob_box.right();
130  if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
131  min_quantum = (prev_blob_box.right() - min_left) / bucket_size;
132  if (min_quantum < 0) {
133  min_quantum = 0;
134  }
135  for (i = min_quantum; i <= map_max; i++) {
136  map[i]++;
137  }
138  }
139  }
140  }
141  }
142  for (i = 0; i <= map_max; i++) {
143  if (map[i] > total_rows / 2) {
145  (((i == 0) && (map[i + 1] <= total_rows / 2)) ||
146  ((i == map_max) && (map[i - 1] <= total_rows / 2)) ||
147  ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) &&
148  (map[i + 1] <= total_rows / 2)))) {
149  map[i] = 0; // prevent isolated quantum
150  } else {
151  any_tabs = true;
152  }
153  }
154  }
155  if (gapmap_debug && any_tabs) {
156  tprintf("Table found\n");
157  }
158 }
159 
160 /*************************************************************************
161  * GAPMAP::table_gap()
162  * Is there a bucket in the specified range where more than half the rows in the
163  * block have a wide gap?
164  *************************************************************************/
165 
166 bool GAPMAP::table_gap( // Is gap a table?
167  int16_t left, // From here
168  int16_t right // To here
169 ) {
170  int16_t min_quantum;
171  int16_t max_quantum;
172  int16_t i;
173  bool tab_found = false;
174 
175  if (!any_tabs) {
176  return false;
177  }
178 
179  min_quantum = (left - min_left) / bucket_size;
180  max_quantum = (right - min_left) / bucket_size;
181  // Clip to the bounds of the array. In some circumstances (big blob followed
182  // by small blob) max_quantum can exceed the map_max bounds, but we clip
183  // here instead, as it provides better long-term safety.
184  if (min_quantum < 0) {
185  min_quantum = 0;
186  }
187  if (max_quantum > map_max) {
188  max_quantum = map_max;
189  }
190  for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) {
191  if (map[i] > total_rows / 2) {
192  tab_found = true;
193  }
194  }
195  return tab_found;
196 }
197 
198 } // namespace tesseract
#define BOOL_VAR(name, val, comment)
Definition: params.h:359
#define double_VAR(name, val, comment)
Definition: params.h:365
bool gapmap_no_isolated_quanta
Definition: gap_map.cpp:19
double gapmap_big_gaps
Definition: gap_map.cpp:20
bool gapmap_use_ends
Definition: gap_map.cpp:18
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool gapmap_debug
Definition: gap_map.cpp:17
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:638
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:608
TO_ROW_LIST * get_rows()
Definition: blobbox.h:709
TDimension left() const
Definition: rect.h:82
TDimension right() const
Definition: rect.h:89
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
double median() const
Definition: statistc.cpp:242
GAPMAP(TO_BLOCK *block)
Definition: gap_map.cpp:36
bool table_gap(int16_t left, int16_t right)
Definition: gap_map.cpp:166