tesseract  5.0.0
edgblob.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: edgblob.cpp (Formerly edgeloop.c)
3  * Description: Functions to clean up an outline before approximation.
4  * Author: Ray Smith
5  *
6  *(C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0(the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23 
24 #include "edgblob.h"
25 
26 #include "edgloop.h"
27 #include "scanedg.h"
28 
29 #define BUCKETSIZE 16
30 
31 namespace tesseract {
32 
33 // Control parameters used in outline_complexity(), which rejects an outline
34 // if any one of the 3 conditions is satisfied:
35 // - number of children exceeds edges_max_children_per_outline
36 // - number of nested layers exceeds edges_max_children_layers
37 // - joint complexity exceeds edges_children_count_limit(as in child_count())
38 static BOOL_VAR(edges_use_new_outline_complexity, false,
39  "Use the new outline complexity module");
40 static INT_VAR(edges_max_children_per_outline, 10,
41  "Max number of children inside a character outline");
42 static INT_VAR(edges_max_children_layers, 5,
43  "Max layers of nested children inside a character outline");
44 static BOOL_VAR(edges_debug, false, "turn on debugging for this module");
45 
46 static INT_VAR(edges_children_per_grandchild, 10,
47  "Importance ratio for chucking outlines");
48 static INT_VAR(edges_children_count_limit, 45, "Max holes allowed in blob");
49 static BOOL_VAR(edges_children_fix, false,
50  "Remove boxy parents of char-like children");
51 static INT_VAR(edges_min_nonhole, 12, "Min pixels for potential char in box");
52 static INT_VAR(edges_patharea_ratio, 40,
53  "Max lensq/area for acceptable child outline");
54 static double_VAR(edges_childarea, 0.5, "Min area fraction of child outline");
55 static double_VAR(edges_boxarea, 0.875,
56  "Min area fraction of grandchild for box");
57 
64 OL_BUCKETS::OL_BUCKETS(ICOORD bleft, // corners
65  ICOORD tright)
66  : bxdim((tright.x() - bleft.x()) / BUCKETSIZE + 1),
67  bydim((tright.y() - bleft.y()) / BUCKETSIZE + 1),
68  buckets(bxdim * bydim),
69  bl(bleft),
70  tr(tright) {}
71 
79 C_OUTLINE_LIST *OL_BUCKETS::operator()( // array access
80  TDimension x, // image coords
81  TDimension y) {
82  return &buckets[(y - bl.y()) / BUCKETSIZE * bxdim +
83  (x - bl.x()) / BUCKETSIZE];
84 }
85 
86 C_OUTLINE_LIST *OL_BUCKETS::start_scan() {
87  return scan_next(buckets.begin());
88 }
89 
90 C_OUTLINE_LIST *OL_BUCKETS::scan_next() {
91  return scan_next(it);
92 }
93 
94 C_OUTLINE_LIST *OL_BUCKETS::scan_next(decltype(buckets)::iterator in_it) {
95  it = std::find_if(in_it, buckets.end(), [](auto &&b) { return !b.empty(); });
96  if (it == buckets.end())
97  return nullptr;
98  return &*it;
99 }
100 
121 int32_t OL_BUCKETS::outline_complexity(C_OUTLINE *outline, // parent outline
122  int32_t max_count, // max output
123  int16_t depth // recurion depth
124 ) {
125  TDimension xmin, xmax; // coord limits
126  TDimension ymin, ymax;
127  C_OUTLINE *child; // current child
128  int32_t child_count; // no of children
129  int32_t grandchild_count; // no of grandchildren
130  C_OUTLINE_IT child_it; // search iterator
131 
132  TBOX olbox = outline->bounding_box();
133  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
134  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
135  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
136  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
137  child_count = 0;
138  grandchild_count = 0;
139  if (++depth > edges_max_children_layers) { // nested loops are too deep
140  return max_count + depth;
141  }
142 
143  for (auto yindex = ymin; yindex <= ymax; yindex++) {
144  for (auto xindex = xmin; xindex <= xmax; xindex++) {
145  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
146  if (child_it.empty()) {
147  continue;
148  }
149  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
150  child_it.forward()) {
151  child = child_it.data();
152  if (child == outline || !(*child < *outline)) {
153  continue;
154  }
155  child_count++;
156 
157  if (child_count > edges_max_children_per_outline) { // too fragmented
158  if (edges_debug) {
159  tprintf(
160  "Discard outline on child_count=%d > "
161  "max_children_per_outline=%d\n",
162  child_count,
163  static_cast<int32_t>(edges_max_children_per_outline));
164  }
165  return max_count + child_count;
166  }
167 
168  // Compute the "complexity" of each child recursively
169  int32_t remaining_count = max_count - child_count - grandchild_count;
170  if (remaining_count > 0) {
171  grandchild_count += edges_children_per_grandchild *
172  outline_complexity(child, remaining_count, depth);
173  }
174  if (child_count + grandchild_count > max_count) { // too complex
175  if (edges_debug) {
176  tprintf(
177  "Disgard outline on child_count=%d + grandchild_count=%d "
178  "> max_count=%d\n",
179  child_count, grandchild_count, max_count);
180  }
181  return child_count + grandchild_count;
182  }
183  }
184  }
185  }
186  return child_count + grandchild_count;
187 }
188 
194 // TODO(rays) Merge with outline_complexity.
195 int32_t OL_BUCKETS::count_children( // recursive count
196  C_OUTLINE *outline, // parent outline
197  int32_t max_count // max output
198 ) {
199  bool parent_box; // could it be boxy
200  TDimension xmin, xmax; // coord limits
201  TDimension ymin, ymax;
202  C_OUTLINE *child; // current child
203  int32_t child_count; // no of children
204  int32_t grandchild_count; // no of grandchildren
205  int32_t parent_area; // potential box
206  float max_parent_area; // potential box
207  int32_t child_area; // current child
208  int32_t child_length; // current child
209  TBOX olbox;
210  C_OUTLINE_IT child_it; // search iterator
211 
212  olbox = outline->bounding_box();
213  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
214  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
215  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
216  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
217  child_count = 0;
218  grandchild_count = 0;
219  parent_area = 0;
220  max_parent_area = 0;
221  parent_box = true;
222  for (auto yindex = ymin; yindex <= ymax; yindex++) {
223  for (auto xindex = xmin; xindex <= xmax; xindex++) {
224  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
225  if (child_it.empty()) {
226  continue;
227  }
228  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
229  child_it.forward()) {
230  child = child_it.data();
231  if (child != outline && *child < *outline) {
232  child_count++;
233  if (child_count <= max_count) {
234  int max_grand =
235  (max_count - child_count) / edges_children_per_grandchild;
236  if (max_grand > 0) {
237  grandchild_count += count_children(child, max_grand) *
238  edges_children_per_grandchild;
239  } else {
240  grandchild_count += count_children(child, 1);
241  }
242  }
243  if (child_count + grandchild_count > max_count) {
244  if (edges_debug) {
245  tprintf("Discarding parent with child count=%d, gc=%d\n",
246  child_count, grandchild_count);
247  }
248  return child_count + grandchild_count;
249  }
250  if (parent_area == 0) {
251  parent_area = outline->outer_area();
252  if (parent_area < 0) {
253  parent_area = -parent_area;
254  }
255  max_parent_area = outline->bounding_box().area() * edges_boxarea;
256  if (parent_area < max_parent_area) {
257  parent_box = false;
258  }
259  }
260  if (parent_box &&
261  (!edges_children_fix ||
262  child->bounding_box().height() > edges_min_nonhole)) {
263  child_area = child->outer_area();
264  if (child_area < 0) {
265  child_area = -child_area;
266  }
267  if (edges_children_fix) {
268  if (parent_area - child_area < max_parent_area) {
269  parent_box = false;
270  continue;
271  }
272  if (grandchild_count > 0) {
273  if (edges_debug) {
274  tprintf(
275  "Discarding parent of area %d, child area=%d, max%g "
276  "with gc=%d\n",
277  parent_area, child_area, max_parent_area,
278  grandchild_count);
279  }
280  return max_count + 1;
281  }
282  child_length = child->pathlength();
283  if (child_length * child_length >
284  child_area * edges_patharea_ratio) {
285  if (edges_debug) {
286  tprintf(
287  "Discarding parent of area %d, child area=%d, max%g "
288  "with child length=%d\n",
289  parent_area, child_area, max_parent_area, child_length);
290  }
291  return max_count + 1;
292  }
293  }
294  if (child_area < child->bounding_box().area() * edges_childarea) {
295  if (edges_debug) {
296  tprintf(
297  "Discarding parent of area %d, child area=%d, max%g "
298  "with child rect=%d\n",
299  parent_area, child_area, max_parent_area,
300  child->bounding_box().area());
301  }
302  return max_count + 1;
303  }
304  }
305  }
306  }
307  }
308  }
309  return child_count + grandchild_count;
310 }
311 
318 void OL_BUCKETS::extract_children( // recursive count
319  C_OUTLINE *outline, // parent outline
320  C_OUTLINE_IT *it // destination iterator
321 ) {
322  TDimension xmin, xmax; // coord limits
323  TDimension ymin, ymax;
324  TBOX olbox;
325  C_OUTLINE_IT child_it; // search iterator
326 
327  olbox = outline->bounding_box();
328  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
329  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
330  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
331  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
332  for (auto yindex = ymin; yindex <= ymax; yindex++) {
333  for (auto xindex = xmin; xindex <= xmax; xindex++) {
334  child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
335  for (child_it.mark_cycle_pt(); !child_it.cycled_list();
336  child_it.forward()) {
337  if (*child_it.data() < *outline) {
338  it->add_after_then_move(child_it.extract());
339  }
340  }
341  }
342  }
343 }
344 
346 
347 void extract_edges(Image pix, // thresholded image
348  BLOCK *block) { // block to scan
349  C_OUTLINE_LIST outlines; // outlines in block
350  C_OUTLINE_IT out_it = &outlines;
351 
352  block_edges(pix, &(block->pdblk), &out_it);
353  ICOORD bleft; // block box
354  ICOORD tright;
355  block->pdblk.bounding_box(bleft, tright);
356  // make blobs
357  outlines_to_blobs(block, bleft, tright, &outlines);
358 }
359 
361 
362 static void fill_buckets(C_OUTLINE_LIST *outlines, // outlines in block
363  OL_BUCKETS *buckets // output buckets
364 ) {
365  C_OUTLINE_IT out_it = outlines; // iterator
366  C_OUTLINE_IT bucket_it; // iterator in bucket
367 
368  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
369  auto outline = out_it.extract(); // take off list
370  // get box
371  const TBOX &ol_box(outline->bounding_box());
372  bucket_it.set_to_list((*buckets)(ol_box.left(), ol_box.bottom()));
373  bucket_it.add_to_end(outline);
374  }
375 }
376 
385 static bool capture_children(OL_BUCKETS *buckets, // bucket sort class
386  C_BLOB_IT *reject_it, // dead grandchildren
387  C_OUTLINE_IT *blob_it // output outlines
388 ) {
389  // master outline
390  auto outline = blob_it->data();
391  // no of children
392  int32_t child_count;
393  if (edges_use_new_outline_complexity) {
394  child_count =
395  buckets->outline_complexity(outline, edges_children_count_limit, 0);
396  } else {
397  child_count = buckets->count_children(outline, edges_children_count_limit);
398  }
399  if (child_count > edges_children_count_limit) {
400  return false;
401  }
402 
403  if (child_count > 0) {
404  buckets->extract_children(outline, blob_it);
405  }
406  return true;
407 }
408 
415 static void empty_buckets(BLOCK *block, // block to scan
416  OL_BUCKETS *buckets // output buckets
417 ) {
418  C_OUTLINE_LIST outlines; // outlines in block
419  // iterator
420  C_OUTLINE_IT out_it = &outlines;
421  auto start_scan = buckets->start_scan();
422  if (start_scan == nullptr) {
423  return;
424  }
425  C_OUTLINE_IT bucket_it = start_scan;
426  C_BLOB_IT good_blobs = block->blob_list();
427  C_BLOB_IT junk_blobs = block->reject_blobs();
428 
429  while (!bucket_it.empty()) {
430  out_it.set_to_list(&outlines);
431  C_OUTLINE_IT parent_it; // parent outline
432  do {
433  parent_it = bucket_it; // find outermost
434  do {
435  bucket_it.forward();
436  } while (!bucket_it.at_first() &&
437  !(*parent_it.data() < *bucket_it.data()));
438  } while (!bucket_it.at_first());
439 
440  // move to new list
441  out_it.add_after_then_move(parent_it.extract());
442  // healthy blob
443  bool good_blob = capture_children(buckets, &junk_blobs, &out_it);
444  C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,
445  &junk_blobs);
446 
447  if (auto l = buckets->scan_next())
448  bucket_it.set_to_list(l);
449  else
450  break;
451  }
452 }
453 
460 void outlines_to_blobs( // find blobs
461  BLOCK *block, // block to scan
462  ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines) {
463  // make buckets
464  OL_BUCKETS buckets(bleft, tright);
465 
466  fill_buckets(outlines, &buckets);
467  empty_buckets(block, &buckets);
468 }
469 
470 } // namespace tesseract
#define BOOL_VAR(name, val, comment)
Definition: params.h:359
#define INT_VAR(name, val, comment)
Definition: params.h:356
#define double_VAR(name, val, comment)
Definition: params.h:365
#define BUCKETSIZE
Definition: edgblob.cpp:29
@ TBOX
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void block_edges(Image t_pix, PDBLK *block, C_OUTLINE_IT *outline_it)
Definition: scanedg.cpp:62
void outlines_to_blobs(BLOCK *block, ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines)
Definition: edgblob.cpp:460
int16_t TDimension
Definition: tesstypes.h:32
void extract_edges(Image pix, BLOCK *block)
Definition: edgblob.cpp:347
int32_t pathlength() const
Definition: coutln.h:134
const TBOX & bounding_box() const
Definition: coutln.h:113
int32_t outer_area() const
Definition: coutln.cpp:313
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
integer coordinate
Definition: points.h:36
TDimension y() const
access_function
Definition: points.h:62
TDimension x() const
access function
Definition: points.h:58
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension top() const
Definition: rect.h:68
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
int32_t area() const
Definition: rect.h:134
static void ConstructBlobsFromOutlines(bool good_blob, C_OUTLINE_LIST *outline_list, C_BLOB_IT *good_blobs_it, C_BLOB_IT *bad_blobs_it)
Definition: stepblob.cpp:188
void extract_children(C_OUTLINE *outline, C_OUTLINE_IT *it)
Definition: edgblob.cpp:318
OL_BUCKETS(ICOORD bleft, ICOORD tright)
Definition: edgblob.cpp:64
C_OUTLINE_LIST * start_scan()
Definition: edgblob.cpp:86
C_OUTLINE_LIST * scan_next()
Definition: edgblob.cpp:90
int32_t outline_complexity(C_OUTLINE *outline, int32_t max_count, int16_t depth)
Definition: edgblob.cpp:121
int32_t count_children(C_OUTLINE *outline, int32_t max_count)
Definition: edgblob.cpp:195
C_OUTLINE_LIST * operator()(TDimension x, TDimension y)
Definition: edgblob.cpp:79