tesseract  5.0.0
imagefind.cpp
Go to the documentation of this file.
1 // File: imagefind.cpp
3 // Description: Function to find image and drawing regions in an image
4 // and create a corresponding list of empty blobs.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23 
24 #include "imagefind.h"
25 
26 #include "colpartitiongrid.h"
27 #include "linlsq.h"
28 #include "params.h"
29 #include "statistc.h"
30 
31 #include <allheaders.h>
32 
33 #include <algorithm>
34 
35 namespace tesseract {
36 
37 static INT_VAR(textord_tabfind_show_images, false, "Show image blobs");
38 
39 // Fraction of width or height of on pixels that can be discarded from a
40 // roughly rectangular image.
41 const double kMinRectangularFraction = 0.125;
42 // Fraction of width or height to consider image completely used.
43 const double kMaxRectangularFraction = 0.75;
44 // Fraction of width or height to allow transition from kMinRectangularFraction
45 // to kMaxRectangularFraction, equivalent to a dy/dx skew.
46 const double kMaxRectangularGradient = 0.1; // About 6 degrees.
47 // Minimum image size to be worth looking for images on.
48 const int kMinImageFindSize = 100;
49 // Scale factor for the rms color fit error.
50 const double kRMSFitScaling = 8.0;
51 // Min color difference to call it two colors.
52 const int kMinColorDifference = 16;
53 // Pixel padding for noise blobs and partitions when rendering on the image
54 // mask to encourage them to join together. Make it too big and images
55 // will fatten out too much and have to be clipped to text.
56 const int kNoisePadding = 4;
57 
58 // Finds image regions within the BINARY source pix (page image) and returns
59 // the image regions as a mask image.
60 // The returned pix may be nullptr, meaning no images found.
61 // If not nullptr, it must be PixDestroyed by the caller.
62 // If textord_tabfind_show_images, debug images are appended to pixa_debug.
64  // Not worth looking at small images.
65  if (pixGetWidth(pix) < kMinImageFindSize || pixGetHeight(pix) < kMinImageFindSize) {
66  return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
67  }
68 
69  // Reduce by factor 2.
70  Image pixr = pixReduceRankBinaryCascade(pix, 1, 0, 0, 0);
71  if (textord_tabfind_show_images && pixa_debug != nullptr) {
72  pixa_debug->AddPix(pixr, "CascadeReduced");
73  }
74 
75  // Get the halftone mask directly from Leptonica.
76  //
77  // Leptonica will print an error message and return nullptr if we call
78  // pixGenHalftoneMask(pixr, nullptr, ...) with too small image, so we
79  // want to bypass that.
80  if (pixGetWidth(pixr) < kMinImageFindSize || pixGetHeight(pixr) < kMinImageFindSize) {
81  pixr.destroy();
82  return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
83  }
84  // Get the halftone mask.
85  l_int32 ht_found = 0;
86  Pixa *pixadb = (textord_tabfind_show_images && pixa_debug != nullptr) ? pixaCreate(0) : nullptr;
87  Image pixht2 = pixGenerateHalftoneMask(pixr, nullptr, &ht_found, pixadb);
88  if (pixadb) {
89  Image pixdb = pixaDisplayTiledInColumns(pixadb, 3, 1.0, 20, 2);
90  if (textord_tabfind_show_images && pixa_debug != nullptr) {
91  pixa_debug->AddPix(pixdb, "HalftoneMask");
92  }
93  pixdb.destroy();
94  pixaDestroy(&pixadb);
95  }
96  pixr.destroy();
97  if (!ht_found && pixht2 != nullptr) {
98  pixht2.destroy();
99  }
100  if (pixht2 == nullptr) {
101  return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
102  }
103 
104  // Expand back up again.
105  Image pixht = pixExpandReplicate(pixht2, 2);
106  if (textord_tabfind_show_images && pixa_debug != nullptr) {
107  pixa_debug->AddPix(pixht, "HalftoneReplicated");
108  }
109  pixht2.destroy();
110 
111  // Fill to capture pixels near the mask edges that were missed
112  Image pixt = pixSeedfillBinary(nullptr, pixht, pix, 8);
113  pixht |= pixt;
114  pixt.destroy();
115 
116  // Eliminate lines and bars that may be joined to images.
117  Image pixfinemask = pixReduceRankBinaryCascade(pixht, 1, 1, 3, 3);
118  pixDilateBrick(pixfinemask, pixfinemask, 5, 5);
119  if (textord_tabfind_show_images && pixa_debug != nullptr) {
120  pixa_debug->AddPix(pixfinemask, "FineMask");
121  }
122  Image pixreduced = pixReduceRankBinaryCascade(pixht, 1, 1, 1, 1);
123  Image pixreduced2 = pixReduceRankBinaryCascade(pixreduced, 3, 3, 3, 0);
124  pixreduced.destroy();
125  pixDilateBrick(pixreduced2, pixreduced2, 5, 5);
126  Image pixcoarsemask = pixExpandReplicate(pixreduced2, 8);
127  pixreduced2.destroy();
128  if (textord_tabfind_show_images && pixa_debug != nullptr) {
129  pixa_debug->AddPix(pixcoarsemask, "CoarseMask");
130  }
131  // Combine the coarse and fine image masks.
132  pixcoarsemask &= pixfinemask;
133  pixfinemask.destroy();
134  // Dilate a bit to make sure we get everything.
135  pixDilateBrick(pixcoarsemask, pixcoarsemask, 3, 3);
136  Image pixmask = pixExpandReplicate(pixcoarsemask, 16);
137  pixcoarsemask.destroy();
138  if (textord_tabfind_show_images && pixa_debug != nullptr) {
139  pixa_debug->AddPix(pixmask, "MaskDilated");
140  }
141  // And the image mask with the line and bar remover.
142  pixht &= pixmask;
143  pixmask.destroy();
144  if (textord_tabfind_show_images && pixa_debug != nullptr) {
145  pixa_debug->AddPix(pixht, "FinalMask");
146  }
147  // Make the result image the same size as the input.
148  Image result = pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
149  result |= pixht;
150  pixht.destroy();
151  return result;
152 }
153 
154 // Generates a Boxa, Pixa pair from the input binary (image mask) pix,
155 // analogous to pixConnComp, except that connected components which are nearly
156 // rectangular are replaced with solid rectangles.
157 // The returned boxa, pixa may be nullptr, meaning no images found.
158 // If not nullptr, they must be destroyed by the caller.
159 // Resolution of pix should match the source image (Tesseract::pix_binary_)
160 // so the output coordinate systems match.
161 void ImageFind::ConnCompAndRectangularize(Image pix, DebugPixa *pixa_debug, Boxa **boxa,
162  Pixa **pixa) {
163  *boxa = nullptr;
164  *pixa = nullptr;
165 
166  if (textord_tabfind_show_images && pixa_debug != nullptr) {
167  pixa_debug->AddPix(pix, "Conncompimage");
168  }
169  // Find the individual image regions in the mask image.
170  *boxa = pixConnComp(pix, pixa, 8);
171  // Rectangularize the individual images. If a sharp edge in vertical and/or
172  // horizontal occupancy can be found, it indicates a probably rectangular
173  // image with unwanted bits merged on, so clip to the approximate rectangle.
174  int npixes = 0;
175  if (*boxa != nullptr && *pixa != nullptr) {
176  npixes = pixaGetCount(*pixa);
177  }
178  for (int i = 0; i < npixes; ++i) {
179  int x_start, x_end, y_start, y_end;
180  Image img_pix = pixaGetPix(*pixa, i, L_CLONE);
181  if (textord_tabfind_show_images && pixa_debug != nullptr) {
182  pixa_debug->AddPix(img_pix, "A component");
183  }
185  kMaxRectangularGradient, &x_start, &y_start, &x_end, &y_end)) {
186  Image simple_pix = pixCreate(x_end - x_start, y_end - y_start, 1);
187  pixSetAll(simple_pix);
188  img_pix.destroy();
189  // pixaReplacePix takes ownership of the simple_pix.
190  pixaReplacePix(*pixa, i, simple_pix, nullptr);
191  img_pix = pixaGetPix(*pixa, i, L_CLONE);
192  // Fix the box to match the new pix.
193  l_int32 x, y, width, height;
194  boxaGetBoxGeometry(*boxa, i, &x, &y, &width, &height);
195  Box *simple_box = boxCreate(x + x_start, y + y_start, x_end - x_start, y_end - y_start);
196  boxaReplaceBox(*boxa, i, simple_box);
197  }
198  img_pix.destroy();
199  }
200 }
201 
202 // Scans horizontally on x=[x_start,x_end), starting with y=*y_start,
203 // stepping y+=y_step, until y=y_end. *ystart is input/output.
204 // If the number of black pixels in a row, pix_count fits this pattern:
205 // 0 or more rows with pix_count < min_count then
206 // <= mid_width rows with min_count <= pix_count <= max_count then
207 // a row with pix_count > max_count then
208 // true is returned, and *y_start = the first y with pix_count >= min_count.
209 static bool HScanForEdge(uint32_t *data, int wpl, int x_start, int x_end, int min_count,
210  int mid_width, int max_count, int y_end, int y_step, int *y_start) {
211  int mid_rows = 0;
212  for (int y = *y_start; y != y_end; y += y_step) {
213  // Need pixCountPixelsInRow(pix, y, &pix_count, nullptr) to count in a
214  // subset.
215  int pix_count = 0;
216  uint32_t *line = data + wpl * y;
217  for (int x = x_start; x < x_end; ++x) {
218  if (GET_DATA_BIT(line, x)) {
219  ++pix_count;
220  }
221  }
222  if (mid_rows == 0 && pix_count < min_count) {
223  continue; // In the min phase.
224  }
225  if (mid_rows == 0) {
226  *y_start = y; // Save the y_start where we came out of the min phase.
227  }
228  if (pix_count > max_count) {
229  return true; // Found the pattern.
230  }
231  ++mid_rows;
232  if (mid_rows > mid_width) {
233  break; // Middle too big.
234  }
235  }
236  return false; // Never found max_count.
237 }
238 
239 // Scans vertically on y=[y_start,y_end), starting with x=*x_start,
240 // stepping x+=x_step, until x=x_end. *x_start is input/output.
241 // If the number of black pixels in a column, pix_count fits this pattern:
242 // 0 or more cols with pix_count < min_count then
243 // <= mid_width cols with min_count <= pix_count <= max_count then
244 // a column with pix_count > max_count then
245 // true is returned, and *x_start = the first x with pix_count >= min_count.
246 static bool VScanForEdge(uint32_t *data, int wpl, int y_start, int y_end, int min_count,
247  int mid_width, int max_count, int x_end, int x_step, int *x_start) {
248  int mid_cols = 0;
249  for (int x = *x_start; x != x_end; x += x_step) {
250  int pix_count = 0;
251  uint32_t *line = data + y_start * wpl;
252  for (int y = y_start; y < y_end; ++y, line += wpl) {
253  if (GET_DATA_BIT(line, x)) {
254  ++pix_count;
255  }
256  }
257  if (mid_cols == 0 && pix_count < min_count) {
258  continue; // In the min phase.
259  }
260  if (mid_cols == 0) {
261  *x_start = x; // Save the place where we came out of the min phase.
262  }
263  if (pix_count > max_count) {
264  return true; // found the pattern.
265  }
266  ++mid_cols;
267  if (mid_cols > mid_width) {
268  break; // Middle too big.
269  }
270  }
271  return false; // Never found max_count.
272 }
273 
274 // Returns true if there is a rectangle in the source pix, such that all
275 // pixel rows and column slices outside of it have less than
276 // min_fraction of the pixels black, and within max_skew_gradient fraction
277 // of the pixels on the inside, there are at least max_fraction of the
278 // pixels black. In other words, the inside of the rectangle looks roughly
279 // rectangular, and the outside of it looks like extra bits.
280 // On return, the rectangle is defined by x_start, y_start, x_end and y_end.
281 // Note: the algorithm is iterative, allowing it to slice off pixels from
282 // one edge, allowing it to then slice off more pixels from another edge.
283 bool ImageFind::pixNearlyRectangular(Image pix, double min_fraction, double max_fraction,
284  double max_skew_gradient, int *x_start, int *y_start,
285  int *x_end, int *y_end) {
286  ASSERT_HOST(pix != nullptr);
287  *x_start = 0;
288  *x_end = pixGetWidth(pix);
289  *y_start = 0;
290  *y_end = pixGetHeight(pix);
291 
292  uint32_t *data = pixGetData(pix);
293  int wpl = pixGetWpl(pix);
294  bool any_cut = false;
295  bool left_done = false;
296  bool right_done = false;
297  bool top_done = false;
298  bool bottom_done = false;
299  do {
300  any_cut = false;
301  // Find the top/bottom edges.
302  int width = *x_end - *x_start;
303  int min_count = static_cast<int>(width * min_fraction);
304  int max_count = static_cast<int>(width * max_fraction);
305  int edge_width = static_cast<int>(width * max_skew_gradient);
306  if (HScanForEdge(data, wpl, *x_start, *x_end, min_count, edge_width, max_count, *y_end, 1,
307  y_start) &&
308  !top_done) {
309  top_done = true;
310  any_cut = true;
311  }
312  --(*y_end);
313  if (HScanForEdge(data, wpl, *x_start, *x_end, min_count, edge_width, max_count, *y_start, -1,
314  y_end) &&
315  !bottom_done) {
316  bottom_done = true;
317  any_cut = true;
318  }
319  ++(*y_end);
320 
321  // Find the left/right edges.
322  int height = *y_end - *y_start;
323  min_count = static_cast<int>(height * min_fraction);
324  max_count = static_cast<int>(height * max_fraction);
325  edge_width = static_cast<int>(height * max_skew_gradient);
326  if (VScanForEdge(data, wpl, *y_start, *y_end, min_count, edge_width, max_count, *x_end, 1,
327  x_start) &&
328  !left_done) {
329  left_done = true;
330  any_cut = true;
331  }
332  --(*x_end);
333  if (VScanForEdge(data, wpl, *y_start, *y_end, min_count, edge_width, max_count, *x_start, -1,
334  x_end) &&
335  !right_done) {
336  right_done = true;
337  any_cut = true;
338  }
339  ++(*x_end);
340  } while (any_cut);
341 
342  // All edges must satisfy the condition of sharp gradient in pixel density
343  // in order for the full rectangle to be present.
344  return left_done && right_done && top_done && bottom_done;
345 }
346 
347 // Given an input pix, and a bounding rectangle, the sides of the rectangle
348 // are shrunk inwards until they bound any black pixels found within the
349 // original rectangle. Returns false if the rectangle contains no black
350 // pixels at all.
351 bool ImageFind::BoundsWithinRect(Image pix, int *x_start, int *y_start, int *x_end, int *y_end) {
352  Box *input_box = boxCreate(*x_start, *y_start, *x_end - *x_start, *y_end - *y_start);
353  Box *output_box = nullptr;
354  pixClipBoxToForeground(pix, input_box, nullptr, &output_box);
355  bool result = output_box != nullptr;
356  if (result) {
357  l_int32 x, y, width, height;
358  boxGetGeometry(output_box, &x, &y, &width, &height);
359  *x_start = x;
360  *y_start = y;
361  *x_end = x + width;
362  *y_end = y + height;
363  boxDestroy(&output_box);
364  }
365  boxDestroy(&input_box);
366  return result;
367 }
368 
369 // Given a point in 3-D (RGB) space, returns the squared Euclidean distance
370 // of the point from the given line, defined by a pair of points in the 3-D
371 // (RGB) space, line1 and line2.
372 double ImageFind::ColorDistanceFromLine(const uint8_t *line1, const uint8_t *line2,
373  const uint8_t *point) {
374  int line_vector[kRGBRMSColors];
375  int point_vector[kRGBRMSColors];
376  for (int i = 0; i < kRGBRMSColors; ++i) {
377  line_vector[i] = static_cast<int>(line2[i]) - static_cast<int>(line1[i]);
378  point_vector[i] = static_cast<int>(point[i]) - static_cast<int>(line1[i]);
379  }
380  line_vector[L_ALPHA_CHANNEL] = 0;
381  // Now the cross product in 3d.
382  int cross[kRGBRMSColors];
383  cross[COLOR_RED] = line_vector[COLOR_GREEN] * point_vector[COLOR_BLUE] -
384  line_vector[COLOR_BLUE] * point_vector[COLOR_GREEN];
385  cross[COLOR_GREEN] = line_vector[COLOR_BLUE] * point_vector[COLOR_RED] -
386  line_vector[COLOR_RED] * point_vector[COLOR_BLUE];
387  cross[COLOR_BLUE] = line_vector[COLOR_RED] * point_vector[COLOR_GREEN] -
388  line_vector[COLOR_GREEN] * point_vector[COLOR_RED];
389  cross[L_ALPHA_CHANNEL] = 0;
390  // Now the sums of the squares.
391  double cross_sq = 0.0;
392  double line_sq = 0.0;
393  for (int j = 0; j < kRGBRMSColors; ++j) {
394  cross_sq += static_cast<double>(cross[j]) * cross[j];
395  line_sq += static_cast<double>(line_vector[j]) * line_vector[j];
396  }
397  if (line_sq == 0.0) {
398  return 0.0;
399  }
400  return cross_sq / line_sq; // This is the squared distance.
401 }
402 
403 // Returns the leptonica combined code for the given RGB triplet.
404 uint32_t ImageFind::ComposeRGB(uint32_t r, uint32_t g, uint32_t b) {
405  l_uint32 result;
406  composeRGBPixel(r, g, b, &result);
407  return result;
408 }
409 
410 // Returns the input value clipped to a uint8_t.
411 uint8_t ImageFind::ClipToByte(double pixel) {
412  if (pixel < 0.0) {
413  return 0;
414  } else if (pixel >= 255.0) {
415  return 255;
416  }
417  return static_cast<uint8_t>(pixel);
418 }
419 
420 // Computes the light and dark extremes of color in the given rectangle of
421 // the given pix, which is factor smaller than the coordinate system in rect.
422 // The light and dark points are taken to be the upper and lower 8th-ile of
423 // the most deviant of R, G and B. The value of the other 2 channels are
424 // computed by linear fit against the most deviant.
425 // The colors of the two points are returned in color1 and color2, with the
426 // alpha channel set to a scaled mean rms of the fits.
427 // If color_map1 is not null then it and color_map2 get rect pasted in them
428 // with the two calculated colors, and rms map gets a pasted rect of the rms.
429 // color_map1, color_map2 and rms_map are assumed to be the same scale as pix.
430 void ImageFind::ComputeRectangleColors(const TBOX &rect, Image pix, int factor, Image color_map1,
431  Image color_map2, Image rms_map, uint8_t *color1,
432  uint8_t *color2) {
433  ASSERT_HOST(pix != nullptr && pixGetDepth(pix) == 32);
434  // Pad the rectangle outwards by 2 (scaled) pixels if possible to get more
435  // background.
436  int width = pixGetWidth(pix);
437  int height = pixGetHeight(pix);
438  int left_pad = std::max(rect.left() - 2 * factor, 0) / factor;
439  int top_pad = (rect.top() + 2 * factor + (factor - 1)) / factor;
440  top_pad = std::min(height, top_pad);
441  int right_pad = (rect.right() + 2 * factor + (factor - 1)) / factor;
442  right_pad = std::min(width, right_pad);
443  int bottom_pad = std::max(rect.bottom() - 2 * factor, 0) / factor;
444  int width_pad = right_pad - left_pad;
445  int height_pad = top_pad - bottom_pad;
446  if (width_pad < 1 || height_pad < 1 || width_pad + height_pad < 4) {
447  return;
448  }
449  // Now crop the pix to the rectangle.
450  Box *scaled_box = boxCreate(left_pad, height - top_pad, width_pad, height_pad);
451  Image scaled = pixClipRectangle(pix, scaled_box, nullptr);
452 
453  // Compute stats over the whole image.
454  STATS red_stats(0, 256);
455  STATS green_stats(0, 256);
456  STATS blue_stats(0, 256);
457  uint32_t *data = pixGetData(scaled);
458  ASSERT_HOST(pixGetWpl(scaled) == width_pad);
459  for (int y = 0; y < height_pad; ++y) {
460  for (int x = 0; x < width_pad; ++x, ++data) {
461  int r = GET_DATA_BYTE(data, COLOR_RED);
462  int g = GET_DATA_BYTE(data, COLOR_GREEN);
463  int b = GET_DATA_BYTE(data, COLOR_BLUE);
464  red_stats.add(r, 1);
465  green_stats.add(g, 1);
466  blue_stats.add(b, 1);
467  }
468  }
469  // Find the RGB component with the greatest 8th-ile-range.
470  // 8th-iles are used instead of quartiles to get closer to the true
471  // foreground color, which is going to be faint at best because of the
472  // pre-scaling of the input image.
473  int best_l8 = static_cast<int>(red_stats.ile(0.125f));
474  int best_u8 = static_cast<int>(ceil(red_stats.ile(0.875f)));
475  int best_i8r = best_u8 - best_l8;
476  int x_color = COLOR_RED;
477  int y1_color = COLOR_GREEN;
478  int y2_color = COLOR_BLUE;
479  int l8 = static_cast<int>(green_stats.ile(0.125f));
480  int u8 = static_cast<int>(ceil(green_stats.ile(0.875f)));
481  if (u8 - l8 > best_i8r) {
482  best_i8r = u8 - l8;
483  best_l8 = l8;
484  best_u8 = u8;
485  x_color = COLOR_GREEN;
486  y1_color = COLOR_RED;
487  }
488  l8 = static_cast<int>(blue_stats.ile(0.125f));
489  u8 = static_cast<int>(ceil(blue_stats.ile(0.875f)));
490  if (u8 - l8 > best_i8r) {
491  best_i8r = u8 - l8;
492  best_l8 = l8;
493  best_u8 = u8;
494  x_color = COLOR_BLUE;
495  y1_color = COLOR_GREEN;
496  y2_color = COLOR_RED;
497  }
498  if (best_i8r >= kMinColorDifference) {
499  LLSQ line1;
500  LLSQ line2;
501  uint32_t *data = pixGetData(scaled);
502  for (int im_y = 0; im_y < height_pad; ++im_y) {
503  for (int im_x = 0; im_x < width_pad; ++im_x, ++data) {
504  int x = GET_DATA_BYTE(data, x_color);
505  int y1 = GET_DATA_BYTE(data, y1_color);
506  int y2 = GET_DATA_BYTE(data, y2_color);
507  line1.add(x, y1);
508  line2.add(x, y2);
509  }
510  }
511  double m1 = line1.m();
512  double c1 = line1.c(m1);
513  double m2 = line2.m();
514  double c2 = line2.c(m2);
515  double rms = line1.rms(m1, c1) + line2.rms(m2, c2);
516  rms *= kRMSFitScaling;
517  // Save the results.
518  color1[x_color] = ClipToByte(best_l8);
519  color1[y1_color] = ClipToByte(m1 * best_l8 + c1 + 0.5);
520  color1[y2_color] = ClipToByte(m2 * best_l8 + c2 + 0.5);
521  color1[L_ALPHA_CHANNEL] = ClipToByte(rms);
522  color2[x_color] = ClipToByte(best_u8);
523  color2[y1_color] = ClipToByte(m1 * best_u8 + c1 + 0.5);
524  color2[y2_color] = ClipToByte(m2 * best_u8 + c2 + 0.5);
525  color2[L_ALPHA_CHANNEL] = ClipToByte(rms);
526  } else {
527  // There is only one color.
528  color1[COLOR_RED] = ClipToByte(red_stats.median());
529  color1[COLOR_GREEN] = ClipToByte(green_stats.median());
530  color1[COLOR_BLUE] = ClipToByte(blue_stats.median());
531  color1[L_ALPHA_CHANNEL] = 0;
532  memcpy(color2, color1, 4);
533  }
534  if (color_map1 != nullptr) {
535  pixSetInRectArbitrary(color_map1, scaled_box,
536  ComposeRGB(color1[COLOR_RED], color1[COLOR_GREEN], color1[COLOR_BLUE]));
537  pixSetInRectArbitrary(color_map2, scaled_box,
538  ComposeRGB(color2[COLOR_RED], color2[COLOR_GREEN], color2[COLOR_BLUE]));
539  pixSetInRectArbitrary(rms_map, scaled_box, color1[L_ALPHA_CHANNEL]);
540  }
541  scaled.destroy();
542  boxDestroy(&scaled_box);
543 }
544 
545 // ================ CUTTING POLYGONAL IMAGES FROM A RECTANGLE ================
546 // The following functions are responsible for cutting a polygonal image from
547 // a rectangle: CountPixelsInRotatedBox, AttemptToShrinkBox, CutChunkFromParts
548 // with DivideImageIntoParts as the master.
549 // Problem statement:
550 // We start with a single connected component from the image mask: we get
551 // a Pix of the component, and its location on the page (im_box).
552 // The objective of cutting a polygonal image from its rectangle is to avoid
553 // interfering text, but not text that completely overlaps the image.
554 // ------------------------------ ------------------------------
555 // | Single input partition | | 1 Cut up output partitions |
556 // | | ------------------------------
557 // Av|oid | Avoid | |
558 // | | |________________________|
559 // Int|erfering | Interfering | |
560 // | | _____|__________________|
561 // T|ext | Text | |
562 // | Text-on-image | | Text-on-image |
563 // ------------------------------ --------------------------
564 // DivideImageIntoParts does this by building a ColPartition_LIST (not in the
565 // grid) with each ColPartition representing one of the rectangles needed,
566 // starting with a single rectangle for the whole image component, and cutting
567 // bits out of it with CutChunkFromParts as needed to avoid text. The output
568 // ColPartitions are supposed to be ordered from top to bottom.
569 
570 // The problem is complicated by the fact that we have rotated the coordinate
571 // system to make text lines horizontal, so if we need to look at the component
572 // image, we have to rotate the coordinates. Throughout the functions in this
573 // section im_box is the rectangle representing the image component in the
574 // rotated page coordinates (where we are building our output ColPartitions),
575 // rotation is the rotation that we used to get there, and rerotation is the
576 // rotation required to get back to original page image coordinates.
577 // To get to coordinates in the component image, pix, we rotate the im_box,
578 // the point we want to locate, and subtract the rotated point from the top-left
579 // of the rotated im_box.
580 // im_box is therefore essential to calculating coordinates within the pix.
581 
582 // Returns true if there are no black pixels in between the boxes.
583 // The im_box must represent the bounding box of the pix in tesseract
584 // coordinates, which may be negative, due to rotations to make the textlines
585 // horizontal. The boxes are rotated by rotation, which should undo such
586 // rotations, before mapping them onto the pix.
587 bool ImageFind::BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box,
588  const FCOORD &rotation, Image pix) {
589  TBOX search_box(box1);
590  search_box += box2;
591  if (box1.x_gap(box2) >= box1.y_gap(box2)) {
592  if (box1.x_gap(box2) <= 0) {
593  return true;
594  }
595  search_box.set_left(std::min(box1.right(), box2.right()));
596  search_box.set_right(std::max(box1.left(), box2.left()));
597  } else {
598  if (box1.y_gap(box2) <= 0) {
599  return true;
600  }
601  search_box.set_top(std::max(box1.bottom(), box2.bottom()));
602  search_box.set_bottom(std::min(box1.top(), box2.top()));
603  }
604  return CountPixelsInRotatedBox(search_box, im_box, rotation, pix) == 0;
605 }
606 
607 // Returns the number of pixels in box in the pix.
608 // rotation, pix and im_box are defined in the large comment above.
609 int ImageFind::CountPixelsInRotatedBox(TBOX box, const TBOX &im_box, const FCOORD &rotation,
610  Image pix) {
611  // Intersect it with the image box.
612  box &= im_box; // This is in-place box intersection.
613  if (box.null_box()) {
614  return 0;
615  }
616  box.rotate(rotation);
617  TBOX rotated_im_box(im_box);
618  rotated_im_box.rotate(rotation);
619  Image rect_pix = pixCreate(box.width(), box.height(), 1);
620  pixRasterop(rect_pix, 0, 0, box.width(), box.height(), PIX_SRC, pix,
621  box.left() - rotated_im_box.left(), rotated_im_box.top() - box.top());
622  l_int32 result;
623  pixCountPixels(rect_pix, &result, nullptr);
624  rect_pix.destroy();
625  return result;
626 }
627 
628 // The box given by slice contains some black pixels, but not necessarily
629 // over the whole box. Shrink the x bounds of slice, but not the y bounds
630 // until there is at least one black pixel in the outermost columns.
631 // rotation, rerotation, pix and im_box are defined in the large comment above.
632 static void AttemptToShrinkBox(const FCOORD &rotation, const FCOORD &rerotation, const TBOX &im_box,
633  Image pix, TBOX *slice) {
634  TBOX rotated_box(*slice);
635  rotated_box.rotate(rerotation);
636  TBOX rotated_im_box(im_box);
637  rotated_im_box.rotate(rerotation);
638  int left = rotated_box.left() - rotated_im_box.left();
639  int right = rotated_box.right() - rotated_im_box.left();
640  int top = rotated_im_box.top() - rotated_box.top();
641  int bottom = rotated_im_box.top() - rotated_box.bottom();
642  ImageFind::BoundsWithinRect(pix, &left, &top, &right, &bottom);
643  top = rotated_im_box.top() - top;
644  bottom = rotated_im_box.top() - bottom;
645  left += rotated_im_box.left();
646  right += rotated_im_box.left();
647  rotated_box.set_to_given_coords(left, bottom, right, top);
648  rotated_box.rotate(rotation);
649  slice->set_left(rotated_box.left());
650  slice->set_right(rotated_box.right());
651 }
652 
653 // The meat of cutting a polygonal image around text.
654 // This function covers the general case of cutting a box out of a box
655 // as shown:
656 // Input Output
657 // ------------------------------ ------------------------------
658 // | Single input partition | | 1 Cut up output partitions |
659 // | | ------------------------------
660 // | ---------- | --------- ----------
661 // | | box | | | 2 | box | 3 |
662 // | | | | | | is cut | |
663 // | ---------- | --------- out ----------
664 // | | ------------------------------
665 // | | | 4 |
666 // ------------------------------ ------------------------------
667 // In the context that this function is used, at most 3 of the above output
668 // boxes will be created, as the overlapping box is never contained by the
669 // input.
670 // The above cutting operation is executed for each element of part_list that
671 // is overlapped by the input box. Each modified ColPartition is replaced
672 // in place in the list by the output of the cutting operation in the order
673 // shown above, so iff no holes are ever created, the output will be in
674 // top-to-bottom order, but in extreme cases, hole creation is possible.
675 // In such cases, the output order may cause strange block polygons.
676 // rotation, rerotation, pix and im_box are defined in the large comment above.
677 static void CutChunkFromParts(const TBOX &box, const TBOX &im_box, const FCOORD &rotation,
678  const FCOORD &rerotation, Image pix, ColPartition_LIST *part_list) {
679  ASSERT_HOST(!part_list->empty());
680  ColPartition_IT part_it(part_list);
681  do {
682  ColPartition *part = part_it.data();
683  TBOX part_box = part->bounding_box();
684  if (part_box.overlap(box)) {
685  // This part must be cut and replaced with the remains. There are
686  // up to 4 pieces to be made. Start with the first one and use
687  // add_before_stay_put. For each piece if it has no black pixels
688  // left, just don't make the box.
689  // Above box.
690  if (box.top() < part_box.top()) {
691  TBOX slice(part_box);
692  slice.set_bottom(box.top());
693  if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) {
694  AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);
695  part_it.add_before_stay_put(
697  }
698  }
699  // Left of box.
700  if (box.left() > part_box.left()) {
701  TBOX slice(part_box);
702  slice.set_right(box.left());
703  if (box.top() < part_box.top()) {
704  slice.set_top(box.top());
705  }
706  if (box.bottom() > part_box.bottom()) {
707  slice.set_bottom(box.bottom());
708  }
709  if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) {
710  AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);
711  part_it.add_before_stay_put(
713  }
714  }
715  // Right of box.
716  if (box.right() < part_box.right()) {
717  TBOX slice(part_box);
718  slice.set_left(box.right());
719  if (box.top() < part_box.top()) {
720  slice.set_top(box.top());
721  }
722  if (box.bottom() > part_box.bottom()) {
723  slice.set_bottom(box.bottom());
724  }
725  if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) {
726  AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);
727  part_it.add_before_stay_put(
729  }
730  }
731  // Below box.
732  if (box.bottom() > part_box.bottom()) {
733  TBOX slice(part_box);
734  slice.set_top(box.bottom());
735  if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, pix) > 0) {
736  AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice);
737  part_it.add_before_stay_put(
739  }
740  }
741  part->DeleteBoxes();
742  delete part_it.extract();
743  }
744  part_it.forward();
745  } while (!part_it.at_first());
746 }
747 
748 // Starts with the bounding box of the image component and cuts it up
749 // so that it doesn't intersect text where possible.
750 // Strong fully contained horizontal text is marked as text on image,
751 // and does not cause a division of the image.
752 // For more detail see the large comment above on cutting polygonal images
753 // from a rectangle.
754 // rotation, rerotation, pix and im_box are defined in the large comment above.
755 static void DivideImageIntoParts(const TBOX &im_box, const FCOORD &rotation,
756  const FCOORD &rerotation, Image pix,
757  ColPartitionGridSearch *rectsearch, ColPartition_LIST *part_list) {
758  // Add the full im_box partition to the list to begin with.
759  ColPartition *pix_part =
761  ColPartition_IT part_it(part_list);
762  part_it.add_after_then_move(pix_part);
763 
764  rectsearch->StartRectSearch(im_box);
765  ColPartition *part;
766  while ((part = rectsearch->NextRectSearch()) != nullptr) {
767  TBOX part_box = part->bounding_box();
768  if (part_box.contains(im_box) && part->flow() >= BTFT_CHAIN) {
769  // This image is completely covered by an existing text partition.
770  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
771  ColPartition *pix_part = part_it.extract();
772  pix_part->DeleteBoxes();
773  delete pix_part;
774  }
775  } else if (part->flow() == BTFT_STRONG_CHAIN) {
776  // Text intersects the box.
777  TBOX overlap_box = part_box.intersection(im_box);
778  // Intersect it with the image box.
779  int black_area = ImageFind::CountPixelsInRotatedBox(overlap_box, im_box, rerotation, pix);
780  if (black_area * 2 < part_box.area() || !im_box.contains(part_box)) {
781  // Eat a piece out of the image.
782  // Pad it so that pieces eaten out look decent.
783  int padding = part->blob_type() == BRT_VERT_TEXT ? part_box.width() : part_box.height();
784  part_box.set_top(part_box.top() + padding / 2);
785  part_box.set_bottom(part_box.bottom() - padding / 2);
786  CutChunkFromParts(part_box, im_box, rotation, rerotation, pix, part_list);
787  } else {
788  // Strong overlap with the black area, so call it text on image.
789  part->set_flow(BTFT_TEXT_ON_IMAGE);
790  }
791  }
792  if (part_list->empty()) {
793  break;
794  }
795  }
796 }
797 
798 // Search for the rightmost text that overlaps vertically and is to the left
799 // of the given box, but within the given left limit.
800 static int ExpandImageLeft(const TBOX &box, int left_limit, ColPartitionGrid *part_grid) {
801  ColPartitionGridSearch search(part_grid);
802  ColPartition *part;
803  // Search right to left for any text that overlaps.
804  search.StartSideSearch(box.left(), box.bottom(), box.top());
805  while ((part = search.NextSideSearch(true)) != nullptr) {
806  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
807  const TBOX &part_box(part->bounding_box());
808  if (part_box.y_gap(box) < 0) {
809  if (part_box.right() > left_limit && part_box.right() < box.left()) {
810  left_limit = part_box.right();
811  }
812  break;
813  }
814  }
815  }
816  if (part != nullptr) {
817  // Search for the nearest text up to the one we already found.
818  TBOX search_box(left_limit, box.bottom(), box.left(), box.top());
819  search.StartRectSearch(search_box);
820  while ((part = search.NextRectSearch()) != nullptr) {
821  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
822  const TBOX &part_box(part->bounding_box());
823  if (part_box.y_gap(box) < 0) {
824  if (part_box.right() > left_limit && part_box.right() < box.left()) {
825  left_limit = part_box.right();
826  }
827  }
828  }
829  }
830  }
831  return left_limit;
832 }
833 
834 // Search for the leftmost text that overlaps vertically and is to the right
835 // of the given box, but within the given right limit.
836 static int ExpandImageRight(const TBOX &box, int right_limit, ColPartitionGrid *part_grid) {
837  ColPartitionGridSearch search(part_grid);
838  ColPartition *part;
839  // Search left to right for any text that overlaps.
840  search.StartSideSearch(box.right(), box.bottom(), box.top());
841  while ((part = search.NextSideSearch(false)) != nullptr) {
842  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
843  const TBOX &part_box(part->bounding_box());
844  if (part_box.y_gap(box) < 0) {
845  if (part_box.left() < right_limit && part_box.left() > box.right()) {
846  right_limit = part_box.left();
847  }
848  break;
849  }
850  }
851  }
852  if (part != nullptr) {
853  // Search for the nearest text up to the one we already found.
854  TBOX search_box(box.left(), box.bottom(), right_limit, box.top());
855  search.StartRectSearch(search_box);
856  while ((part = search.NextRectSearch()) != nullptr) {
857  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
858  const TBOX &part_box(part->bounding_box());
859  if (part_box.y_gap(box) < 0) {
860  if (part_box.left() < right_limit && part_box.left() > box.right()) {
861  right_limit = part_box.left();
862  }
863  }
864  }
865  }
866  }
867  return right_limit;
868 }
869 
870 // Search for the topmost text that overlaps horizontally and is below
871 // the given box, but within the given bottom limit.
872 static int ExpandImageBottom(const TBOX &box, int bottom_limit, ColPartitionGrid *part_grid) {
873  ColPartitionGridSearch search(part_grid);
874  ColPartition *part;
875  // Search right to left for any text that overlaps.
876  search.StartVerticalSearch(box.left(), box.right(), box.bottom());
877  while ((part = search.NextVerticalSearch(true)) != nullptr) {
878  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
879  const TBOX &part_box(part->bounding_box());
880  if (part_box.x_gap(box) < 0) {
881  if (part_box.top() > bottom_limit && part_box.top() < box.bottom()) {
882  bottom_limit = part_box.top();
883  }
884  break;
885  }
886  }
887  }
888  if (part != nullptr) {
889  // Search for the nearest text up to the one we already found.
890  TBOX search_box(box.left(), bottom_limit, box.right(), box.bottom());
891  search.StartRectSearch(search_box);
892  while ((part = search.NextRectSearch()) != nullptr) {
893  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
894  const TBOX &part_box(part->bounding_box());
895  if (part_box.x_gap(box) < 0) {
896  if (part_box.top() > bottom_limit && part_box.top() < box.bottom()) {
897  bottom_limit = part_box.top();
898  }
899  }
900  }
901  }
902  }
903  return bottom_limit;
904 }
905 
906 // Search for the bottommost text that overlaps horizontally and is above
907 // the given box, but within the given top limit.
908 static int ExpandImageTop(const TBOX &box, int top_limit, ColPartitionGrid *part_grid) {
909  ColPartitionGridSearch search(part_grid);
910  ColPartition *part;
911  // Search right to left for any text that overlaps.
912  search.StartVerticalSearch(box.left(), box.right(), box.top());
913  while ((part = search.NextVerticalSearch(false)) != nullptr) {
914  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
915  const TBOX &part_box(part->bounding_box());
916  if (part_box.x_gap(box) < 0) {
917  if (part_box.bottom() < top_limit && part_box.bottom() > box.top()) {
918  top_limit = part_box.bottom();
919  }
920  break;
921  }
922  }
923  }
924  if (part != nullptr) {
925  // Search for the nearest text up to the one we already found.
926  TBOX search_box(box.left(), box.top(), box.right(), top_limit);
927  search.StartRectSearch(search_box);
928  while ((part = search.NextRectSearch()) != nullptr) {
929  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) {
930  const TBOX &part_box(part->bounding_box());
931  if (part_box.x_gap(box) < 0) {
932  if (part_box.bottom() < top_limit && part_box.bottom() > box.top()) {
933  top_limit = part_box.bottom();
934  }
935  }
936  }
937  }
938  }
939  return top_limit;
940 }
941 
942 // Expands the image box in the given direction until it hits text,
943 // limiting the expansion to the given limit box, returning the result
944 // in the expanded box, and
945 // returning the increase in area resulting from the expansion.
946 static int ExpandImageDir(BlobNeighbourDir dir, const TBOX &im_box, const TBOX &limit_box,
947  ColPartitionGrid *part_grid, TBOX *expanded_box) {
948  *expanded_box = im_box;
949  switch (dir) {
950  case BND_LEFT:
951  expanded_box->set_left(ExpandImageLeft(im_box, limit_box.left(), part_grid));
952  break;
953  case BND_RIGHT:
954  expanded_box->set_right(ExpandImageRight(im_box, limit_box.right(), part_grid));
955  break;
956  case BND_ABOVE:
957  expanded_box->set_top(ExpandImageTop(im_box, limit_box.top(), part_grid));
958  break;
959  case BND_BELOW:
960  expanded_box->set_bottom(ExpandImageBottom(im_box, limit_box.bottom(), part_grid));
961  break;
962  default:
963  return 0;
964  }
965  return expanded_box->area() - im_box.area();
966 }
967 
968 // Expands the image partition into any non-text until it touches text.
969 // The expansion proceeds in the order of increasing increase in area
970 // as a heuristic to find the best rectangle by expanding in the most
971 // constrained direction first.
972 static void MaximalImageBoundingBox(ColPartitionGrid *part_grid, TBOX *im_box) {
973  bool dunnit[BND_COUNT];
974  memset(dunnit, 0, sizeof(dunnit));
975  TBOX limit_box(part_grid->bleft().x(), part_grid->bleft().y(), part_grid->tright().x(),
976  part_grid->tright().y());
977  TBOX text_box(*im_box);
978  for (int iteration = 0; iteration < BND_COUNT; ++iteration) {
979  // Find the direction with least area increase.
980  int best_delta = -1;
981  BlobNeighbourDir best_dir = BND_LEFT;
982  TBOX expanded_boxes[BND_COUNT];
983  for (int dir = 0; dir < BND_COUNT; ++dir) {
984  auto bnd = static_cast<BlobNeighbourDir>(dir);
985  if (!dunnit[bnd]) {
986  TBOX expanded_box;
987  int area_delta = ExpandImageDir(bnd, text_box, limit_box, part_grid, &expanded_boxes[bnd]);
988  if (best_delta < 0 || area_delta < best_delta) {
989  best_delta = area_delta;
990  best_dir = bnd;
991  }
992  }
993  }
994  // Run the best and remember the direction.
995  dunnit[best_dir] = true;
996  text_box = expanded_boxes[best_dir];
997  }
998  *im_box = text_box;
999 }
1000 
1001 // Helper deletes the given partition but first marks up all the blobs as
1002 // noise, so they get deleted later, and disowns them.
1003 // If the initial type of the partition is image, then it actually deletes
1004 // the blobs, as the partition owns them in that case.
1005 static void DeletePartition(ColPartition *part) {
1006  BlobRegionType type = part->blob_type();
1007  if (type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) {
1008  // The partition owns the boxes of these types, so just delete them.
1009  part->DeleteBoxes(); // From a previous iteration.
1010  } else {
1011  // Once marked, the blobs will be swept up by TidyBlobs.
1012  part->set_flow(BTFT_NONTEXT);
1013  part->set_blob_type(BRT_NOISE);
1014  part->SetBlobTypes();
1015  part->DisownBoxes(); // Created before FindImagePartitions.
1016  }
1017  delete part;
1018 }
1019 
1020 // The meat of joining fragmented images and consuming ColPartitions of
1021 // uncertain type.
1022 // *part_ptr is an input/output BRT_RECTIMAGE ColPartition that is to be
1023 // expanded to consume overlapping and nearby ColPartitions of uncertain type
1024 // and other BRT_RECTIMAGE partitions, but NOT to be expanded beyond
1025 // max_image_box. *part_ptr is NOT in the part_grid.
1026 // rectsearch is already constructed on the part_grid, and is used for
1027 // searching for overlapping and nearby ColPartitions.
1028 // ExpandImageIntoParts is called iteratively until it returns false. Each
1029 // time it absorbs the nearest non-contained candidate, and everything that
1030 // is fully contained within part_ptr's bounding box.
1031 // TODO(rays) what if it just eats everything inside max_image_box in one go?
1032 static bool ExpandImageIntoParts(const TBOX &max_image_box, ColPartitionGridSearch *rectsearch,
1033  ColPartitionGrid *part_grid, ColPartition **part_ptr) {
1034  ColPartition *image_part = *part_ptr;
1035  TBOX im_part_box = image_part->bounding_box();
1036  if (textord_tabfind_show_images > 1) {
1037  tprintf("Searching for merge with image part:");
1038  im_part_box.print();
1039  tprintf("Text box=");
1040  max_image_box.print();
1041  }
1042  rectsearch->StartRectSearch(max_image_box);
1043  ColPartition *part;
1044  ColPartition *best_part = nullptr;
1045  int best_dist = 0;
1046  while ((part = rectsearch->NextRectSearch()) != nullptr) {
1047  if (textord_tabfind_show_images > 1) {
1048  tprintf("Considering merge with part:");
1049  part->Print();
1050  if (im_part_box.contains(part->bounding_box())) {
1051  tprintf("Fully contained\n");
1052  } else if (!max_image_box.contains(part->bounding_box())) {
1053  tprintf("Not within text box\n");
1054  } else if (part->flow() == BTFT_STRONG_CHAIN) {
1055  tprintf("Too strong text\n");
1056  } else {
1057  tprintf("Real candidate\n");
1058  }
1059  }
1060  if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_TEXT_ON_IMAGE ||
1061  part->blob_type() == BRT_POLYIMAGE) {
1062  continue;
1063  }
1064  TBOX box = part->bounding_box();
1065  if (max_image_box.contains(box) && part->blob_type() != BRT_NOISE) {
1066  if (im_part_box.contains(box)) {
1067  // Eat it completely.
1068  rectsearch->RemoveBBox();
1069  DeletePartition(part);
1070  continue;
1071  }
1072  int x_dist = std::max(0, box.x_gap(im_part_box));
1073  int y_dist = std::max(0, box.y_gap(im_part_box));
1074  int dist = x_dist * x_dist + y_dist * y_dist;
1075  if (dist > box.area() || dist > im_part_box.area()) {
1076  continue; // Not close enough.
1077  }
1078  if (best_part == nullptr || dist < best_dist) {
1079  // We keep the nearest qualifier, which is not necessarily the nearest.
1080  best_part = part;
1081  best_dist = dist;
1082  }
1083  }
1084  }
1085  if (best_part != nullptr) {
1086  // It needs expanding. We can do it without touching text.
1087  TBOX box = best_part->bounding_box();
1088  if (textord_tabfind_show_images > 1) {
1089  tprintf("Merging image part:");
1090  im_part_box.print();
1091  tprintf("with part:");
1092  box.print();
1093  }
1094  im_part_box += box;
1096  DeletePartition(image_part);
1097  part_grid->RemoveBBox(best_part);
1098  DeletePartition(best_part);
1099  rectsearch->RepositionIterator();
1100  return true;
1101  }
1102  return false;
1103 }
1104 
1105 // Helper function to compute the overlap area between the box and the
1106 // given list of partitions.
1107 static int IntersectArea(const TBOX &box, ColPartition_LIST *part_list) {
1108  int intersect_area = 0;
1109  ColPartition_IT part_it(part_list);
1110  // Iterate the parts and subtract intersecting area.
1111  for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
1112  ColPartition *image_part = part_it.data();
1113  TBOX intersect = box.intersection(image_part->bounding_box());
1114  intersect_area += intersect.area();
1115  }
1116  return intersect_area;
1117 }
1118 
1119 // part_list is a set of ColPartitions representing a polygonal image, and
1120 // im_box is the union of the bounding boxes of all the parts in part_list.
1121 // Tests whether part is to be consumed by the polygonal image.
1122 // Returns true if part is weak text and more than half of its area is
1123 // intersected by parts from the part_list, and it is contained within im_box.
1124 static bool TestWeakIntersectedPart(const TBOX &im_box, ColPartition_LIST *part_list,
1125  ColPartition *part) {
1126  if (part->flow() < BTFT_STRONG_CHAIN) {
1127  // A weak partition intersects the box.
1128  const TBOX &part_box = part->bounding_box();
1129  if (im_box.contains(part_box)) {
1130  int area = part_box.area();
1131  int intersect_area = IntersectArea(part_box, part_list);
1132  if (area < 2 * intersect_area) {
1133  return true;
1134  }
1135  }
1136  }
1137  return false;
1138 }
1139 
1140 // A rectangular or polygonal image has been completed, in part_list, bounding
1141 // box in im_box. We want to eliminate weak text or other uncertain partitions
1142 // (basically anything that is not BRT_STRONG_CHAIN or better) from both the
1143 // part_grid and the big_parts list that are contained within im_box and
1144 // overlapped enough by the possibly polygonal image.
1145 static void EliminateWeakParts(const TBOX &im_box, ColPartitionGrid *part_grid,
1146  ColPartition_LIST *big_parts, ColPartition_LIST *part_list) {
1147  ColPartitionGridSearch rectsearch(part_grid);
1148  ColPartition *part;
1149  rectsearch.StartRectSearch(im_box);
1150  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1151  if (TestWeakIntersectedPart(im_box, part_list, part)) {
1152  BlobRegionType type = part->blob_type();
1153  if (type == BRT_POLYIMAGE || type == BRT_RECTIMAGE) {
1154  rectsearch.RemoveBBox();
1155  DeletePartition(part);
1156  } else {
1157  // The part is mostly covered, so mark it. Non-image partitions are
1158  // kept hanging around to mark the image for pass2
1159  part->set_flow(BTFT_NONTEXT);
1160  part->set_blob_type(BRT_NOISE);
1161  part->SetBlobTypes();
1162  }
1163  }
1164  }
1165  ColPartition_IT big_it(big_parts);
1166  for (big_it.mark_cycle_pt(); !big_it.cycled_list(); big_it.forward()) {
1167  part = big_it.data();
1168  if (TestWeakIntersectedPart(im_box, part_list, part)) {
1169  // Once marked, the blobs will be swept up by TidyBlobs.
1170  DeletePartition(big_it.extract());
1171  }
1172  }
1173 }
1174 
1175 // Helper scans for good text partitions overlapping the given box.
1176 // If there are no good text partitions overlapping an expanded box, then
1177 // the box is expanded, otherwise, the original box is returned.
1178 // If good text overlaps the box, true is returned.
1179 static bool ScanForOverlappingText(ColPartitionGrid *part_grid, TBOX *box) {
1180  ColPartitionGridSearch rectsearch(part_grid);
1181  TBOX padded_box(*box);
1182  padded_box.pad(kNoisePadding, kNoisePadding);
1183  rectsearch.StartRectSearch(padded_box);
1184  ColPartition *part;
1185  bool any_text_in_padded_rect = false;
1186  while ((part = rectsearch.NextRectSearch()) != nullptr) {
1187  if (part->flow() == BTFT_CHAIN || part->flow() == BTFT_STRONG_CHAIN) {
1188  // Text intersects the box.
1189  any_text_in_padded_rect = true;
1190  const TBOX &part_box = part->bounding_box();
1191  if (box->overlap(part_box)) {
1192  return true;
1193  }
1194  }
1195  }
1196  if (!any_text_in_padded_rect) {
1197  *box = padded_box;
1198  }
1199  return false;
1200 }
1201 
1202 // Renders the boxes of image parts from the supplied list onto the image_pix,
1203 // except where they interfere with existing strong text in the part_grid,
1204 // and then deletes them.
1205 // Box coordinates are rotated by rerotate to match the image.
1206 static void MarkAndDeleteImageParts(const FCOORD &rerotate, ColPartitionGrid *part_grid,
1207  ColPartition_LIST *image_parts, Image image_pix) {
1208  if (image_pix == nullptr) {
1209  return;
1210  }
1211  int imageheight = pixGetHeight(image_pix);
1212  ColPartition_IT part_it(image_parts);
1213  for (; !part_it.empty(); part_it.forward()) {
1214  ColPartition *part = part_it.extract();
1215  TBOX part_box = part->bounding_box();
1216  BlobRegionType type = part->blob_type();
1217  if (!ScanForOverlappingText(part_grid, &part_box) || type == BRT_RECTIMAGE ||
1218  type == BRT_POLYIMAGE) {
1219  // Mark the box on the image.
1220  // All coords need to be rotated to match the image.
1221  part_box.rotate(rerotate);
1222  int left = part_box.left();
1223  int top = part_box.top();
1224  pixRasterop(image_pix, left, imageheight - top, part_box.width(), part_box.height(), PIX_SET,
1225  nullptr, 0, 0);
1226  }
1227  DeletePartition(part);
1228  }
1229 }
1230 
1231 // Locates all the image partitions in the part_grid, that were found by a
1232 // previous call to FindImagePartitions, marks them in the image_mask,
1233 // removes them from the grid, and deletes them. This makes it possible to
1234 // call FindImagePartitions again to produce less broken-up and less
1235 // overlapping image partitions.
1236 // rerotation specifies how to rotate the partition coords to match
1237 // the image_mask, since this function is used after orientation correction.
1239  Image image_mask) {
1240  // Extract the noise parts from the grid and put them on a temporary list.
1241  ColPartition_LIST parts_list;
1242  ColPartition_IT part_it(&parts_list);
1243  ColPartitionGridSearch gsearch(part_grid);
1244  gsearch.StartFullSearch();
1245  ColPartition *part;
1246  while ((part = gsearch.NextFullSearch()) != nullptr) {
1247  BlobRegionType type = part->blob_type();
1248  if (type == BRT_NOISE || type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) {
1249  part_it.add_after_then_move(part);
1250  gsearch.RemoveBBox();
1251  }
1252  }
1253  // Render listed noise partitions to the image mask.
1254  MarkAndDeleteImageParts(rerotation, part_grid, &parts_list, image_mask);
1255 }
1256 
1257 // Removes and deletes all image partitions that are too small to be worth
1258 // keeping. We have to do this as a separate phase after creating the image
1259 // partitions as the small images are needed to join the larger ones together.
1260 static void DeleteSmallImages(ColPartitionGrid *part_grid) {
1261  if (part_grid != nullptr) {
1262  return;
1263  }
1264  ColPartitionGridSearch gsearch(part_grid);
1265  gsearch.StartFullSearch();
1266  ColPartition *part;
1267  while ((part = gsearch.NextFullSearch()) != nullptr) {
1268  // Only delete rectangular images, since if it became a poly image, it
1269  // is more evidence that it is somehow important.
1270  if (part->blob_type() == BRT_RECTIMAGE) {
1271  const TBOX &part_box = part->bounding_box();
1272  if (part_box.width() < kMinImageFindSize || part_box.height() < kMinImageFindSize) {
1273  // It is too small to keep. Just make it disappear.
1274  gsearch.RemoveBBox();
1275  DeletePartition(part);
1276  }
1277  }
1278  }
1279 }
1280 
1281 // Runs a CC analysis on the image_pix mask image, and creates
1282 // image partitions from them, cutting out strong text, and merging with
1283 // nearby image regions such that they don't interfere with text.
1284 // Rotation and rerotation specify how to rotate image coords to match
1285 // the blob and partition coords and back again.
1286 // The input/output part_grid owns all the created partitions, and
1287 // the partitions own all the fake blobs that belong in the partitions.
1288 // Since the other blobs in the other partitions will be owned by the block,
1289 // ColPartitionGrid::ReTypeBlobs must be called afterwards to fix this
1290 // situation and collect the image blobs.
1291 void ImageFind::FindImagePartitions(Image image_pix, const FCOORD &rotation,
1292  const FCOORD &rerotation, TO_BLOCK *block, TabFind *tab_grid,
1293  DebugPixa *pixa_debug, ColPartitionGrid *part_grid,
1294  ColPartition_LIST *big_parts) {
1295  int imageheight = pixGetHeight(image_pix);
1296  Boxa *boxa;
1297  Pixa *pixa;
1298  ConnCompAndRectangularize(image_pix, pixa_debug, &boxa, &pixa);
1299  // Iterate the connected components in the image regions mask.
1300  int nboxes = 0;
1301  if (boxa != nullptr && pixa != nullptr) {
1302  nboxes = boxaGetCount(boxa);
1303  }
1304  for (int i = 0; i < nboxes; ++i) {
1305  l_int32 x, y, width, height;
1306  boxaGetBoxGeometry(boxa, i, &x, &y, &width, &height);
1307  Image pix = pixaGetPix(pixa, i, L_CLONE);
1308  TBOX im_box(x, imageheight - y - height, x + width, imageheight - y);
1309  im_box.rotate(rotation); // Now matches all partitions and blobs.
1310  ColPartitionGridSearch rectsearch(part_grid);
1311  rectsearch.SetUniqueMode(true);
1312  ColPartition_LIST part_list;
1313  DivideImageIntoParts(im_box, rotation, rerotation, pix, &rectsearch, &part_list);
1314  if (textord_tabfind_show_images && pixa_debug != nullptr) {
1315  pixa_debug->AddPix(pix, "ImageComponent");
1316  tprintf("Component has %d parts\n", part_list.length());
1317  }
1318  pix.destroy();
1319  if (!part_list.empty()) {
1320  ColPartition_IT part_it(&part_list);
1321  if (part_list.singleton()) {
1322  // We didn't have to chop it into a polygon to fit around text, so
1323  // try expanding it to merge fragmented image parts, as long as it
1324  // doesn't touch strong text.
1325  ColPartition *part = part_it.extract();
1326  TBOX text_box(im_box);
1327  MaximalImageBoundingBox(part_grid, &text_box);
1328  while (ExpandImageIntoParts(text_box, &rectsearch, part_grid, &part)) {
1329  ;
1330  }
1331  part_it.set_to_list(&part_list);
1332  part_it.add_after_then_move(part);
1333  im_box = part->bounding_box();
1334  }
1335  EliminateWeakParts(im_box, part_grid, big_parts, &part_list);
1336  // Iterate the part_list and put the parts into the grid.
1337  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
1338  ColPartition *image_part = part_it.extract();
1339  im_box = image_part->bounding_box();
1340  part_grid->InsertBBox(true, true, image_part);
1341  if (!part_it.at_last()) {
1342  ColPartition *neighbour = part_it.data_relative(1);
1343  image_part->AddPartner(false, neighbour);
1344  neighbour->AddPartner(true, image_part);
1345  }
1346  }
1347  }
1348  }
1349  boxaDestroy(&boxa);
1350  pixaDestroy(&pixa);
1351  DeleteSmallImages(part_grid);
1352 #ifndef GRAPHICS_DISABLED
1353  if (textord_tabfind_show_images) {
1354  ScrollView *images_win_ = part_grid->MakeWindow(1000, 400, "With Images");
1355  part_grid->DisplayBoxes(images_win_);
1356  }
1357 #endif
1358 }
1359 
1360 } // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define INT_VAR(name, val, comment)
Definition: params.h:356
@ TBOX
const double kMinRectangularFraction
Definition: imagefind.cpp:41
BlobRegionType
Definition: blobbox.h:74
@ BRT_NOISE
Definition: blobbox.h:75
@ BRT_POLYIMAGE
Definition: blobbox.h:79
@ BRT_VERT_TEXT
Definition: blobbox.h:81
@ BRT_RECTIMAGE
Definition: blobbox.h:78
const double kRMSFitScaling
Definition: imagefind.cpp:50
const int kNoisePadding
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:919
@ BTFT_STRONG_CHAIN
Definition: blobbox.h:115
@ BTFT_CHAIN
Definition: blobbox.h:114
@ BTFT_TEXT_ON_IMAGE
Definition: blobbox.h:116
@ BTFT_NONTEXT
Definition: blobbox.h:112
const int kMinColorDifference
Definition: imagefind.cpp:52
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:211
const int kRGBRMSColors
Definition: colpartition.h:36
const double kMaxRectangularGradient
Definition: imagefind.cpp:46
const int kMinImageFindSize
Definition: imagefind.cpp:48
const double kMaxRectangularFraction
Definition: imagefind.cpp:43
BlobNeighbourDir
Definition: blobbox.h:89
@ BND_LEFT
Definition: blobbox.h:89
@ BND_RIGHT
Definition: blobbox.h:89
@ BND_BELOW
Definition: blobbox.h:89
@ BND_ABOVE
Definition: blobbox.h:89
@ BND_COUNT
Definition: blobbox.h:89
void AddPix(const Image pix, const char *caption)
Definition: debugpixa.h:32
void destroy()
Definition: image.cpp:32
void add(double x, double y)
Definition: linlsq.cpp:49
double m() const
Definition: linlsq.cpp:100
double c(double m) const
Definition: linlsq.cpp:116
double rms(double m, double c) const
Definition: linlsq.cpp:130
TDimension left() const
Definition: rect.h:82
int y_gap(const TBOX &box) const
Definition: rect.h:245
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
void set_right(int x)
Definition: rect.h:92
void set_left(int x)
Definition: rect.h:85
void rotate(const FCOORD &vec)
Definition: rect.h:210
TDimension top() const
Definition: rect.h:68
int x_gap(const TBOX &box) const
Definition: rect.h:238
bool null_box() const
Definition: rect.h:60
void set_bottom(int y)
Definition: rect.h:78
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void set_top(int y)
Definition: rect.h:71
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
double median() const
Definition: statistc.cpp:242
double ile(double frac) const
Definition: statistc.cpp:173
void SetUniqueMode(bool mode)
Definition: bbgrid.h:249
void StartFullSearch()
Definition: bbgrid.h:701
BBC * NextFullSearch()
Definition: bbgrid.h:711
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:649
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:529
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:633
static ColPartition * FakePartition(const TBOX &box, PolyBlockType block_type, BlobRegionType blob_type, BlobTextFlowType flow)
BlobRegionType blob_type() const
Definition: colpartition.h:147
const TBOX & bounding_box() const
Definition: colpartition.h:108
void AddPartner(bool upper, ColPartition *partner)
static bool BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box, const FCOORD &rotation, Image pix)
Definition: imagefind.cpp:587
static bool pixNearlyRectangular(Image pix, double min_fraction, double max_fraction, double max_skew_gradient, int *x_start, int *y_start, int *x_end, int *y_end)
Definition: imagefind.cpp:283
static void ConnCompAndRectangularize(Image pix, DebugPixa *pixa_debug, Boxa **boxa, Pixa **pixa)
Definition: imagefind.cpp:161
static bool BoundsWithinRect(Image pix, int *x_start, int *y_start, int *x_end, int *y_end)
Definition: imagefind.cpp:351
static void ComputeRectangleColors(const TBOX &rect, Image pix, int factor, Image color_map1, Image color_map2, Image rms_map, uint8_t *color1, uint8_t *color2)
Definition: imagefind.cpp:430
static uint32_t ComposeRGB(uint32_t r, uint32_t g, uint32_t b)
Definition: imagefind.cpp:404
static void FindImagePartitions(Image image_pix, const FCOORD &rotation, const FCOORD &rerotation, TO_BLOCK *block, TabFind *tab_grid, DebugPixa *pixa_debug, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts)
Definition: imagefind.cpp:1291
static uint8_t ClipToByte(double pixel)
Definition: imagefind.cpp:411
static void TransferImagePartsToImageMask(const FCOORD &rerotation, ColPartitionGrid *part_grid, Image image_mask)
Definition: imagefind.cpp:1238
static Image FindImages(Image pix, DebugPixa *pixa_debug)
Definition: imagefind.cpp:63
static int CountPixelsInRotatedBox(TBOX box, const TBOX &im_box, const FCOORD &rotation, Image pix)
Definition: imagefind.cpp:609
static double ColorDistanceFromLine(const uint8_t *line1, const uint8_t *line2, const uint8_t *point)
Definition: imagefind.cpp:372