tesseract  5.0.0
thresholder.cpp
Go to the documentation of this file.
1 // File: thresholder.cpp
3 // Description: Base API for thresholding images in tesseract.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2008, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "otsuthr.h"
20 #include "thresholder.h"
21 #include "tprintf.h" // for tprintf
22 
23 #if defined(USE_OPENCL)
24 # include "openclwrapper.h" // for OpenclDevice
25 #endif
26 
27 #include <allheaders.h>
28 #include <tesseract/baseapi.h> // for api->GetIntVariable()
29 
30 #include <cstdint> // for uint32_t
31 #include <cstring>
32 #include <tuple>
33 
34 namespace tesseract {
35 
37  : pix_(nullptr)
38  , image_width_(0)
39  , image_height_(0)
40  , pix_channels_(0)
41  , pix_wpl_(0)
42  , scale_(1)
43  , yres_(300)
44  , estimated_res_(300) {
45  SetRectangle(0, 0, 0, 0);
46 }
47 
49  Clear();
50 }
51 
52 // Destroy the Pix if there is one, freeing memory.
54  pix_.destroy();
55 }
56 
57 // Return true if no image has been set.
59  return pix_ == nullptr;
60 }
61 
62 // SetImage makes a copy of all the image data, so it may be deleted
63 // immediately after this call.
64 // Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
65 // Palette color images will not work properly and must be converted to
66 // 24 bit.
67 // Binary images of 1 bit per pixel may also be given but they must be
68 // byte packed with the MSB of the first byte being the first pixel, and a
69 // one pixel is WHITE. For binary images set bytes_per_pixel=0.
70 void ImageThresholder::SetImage(const unsigned char *imagedata, int width, int height,
71  int bytes_per_pixel, int bytes_per_line) {
72  int bpp = bytes_per_pixel * 8;
73  if (bpp == 0) {
74  bpp = 1;
75  }
76  Image pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);
77  l_uint32 *data = pixGetData(pix);
78  int wpl = pixGetWpl(pix);
79  switch (bpp) {
80  case 1:
81  for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
82  for (int x = 0; x < width; ++x) {
83  if (imagedata[x / 8] & (0x80 >> (x % 8))) {
84  CLEAR_DATA_BIT(data, x);
85  } else {
86  SET_DATA_BIT(data, x);
87  }
88  }
89  }
90  break;
91 
92  case 8:
93  // Greyscale just copies the bytes in the right order.
94  for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
95  for (int x = 0; x < width; ++x) {
96  SET_DATA_BYTE(data, x, imagedata[x]);
97  }
98  }
99  break;
100 
101  case 24:
102  // Put the colors in the correct places in the line buffer.
103  for (int y = 0; y < height; ++y, imagedata += bytes_per_line) {
104  for (int x = 0; x < width; ++x, ++data) {
105  SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);
106  SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);
107  SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);
108  }
109  }
110  break;
111 
112  case 32:
113  // Maintain byte order consistency across different endianness.
114  for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) {
115  for (int x = 0; x < width; ++x) {
116  data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |
117  (imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];
118  }
119  }
120  break;
121 
122  default:
123  tprintf("Cannot convert RAW image to Pix with bpp = %d\n", bpp);
124  }
125  SetImage(pix);
126  pix.destroy();
127 }
128 
129 // Store the coordinates of the rectangle to process for later use.
130 // Doesn't actually do any thresholding.
131 void ImageThresholder::SetRectangle(int left, int top, int width, int height) {
132  rect_left_ = left;
133  rect_top_ = top;
134  rect_width_ = width;
135  rect_height_ = height;
136 }
137 
138 // Get enough parameters to be able to rebuild bounding boxes in the
139 // original image (not just within the rectangle).
140 // Left and top are enough with top-down coordinates, but
141 // the height of the rectangle and the image are needed for bottom-up.
142 void ImageThresholder::GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,
143  int *imageheight) {
144  *left = rect_left_;
145  *top = rect_top_;
146  *width = rect_width_;
147  *height = rect_height_;
148  *imagewidth = image_width_;
149  *imageheight = image_height_;
150 }
151 
152 // Pix vs raw, which to use? Pix is the preferred input for efficiency,
153 // since raw buffers are copied.
154 // SetImage for Pix clones its input, so the source pix may be pixDestroyed
155 // immediately after, but may not go away until after the Thresholder has
156 // finished with it.
158  if (pix_ != nullptr) {
159  pix_.destroy();
160  }
161  Image src = pix;
162  int depth;
163  pixGetDimensions(src, &image_width_, &image_height_, &depth);
164  // Convert the image as necessary so it is one of binary, plain RGB, or
165  // 8 bit with no colormap. Guarantee that we always end up with our own copy,
166  // not just a clone of the input.
167  if (pixGetColormap(src)) {
168  Image tmp = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC);
169  depth = pixGetDepth(tmp);
170  if (depth > 1 && depth < 8) {
171  pix_ = pixConvertTo8(tmp, false);
172  tmp.destroy();
173  } else {
174  pix_ = tmp;
175  }
176  } else if (depth > 1 && depth < 8) {
177  pix_ = pixConvertTo8(src, false);
178  } else {
179  pix_ = src.copy();
180  }
181  depth = pixGetDepth(pix_);
182  pix_channels_ = depth / 8;
183  pix_wpl_ = pixGetWpl(pix_);
184  scale_ = 1;
185  estimated_res_ = yres_ = pixGetYRes(pix_);
186  Init();
187 }
188 
189 std::tuple<bool, Image, Image, Image> ImageThresholder::Threshold(
190  TessBaseAPI *api,
191  ThresholdMethod method) {
192  Image pix_binary = nullptr;
193  Image pix_thresholds = nullptr;
194 
195  if (pix_channels_ == 0) {
196  // We have a binary image, but it still has to be copied, as this API
197  // allows the caller to modify the output.
198  Image original = GetPixRect();
199  pix_binary = original.copy();
200  original.destroy();
201  return std::make_tuple(true, nullptr, pix_binary, nullptr);
202  }
203 
204  auto pix_grey = GetPixRectGrey();
205 
206  int r;
207 
208  l_int32 pix_w, pix_h;
209  pixGetDimensions(pix_grey, &pix_w, &pix_h, nullptr);
210 
211  bool thresholding_debug;
212  api->GetBoolVariable("thresholding_debug", &thresholding_debug);
213  if (thresholding_debug) {
214  tprintf("\nimage width: %d height: %d ppi: %d\n", pix_w, pix_h, yres_);
215  }
216 
217  if (method == ThresholdMethod::Sauvola) {
218  int window_size;
219  double window_size_factor;
220  api->GetDoubleVariable("thresholding_window_size", &window_size_factor);
221  window_size = window_size_factor * yres_;
222  window_size = std::max(7, window_size);
223  window_size = std::min(pix_w < pix_h ? pix_w - 3 : pix_h - 3, window_size);
224  int half_window_size = window_size / 2;
225 
226  // factor for image division into tiles; >= 1
227  l_int32 nx, ny;
228  // tiles size will be approx. 250 x 250 pixels
229  nx = std::max(1, (pix_w + 125) / 250);
230  ny = std::max(1, (pix_h + 125) / 250);
231  auto xrat = pix_w / nx;
232  auto yrat = pix_h / ny;
233  if (xrat < half_window_size + 2) {
234  nx = pix_w / (half_window_size + 2);
235  }
236  if (yrat < half_window_size + 2) {
237  ny = pix_h / (half_window_size + 2);
238  }
239 
240  double kfactor;
241  api->GetDoubleVariable("thresholding_kfactor", &kfactor);
242  kfactor = std::max(0.0, kfactor);
243 
244  if (thresholding_debug) {
245  tprintf("window size: %d kfactor: %.3f nx:%d ny: %d\n", window_size, kfactor, nx, ny);
246  }
247 
248  r = pixSauvolaBinarizeTiled(pix_grey, half_window_size, kfactor, nx, ny,
249  (PIX**)pix_thresholds,
250  (PIX**)pix_binary);
251  } else { // if (method == ThresholdMethod::LeptonicaOtsu)
252  int tile_size;
253  double tile_size_factor;
254  api->GetDoubleVariable("thresholding_tile_size", &tile_size_factor);
255  tile_size = tile_size_factor * yres_;
256  tile_size = std::max(16, tile_size);
257 
258  int smooth_size;
259  double smooth_size_factor;
260  api->GetDoubleVariable("thresholding_smooth_kernel_size",
261  &smooth_size_factor);
262  smooth_size_factor = std::max(0.0, smooth_size_factor);
263  smooth_size = smooth_size_factor * yres_;
264  int half_smooth_size = smooth_size / 2;
265 
266  double score_fraction;
267  api->GetDoubleVariable("thresholding_score_fraction", &score_fraction);
268 
269  if (thresholding_debug) {
270  tprintf("tile size: %d smooth_size: %d score_fraction: %.2f\n", tile_size, smooth_size, score_fraction);
271  }
272 
273  r = pixOtsuAdaptiveThreshold(pix_grey, tile_size, tile_size,
274  half_smooth_size, half_smooth_size,
275  score_fraction,
276  (PIX**)pix_thresholds,
277  (PIX**)pix_binary);
278  }
279 
280  bool ok = (r == 0);
281  return std::make_tuple(ok, pix_grey, pix_binary, pix_thresholds);
282 }
283 
284 // Threshold the source image as efficiently as possible to the output Pix.
285 // Creates a Pix and sets pix to point to the resulting pointer.
286 // Caller must use pixDestroy to free the created Pix.
289  if (image_width_ > INT16_MAX || image_height_ > INT16_MAX) {
290  tprintf("Image too large: (%d, %d)\n", image_width_, image_height_);
291  return false;
292  }
293  if (pix_channels_ == 0) {
294  // We have a binary image, but it still has to be copied, as this API
295  // allows the caller to modify the output.
296  Image original = GetPixRect();
297  *pix = original.copy();
298  original.destroy();
299  } else {
301  }
302  return true;
303 }
304 
305 // Gets a pix that contains an 8 bit threshold value at each pixel. The
306 // returned pix may be an integer reduction of the binary image such that
307 // the scale factor may be inferred from the ratio of the sizes, even down
308 // to the extreme of a 1x1 pixel thresholds image.
309 // Ideally the 8 bit threshold should be the exact threshold used to generate
310 // the binary image in ThresholdToPix, but this is not a hard constraint.
311 // Returns nullptr if the input is binary. PixDestroy after use.
313  if (IsBinary()) {
314  return nullptr;
315  }
316  Image pix_grey = GetPixRectGrey();
317  int width = pixGetWidth(pix_grey);
318  int height = pixGetHeight(pix_grey);
319  std::vector<int> thresholds;
320  std::vector<int> hi_values;
321  OtsuThreshold(pix_grey, 0, 0, width, height, thresholds, hi_values);
322  pix_grey.destroy();
323  Image pix_thresholds = pixCreate(width, height, 8);
324  int threshold = thresholds[0] > 0 ? thresholds[0] : 128;
325  pixSetAllArbitrary(pix_thresholds, threshold);
326  return pix_thresholds;
327 }
328 
329 // Common initialization shared between SetImage methods.
332 }
333 
334 // Get a clone/copy of the source image rectangle.
335 // The returned Pix must be pixDestroyed.
336 // This function will be used in the future by the page layout analysis, and
337 // the layout analysis that uses it will only be available with Leptonica,
338 // so there is no raw equivalent.
340  if (IsFullImage()) {
341  // Just clone the whole thing.
342  return pix_.clone();
343  } else {
344  // Crop to the given rectangle.
345  Box *box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_);
346  Image cropped = pixClipRectangle(pix_, box, nullptr);
347  boxDestroy(&box);
348  return cropped;
349  }
350 }
351 
352 // Get a clone/copy of the source image rectangle, reduced to greyscale,
353 // and at the same resolution as the output binary.
354 // The returned Pix must be pixDestroyed.
355 // Provided to the classifier to extract features from the greyscale image.
357  auto pix = GetPixRect(); // May have to be reduced to grey.
358  int depth = pixGetDepth(pix);
359  if (depth != 8) {
360  if (depth == 24) {
361  auto tmp = pixConvert24To32(pix);
362  pix.destroy();
363  pix = tmp;
364  }
365  auto result = pixConvertTo8(pix, false);
366  pix.destroy();
367  return result;
368  }
369  return pix;
370 }
371 
372 // Otsu thresholds the rectangle, taking the rectangle from *this.
373 void ImageThresholder::OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const {
374  std::vector<int> thresholds;
375  std::vector<int> hi_values;
376 
377  int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_, rect_height_,
378  thresholds, hi_values);
379  // only use opencl if compiled w/ OpenCL and selected device is opencl
380 #ifdef USE_OPENCL
381  OpenclDevice od;
382  if (num_channels == 4 && od.selectedDeviceIsOpenCL() && rect_top_ == 0 && rect_left_ == 0) {
383  od.ThresholdRectToPixOCL((unsigned char *)pixGetData(src_pix), num_channels,
384  pixGetWpl(src_pix) * 4, &thresholds[0], &hi_values[0], out_pix /*pix_OCL*/,
386  } else {
387 #endif
388  ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix);
389 #ifdef USE_OPENCL
390  }
391 #endif
392 }
393 
397 // arrays and also the bytes per pixel in src_pix.
398 void ImageThresholder::ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds,
399  const std::vector<int> &hi_values, Image *pix) const {
400  *pix = pixCreate(rect_width_, rect_height_, 1);
401  uint32_t *pixdata = pixGetData(*pix);
402  int wpl = pixGetWpl(*pix);
403  int src_wpl = pixGetWpl(src_pix);
404  uint32_t *srcdata = pixGetData(src_pix);
405  pixSetXRes(*pix, pixGetXRes(src_pix));
406  pixSetYRes(*pix, pixGetYRes(src_pix));
407  for (int y = 0; y < rect_height_; ++y) {
408  const uint32_t *linedata = srcdata + (y + rect_top_) * src_wpl;
409  uint32_t *pixline = pixdata + y * wpl;
410  for (int x = 0; x < rect_width_; ++x) {
411  bool white_result = true;
412  for (int ch = 0; ch < num_channels; ++ch) {
413  int pixel = GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);
414  if (hi_values[ch] >= 0 && (pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
415  white_result = false;
416  break;
417  }
418  }
419  if (white_result) {
420  CLEAR_DATA_BIT(pixline, x);
421  } else {
422  SET_DATA_BIT(pixline, x);
423  }
424  }
425  }
426 }
427 
428 } // namespace tesseract.
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int OtsuThreshold(Image src_pix, int left, int top, int width, int height, std::vector< int > &thresholds, std::vector< int > &hi_values)
Definition: otsuthr.cpp:38
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:301
bool GetDoubleVariable(const char *name, double *value) const
Definition: baseapi.cpp:317
virtual Image GetPixRectThresholds()
int pix_wpl_
Words per line of pix_.
Definition: thresholder.h:188
virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth, int *imageheight)
bool IsFullImage() const
Return true if we are processing the full image.
Definition: thresholder.h:165
bool IsEmpty() const
Return true if no image has been set.
Definition: thresholder.cpp:58
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:70
int estimated_res_
Resolution estimate from text size.
Definition: thresholder.h:192
virtual std::tuple< bool, Image, Image, Image > Threshold(TessBaseAPI *api, ThresholdMethod method)
void SetRectangle(int left, int top, int width, int height)
virtual void Init()
Common initialization shared between SetImage methods.
void OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const
int scale_
Scale factor from original image.
Definition: thresholder.h:190
int pix_channels_
Number of 8-bit channels in pix_.
Definition: thresholder.h:187
int yres_
y pixels/inch in source image.
Definition: thresholder.h:191
int image_width_
Width of source pix_.
Definition: thresholder.h:185
virtual Image GetPixRectGrey()
void ThresholdRectToPix(Image src_pix, int num_channels, const std::vector< int > &thresholds, const std::vector< int > &hi_values, Image *pix) const
virtual bool ThresholdToPix(Image *pix)
Returns false on error.
bool IsBinary() const
Returns true if the source image is binary.
Definition: thresholder.h:84
int image_height_
Height of source pix_.
Definition: thresholder.h:186
virtual void Clear()
Destroy the Pix if there is one, freeing memory.
Definition: thresholder.cpp:53
Image copy() const
Definition: image.cpp:28
Image clone() const
Definition: image.cpp:24
void destroy()
Definition: image.cpp:32