tesseract  5.0.0
baseapi_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "include_gunit.h"
13 
14 #include "cycletimer.h" // for CycleTimer
15 #include "log.h" // for LOG
16 #include "ocrblock.h" // for class BLOCK
17 #include "pageres.h"
18 
19 #include <tesseract/baseapi.h>
20 
21 #include <allheaders.h>
22 #include "gmock/gmock-matchers.h"
23 
24 #include <memory>
25 #include <regex>
26 #include <string>
27 #include <vector>
28 
29 namespace tesseract {
30 
31 using ::testing::ContainsRegex;
32 using ::testing::HasSubstr;
33 
34 static const char *langs[] = {"eng", "vie", "hin", "ara", nullptr};
35 static const char *image_files[] = {"HelloGoogle.tif", "viet.tif", "raaj.tif", "arabic.tif",
36  nullptr};
37 static const char *gt_text[] = {"Hello Google", "\x74\x69\xe1\xba\xbf\x6e\x67",
38  "\xe0\xa4\xb0\xe0\xa4\xbe\xe0\xa4\x9c",
39  "\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", nullptr};
40 
42  FRIEND_TEST(TesseractTest, LSTMGeometryTest);
43 };
44 
46  tess->SetImage(pix);
47  char *result = tess->GetUTF8Text();
48  std::string ocr_result = result;
49  delete[] result;
50  trim(ocr_result);
51  return ocr_result;
52 }
53 
54 // The fixture for testing Tesseract.
55 class TesseractTest : public testing::Test {
56 protected:
57  static std::string TestDataNameToPath(const std::string &name) {
58  return file::JoinPath(TESTING_DIR, name);
59  }
60  static std::string TessdataPath() {
61  return TESSDATA_DIR;
62  }
63 };
64 
65 // Test static TessBaseAPI (like it is used by tesserocr).
66 TEST_F(TesseractTest, StaticTessBaseAPI) {
67  static tesseract::TessBaseAPI api;
68  api.End();
69 }
70 
71 // Tests that Tesseract gets exactly the right answer on phototest.
72 TEST_F(TesseractTest, BasicTesseractTest) {
74  std::string truth_text;
75  std::string ocr_text;
76  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
77  Image src_pix = pixRead(TestDataNameToPath("phototest.tif").c_str());
78  CHECK(src_pix);
79  ocr_text = GetCleanedTextResult(&api, src_pix);
80  CHECK_OK(
81  file::GetContents(TestDataNameToPath("phototest.gold.txt"), &truth_text, file::Defaults()));
82  trim(truth_text);
83  EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
84  src_pix.destroy();
85  } else {
86  // eng.traineddata not found.
87  GTEST_SKIP();
88  }
89 }
90 
91 // Test that api.GetComponentImages() will return a set of images for
92 // paragraphs even if text recognition was not run.
93 TEST_F(TesseractTest, IteratesParagraphsEvenIfNotDetected) {
95  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
97  api.SetVariable("paragraph_debug_level", "3");
98 #if 0 // TODO: b622.png is missing
99  Pix* src_pix = pixRead(TestDataNameToPath("b622.png").c_str());
100  CHECK(src_pix);
101  api.SetImage(src_pix);
102  Boxa* para_boxes =
103  api.GetComponentImages(tesseract::RIL_PARA, true, nullptr, nullptr);
104  EXPECT_TRUE(para_boxes != nullptr);
105  Boxa* block_boxes =
106  api.GetComponentImages(tesseract::RIL_BLOCK, true, nullptr, nullptr);
107  EXPECT_TRUE(block_boxes != nullptr);
108  // TODO(eger): Get paragraphs out of this page pre-text.
109  EXPECT_GE(boxaGetCount(para_boxes), boxaGetCount(block_boxes));
110  boxaDestroy(&block_boxes);
111  boxaDestroy(&para_boxes);
112  src_pix.destroy();
113 #endif
114  } else {
115  // eng.traineddata not found.
116  GTEST_SKIP();
117  }
118 }
119 
120 // We should get hOCR output and not seg fault, even if the api caller doesn't
121 // call SetInputName().
122 TEST_F(TesseractTest, HOCRWorksWithoutSetInputName) {
124  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
125  // eng.traineddata not found.
126  GTEST_SKIP();
127  return;
128  }
129  Image src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
130  CHECK(src_pix);
131  api.SetImage(src_pix);
132  char *result = api.GetHOCRText(0);
133  EXPECT_TRUE(result != nullptr);
134  EXPECT_THAT(result, HasSubstr("Hello"));
135  EXPECT_THAT(result, HasSubstr("<div class='ocr_page'"));
136  delete[] result;
137  src_pix.destroy();
138 }
139 
140 // hOCR output should contain baseline info for upright textlines.
141 TEST_F(TesseractTest, HOCRContainsBaseline) {
143  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
144  // eng.traineddata not found.
145  GTEST_SKIP();
146  return;
147  }
148  Image src_pix = pixRead(TestDataNameToPath("HelloGoogle.tif").c_str());
149  CHECK(src_pix);
150  api.SetInputName("HelloGoogle.tif");
151  api.SetImage(src_pix);
152  char *result = api.GetHOCRText(0);
153  EXPECT_TRUE(result != nullptr);
154  EXPECT_THAT(result, HasSubstr("Hello"));
155  EXPECT_TRUE(std::regex_search(
156  result, std::regex{"<span class='ocr_line'[^>]* baseline [-.0-9]+ [-.0-9]+"}));
157 
158  delete[] result;
159  src_pix.destroy();
160 }
161 
162 // Tests that Tesseract gets exactly the right answer on some page numbers.
163 TEST_F(TesseractTest, AdaptToWordStrTest) {
164 #ifdef DISABLED_LEGACY_ENGINE
165  // Skip test because TessBaseAPI::AdaptToWordStr is missing.
166  GTEST_SKIP();
167 #else
168  static const char *kTrainingPages[] = {"136.tif", "256.tif", "410.tif", "432.tif", "540.tif",
169  "692.tif", "779.tif", "793.tif", "808.tif", "815.tif",
170  "12.tif", "12.tif", nullptr};
171  static const char *kTrainingText[] = {"1 3 6", "2 5 6", "4 1 0", "4 3 2", "5 4 0",
172  "6 9 2", "7 7 9", "7 9 3", "8 0 8", "8 1 5",
173  "1 2", "1 2", nullptr};
174  static const char *kTestPages[] = {"324.tif", "433.tif", "12.tif", nullptr};
175  static const char *kTestText[] = {"324", "433", "12", nullptr};
177  std::string truth_text;
178  std::string ocr_text;
179  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) == -1) {
180  // eng.traineddata not found.
181  GTEST_SKIP();
182  return;
183  }
184  api.SetVariable("matcher_sufficient_examples_for_prototyping", "1");
185  api.SetVariable("classify_class_pruner_threshold", "220");
186  // Train on the training text.
187  for (int i = 0; kTrainingPages[i] != nullptr; ++i) {
188  std::string image_file = TestDataNameToPath(kTrainingPages[i]);
189  Image src_pix = pixRead(image_file.c_str());
190  CHECK(src_pix);
191  api.SetImage(src_pix);
192  EXPECT_TRUE(api.AdaptToWordStr(tesseract::PSM_SINGLE_WORD, kTrainingText[i]))
193  << "Failed to adapt to text \"" << kTrainingText[i] << "\" on image " << image_file;
194  src_pix.destroy();
195  }
196  // Test the test text.
197  api.SetVariable("tess_bn_matching", "1");
199  for (int i = 0; kTestPages[i] != nullptr; ++i) {
200  Image src_pix = pixRead(TestDataNameToPath(kTestPages[i]).c_str());
201  CHECK(src_pix);
202  ocr_text = GetCleanedTextResult(&api, src_pix);
203  trim(truth_text);
204  EXPECT_STREQ(kTestText[i], ocr_text.c_str());
205  src_pix.destroy();
206  }
207 #endif
208 }
209 
210 // Tests that LSTM gets exactly the right answer on phototest.
211 TEST_F(TesseractTest, BasicLSTMTest) {
213  std::string truth_text;
214  std::string ocr_text;
215  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) {
216  // eng.traineddata not found.
217  GTEST_SKIP();
218  return;
219  }
220  Image src_pix = pixRead(TestDataNameToPath("phototest_2.tif").c_str());
221  CHECK(src_pix);
222  ocr_text = GetCleanedTextResult(&api, src_pix);
223  CHECK_OK(
224  file::GetContents(TestDataNameToPath("phototest.gold.txt"), &truth_text, file::Defaults()));
225  trim(truth_text);
226  EXPECT_STREQ(truth_text.c_str(), ocr_text.c_str());
227  src_pix.destroy();
228 }
229 
230 // Test that LSTM's character bounding boxes are properly converted to
231 // Tesseract structures. Note that we can't guarantee that LSTM's
232 // character boxes fall completely within Tesseract's word box because
233 // the baseline denormalization/normalization transforms may introduce
234 // errors due to float/int conversions (e.g., see OUTLINE::move() in
235 // ccstruct/poutline.h) Instead, we do a loose check.
236 TEST_F(TesseractTest, LSTMGeometryTest) {
237  Image src_pix = pixRead(TestDataNameToPath("deslant.tif").c_str());
239  if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) {
240  // eng.traineddata not found.
241  GTEST_SKIP();
242  return;
243  }
244  api.SetImage(src_pix);
245  ASSERT_EQ(api.Recognize(nullptr), 0);
246 
247  const PAGE_RES *page_res = api.GetPageRes();
248  PAGE_RES_IT page_res_it(const_cast<PAGE_RES *>(page_res));
249  page_res_it.restart_page();
250  BLOCK *block = page_res_it.block()->block;
251  CHECK(block);
252 
253  // extract word and character boxes for each word
254  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
255  WERD_RES *word = page_res_it.word();
256  CHECK(word);
257  CHECK(word->best_choice);
258  CHECK_GT(word->best_choice->length(), 0);
259  CHECK(word->word);
260  CHECK(word->box_word);
261  // tesseract's word box
262  TBOX tess_blob_box;
263  tess_blob_box = word->word->bounding_box();
264  tess_blob_box.rotate(block->re_rotation());
265  // verify that each of LSTM's character boxes lies close to within
266  // tesseract's word box
267  for (int i = 0; i < word->box_word->length(); ++i) {
268  TBOX lstm_blob_box = word->box_word->BlobBox(i);
269  // LSTM character box should not spill out of tesseract word box
270  // by more than a few pixels in any direction
271  EXPECT_LT(tess_blob_box.left() - lstm_blob_box.left(), 5);
272  EXPECT_LT(lstm_blob_box.right() - tess_blob_box.right(), 5);
273  EXPECT_LT(tess_blob_box.bottom() - lstm_blob_box.bottom(), 5);
274  EXPECT_LT(lstm_blob_box.top() - tess_blob_box.top(), 5);
275  }
276  }
277  src_pix.destroy();
278 }
279 
280 TEST_F(TesseractTest, InitConfigOnlyTest) {
281  // Languages for testing initialization.
282  const char *langs[] = {"eng", "chi_tra", "jpn", "vie"};
283  std::unique_ptr<tesseract::TessBaseAPI> api;
284  CycleTimer timer;
285  for (auto &lang : langs) {
286  api = std::make_unique<tesseract::TessBaseAPI>();
287  timer.Restart();
288  EXPECT_EQ(0, api->Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY));
289  timer.Stop();
290  LOG(INFO) << "Lang " << lang << " took " << timer.GetInMs() << "ms in regular init";
291  }
292  // Init variables to set for config-only initialization.
293  std::vector<std::string> vars_vec, vars_values;
294  vars_vec.emplace_back("tessedit_init_config_only");
295  vars_values.emplace_back("1");
296  LOG(INFO) << "Switching to config only initialization:";
297  for (auto &lang : langs) {
298  api = std::make_unique<tesseract::TessBaseAPI>();
299  timer.Restart();
300  EXPECT_EQ(0, api->Init(TessdataPath().c_str(), lang, tesseract::OEM_TESSERACT_ONLY, nullptr, 0,
301  &vars_vec, &vars_values, false));
302  timer.Stop();
303  LOG(INFO) << "Lang " << lang << " took " << timer.GetInMs() << "ms in config-only init";
304  }
305 }
306 
307 // Tests if two instances of Tesseract/LSTM can co-exist in the same thread.
308 // NOTE: This is not an exhaustive test and current support for multiple
309 // instances in Tesseract is fragile. This test is intended largely as a means
310 // of detecting and guarding against the existing support being possibly broken
311 // by future CLs. TessBaseAPI instances are initialized using the default
312 // OEM_DEFAULT mode.
313 TEST(TesseractInstanceTest, TestMultipleTessInstances) {
314  int num_langs = 0;
315  while (langs[num_langs] != nullptr) {
316  ++num_langs;
317  }
318 
319  const std::string kTessdataPath = TESSDATA_DIR;
320 
321  // Preload images and verify that OCR is correct on them individually.
322  std::vector<Image > pix(num_langs);
323  for (int i = 0; i < num_langs; ++i) {
324  std::string tracestring = "Single instance test with lang = ";
325  tracestring += langs[i];
326  SCOPED_TRACE(tracestring);
327  std::string path = file::JoinPath(TESTING_DIR, image_files[i]);
328  pix[i] = pixRead(path.c_str());
329  QCHECK(pix[i] != nullptr) << "Could not read " << path;
330 
332  EXPECT_EQ(0, tess.Init(kTessdataPath.c_str(), langs[i]));
333  std::string ocr_result = GetCleanedTextResult(&tess, pix[i]);
334  EXPECT_STREQ(gt_text[i], ocr_result.c_str());
335  }
336 
337  // Process the images in all pairwise combinations of associated languages.
338  std::string ocr_result[2];
339  for (int i = 0; i < num_langs; ++i) {
340  for (int j = i + 1; j < num_langs; ++j) {
341  tesseract::TessBaseAPI tess1, tess2;
342  tess1.Init(kTessdataPath.c_str(), langs[i]);
343  tess2.Init(kTessdataPath.c_str(), langs[j]);
344 
345  ocr_result[0] = GetCleanedTextResult(&tess1, pix[i]);
346  ocr_result[1] = GetCleanedTextResult(&tess2, pix[j]);
347 
348  EXPECT_FALSE(strcmp(gt_text[i], ocr_result[0].c_str()) ||
349  strcmp(gt_text[j], ocr_result[1].c_str()))
350  << "OCR failed on language pair " << langs[i] << "-" << langs[j];
351  }
352  }
353 
354  for (int i = 0; i < num_langs; ++i) {
355  pix[i].destroy();
356  }
357 }
358 
359 // Tests whether Tesseract parameters are correctly set for the two instances.
360 TEST(TesseractInstanceTest, TestMultipleTessInstanceVariables) {
361  std::string illegal_name = "an_illegal_name";
362  std::string langs[2] = {"eng", "hin"};
363  std::string int_param_name = "tessedit_pageseg_mode";
364  int int_param[2] = {1, 2};
365  std::string int_param_str[2] = {"1", "2"};
366  std::string bool_param_name = "tessedit_ambigs_training";
367  bool bool_param[2] = {false, true};
368  std::string bool_param_str[2] = {"F", "T"};
369  std::string str_param_name = "tessedit_char_blacklist";
370  std::string str_param[2] = {"abc", "def"};
371  std::string double_param_name = "segment_penalty_dict_frequent_word";
372  std::string double_param_str[2] = {"0.01", "2"};
373  double double_param[2] = {0.01, 2};
374 
375  const std::string kTessdataPath = TESSDATA_DIR;
376 
377  tesseract::TessBaseAPI tess1, tess2;
378  for (int i = 0; i < 2; ++i) {
379  tesseract::TessBaseAPI *api = (i == 0) ? &tess1 : &tess2;
380  api->Init(kTessdataPath.c_str(), langs[i].c_str());
381  api->SetVariable(illegal_name.c_str(), "none");
382  api->SetVariable(int_param_name.c_str(), int_param_str[i].c_str());
383  api->SetVariable(bool_param_name.c_str(), bool_param_str[i].c_str());
384  api->SetVariable(str_param_name.c_str(), str_param[i].c_str());
385  api->SetVariable(double_param_name.c_str(), double_param_str[i].c_str());
386  }
387  for (int i = 0; i < 2; ++i) {
388  tesseract::TessBaseAPI *api = (i == 0) ? &tess1 : &tess2;
389  EXPECT_FALSE(api->GetStringVariable(illegal_name.c_str()));
390  int intvar;
391  EXPECT_TRUE(api->GetIntVariable(int_param_name.c_str(), &intvar));
392  EXPECT_EQ(int_param[i], intvar);
393  bool boolvar;
394  EXPECT_TRUE(api->GetBoolVariable(bool_param_name.c_str(), &boolvar));
395  EXPECT_EQ(bool_param[i], boolvar);
396  EXPECT_STREQ(str_param[i].c_str(), api->GetStringVariable(str_param_name.c_str()));
397  double doublevar;
398  EXPECT_TRUE(api->GetDoubleVariable(double_param_name.c_str(), &doublevar));
399  EXPECT_EQ(double_param[i], doublevar);
400  }
401 }
402 
403 } // namespace tesseract
@ LOG
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_GT(test, value)
Definition: include_gunit.h:81
#define CHECK_OK(test)
Definition: include_gunit.h:84
@ INFO
Definition: log.h:28
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:266
@ PSM_SINGLE_WORD
Treat the image as a single word.
Definition: publictypes.h:170
@ PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:168
std::string TestDataNameToPath(const std::string &name)
TEST_F(EuroText, FastLatinOCR)
std::string GetCleanedTextResult(tesseract::TessBaseAPI *tess, Image pix)
Definition: baseapi_test.cc:45
TEST(TesseractInstanceTest, TestMultipleTessInstances)
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:831
const PAGE_RES * GetPageRes() const
Definition: baseapi.h:762
void SetPageSegMode(PageSegMode mode)
Definition: baseapi.cpp:508
bool GetIntVariable(const char *name, int *value) const
Definition: baseapi.cpp:291
bool SetVariable(const char *name, const char *value)
Definition: baseapi.cpp:276
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
Definition: baseapi.cpp:267
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:365
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: baseapi.cpp:573
Boxa * GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:699
const char * GetStringVariable(const char *name) const
Definition: baseapi.cpp:311
bool AdaptToWordStr(PageSegMode mode, const char *wordstr)
Definition: baseapi.cpp:1795
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:301
bool GetDoubleVariable(const char *name, double *value) const
Definition: baseapi.cpp:317
const TBOX & BlobBox(unsigned index) const
Definition: boxword.h:84
unsigned length() const
Definition: boxword.h:81
void destroy()
Definition: image.cpp:32
FCOORD re_rotation() const
Definition: ocrblock.h:129
WERD_CHOICE * best_choice
Definition: pageres.h:239
tesseract::BoxWord * box_word
Definition: pageres.h:270
WERD_RES * restart_page()
Definition: pageres.h:710
WERD_RES * forward()
Definition: pageres.h:743
WERD_RES * word() const
Definition: pageres.h:763
BLOCK_RES * block() const
Definition: pageres.h:769
unsigned length() const
Definition: ratngs.h:283
TDimension left() const
Definition: rect.h:82
void rotate(const FCOORD &vec)
Definition: rect.h:210
TDimension top() const
Definition: rect.h:68
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
TBOX bounding_box() const
Definition: werd.cpp:155
static std::string TessdataPath()
Definition: baseapi_test.cc:60
static std::string TestDataNameToPath(const std::string &name)
Definition: baseapi_test.cc:57
void Stop()
Definition: cycletimer.h:48
void Restart()
Definition: cycletimer.h:43
int64_t GetInMs() const
Definition: cycletimer.h:54
static int Defaults()
Definition: include_gunit.h:61
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52