tesseract  5.0.0
applybox_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include <allheaders.h>
13 #include <tesseract/baseapi.h>
15 #include <string>
16 #include "boxread.h"
17 #include "rect.h"
18 
19 #include "include_gunit.h"
20 
21 namespace tesseract {
22 
23 const char *kTruthTextWords = "To simple burn running of goods lately.\n";
24 const char *kTruthTextLine = "Tosimpleburnrunningofgoodslately.\n";
25 
26 // The fixture for testing Tesseract.
27 class ApplyBoxTest : public testing::Test {
28 protected:
29  std::string TestDataNameToPath(const std::string &name) {
30  return file::JoinPath(TESTING_DIR, name);
31  }
32  std::string TessdataPath() {
33  return TESSDATA_DIR;
34  }
35 
37  src_pix_ = nullptr;
38  }
39  ~ApplyBoxTest() override {
40  src_pix_.destroy();
41  }
42 
43  bool SetImage(const char *filename) {
44  bool found = false;
45  src_pix_.destroy();
46  src_pix_ = pixRead(TestDataNameToPath(filename).c_str());
47  if (api_.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY) != -1) {
50  api_.SetVariable("tessedit_make_boxes_from_boxes", "1");
51  api_.SetInputName(TestDataNameToPath(filename).c_str());
52  found = true;
53  }
54  return found;
55  }
56 
57  // Runs ApplyBoxes (via setting the appropriate variables and Recognize)
58  // and checks that the output ocr text matches the truth_str, and that
59  // the boxes match the given box file well enough.
60  // If line_mode is true, ApplyBoxes is run in line segmentation mode,
61  // otherwise the input box file is assumed to have character-level boxes.
62  void VerifyBoxesAndText(const char *imagefile, const char *truth_str, const char *target_box_file,
63  bool line_mode) {
64  if (!SetImage(imagefile)) {
65  // eng.traineddata not found or other problem during Init.
66  GTEST_SKIP();
67  return;
68  }
69  if (line_mode) {
70  api_.SetVariable("tessedit_resegment_from_line_boxes", "1");
71  } else {
72  api_.SetVariable("tessedit_resegment_from_boxes", "1");
73  }
74  api_.Recognize(nullptr);
75  char *ocr_text = api_.GetUTF8Text();
76  EXPECT_STREQ(truth_str, ocr_text);
77  delete[] ocr_text;
78  // Test the boxes by reading the target box file in parallel with the
79  // bounding boxes in the ocr output.
80  std::string box_filename = TestDataNameToPath(target_box_file);
81  FILE *box_file = OpenBoxFile(box_filename.c_str());
82  ASSERT_TRUE(box_file != nullptr);
83  int height = pixGetHeight(src_pix_);
85  do {
86  int left, top, right, bottom;
87  EXPECT_TRUE(it->BoundingBox(tesseract::RIL_SYMBOL, &left, &top, &right, &bottom));
88  TBOX ocr_box(ICOORD(left, height - bottom), ICOORD(right, height - top));
89  int line_number = 0;
90  TBOX truth_box;
91  std::string box_text;
92  EXPECT_TRUE(ReadNextBox(0, &line_number, box_file, box_text, &truth_box));
93  // Testing for major overlap is a bit weak, but if they all
94  // major overlap successfully, then it has to be fairly close.
95  EXPECT_TRUE(ocr_box.major_overlap(truth_box));
96  // Also check that the symbol text matches the box text.
97  char *symbol_text = it->GetUTF8Text(tesseract::RIL_SYMBOL);
98  EXPECT_STREQ(box_text.c_str(), symbol_text);
99  delete[] symbol_text;
100  } while (it->Next(tesseract::RIL_SYMBOL));
101  delete it;
102  }
103 
105  std::string ocr_text_;
107 };
108 
109 // Tests character-level applyboxes on normal Times New Roman.
110 TEST_F(ApplyBoxTest, TimesCharLevel) {
111  VerifyBoxesAndText("trainingtimes.tif", kTruthTextWords, "trainingtimes.box", false);
112 }
113 
114 // Tests character-level applyboxes on italic Times New Roman.
115 TEST_F(ApplyBoxTest, ItalicCharLevel) {
116  VerifyBoxesAndText("trainingital.tif", kTruthTextWords, "trainingital.box", false);
117 }
118 
119 // Tests line-level applyboxes on normal Times New Roman.
120 TEST_F(ApplyBoxTest, TimesLineLevel) {
121  VerifyBoxesAndText("trainingtimesline.tif", kTruthTextLine, "trainingtimes.box", true);
122 }
123 
124 // Tests line-level applyboxes on italic Times New Roman.
125 TEST_F(ApplyBoxTest, ItalLineLevel) {
126  VerifyBoxesAndText("trainingitalline.tif", kTruthTextLine, "trainingital.box", true);
127 }
128 
129 } // namespace tesseract
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:266
const char * kTruthTextLine
@ PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:168
FILE * OpenBoxFile(const char *fname)
Definition: boxread.cpp:59
const char * kTruthTextWords
TEST_F(EuroText, FastLatinOCR)
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:146
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:831
void SetPageSegMode(PageSegMode mode)
Definition: baseapi.cpp:508
bool SetVariable(const char *name, const char *value)
Definition: baseapi.cpp:276
ResultIterator * GetIterator()
Definition: baseapi.cpp:1313
void SetInputName(const char *name)
Definition: baseapi.cpp:267
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:365
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: baseapi.cpp:573
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool Next(PageIteratorLevel level) override
void destroy()
Definition: image.cpp:32
integer coordinate
Definition: points.h:36
bool major_overlap(const TBOX &box) const
Definition: rect.h:374
std::string TestDataNameToPath(const std::string &name)
void VerifyBoxesAndText(const char *imagefile, const char *truth_str, const char *target_box_file, bool line_mode)
tesseract::TessBaseAPI api_
bool SetImage(const char *filename)
std::string TessdataPath()
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65