tesseract  5.0.0
tatweel_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #if defined(_WIN32)
13 # include <io.h> // for _access
14 #else
15 # include <unistd.h> // for access
16 #endif
17 
18 #include "dawg.h"
19 #include "include_gunit.h"
20 #include "trie.h"
21 #include "unicharset.h"
22 #include "util/utf8/unicodetext.h" // for UnicodeText
23 
24 namespace tesseract {
25 
26 // Replacement for std::filesystem::exists (C++-17)
27 static bool file_exists(const char *filename) {
28 #if defined(_WIN32)
29  return _access(filename, 0) == 0;
30 #else
31  return access(filename, 0) == 0;
32 #endif
33 }
34 
35 class TatweelTest : public ::testing::Test {
36 protected:
37  void SetUp() override {
38  static std::locale system_locale("");
39  std::locale::global(system_locale);
40  }
41 
43  std::string filename = TestDataNameToPath("ara.wordlist");
44  if (file_exists(filename.c_str())) {
45  std::string wordlist("\u0640");
46  CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
47  // Put all the unicodes in the unicharset_.
48  UnicodeText text;
49  text.PointToUTF8(wordlist.data(), wordlist.size());
50  int num_tatweel = 0;
51  for (auto it = text.begin(); it != text.end(); ++it) {
52  std::string utf8 = it.get_utf8_string();
53  if (utf8.find("\u0640") != std::string::npos)
54  ++num_tatweel;
55  unicharset_.unichar_insert(utf8.c_str());
56  }
57  LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
58  EXPECT_GT(num_tatweel, 0);
59  }
60  }
61 
62  std::string TestDataNameToPath(const std::string &name) {
63  return file::JoinPath(TESTDATA_DIR, name);
64  }
66 };
67 
68 TEST_F(TatweelTest, UnicharsetIgnoresTatweel) {
69  // This test verifies that the unicharset ignores the Tatweel character.
70  for (int i = 0; i < unicharset_.size(); ++i) {
71  const char *utf8 = unicharset_.id_to_unichar(i);
72  EXPECT_EQ(strstr(utf8, reinterpret_cast<const char *>(u8"\u0640")), nullptr);
73  }
74 }
75 
76 TEST_F(TatweelTest, DictIgnoresTatweel) {
77  // This test verifies that the dictionary ignores the Tatweel character.
78  tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM, unicharset_.size(), 0);
79  std::string filename = TestDataNameToPath("ara.wordlist");
80  if (!file_exists(filename.c_str())) {
81  LOG(INFO) << "Skip test because of missing " << filename;
82  GTEST_SKIP();
83  } else {
84  EXPECT_TRUE(trie.read_and_add_word_list(filename.c_str(), unicharset_,
86  EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));
87  }
88 }
89 
90 TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {
91  // This test verifies that a load of an existing unicharset keeps any
92  // existing tatweel for backwards compatibility.
93  std::string filename = TestDataNameToPath("ara.unicharset");
94  if (!file_exists(filename.c_str())) {
95  LOG(INFO) << "Skip test because of missing " << filename;
96  GTEST_SKIP();
97  } else {
98  EXPECT_TRUE(unicharset_.load_from_file(filename.c_str()));
99  int num_tatweel = 0;
100  for (int i = 0; i < unicharset_.size(); ++i) {
101  const char *utf8 = unicharset_.id_to_unichar(i);
102  if (strstr(utf8, reinterpret_cast<const char *>(u8"\u0640")) != nullptr) {
103  ++num_tatweel;
104  }
105  }
106  LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel;
107  EXPECT_EQ(num_tatweel, 4);
108  }
109 }
110 
111 } // namespace tesseract
@ LOG
#define CHECK_OK(test)
Definition: include_gunit.h:84
@ INFO
Definition: log.h:28
@ DAWG_TYPE_WORD
Definition: dawg.h:66
std::string TestDataNameToPath(const std::string &name)
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:240
TEST_F(EuroText, FastLatinOCR)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
Definition: dawg.cpp:68
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:273
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:57
static int Defaults()
Definition: include_gunit.h:61
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52
void SetUp() override
Definition: tatweel_test.cc:37
std::string TestDataNameToPath(const std::string &name)
Definition: tatweel_test.cc:62
const_iterator end() const
Definition: unicodetext.cc:412
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:254
const_iterator begin() const
Definition: unicodetext.cc:408