tesseract  5.0.0
dawg_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "include_gunit.h"
13 
14 #include "ratngs.h"
15 #include "trie.h"
16 #include "unicharset.h"
17 
18 #include <sys/stat.h>
19 #include <cstdlib> // for system
20 #include <fstream> // for ifstream
21 #include <set>
22 #include <string>
23 #include <vector>
24 
25 #ifndef SW_TESTING
26 # define wordlist2dawg_prog "wordlist2dawg"
27 # define dawg2wordlist_prog "dawg2wordlist"
28 #endif
29 
30 namespace tesseract {
31 
32 // Test some basic functionality dealing with Dawgs (compressed dictionaries,
33 // aka Directed Acyclic Word Graphs).
34 class DawgTest : public testing::Test {
35 protected:
36  void SetUp() override {
37  std::locale::global(std::locale(""));
39  }
40 
41  void LoadWordlist(const std::string &filename, std::set<std::string> *words) const {
42  std::ifstream file(filename);
43  if (file.is_open()) {
44  std::string line;
45  while (getline(file, line)) {
46  // Remove trailing line terminators from line.
47  while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) {
48  line.resize(line.size() - 1);
49  }
50  // Add line to set.
51  words->insert(line.c_str());
52  }
53  file.close();
54  }
55  }
56  std::string TessBinaryPath(const std::string &name) const {
57  return file::JoinPath(TESSBIN_DIR, name);
58  }
59  std::string OutputNameToPath(const std::string &name) const {
60  return file::JoinPath(FLAGS_test_tmpdir, name);
61  }
62  int RunCommand(const std::string &program, const std::string &arg1, const std::string &arg2,
63  const std::string &arg3) const {
64  std::string cmdline = TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3;
65  return system(cmdline.c_str());
66  }
67  // Test that we are able to convert a wordlist file (one "word" per line) to
68  // a dawg (a compressed format) and then extract the original wordlist back
69  // out using the tools "wordlist2dawg" and "dawg2wordlist."
70  void TestDawgRoundTrip(const std::string &unicharset_filename,
71  const std::string &wordlist_filename) const {
72  std::set<std::string> orig_words, roundtrip_words;
73  std::string unicharset = file::JoinPath(TESTING_DIR, unicharset_filename);
74  std::string orig_wordlist = file::JoinPath(TESTING_DIR, wordlist_filename);
75  std::string output_dawg = OutputNameToPath(wordlist_filename + ".dawg");
76  std::string output_wordlist = OutputNameToPath(wordlist_filename);
77  LoadWordlist(orig_wordlist, &orig_words);
78  EXPECT_EQ(RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0);
79  EXPECT_EQ(RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist), 0);
80  LoadWordlist(output_wordlist, &roundtrip_words);
81  EXPECT_EQ(orig_words, roundtrip_words);
82  }
83 };
84 
85 TEST_F(DawgTest, TestDawgConversion) {
86  TestDawgRoundTrip("eng.unicharset", "eng.wordlist.clean.freq");
87 }
88 
89 TEST_F(DawgTest, TestMatching) {
90  UNICHARSET unicharset;
91  unicharset.load_from_file(file::JoinPath(TESTING_DIR, "eng.unicharset").c_str());
92  tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM, unicharset.size(), 0);
93  WERD_CHOICE space_apos(" '", unicharset);
94  trie.add_word_to_dawg(space_apos);
95 
96  WERD_CHOICE space(" ", unicharset);
97 
98  // partial match ok - then good!
99  EXPECT_TRUE(trie.prefix_in_dawg(space, false));
100  // require complete match - not present.
101  EXPECT_FALSE(trie.word_in_dawg(space));
102  EXPECT_FALSE(trie.prefix_in_dawg(space, true));
103 
104  // partial or complete match ok for full word:
105  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false));
106  EXPECT_TRUE(trie.word_in_dawg(space_apos));
107  EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));
108 }
109 
110 } // namespace tesseract
#define dawg2wordlist_prog
Definition: dawg_test.cc:27
#define wordlist2dawg_prog
Definition: dawg_test.cc:26
@ DAWG_TYPE_WORD
Definition: dawg.h:66
@ NGRAM_PERM
Definition: ratngs.h:237
TEST_F(EuroText, FastLatinOCR)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
size_t size() const
Definition: unicharset.h:355
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
Definition: dawg.cpp:41
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:64
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector< bool > *repetitions)
Definition: trie.cpp:159
int RunCommand(const std::string &program, const std::string &arg1, const std::string &arg2, const std::string &arg3) const
Definition: dawg_test.cc:62
void SetUp() override
Definition: dawg_test.cc:36
void LoadWordlist(const std::string &filename, std::set< std::string > *words) const
Definition: dawg_test.cc:41
std::string TessBinaryPath(const std::string &name) const
Definition: dawg_test.cc:56
void TestDawgRoundTrip(const std::string &unicharset_filename, const std::string &wordlist_filename) const
Definition: dawg_test.cc:70
std::string OutputNameToPath(const std::string &name) const
Definition: dawg_test.cc:59
static void MakeTmpdir()
Definition: include_gunit.h:38
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65