tesseract  5.0.0
dawg.cpp
Go to the documentation of this file.
1 /********************************************************************************
2  *
3  * File: dawg.cpp (Formerly dawg.c)
4  * Description: Use a Directed Acyclic Word Graph
5  * Author: Mark Seaman, OCR Technology
6  *
7  * (c) Copyright 1987, Hewlett-Packard Company.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  *********************************************************************************/
19 /*----------------------------------------------------------------------
20  I n c l u d e s
21 ----------------------------------------------------------------------*/
22 
23 #include "dawg.h"
24 
25 #include "dict.h"
26 #include "helpers.h"
27 #include "tprintf.h"
28 
29 #include <memory>
30 
31 /*----------------------------------------------------------------------
32  F u n c t i o n s f o r D a w g
33 ----------------------------------------------------------------------*/
34 namespace tesseract {
35 
36 // Destructor.
37 // It is defined here, so the compiler can create a single vtable
38 // instead of weak vtables in every compilation unit.
39 Dawg::~Dawg() = default;
40 
42  bool requires_complete) const {
43  if (word.empty()) {
44  return !requires_complete;
45  }
46  NODE_REF node = 0;
47  int end_index = word.length() - 1;
48  for (int i = 0; i < end_index; i++) {
49  EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
50  if (edge == NO_EDGE) {
51  return false;
52  }
53  if ((node = next_node(edge)) == 0) {
54  // This only happens if all words following this edge terminate --
55  // there are no larger words. See Trie::add_word_to_dawg()
56  return false;
57  }
58  }
59  // Now check the last character.
60  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
61  NO_EDGE;
62 }
63 
64 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
65  return prefix_in_dawg(word, true);
66 }
67 
68 int Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset,
69  bool enable_wildcard) const {
70  if (filename == nullptr) {
71  return 0;
72  }
73 
74  FILE *word_file;
75  char string[CHARS_PER_LINE];
76  int misses = 0;
77  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
78 
79  word_file = fopen(filename, "r");
80  if (word_file == nullptr) {
81  tprintf("Error: Could not open file %s\n", filename);
82  ASSERT_HOST(word_file);
83  }
84 
85  while (fgets(string, CHARS_PER_LINE, word_file) != nullptr) {
86  chomp_string(string); // remove newline
87  WERD_CHOICE word(string, unicharset);
88  if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
89  if (!match_words(&word, 0, 0,
90  enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
91  tprintf("Missing word: %s\n", string);
92  ++misses;
93  }
94  } else {
95  tprintf("Failed to create a valid word from %s\n", string);
96  }
97  }
98  fclose(word_file);
99  // Make sure the user sees this with fprintf instead of tprintf.
100  if (debug_level_) {
101  tprintf("Number of lost words=%d\n", misses);
102  }
103  return misses;
104 }
105 
106 void Dawg::iterate_words(const UNICHARSET &unicharset,
107  std::function<void(const WERD_CHOICE *)> cb) const {
108  WERD_CHOICE word(&unicharset);
109  iterate_words_rec(word, 0, cb);
110 }
111 
112 static void CallWithUTF8(const std::function<void(const char *)> &cb,
113  const WERD_CHOICE *wc) {
114  std::string s;
115  wc->string_and_lengths(&s, nullptr);
116  cb(s.c_str());
117 }
118 
119 void Dawg::iterate_words(const UNICHARSET &unicharset,
120  const std::function<void(const char *)> &cb) const {
121  using namespace std::placeholders; // for _1
122  std::function<void(const WERD_CHOICE *)> shim(
123  std::bind(CallWithUTF8, cb, _1));
124  WERD_CHOICE word(&unicharset);
125  iterate_words_rec(word, 0, shim);
126 }
127 
129  const WERD_CHOICE &word_so_far, NODE_REF to_explore,
130  const std::function<void(const WERD_CHOICE *)> &cb) const {
131  NodeChildVector children;
132  this->unichar_ids_of(to_explore, &children, false);
133  for (auto &i : children) {
134  WERD_CHOICE next_word(word_so_far);
135  next_word.append_unichar_id(i.unichar_id, 1, 0.0, 0.0);
136  if (this->end_of_word(i.edge_ref)) {
137  cb(&next_word);
138  }
139  NODE_REF next = next_node(i.edge_ref);
140  if (next != 0) {
141  iterate_words_rec(next_word, next, cb);
142  }
143  }
144 }
145 
146 bool Dawg::match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node,
147  UNICHAR_ID wildcard) const {
148  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
149  bool any_matched = false;
150  NodeChildVector vec;
151  this->unichar_ids_of(node, &vec, false);
152  for (auto &i : vec) {
153  word->set_unichar_id(i.unichar_id, index);
154  if (match_words(word, index, node, wildcard)) {
155  any_matched = true;
156  }
157  }
158  word->set_unichar_id(wildcard, index);
159  return any_matched;
160  } else {
161  auto word_end = index == word->length() - 1;
162  auto edge = edge_char_of(node, word->unichar_id(index), word_end);
163  if (edge != NO_EDGE) { // normal edge in DAWG
164  node = next_node(edge);
165  if (word_end) {
166  if (debug_level_ > 1) {
167  word->print("match_words() found: ");
168  }
169  return true;
170  } else if (node != 0) {
171  return match_words(word, index + 1, node, wildcard);
172  }
173  }
174  }
175  return false;
176 }
177 
178 void Dawg::init(int unicharset_size) {
179  ASSERT_HOST(unicharset_size > 0);
180  unicharset_size_ = unicharset_size;
181  // Set bit masks. We will use the value unicharset_size_ as a null char, so
182  // the actual number of unichars is unicharset_size_ + 1.
183  flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
185  letter_mask_ = ~(~0ull << flag_start_bit_);
188 }
189 
190 /*----------------------------------------------------------------------
191  F u n c t i o n s f o r S q u i s h e d D a w g
192 ----------------------------------------------------------------------*/
193 
195  delete[] edges_;
196 }
197 
199  bool word_end) const {
200  EDGE_REF edge = node;
201  if (node == 0) { // binary search
202  EDGE_REF start = 0;
203  EDGE_REF end = num_forward_edges_in_node0 - 1;
204  int compare;
205  while (start <= end) {
206  edge = (start + end) >> 1; // (start + end) / 2
207  compare = given_greater_than_edge_rec(NO_EDGE, word_end, unichar_id,
208  edges_[edge]);
209  if (compare == 0) { // given == vec[k]
210  return edge;
211  } else if (compare == 1) { // given > vec[k]
212  start = edge + 1;
213  } else { // given < vec[k]
214  end = edge - 1;
215  }
216  }
217  } else { // linear search
218  if (edge != NO_EDGE && edge_occupied(edge)) {
219  do {
220  if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
221  (!word_end || end_of_word_from_edge_rec(edges_[edge]))) {
222  return (edge);
223  }
224  } while (!last_edge(edge++));
225  }
226  }
227  return (NO_EDGE); // not found
228 }
229 
230 int32_t SquishedDawg::num_forward_edges(NODE_REF node) const {
231  EDGE_REF edge = node;
232  int32_t num = 0;
233 
234  if (forward_edge(edge)) {
235  do {
236  num++;
237  } while (!last_edge(edge++));
238  }
239 
240  return (num);
241 }
242 
243 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
244  if (node == NO_EDGE) {
245  return; // nothing to print
246  }
247 
248  EDGE_REF edge = node;
249  const char *forward_string = "FORWARD";
250  const char *backward_string = " ";
251 
252  const char *last_string = "LAST";
253  const char *not_last_string = " ";
254 
255  const char *eow_string = "EOW";
256  const char *not_eow_string = " ";
257 
258  const char *direction;
259  const char *is_last;
260  const char *eow;
261 
262  UNICHAR_ID unichar_id;
263 
264  if (edge_occupied(edge)) {
265  do {
266  direction = forward_edge(edge) ? forward_string : backward_string;
267  is_last = last_edge(edge) ? last_string : not_last_string;
268  eow = end_of_word(edge) ? eow_string : not_eow_string;
269 
270  unichar_id = edge_letter(edge);
271  tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
272  edge, next_node(edge), unichar_id, direction, is_last, eow);
273 
274  if (edge - node > max_num_edges) {
275  return;
276  }
277  } while (!last_edge(edge++));
278 
279  if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) {
280  do {
281  direction = forward_edge(edge) ? forward_string : backward_string;
282  is_last = last_edge(edge) ? last_string : not_last_string;
283  eow = end_of_word(edge) ? eow_string : not_eow_string;
284 
285  unichar_id = edge_letter(edge);
286  tprintf(REFFORMAT " : next = " REFFORMAT
287  ", unichar_id = %d, %s %s %s\n",
288  edge, next_node(edge), unichar_id, direction, is_last, eow);
289 
290  if (edge - node > MAX_NODE_EDGES_DISPLAY) {
291  return;
292  }
293  } while (!last_edge(edge++));
294  }
295  } else {
296  tprintf(REFFORMAT " : no edges in this node\n", node);
297  }
298  tprintf("\n");
299 }
300 
301 void SquishedDawg::print_edge(EDGE_REF edge) const {
302  if (edge == NO_EDGE) {
303  tprintf("NO_EDGE\n");
304  } else {
305  tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = '%d', %s %s %s\n",
306  edge, next_node(edge), edge_letter(edge),
307  (forward_edge(edge) ? "FORWARD" : " "),
308  (last_edge(edge) ? "LAST" : " "),
309  (end_of_word(edge) ? "EOW" : ""));
310  }
311 }
312 
313 bool SquishedDawg::read_squished_dawg(TFile *file) {
314  if (debug_level_) {
315  tprintf("Reading squished dawg\n");
316  }
317 
318  // Read the magic number and check that it matches kDawgMagicNumber, as
319  // auto-endian fixing should make sure it is always correct.
320  int16_t magic;
321  if (!file->DeSerialize(&magic)) {
322  return false;
323  }
324  if (magic != kDawgMagicNumber) {
325  tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber);
326  return false;
327  }
328 
329  int32_t unicharset_size;
330  if (!file->DeSerialize(&unicharset_size)) {
331  return false;
332  }
333  if (!file->DeSerialize(&num_edges_)) {
334  return false;
335  }
336  ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
337  Dawg::init(unicharset_size);
338 
339  edges_ = new EDGE_RECORD[num_edges_];
340  if (!file->DeSerialize(&edges_[0], num_edges_)) {
341  return false;
342  }
343  if (debug_level_ > 2) {
344  tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
345  type_, lang_.c_str(), perm_, unicharset_size_, num_edges_);
346  for (EDGE_REF edge = 0; edge < num_edges_; ++edge) {
347  print_edge(edge);
348  }
349  }
350  return true;
351 }
352 
353 std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
354  int32_t *num_nodes) const {
355  EDGE_REF edge;
356  std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]);
357  int32_t node_counter;
358  int32_t num_edges;
359 
360  for (edge = 0; edge < num_edges_; edge++) { // init all slots
361  node_map[edge] = -1;
362  }
363 
364  node_counter = num_forward_edges(0);
365 
366  *num_nodes = 0;
367  for (edge = 0; edge < num_edges_; edge++) { // search all slots
368 
369  if (forward_edge(edge)) {
370  (*num_nodes)++; // count nodes links
371  node_map[edge] = (edge ? node_counter : 0);
372  num_edges = num_forward_edges(edge);
373  if (edge != 0) {
374  node_counter += num_edges;
375  }
376  edge += num_edges;
377  if (edge >= num_edges_) {
378  break;
379  }
380  if (backward_edge(edge)) {
381  while (!last_edge(edge++)) {
382  ;
383  }
384  }
385  edge--;
386  }
387  }
388  return node_map;
389 }
390 
392  EDGE_REF edge;
393  int32_t num_edges;
394  int32_t node_count = 0;
395  EDGE_REF old_index;
396  EDGE_RECORD temp_record;
397 
398  if (debug_level_) {
399  tprintf("write_squished_dawg\n");
400  }
401 
402  std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
403 
404  // Write the magic number to help detecting a change in endianness.
405  int16_t magic = kDawgMagicNumber;
406  if (!file->Serialize(&magic)) {
407  return false;
408  }
409  if (!file->Serialize(&unicharset_size_)) {
410  return false;
411  }
412 
413  // Count the number of edges in this Dawg.
414  num_edges = 0;
415  for (edge = 0; edge < num_edges_; edge++) {
416  if (forward_edge(edge)) {
417  num_edges++;
418  }
419  }
420 
421  // Write edge count to file.
422  if (!file->Serialize(&num_edges)) {
423  return false;
424  }
425 
426  if (debug_level_) {
427  tprintf("%d nodes in DAWG\n", node_count);
428  tprintf("%d edges in DAWG\n", num_edges);
429  }
430 
431  for (edge = 0; edge < num_edges_; edge++) {
432  if (forward_edge(edge)) { // write forward edges
433  do {
434  old_index = next_node_from_edge_rec(edges_[edge]);
435  set_next_node(edge, node_map[old_index]);
436  temp_record = edges_[edge];
437  if (!file->Serialize(&temp_record)) {
438  return false;
439  }
440  set_next_node(edge, old_index);
441  } while (!last_edge(edge++));
442 
443  if (edge >= num_edges_) {
444  break;
445  }
446  if (backward_edge(edge)) { // skip back links
447  while (!last_edge(edge++)) {
448  ;
449  }
450  }
451 
452  edge--;
453  }
454  }
455  return true;
456 }
457 
458 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:79
#define REFFORMAT
Definition: dawg.h:85
#define NUM_FLAG_BITS
Definition: dawg.h:84
#define CHARS_PER_LINE
Definition: dict.h:44
int64_t EDGE_REF
Definition: dawg.h:49
uint64_t EDGE_RECORD
Definition: dawg.h:47
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int64_t NODE_REF
Definition: dawg.h:50
void chomp_string(char *str)
Definition: helpers.h:89
int UNICHAR_ID
Definition: unichar.h:36
std::vector< NodeChild > NodeChildVector
Definition: dawg.h:60
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:340
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:309
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
Definition: ratngs.cpp:427
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
bool empty() const
Definition: ratngs.h:280
unsigned length() const
Definition: ratngs.h:283
void print() const
Definition: ratngs.h:557
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:447
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:210
int flag_start_bit_
Definition: dawg.h:314
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
Definition: dawg.cpp:41
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:247
uint64_t flags_mask_
Definition: dawg.h:311
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:64
virtual bool end_of_word(EDGE_REF edge_ref) const =0
uint64_t next_node_mask_
Definition: dawg.h:310
std::string lang_
Definition: dawg.h:302
void iterate_words(const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const
Definition: dawg.cpp:106
bool match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node, UNICHAR_ID wildcard) const
Definition: dawg.cpp:146
int next_node_start_bit_
Definition: dawg.h:315
DawgType type_
Definition: dawg.h:303
int unicharset_size_
Definition: dawg.h:313
void init(int unicharset_size)
Definition: dawg.cpp:178
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:223
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:227
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
Definition: dawg.cpp:68
uint64_t letter_mask_
Definition: dawg.h:312
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:305
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
int debug_level_
Definition: dawg.h:317
virtual ~Dawg()
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, const std::function< void(const WERD_CHOICE *)> &cb) const
Definition: dawg.cpp:128
static const int16_t kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
Definition: dawg.h:113
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const override
Returns the edge that corresponds to the letter out of this node.
Definition: dawg.cpp:198
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:391
void print_node(NODE_REF node, int max_num_edges) const override
Definition: dawg.cpp:243
~SquishedDawg() override
Definition: dawg.cpp:194