tesseract  5.0.0
trie.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * File: trie.cpp (Formerly trie.c)
4  * Description: Functions to build a trie data structure.
5  * Author: Mark Seaman, OCR Technology
6  *
7  * (c) Copyright 1987, Hewlett-Packard Company.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  *****************************************************************************/
19 /*----------------------------------------------------------------------
20  I n c l u d e s
21 ----------------------------------------------------------------------*/
22 
23 #include "trie.h"
24 
25 #include "dawg.h"
26 #include "dict.h"
27 #include "helpers.h"
28 #include "kdpair.h"
29 
30 namespace tesseract {
31 
32 const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
33 const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
34 const char kForceReverse[] = "RRP_FORCE_REVERSE";
35 
37 
38 const char Trie::kAlphaPatternUnicode[] = "\u2000";
39 const char Trie::kDigitPatternUnicode[] = "\u2001";
40 const char Trie::kAlphanumPatternUnicode[] = "\u2002";
41 const char Trie::kPuncPatternUnicode[] = "\u2003";
42 const char Trie::kLowerPatternUnicode[] = "\u2004";
43 const char Trie::kUpperPatternUnicode[] = "\u2005";
44 
45 const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
46  return RTLReversePolicyNames[reverse_policy];
47 }
48 
49 // Reset the Trie to empty.
50 void Trie::clear() {
51  for (auto node : nodes_) {
52  delete node;
53  }
54  nodes_.clear();
55  root_back_freelist_.clear();
56  num_edges_ = 0;
57  new_dawg_node(); // Need to allocate node 0.
58 }
59 
60 bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end,
61  UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr,
62  EDGE_INDEX *edge_index) const {
63  if (debug_level_ == 3) {
64  tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
65  " direction %d word_end %d unichar_id %d, exploring node:\n",
66  node_ref, next_node, direction, word_end, unichar_id);
67  if (node_ref != NO_EDGE) {
68  print_node(node_ref, nodes_[node_ref]->forward_edges.size());
69  }
70  }
71  if (node_ref == NO_EDGE) {
72  return false;
73  }
74  assert(static_cast<size_t>(node_ref) < nodes_.size());
75  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? nodes_[node_ref]->forward_edges
76  : nodes_[node_ref]->backward_edges;
77  int vec_size = vec.size();
78  if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search
79  EDGE_INDEX start = 0;
80  EDGE_INDEX end = vec_size - 1;
81  EDGE_INDEX k;
82  int compare;
83  while (start <= end) {
84  k = (start + end) >> 1; // (start + end) / 2
85  compare = given_greater_than_edge_rec(next_node, word_end, unichar_id, vec[k]);
86  if (compare == 0) { // given == vec[k]
87  *edge_ptr = &(vec[k]);
88  *edge_index = k;
89  return true;
90  } else if (compare == 1) { // given > vec[k]
91  start = k + 1;
92  } else { // given < vec[k]
93  end = k - 1;
94  }
95  }
96  } else { // linear search
97  for (int i = 0; i < vec_size; ++i) {
98  EDGE_RECORD &edge_rec = vec[i];
99  if (edge_rec_match(next_node, word_end, unichar_id, next_node_from_edge_rec(edge_rec),
100  end_of_word_from_edge_rec(edge_rec), unichar_id_from_edge_rec(edge_rec))) {
101  *edge_ptr = &(edge_rec);
102  *edge_index = i;
103  return true;
104  }
105  }
106  }
107  return false; // not found
108 }
109 
110 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, int direction,
111  bool word_end, UNICHAR_ID unichar_id) {
112  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? &(nodes_[node1]->forward_edges)
113  : &(nodes_[node1]->backward_edges);
114  unsigned search_index;
115  if (node1 == 0 && direction == FORWARD_EDGE) {
116  search_index = 0; // find the index to make the add sorted
117  while (search_index < vec->size() &&
118  given_greater_than_edge_rec(node2, word_end, unichar_id, (*vec)[search_index]) == 1) {
119  search_index++;
120  }
121  } else {
122  search_index = vec->size(); // add is unsorted, so index does not matter
123  }
124  EDGE_RECORD edge_rec;
125  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
126  if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) {
127  EDGE_INDEX edge_index = root_back_freelist_.back();
128  root_back_freelist_.pop_back();
129  (*vec)[edge_index] = edge_rec;
130  } else if (search_index < vec->size()) {
131  vec->insert(vec->begin() + search_index, edge_rec);
132  } else {
133  vec->push_back(edge_rec);
134  }
135  if (debug_level_ > 1) {
136  tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
137  print_edge_rec(edge_rec);
138  tprintf("\n");
139  }
140  num_edges_++;
141  return true;
142 }
143 
144 void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool marker_flag,
145  UNICHAR_ID unichar_id) {
146  EDGE_RECORD *back_edge_ptr;
147  EDGE_INDEX back_edge_index;
148  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, unichar_id, &back_edge_ptr,
149  &back_edge_index));
150  if (marker_flag) {
151  *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
152  *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
153  }
154  // Mark both directions as end of word.
155  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
156  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
157 }
158 
159 bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions) {
160  if (word.length() <= 0) {
161  return false; // can't add empty words
162  }
163  if (repetitions != nullptr) {
164  ASSERT_HOST(repetitions->size() == word.length());
165  }
166  // Make sure the word does not contain invalid unchar ids.
167  for (unsigned i = 0; i < word.length(); ++i) {
168  if (word.unichar_id(i) < 0 || word.unichar_id(i) >= unicharset_size_) {
169  return false;
170  }
171  }
172 
173  EDGE_RECORD *edge_ptr;
174  NODE_REF last_node = 0;
175  NODE_REF the_next_node;
176  bool marker_flag = false;
177  EDGE_INDEX edge_index;
178  int32_t still_finding_chars = true;
179  int32_t word_end = false;
180  bool add_failed = false;
181  bool found;
182 
183  if (debug_level_ > 1) {
184  word.print("\nAdding word: ");
185  }
186 
187  UNICHAR_ID unichar_id;
188  unsigned i;
189  for (i = 0; i < word.length() - 1; ++i) {
190  unichar_id = word.unichar_id(i);
191  marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
192  if (debug_level_ > 1) {
193  tprintf("Adding letter %d\n", unichar_id);
194  }
195  if (still_finding_chars) {
196  found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,
197  &edge_index);
198  if (found && debug_level_ > 1) {
199  tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n", edge_index, last_node);
200  }
201  if (!found) {
202  still_finding_chars = false;
203  } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
204  // We hit the end of an existing word, but the new word is longer.
205  // In this case we have to disconnect the existing word from the
206  // backwards root node, mark the current position as end-of-word
207  // and add new nodes for the increased length. Disconnecting the
208  // existing word from the backwards root node requires a linear
209  // search, so it is much faster to add the longest words first,
210  // to avoid having to come here.
211  word_end = true;
212  still_finding_chars = false;
213  remove_edge(last_node, 0, word_end, unichar_id);
214  } else {
215  // We have to add a new branch here for the new word.
216  if (marker_flag) {
217  set_marker_flag_in_edge_rec(edge_ptr);
218  }
219  last_node = next_node_from_edge_rec(*edge_ptr);
220  }
221  }
222  if (!still_finding_chars) {
223  the_next_node = new_dawg_node();
224  if (debug_level_ > 1) {
225  tprintf("adding node " REFFORMAT "\n", the_next_node);
226  }
227  if (the_next_node == 0) {
228  add_failed = true;
229  break;
230  }
231  if (!add_new_edge(last_node, the_next_node, marker_flag, word_end, unichar_id)) {
232  add_failed = true;
233  break;
234  }
235  word_end = false;
236  last_node = the_next_node;
237  }
238  }
239  the_next_node = 0;
240  unichar_id = word.unichar_id(i);
241  marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
242  if (debug_level_ > 1) {
243  tprintf("Adding letter %d\n", unichar_id);
244  }
245  if (still_finding_chars &&
246  edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, unichar_id, &edge_ptr, &edge_index)) {
247  // An extension of this word already exists in the trie, so we
248  // only have to add the ending flags in both directions.
249  add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), marker_flag, unichar_id);
250  } else {
251  // Add a link to node 0. All leaves connect to node 0 so the back links can
252  // be used in reduction to a dawg. This root backward node has one edge
253  // entry for every word, (except prefixes of longer words) so it is huge.
254  if (!add_failed && !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) {
255  add_failed = true;
256  }
257  }
258  if (add_failed) {
259  tprintf("Re-initializing document dictionary...\n");
260  clear();
261  return false;
262  } else {
263  return true;
264  }
265 }
266 
268  auto *node = new TRIE_NODE_RECORD();
269  nodes_.push_back(node);
270  return nodes_.size() - 1;
271 }
272 
273 bool Trie::read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,
274  Trie::RTLReversePolicy reverse_policy) {
275  std::vector<std::string> word_list;
276  if (!read_word_list(filename, &word_list)) {
277  return false;
278  }
279  std::sort(word_list.begin(), word_list.end(),
280  [](auto &s1, auto &s2) { return s1.size() > s2.size(); });
281  return add_word_list(word_list, unicharset, reverse_policy);
282 }
283 
284 bool Trie::read_word_list(const char *filename, std::vector<std::string> *words) {
285  FILE *word_file;
286  char line_str[CHARS_PER_LINE];
287  int word_count = 0;
288 
289  word_file = fopen(filename, "rb");
290  if (word_file == nullptr) {
291  return false;
292  }
293 
294  while (fgets(line_str, sizeof(line_str), word_file) != nullptr) {
295  chomp_string(line_str); // remove newline
296  std::string word_str(line_str);
297  ++word_count;
298  if (debug_level_ && word_count % 10000 == 0) {
299  tprintf("Read %d words so far\n", word_count);
300  }
301  words->push_back(word_str);
302  }
303  if (debug_level_) {
304  tprintf("Read %d words total.\n", word_count);
305  }
306  fclose(word_file);
307  return true;
308 }
309 
310 bool Trie::add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
311  Trie::RTLReversePolicy reverse_policy) {
312  for (const auto &i : words) {
313  WERD_CHOICE word(i.c_str(), unicharset);
314  if (word.empty() || word.contains_unichar_id(INVALID_UNICHAR_ID)) {
315  continue;
316  }
317  if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && word.has_rtl_unichar_id()) ||
318  reverse_policy == RRP_FORCE_REVERSE) {
320  }
321  if (!word_in_dawg(word)) {
322  add_word_to_dawg(word);
323  if (!word_in_dawg(word)) {
324  tprintf("Error: word '%s' not in DAWG after adding it\n", i.c_str());
325  return false;
326  }
327  }
328  }
329  return true;
330 }
331 
339  unicharset->unichar_insert(kPuncPatternUnicode);
345  initialized_patterns_ = true;
346  unicharset_size_ = unicharset->size();
347 }
348 
349 void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
350  std::vector<UNICHAR_ID> *vec) const {
351  bool is_alpha = unicharset.get_isalpha(unichar_id);
352  if (is_alpha) {
353  vec->push_back(alpha_pattern_);
354  vec->push_back(alphanum_pattern_);
355  if (unicharset.get_islower(unichar_id)) {
356  vec->push_back(lower_pattern_);
357  } else if (unicharset.get_isupper(unichar_id)) {
358  vec->push_back(upper_pattern_);
359  }
360  }
361  if (unicharset.get_isdigit(unichar_id)) {
362  vec->push_back(digit_pattern_);
363  if (!is_alpha) {
364  vec->push_back(alphanum_pattern_);
365  }
366  }
367  if (unicharset.get_ispunctuation(unichar_id)) {
368  vec->push_back(punc_pattern_);
369  }
370 }
371 
373  if (ch == 'c') {
374  return alpha_pattern_;
375  } else if (ch == 'd') {
376  return digit_pattern_;
377  } else if (ch == 'n') {
378  return alphanum_pattern_;
379  } else if (ch == 'p') {
380  return punc_pattern_;
381  } else if (ch == 'a') {
382  return lower_pattern_;
383  } else if (ch == 'A') {
384  return upper_pattern_;
385  } else {
386  return INVALID_UNICHAR_ID;
387  }
388 }
389 
390 bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset) {
391  if (!initialized_patterns_) {
392  tprintf("please call initialize_patterns() before read_pattern_list()\n");
393  return false;
394  }
395 
396  FILE *pattern_file = fopen(filename, "rb");
397  if (pattern_file == nullptr) {
398  tprintf("Error opening pattern file %s\n", filename);
399  return false;
400  }
401 
402  int pattern_count = 0;
403  char string[CHARS_PER_LINE];
404  while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) {
405  chomp_string(string); // remove newline
406  // Parse the pattern and construct a unichar id vector.
407  // Record the number of repetitions of each unichar in the parallel vector.
408  WERD_CHOICE word(&unicharset);
409  std::vector<bool> repetitions_vec;
410  const char *str_ptr = string;
411  int step = unicharset.step(str_ptr);
412  bool failed = false;
413  while (step > 0) {
414  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
415  if (step == 1 && *str_ptr == '\\') {
416  ++str_ptr;
417  if (*str_ptr == '\\') { // regular '\' unichar that was escaped
418  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
419  } else {
420 #if 0 // TODO: This code should be enabled if kSaneNumConcreteChars != 0.
421  if (word.length() < kSaneNumConcreteChars) {
422  tprintf(
423  "Please provide at least %d concrete characters at the"
424  " beginning of the pattern\n",
426  failed = true;
427  break;
428  }
429 #endif
430  // Parse character class from expression.
431  curr_unichar_id = character_class_to_pattern(*str_ptr);
432  }
433  } else {
434  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
435  }
436  if (curr_unichar_id == INVALID_UNICHAR_ID) {
437  failed = true;
438  break; // failed to parse this pattern
439  }
440  word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
441  repetitions_vec.push_back(false);
442  str_ptr += step;
443  step = unicharset.step(str_ptr);
444  // Check if there is a repetition pattern specified after this unichar.
445  if (step == 1 && *str_ptr == '\\' && *(str_ptr + 1) == '*') {
446  repetitions_vec[repetitions_vec.size() - 1] = true;
447  str_ptr += 2;
448  step = unicharset.step(str_ptr);
449  }
450  }
451  if (failed) {
452  tprintf("Invalid user pattern %s\n", string);
453  continue;
454  }
455  // Insert the pattern into the trie.
456  if (debug_level_ > 2) {
457  tprintf("Inserting expanded user pattern %s\n", word.debug_string().c_str());
458  }
459  if (!this->word_in_dawg(word)) {
460  this->add_word_to_dawg(word, &repetitions_vec);
461  if (!this->word_in_dawg(word)) {
462  tprintf("Error: failed to insert pattern '%s'\n", string);
463  }
464  }
465  ++pattern_count;
466  }
467  if (debug_level_) {
468  tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
469  }
470  fclose(pattern_file);
471  return true;
472 }
473 
474 void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end,
475  UNICHAR_ID unichar_id) {
476  EDGE_RECORD *edge_ptr = nullptr;
477  EDGE_INDEX edge_index = 0;
478  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, unichar_id, &edge_ptr, &edge_index));
479  if (debug_level_ > 1) {
480  tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
481  print_edge_rec(*edge_ptr);
482  tprintf("\n");
483  }
484  if (direction == FORWARD_EDGE) {
485  nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index);
486  } else if (node1 == 0) {
487  KillEdge(&nodes_[node1]->backward_edges[edge_index]);
488  root_back_freelist_.push_back(edge_index);
489  } else {
490  nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index);
491  }
492  --num_edges_;
493 }
494 
495 // Some optimizations employed in add_word_to_dawg and trie_to_dawg:
496 // 1 Avoid insertion sorting or bubble sorting the tail root node
497 // (back links on node 0, a list of all the leaves.). The node is
498 // huge, and sorting it with n^2 time is terrible.
499 // 2 Avoid using vector::erase on the tail root node.
500 // (a) During add of words to the trie, zero-out the unichars and
501 // keep a freelist of spaces to re-use.
502 // (b) During reduction, just zero-out the unichars of deleted back
503 // links, skipping zero entries while searching.
504 // 3 Avoid linear search of the tail root node. This has to be done when
505 // a suffix is added to an existing word. Adding words by decreasing
506 // length avoids this problem entirely. Words can still be added in
507 // any order, but it is faster to add the longest first.
509  root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.
510  if (debug_level_ > 2) {
511  print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
512  }
513  std::vector<bool> reduced_nodes(nodes_.size());
514  this->reduce_node_input(0, reduced_nodes);
515 
516  if (debug_level_ > 2) {
517  print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
518  }
519  // Build a translation map from node indices in nodes_ vector to
520  // their target indices in EDGE_ARRAY.
521  std::vector<NODE_REF> node_ref_map(nodes_.size() + 1);
522  unsigned i;
523  for (i = 0; i < nodes_.size(); ++i) {
524  node_ref_map[i + 1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
525  }
526  int num_forward_edges = node_ref_map[i];
527 
528  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
529  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
530  auto edge_array = new EDGE_RECORD[num_forward_edges];
531  EDGE_ARRAY edge_array_ptr = edge_array;
532  for (i = 0; i < nodes_.size(); ++i) {
533  TRIE_NODE_RECORD *node_ptr = nodes_[i];
534  int end = node_ptr->forward_edges.size();
535  for (int j = 0; j < end; ++j) {
536  EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
537  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
538  ASSERT_HOST(static_cast<size_t>(node_ref) < nodes_.size());
539  UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
540  link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
541  end_of_word_from_edge_rec(edge_rec), unichar_id);
542  if (j == end - 1) {
543  set_marker_flag_in_edge_rec(edge_array_ptr);
544  }
545  ++edge_array_ptr;
546  }
547  }
548 
549  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, perm_, unicharset_size_,
550  debug_level_);
551 }
552 
554  const EDGE_RECORD &edge2) {
555  if (debug_level_ > 1) {
556  tprintf("\nCollapsing node %" PRIi64 ":\n", node);
558  tprintf("Candidate edges: ");
559  print_edge_rec(edge1);
560  tprintf(", ");
561  print_edge_rec(edge2);
562  tprintf("\n\n");
563  }
564  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
565  NODE_REF next_node2 = next_node_from_edge_rec(edge2);
566  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
567  // Translate all edges going to/from next_node2 to go to/from next_node1.
568  EDGE_RECORD *edge_ptr = nullptr;
569  EDGE_INDEX edge_index;
570  // The backward link in node to next_node2 will be zeroed out by the caller.
571  // Copy all the backward links in next_node2 to node next_node1
572  for (unsigned i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
573  const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
574  NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
575  UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
576  int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
577  bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
578  add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, curr_word_end,
579  curr_unichar_id);
580  // Relocate the corresponding forward edge in curr_next_node
581  ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, curr_word_end,
582  curr_unichar_id, &edge_ptr, &edge_index));
583  set_next_node_in_edge_rec(edge_ptr, next_node1);
584  }
585  int next_node2_num_edges =
586  (next_node2_ptr->forward_edges.size() + next_node2_ptr->backward_edges.size());
587  if (debug_level_ > 1) {
588  tprintf("removed %d edges from node " REFFORMAT "\n", next_node2_num_edges, next_node2);
589  }
590  next_node2_ptr->forward_edges.clear();
591  next_node2_ptr->backward_edges.clear();
592  num_edges_ -= next_node2_num_edges;
593  return true;
594 }
595 
596 bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
597  EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes) {
598  if (debug_level_ > 1) {
599  tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
600  }
601  // Compare each of the edge pairs with the given unichar_id.
602  bool did_something = false;
603  for (unsigned i = edge_index; i < backward_edges->size() - 1; ++i) {
604  // Find the first edge that can be eliminated.
605  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
606  while (i < backward_edges->size()) {
607  if (!DeadEdge((*backward_edges)[i])) {
608  curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
609  if (curr_unichar_id != unichar_id) {
610  return did_something;
611  }
612  if (can_be_eliminated((*backward_edges)[i])) {
613  break;
614  }
615  }
616  ++i;
617  }
618  if (i == backward_edges->size()) {
619  break;
620  }
621  const EDGE_RECORD &edge_rec = (*backward_edges)[i];
622  // Compare it to the rest of the edges with the given unichar_id.
623  for (auto j = i + 1; j < backward_edges->size(); ++j) {
624  const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
625  if (DeadEdge(next_edge_rec)) {
626  continue;
627  }
628  UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
629  if (next_id != unichar_id) {
630  break;
631  }
632  if (end_of_word_from_edge_rec(next_edge_rec) == end_of_word_from_edge_rec(edge_rec) &&
633  can_be_eliminated(next_edge_rec) &&
634  eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
635  reduced_nodes[next_node_from_edge_rec(edge_rec)] = false;
636  did_something = true;
637  KillEdge(&(*backward_edges)[j]);
638  }
639  }
640  }
641  return did_something;
642 }
643 
645  int num_edges = edges->size();
646  if (num_edges <= 1) {
647  return;
648  }
649  std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
650  sort_vec.reserve(num_edges);
651  for (int i = 0; i < num_edges; ++i) {
652  sort_vec.emplace_back(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]);
653  }
654  std::sort(sort_vec.begin(), sort_vec.end());
655  for (int i = 0; i < num_edges; ++i) {
656  (*edges)[i] = sort_vec[i].data();
657  }
658 }
659 
660 void Trie::reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes) {
661  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
662  sort_edges(&backward_edges);
663  if (debug_level_ > 1) {
664  tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
666  }
667 
668  EDGE_INDEX edge_index = 0;
669  while (static_cast<size_t>(edge_index) < backward_edges.size()) {
670  if (DeadEdge(backward_edges[edge_index])) {
671  continue;
672  }
673  UNICHAR_ID unichar_id = unichar_id_from_edge_rec(backward_edges[edge_index]);
674  while (reduce_lettered_edges(edge_index, unichar_id, node, &backward_edges, reduced_nodes)) {
675  ;
676  }
677  while (static_cast<size_t>(++edge_index) < backward_edges.size()) {
678  UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
679  if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) {
680  break;
681  }
682  }
683  }
684  reduced_nodes[node] = true; // mark as reduced
685 
686  if (debug_level_ > 1) {
687  tprintf("Node " REFFORMAT " after reduction:\n", node);
689  }
690 
691  for (auto &backward_edge : backward_edges) {
692  if (DeadEdge(backward_edge)) {
693  continue;
694  }
695  NODE_REF next_node = next_node_from_edge_rec(backward_edge);
696  if (next_node != 0 && !reduced_nodes[next_node]) {
697  reduce_node_input(next_node, reduced_nodes);
698  }
699  }
700 }
701 
702 void Trie::print_node(NODE_REF node, int max_num_edges) const {
703  if (node == NO_EDGE) {
704  return; // nothing to print
705  }
706  TRIE_NODE_RECORD *node_ptr = nodes_[node];
707  int num_fwd = node_ptr->forward_edges.size();
708  int num_bkw = node_ptr->backward_edges.size();
709  EDGE_VECTOR *vec;
710  for (int dir = 0; dir < 2; ++dir) {
711  if (dir == 0) {
712  vec = &(node_ptr->forward_edges);
713  tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
714  } else {
715  vec = &(node_ptr->backward_edges);
716  tprintf("\t");
717  }
718  int i;
719  for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && i < max_num_edges; ++i) {
720  if (DeadEdge((*vec)[i])) {
721  continue;
722  }
723  print_edge_rec((*vec)[i]);
724  tprintf(" ");
725  }
726  if (dir == 0 ? i < num_fwd : i < num_bkw) {
727  tprintf("...");
728  }
729  tprintf("\n");
730  }
731 }
732 
733 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define WERD_END_FLAG
Definition: dawg.h:82
#define BACKWARD_EDGE
Definition: dawg.h:78
#define FORWARD_EDGE
Definition: dawg.h:77
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:79
#define MARKER_FLAG
Definition: dawg.h:80
#define REFFORMAT
Definition: dawg.h:85
#define CHARS_PER_LINE
Definition: dict.h:44
const char kDoNotReverse[]
Definition: trie.cpp:32
uint64_t EDGE_RECORD
Definition: dawg.h:47
std::vector< EDGE_RECORD > EDGE_VECTOR
Definition: trie.h:39
const char kReverseIfHasRTL[]
Definition: trie.cpp:33
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int64_t NODE_REF
Definition: dawg.h:50
void chomp_string(char *str)
Definition: helpers.h:89
const char kForceReverse[]
Definition: trie.cpp:34
int64_t EDGE_INDEX
Definition: trie.h:38
int UNICHAR_ID
Definition: unichar.h:36
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:48
const char *const RTLReversePolicyNames[]
Definition: trie.cpp:36
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:411
std::string debug_string() const
Definition: ratngs.h:475
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:309
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
bool empty() const
Definition: ratngs.h:280
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:349
unsigned length() const
Definition: ratngs.h:283
void print() const
Definition: ratngs.h:557
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:447
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506
int step(const char *str) const
Definition: unicharset.cpp:211
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533
size_t size() const
Definition: unicharset.h:355
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:275
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:210
int flag_start_bit_
Definition: dawg.h:314
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:247
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:64
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:232
std::string lang_
Definition: dawg.h:302
DawgType type_
Definition: dawg.h:303
int unicharset_size_
Definition: dawg.h:313
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:223
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:214
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:237
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:227
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:305
int debug_level_
Definition: dawg.h:317
EDGE_VECTOR backward_edges
Definition: trie.h:43
EDGE_VECTOR forward_edges
Definition: trie.h:42
UNICHAR_ID alpha_pattern_
Definition: trie.h:410
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:295
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:150
bool initialized_patterns_
Definition: trie.h:416
static const char kLowerPatternUnicode[]
Definition: trie.h:70
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:273
static const char kAlphanumPatternUnicode[]
Definition: trie.h:68
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:369
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:348
NODE_REF next_node(EDGE_REF edge_ref) const override
Definition: trie.h:123
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:332
TRIE_NODES nodes_
Definition: trie.h:402
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:553
void clear()
Definition: trie.cpp:50
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:326
static const char kUpperPatternUnicode[]
Definition: trie.h:71
UNICHAR_ID digit_pattern_
Definition: trie.h:411
SquishedDawg * trie_to_dawg()
Definition: trie.cpp:508
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:110
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:95
UNICHAR_ID lower_pattern_
Definition: trie.h:414
static const char kDigitPatternUnicode[]
Definition: trie.h:67
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:390
void reduce_node_input(NODE_REF node, std::vector< bool > &reduced_nodes)
Definition: trie.cpp:660
UNICHAR_ID upper_pattern_
Definition: trie.h:415
UNICHAR_ID punc_pattern_
Definition: trie.h:413
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:474
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:644
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, std::vector< bool > &reduced_nodes)
Definition: trie.cpp:596
UNICHAR_ID alphanum_pattern_
Definition: trie.h:412
static const char kPuncPatternUnicode[]
Definition: trie.h:69
RTLReversePolicy
Definition: trie.h:55
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:57
@ RRP_FORCE_REVERSE
Definition: trie.h:58
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, std::vector< UNICHAR_ID > *vec) const override
Definition: trie.cpp:349
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector< bool > *repetitions)
Definition: trie.cpp:159
std::vector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:404
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:372
bool add_word_list(const std::vector< std::string > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
Definition: trie.cpp:310
void print_node(NODE_REF node, int max_num_edges) const override
Definition: trie.cpp:702
static const char * get_reverse_policy_name(RTLReversePolicy reverse_policy)
Definition: trie.cpp:45
uint64_t num_edges_
Definition: trie.h:405
static const char kAlphaPatternUnicode[]
Definition: trie.h:66
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:154
NODE_REF new_dawg_node()
Definition: trie.cpp:267
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:319
static const int kSaneNumConcreteChars
Definition: trie.h:62
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:311
bool read_word_list(const char *filename, std::vector< std::string > *words)
Definition: trie.cpp:284
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:144