tesseract  5.0.0
trie.h
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * File: trie.h
4  * Description: Functions to build a trie data structure.
5  * Author: Mark Seaman, SW Productivity
6  *
7  * (c) Copyright 1987, Hewlett-Packard Company.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  *****************************************************************************/
19 #ifndef TRIE_H
20 #define TRIE_H
21 
22 #include "dawg.h"
23 
24 namespace tesseract {
25 
26 class UNICHARSET;
27 
28 // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
29 // max int32, we will need to change vector to use int64 for size
30 // and address indices. This does not seem to be needed immediately,
31 // since currently the largest number of edges limit used by tesseract
32 // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
33 // There are also int casts below to satisfy the WIN32 compiler that would
34 // need to be changed.
35 // It might be cleanest to change the types of most of the Trie/Dawg related
36 // typedefs to int and restrict the casts to extracting these values from
37 // the 64 bit EDGE_RECORD.
38 using EDGE_INDEX = int64_t; // index of an edge in a given node
39 using EDGE_VECTOR = std::vector<EDGE_RECORD>;
40 
44 };
45 using TRIE_NODES = std::vector<TRIE_NODE_RECORD *>;
46 
53 class TESS_API Trie : public Dawg {
54 public:
59  };
60 
61  // Minimum number of concrete characters at the beginning of user patterns.
62  static const int kSaneNumConcreteChars = 0;
63  // Various unicode whitespace characters are used to denote unichar patterns,
64  // (character classifier would never produce these whitespace characters as a
65  // valid classification).
66  static const char kAlphaPatternUnicode[];
67  static const char kDigitPatternUnicode[];
68  static const char kAlphanumPatternUnicode[];
69  static const char kPuncPatternUnicode[];
70  static const char kLowerPatternUnicode[];
71  static const char kUpperPatternUnicode[];
72 
73  static const char *get_reverse_policy_name(RTLReversePolicy reverse_policy);
74 
75  // max_num_edges argument allows limiting the amount of memory this
76  // Trie can consume (if a new word insert would cause the Trie to
77  // contain more edges than max_num_edges, all the edges are cleared
78  // so that new inserts can proceed).
79  Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
80  : Dawg(type, lang, perm, debug_level) {
81  init(unicharset_size);
82  deref_node_index_mask_ = ~letter_mask_;
83  new_dawg_node(); // need to allocate node 0
84  }
85  ~Trie() override {
86  for (auto node : nodes_) {
87  delete node;
88  }
89  }
90 
91  // Reset the Trie to empty.
92  void clear();
93 
95  EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override {
96  EDGE_RECORD *edge_ptr;
97  EDGE_INDEX edge_index;
98  if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,
99  &edge_index)) {
100  return NO_EDGE;
101  }
102  return make_edge_ref(node_ref, edge_index);
103  }
104 
109  void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override {
110  const EDGE_VECTOR &forward_edges = nodes_[static_cast<int>(node)]->forward_edges;
111  for (auto &edge : forward_edges) {
112  if (!word_end || end_of_word_from_edge_rec(edge)) {
113  vec->push_back(
114  NodeChild(unichar_id_from_edge_rec(edge), make_edge_ref(node, &edge - &forward_edges[0])));
115  }
116  }
117  }
118 
123  NODE_REF next_node(EDGE_REF edge_ref) const override {
124  if (edge_ref == NO_EDGE || num_edges_ == 0) {
125  return NO_EDGE;
126  }
127  return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
128  }
129 
134  bool end_of_word(EDGE_REF edge_ref) const override {
135  if (edge_ref == NO_EDGE || num_edges_ == 0) {
136  return false;
137  }
138  return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
139  }
140 
142  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
143  if (edge_ref == NO_EDGE || num_edges_ == 0) {
144  return INVALID_UNICHAR_ID;
145  }
146  return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
147  }
148  // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
149  // the edge dead.
150  void KillEdge(EDGE_RECORD *edge_rec) const {
151  *edge_rec &= ~letter_mask_;
152  *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
153  }
154  bool DeadEdge(const EDGE_RECORD &edge_rec) const {
155  return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
156  }
157 
158  // Prints the contents of the node indicated by the given NODE_REF.
159  // At most max_num_edges will be printed.
160  void print_node(NODE_REF node, int max_num_edges) const override;
161 
162  // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
163  // Eliminates redundant edges and returns the pointer to the SquishedDawg.
164  // Note: the caller is responsible for deallocating memory associated
165  // with the returned SquishedDawg pointer.
166  SquishedDawg *trie_to_dawg();
167 
168  // Reads a list of words from the given file and adds into the Trie.
169  // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
170  // policy and information in the unicharset.
171  // Returns false on error.
172  bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,
173  Trie::RTLReversePolicy reverse);
174 
175  // Reads a list of words from the given file.
176  // Returns false on error.
177  bool read_word_list(const char *filename, std::vector<std::string> *words);
178  // Adds a list of words previously read using read_word_list to the trie
179  // using the given unicharset and reverse_policy to convert to unichar-ids.
180  // Returns false on error.
181  bool add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
182  Trie::RTLReversePolicy reverse_policy);
183 
184  // Inserts the list of patterns from the given file into the Trie.
185  // The pattern list file should contain one pattern per line in UTF-8 format.
186  //
187  // Each pattern can contain any non-whitespace characters, however only the
188  // patterns that contain characters from the unicharset of the corresponding
189  // language will be useful.
190  // The only meta character is '\'. To be used in a pattern as an ordinary
191  // string it should be escaped with '\' (e.g. string "C:\Documents" should
192  // be written in the patterns file as "C:\\Documents").
193  // This function supports a very limited regular expression syntax. One can
194  // express a character, a certain character class and a number of times the
195  // entity should be repeated in the pattern.
196  //
197  // To denote a character class use one of:
198  // \c - unichar for which UNICHARSET::get_isalpha() is true (character)
199  // \d - unichar for which UNICHARSET::get_isdigit() is true
200  // \n - unichar for which UNICHARSET::get_isdigit() or
201  // UNICHARSET::isalpha() are true
202  // \p - unichar for which UNICHARSET::get_ispunct() is true
203  // \a - unichar for which UNICHARSET::get_islower() is true
204  // \A - unichar for which UNICHARSET::get_isupper() is true
205  //
206  // \* could be specified after each character or pattern to indicate that
207  // the character/pattern can be repeated any number of times before the next
208  // character/pattern occurs.
209  //
210  // Examples:
211  // 1-8\d\d-GOOG-411 will be expanded to strings:
212  // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
213  //
214  // http://www.\n\*.com will be expanded to strings like:
215  // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
216  //
217  // Note: In choosing which patterns to include please be aware of the fact
218  // providing very generic patterns will make tesseract run slower.
219  // For example \n\* at the beginning of the pattern will make Tesseract
220  // consider all the combinations of proposed character choices for each
221  // of the segmentations, which will be unacceptably slow.
222  // Because of potential problems with speed that could be difficult to
223  // identify, each user pattern has to have at least kSaneNumConcreteChars
224  // concrete characters from the unicharset at the beginning.
225  bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
226 
227  // Initializes the values of *_pattern_ unichar ids.
228  // This function should be called before calling read_pattern_list().
229  void initialize_patterns(UNICHARSET *unicharset);
230 
231  // Fills in the given unichar id vector with the unichar ids that represent
232  // the patterns of the character classes of the given unichar_id.
233  void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
234  std::vector<UNICHAR_ID> *vec) const override;
235 
236  // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
237  // a self loop and the given unichar_id matches the unichar_id stored in the
238  // EDGE_RECORD, returns NO_EDGE otherwise.
240  bool word_end) const override {
241  if (edge_ref == NO_EDGE) {
242  return NO_EDGE;
243  }
244  EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
245  return (marker_flag_from_edge_rec(*edge_rec) &&
246  unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
247  word_end == end_of_word_from_edge_rec(*edge_rec))
248  ? edge_ref
249  : NO_EDGE;
250  }
251 
252  // Adds a word to the Trie (creates the necessary nodes and edges).
253  //
254  // If repetitions vector is not nullptr, each entry in the vector indicates
255  // whether the unichar id with the corresponding index in the word is allowed
256  // to repeat an unlimited number of times. For each entry that is true, MARKER
257  // flag of the corresponding edge created for this unichar id is set to true).
258  //
259  // Return true if add succeeded, false otherwise (e.g. when a word contained
260  // an invalid unichar id or the trie was getting too large and was cleared).
261  bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions);
262  bool add_word_to_dawg(const WERD_CHOICE &word) {
263  return add_word_to_dawg(word, nullptr);
264  }
265 
266 protected:
267  // The structure of an EDGE_REF for Trie edges is as follows:
268  // [LETTER_START_BIT, flag_start_bit_):
269  // edge index in *_edges in a TRIE_NODE_RECORD
270  // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
271  //
272  // With this arrangement there are enough bits to represent edge indices
273  // (each node can have at most unicharset_size_ forward edges and
274  // the position of flag_start_bit is set to be log2(unicharset_size_)).
275  // It is also possible to accommodate a maximum number of nodes that is at
276  // least as large as that of the SquishedDawg representation (in SquishedDawg
277  // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
278  // the next node index).
279  //
280 
281  // Returns the pointer to EDGE_RECORD after decoding the location
282  // of the edge from the information in the given EDGE_REF.
283  // This function assumes that EDGE_REF holds valid node/edge indices.
284  inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
285  int edge_index = static_cast<int>((edge_ref & letter_mask_) >> LETTER_START_BIT);
286  int node_index = static_cast<int>((edge_ref & deref_node_index_mask_) >> flag_start_bit_);
287  TRIE_NODE_RECORD *node_rec = nodes_[node_index];
288  return &(node_rec->forward_edges[edge_index]);
289  }
291  inline EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const {
292  return ((node_index << flag_start_bit_) | (edge_index << LETTER_START_BIT));
293  }
295  inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end,
296  UNICHAR_ID unichar_id) {
297  EDGE_RECORD flags = 0;
298  if (repeats) {
299  flags |= MARKER_FLAG;
300  }
301  if (word_end) {
302  flags |= WERD_END_FLAG;
303  }
304  if (direction == BACKWARD_EDGE) {
305  flags |= DIRECTION_FLAG;
306  }
307  *edge = ((nxt << next_node_start_bit_) | (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
308  (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
309  }
311  inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
312  tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
313  marker_flag_from_edge_rec(edge_rec) ? "R," : "",
314  (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
315  end_of_word_from_edge_rec(edge_rec) ? ",E" : "", unichar_id_from_edge_rec(edge_rec));
316  }
317  // Returns true if the next node in recorded the given EDGE_RECORD
318  // has exactly one forward edge.
319  inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
320  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
321  return (node_ref != NO_EDGE && nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
322  }
323 
324  // Prints the contents of the Trie.
325  // At most max_num_edges will be printed for each node.
326  void print_all(const char *msg, int max_num_edges) {
327  tprintf("\n__________________________\n%s\n", msg);
328  for (size_t i = 0; i < nodes_.size(); ++i) {
329  print_node(i, max_num_edges);
330  }
331  tprintf("__________________________\n");
332  }
333 
334  // Finds the edge with the given direction, word_end and unichar_id
335  // in the node indicated by node_ref. Fills in the pointer to the
336  // EDGE_RECORD and the index of the edge with the the values
337  // corresponding to the edge found. Returns true if an edge was found.
338  bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end,
339  UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
340 
341  // Adds an single edge linkage between node1 and node2 in the direction
342  // indicated by direction argument.
343  bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end,
344  UNICHAR_ID unichar_id);
345 
346  // Adds forward edge linkage from node1 to node2 and the corresponding
347  // backward edge linkage in the other direction.
348  bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end,
349  UNICHAR_ID unichar_id) {
350  return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, word_end, unichar_id) &&
351  add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, word_end, unichar_id));
352  }
353 
354  // Sets the word ending flags in an already existing edge pair.
355  // Returns true on success.
356  void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats,
357  UNICHAR_ID unichar_id);
358 
359  // Allocates space for a new node in the Trie.
360  NODE_REF new_dawg_node();
361 
362  // Removes a single edge linkage to between node1 and node2 in the
363  // direction indicated by direction argument.
364  void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end,
365  UNICHAR_ID unichar_id);
366 
367  // Removes forward edge linkage from node1 to node2 and the corresponding
368  // backward edge linkage in the other direction.
369  void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) {
370  remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
371  remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
372  }
373 
374  // Compares edge1 and edge2 in the given node to see if they point to two
375  // next nodes that could be collapsed. If they do, performs the reduction
376  // and returns true.
377  bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2);
378 
379  // Assuming that edge_index indicates the first edge in a group of edges
380  // in this node with a particular letter value, looks through these edges
381  // to see if any of them can be collapsed. If so does it. Returns to the
382  // caller when all edges with this letter have been reduced.
383  // Returns true if further reduction is possible with this same letter.
384  bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
385  EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes);
386 
393  void sort_edges(EDGE_VECTOR *edges);
394 
396  void reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes);
397 
398  // Returns the pattern unichar id for the given character class code.
399  UNICHAR_ID character_class_to_pattern(char ch);
400 
401  // Member variables
402  TRIE_NODES nodes_; // vector of nodes in the Trie
403  // Freelist of edges in the root backwards node that were previously zeroed.
404  std::vector<EDGE_INDEX> root_back_freelist_;
405  uint64_t num_edges_ = 0; // sum of all edges (forward and backward)
406  uint64_t deref_direction_mask_ = 0; // mask for EDGE_REF to extract direction
407  uint64_t deref_node_index_mask_ = 0; // mask for EDGE_REF to extract node index
408  // Variables for translating character class codes denoted in user patterns
409  // file to the unichar ids used to represent them in a Trie.
410  UNICHAR_ID alpha_pattern_ = 0;
411  UNICHAR_ID digit_pattern_ = 0;
412  UNICHAR_ID alphanum_pattern_ = 0;
413  UNICHAR_ID punc_pattern_ = 0;
414  UNICHAR_ID lower_pattern_ = 0;
415  UNICHAR_ID upper_pattern_ = 0;
416  bool initialized_patterns_ = false;
417 };
418 
419 } // namespace tesseract
420 
421 #endif
#define WERD_END_FLAG
Definition: dawg.h:82
#define BACKWARD_EDGE
Definition: dawg.h:78
#define FORWARD_EDGE
Definition: dawg.h:77
#define MARKER_FLAG
Definition: dawg.h:80
#define REFFORMAT
Definition: dawg.h:85
#define LETTER_START_BIT
Definition: dawg.h:83
#define DIRECTION_FLAG
Definition: dawg.h:81
DawgType
Definition: dawg.h:64
int64_t EDGE_REF
Definition: dawg.h:49
uint64_t EDGE_RECORD
Definition: dawg.h:47
std::vector< EDGE_RECORD > EDGE_VECTOR
Definition: trie.h:39
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int64_t NODE_REF
Definition: dawg.h:50
std::vector< TRIE_NODE_RECORD * > TRIE_NODES
Definition: trie.h:45
int64_t EDGE_INDEX
Definition: trie.h:38
int UNICHAR_ID
Definition: unichar.h:36
std::vector< NodeChild > NodeChildVector
Definition: dawg.h:60
PermuterType
Definition: ratngs.h:231
EDGE_VECTOR backward_edges
Definition: trie.h:43
EDGE_VECTOR forward_edges
Definition: trie.h:42
~Trie() override
Definition: trie.h:85
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:295
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:150
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override
Definition: trie.h:109
Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: trie.h:79
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:369
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:348
NODE_REF next_node(EDGE_REF edge_ref) const override
Definition: trie.h:123
TRIE_NODES nodes_
Definition: trie.h:402
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:326
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:95
bool add_word_to_dawg(const WERD_CHOICE &word)
Definition: trie.h:262
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Definition: trie.h:142
EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:239
RTLReversePolicy
Definition: trie.h:55
@ RRP_DO_NO_REVERSE
Definition: trie.h:56
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:57
@ RRP_FORCE_REVERSE
Definition: trie.h:58
bool end_of_word(EDGE_REF edge_ref) const override
Definition: trie.h:134
std::vector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:404
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:284
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:154
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:291
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:319
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:311
#define TESS_API
Definition: export.h:34