tesseract  5.0.0
unicharmap.cpp
Go to the documentation of this file.
1 // File: unicharmap.cpp
3 // Description: Unicode character/ligature to integer id class.
4 // Author: Thomas Kielbus
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "unicharmap.h"
20 
21 #include <tesseract/unichar.h>
22 
23 #include <cassert>
24 
25 namespace tesseract {
26 
27 UNICHARMAP::UNICHARMAP() : nodes(nullptr) {}
28 
30  delete[] nodes;
31 }
32 
33 // Search the given unichar representation in the tree, using length characters
34 // from it maximum. Each character in the string is interpreted as an index in
35 // an array of nodes.
36 UNICHAR_ID UNICHARMAP::unichar_to_id(const char *const unichar_repr, int length) const {
37  UNICHARMAP_NODE *current_nodes = nodes;
38 
39  assert(*unichar_repr != '\0');
40  assert(length > 0 && length <= UNICHAR_LEN);
41 
42  int index = 0;
43  if (length <= 0 || unichar_repr[index] == '\0') {
44  return INVALID_UNICHAR_ID;
45  }
46  do {
47  if (index + 1 >= length || unichar_repr[index + 1] == '\0') {
48  return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
49  }
50  current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
51  ++index;
52  } while (true);
53 }
54 
55 // Search the given unichar representation in the tree, creating the possibly
56 // missing nodes. Once the right place has been found, insert the given id and
57 // update the inserted flag to keep track of the insert. Each character in the
58 // string is interpreted as an index in an array of nodes.
59 void UNICHARMAP::insert(const char *const unichar_repr, UNICHAR_ID id) {
60  const char *current_char = unichar_repr;
61  if (*current_char == '\0') {
62  return;
63  }
64  UNICHARMAP_NODE **current_nodes_pointer = &nodes;
65  do {
66  if (*current_nodes_pointer == nullptr) {
67  *current_nodes_pointer = new UNICHARMAP_NODE[256];
68  }
69  if (current_char[1] == '\0') {
70  (*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].id = id;
71  return;
72  }
73  current_nodes_pointer =
74  &((*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].children);
75  ++current_char;
76  } while (true);
77 }
78 
79 // Search the given unichar representation in the tree, using length characters
80 // from it maximum. Each character in the string is interpreted as an index in
81 // an array of nodes. Stop once the tree does not have anymore nodes or once we
82 // found the right unichar_repr.
83 bool UNICHARMAP::contains(const char *const unichar_repr, int length) const {
84  if (unichar_repr == nullptr || *unichar_repr == '\0') {
85  return false;
86  }
87  if (length <= 0 || length > UNICHAR_LEN) {
88  return false;
89  }
90  int index = 0;
91  if (unichar_repr[index] == '\0') {
92  return false;
93  }
94  UNICHARMAP_NODE *current_nodes = nodes;
95 
96  while (current_nodes != nullptr && index + 1 < length && unichar_repr[index + 1] != '\0') {
97  current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
98  ++index;
99  }
100  return current_nodes != nullptr && (index + 1 >= length || unichar_repr[index + 1] == '\0') &&
101  current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
102 }
103 
104 // Return the minimum number of characters that must be used from this string
105 // to obtain a match in the UNICHARMAP.
106 int UNICHARMAP::minmatch(const char *const unichar_repr) const {
107  const char *current_char = unichar_repr;
108  if (*current_char == '\0') {
109  return 0;
110  }
111  UNICHARMAP_NODE *current_nodes = nodes;
112 
113  while (current_nodes != nullptr && *current_char != '\0') {
114  if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0) {
115  return current_char + 1 - unichar_repr;
116  }
117  current_nodes = current_nodes[static_cast<unsigned char>(*current_char)].children;
118  ++current_char;
119  }
120  return 0;
121 }
122 
124  delete[] nodes;
125  nodes = nullptr;
126 }
127 
128 UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() : children(nullptr), id(-1) {}
129 
130 // Recursively delete the children
131 UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
132  delete[] children;
133 }
134 
135 } // namespace tesseract
#define UNICHAR_LEN
Definition: unichar.h:33
int UNICHAR_ID
Definition: unichar.h:36
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:83
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:59
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:36
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:106