tesseract  5.0.0
unicharcompress.h
Go to the documentation of this file.
1 // File: unicharcompress.h
3 // Description: Unicode re-encoding using a sequence of smaller numbers in
4 // place of a single large code for CJK, similarly for Indic,
5 // and dissection of ligatures for other scripts.
6 // Author: Ray Smith
7 //
8 // (C) Copyright 2015, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
22 #define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
23 
24 #include <unordered_map>
25 #include <vector>
26 #include "serialis.h"
27 #include "unicharset.h"
28 
29 namespace tesseract {
30 
31 // Trivial class to hold the code for a recoded unichar-id.
33 public:
34  // The maximum length of a code.
35  static const int kMaxCodeLen = 9;
36 
37  RecodedCharID() : self_normalized_(1), length_(0) {
38  memset(code_, 0, sizeof(code_));
39  }
40  void Truncate(int length) {
41  length_ = length;
42  }
43  // Sets the code value at the given index in the code.
44  void Set(int index, int value) {
45  code_[index] = value;
46  if (length_ <= index) {
47  length_ = index + 1;
48  }
49  }
50  // Shorthand for setting codes of length 3, as all Hangul and Han codes are
51  // length 3.
52  void Set3(int code0, int code1, int code2) {
53  length_ = 3;
54  code_[0] = code0;
55  code_[1] = code1;
56  code_[2] = code2;
57  }
58  bool empty() const {
59  return length_ == 0;
60  }
61  // Accessors
62  int length() const {
63  return length_;
64  }
65  int operator()(int index) const {
66  return code_[index];
67  }
68 
69  // Writes to the given file. Returns false in case of error.
70  bool Serialize(TFile *fp) const {
71  return fp->Serialize(&self_normalized_) && fp->Serialize(&length_) &&
72  fp->Serialize(&code_[0], length_);
73  }
74  // Reads from the given file. Returns false in case of error.
75  bool DeSerialize(TFile *fp) {
76  return fp->DeSerialize(&self_normalized_) && fp->DeSerialize(&length_) &&
77  fp->DeSerialize(&code_[0], length_);
78  }
79  bool operator==(const RecodedCharID &other) const {
80  if (length_ != other.length_) {
81  return false;
82  }
83  for (int i = 0; i < length_; ++i) {
84  if (code_[i] != other.code_[i]) {
85  return false;
86  }
87  }
88  return true;
89  }
90  // Hash functor for RecodedCharID.
92  uint64_t operator()(const RecodedCharID &code) const {
93  uint64_t result = 0;
94  for (int i = 0; i < code.length_; ++i) {
95  result ^= static_cast<uint64_t>(code(i)) << (7 * i);
96  }
97  return result;
98  }
99  };
100 
101 private:
102  // True if this code is self-normalizing, ie is the master entry for indices
103  // that map to the same code. Has boolean value, but int8_t for serialization.
104  int8_t self_normalized_;
105  // The number of elements in use in code_;
106  int32_t length_;
107  // The re-encoded form of the unichar-id to which this RecodedCharID relates.
108  int32_t code_[kMaxCodeLen];
109 };
110 
111 // Class holds a "compression" of a unicharset to simplify the learning problem
112 // for a neural-network-based classifier.
113 // Objectives:
114 // 1 (CJK): Ids of a unicharset with a large number of classes are expressed as
115 // a sequence of 3 codes with much fewer values.
116 // This is achieved using the Jamo coding for Hangul and the Unicode
117 // Radical-Stroke-index for Han.
118 // 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code
119 // as the unicode sequence (but coded in a more compact space).
120 // 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing
121 // and not significantly distinct shapes (quotes) together, ie
122 // represent the fi ligature as the f-i pair, and fold u+2019 and
123 // friends all onto ascii single '
124 // 4 The null character and mapping to target activations:
125 // To save horizontal coding space, the compressed codes are generally mapped
126 // to target network activations without intervening null characters, BUT
127 // in the case of ligatures, such as ff, null characters have to be included
128 // so existence of repeated codes is detected at codebook-building time, and
129 // null characters are embedded directly into the codes, so the rest of the
130 // system doesn't need to worry about the problem (much). There is still an
131 // effect on the range of ways in which the target activations can be
132 // generated.
133 //
134 // The computed code values are compact (no unused values), and, for CJK,
135 // unique (each code position uses a disjoint set of values from each other code
136 // position). For non-CJK, the same code value CAN be used in multiple
137 // positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f>
138 // is the same code as is used for the single f.
140 public:
141  UnicharCompress();
142  UnicharCompress(const UnicharCompress &src);
143  ~UnicharCompress();
144  UnicharCompress &operator=(const UnicharCompress &src);
145 
146  // The 1st Hangul unicode.
147  static const int kFirstHangul = 0xac00;
148  // The number of Hangul unicodes.
149  static const int kNumHangul = 11172;
150  // The number of Jamos for each of the 3 parts of a Hangul character, being
151  // the Leading consonant, Vowel and Trailing consonant.
152  static const int kLCount = 19;
153  static const int kVCount = 21;
154  static const int kTCount = 28;
155 
156  // Computes the encoding for the given unicharset. It is a requirement that
157  // the file training/langdata/radical-stroke.txt have been read into the
158  // input string radical_stroke_table.
159  // Returns false if the encoding cannot be constructed.
160  bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table);
161  // Sets up an encoder that doesn't change the unichars at all, so it just
162  // passes them through unchanged.
163  void SetupPassThrough(const UNICHARSET &unicharset);
164  // Sets up an encoder directly using the given encoding vector, which maps
165  // unichar_ids to the given codes.
166  void SetupDirect(const std::vector<RecodedCharID> &codes);
167 
168  // Returns the number of different values that can be used in a code, ie
169  // 1 + the maximum value that will ever be used by an RecodedCharID code in
170  // any position in its array.
171  int code_range() const {
172  return code_range_;
173  }
174 
175  // Encodes a single unichar_id. Returns the length of the code, (or zero if
176  // invalid input), and the encoding itself in code.
177  int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const;
178  // Decodes code, returning the original unichar-id, or
179  // INVALID_UNICHAR_ID if the input is invalid.
180  int DecodeUnichar(const RecodedCharID &code) const;
181  // Returns true if the given code is a valid start or single code.
182  bool IsValidFirstCode(int code) const {
183  return is_valid_start_[code];
184  }
185  // Returns a list of valid non-final next codes for a given prefix code,
186  // which may be empty.
187  const std::vector<int> *GetNextCodes(const RecodedCharID &code) const {
188  auto it = next_codes_.find(code);
189  return it == next_codes_.end() ? nullptr : it->second;
190  }
191  // Returns a list of valid final codes for a given prefix code, which may
192  // be empty.
193  const std::vector<int> *GetFinalCodes(const RecodedCharID &code) const {
194  auto it = final_codes_.find(code);
195  return it == final_codes_.end() ? nullptr : it->second;
196  }
197 
198  // Writes to the given file. Returns false in case of error.
199  bool Serialize(TFile *fp) const;
200  // Reads from the given file. Returns false in case of error.
201 
202  bool DeSerialize(TFile *fp);
203 
204  // Returns a string containing a text file that describes the encoding thus:
205  // <index>[,<index>]*<tab><UTF8-str><newline>
206  // In words, a comma-separated list of one or more indices, followed by a tab
207  // and the UTF-8 string that the code represents per line. Most simple scripts
208  // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
209  // and the Indic scripts will contain a many-to-many mapping.
210  // See the class comment above for details.
211  std::string GetEncodingAsString(const UNICHARSET &unicharset) const;
212 
213  // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
214  // Note that the returned values are 0-based indices, NOT unicode Jamo.
215  // Returns false if the input is not in the Hangul unicode range.
216  static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing);
217 
218 private:
219  // Renumbers codes to eliminate unused values.
220  void DefragmentCodeValues(int encoded_null);
221  // Computes the value of code_range_ from the encoder_.
222  void ComputeCodeRange();
223  // Initializes the decoding hash_map from the encoder_ array.
224  void SetupDecoder();
225  // Frees allocated memory.
226  void Cleanup();
227 
228  // The encoder that maps a unichar-id to a sequence of small codes.
229  // encoder_ is the only part that is serialized. The rest is computed on load.
230  std::vector<RecodedCharID> encoder_;
231  // Decoder converts the output of encoder back to a unichar-id.
232  std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash> decoder_;
233  // True if the index is a valid single or start code.
234  std::vector<bool> is_valid_start_;
235  // Maps a prefix code to a list of valid next codes.
236  // The map owns the vectors.
237  std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>
238  next_codes_;
239  // Maps a prefix code to a list of valid final codes.
240  // The map owns the vectors.
241  std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>
242  final_codes_;
243  // Max of any value in encoder_ + 1.
244  int code_range_;
245 };
246 
247 } // namespace tesseract.
248 
249 #endif // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
bool DeSerialize(bool swap, FILE *fp, std::vector< T > &data)
Definition: helpers.h:220
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:251
bool DeSerialize(std::string &data)
Definition: serialis.cpp:94
bool Serialize(const std::string &data)
Definition: serialis.cpp:107
bool DeSerialize(TFile *fp)
bool Serialize(TFile *fp) const
void Truncate(int length)
void Set(int index, int value)
static const int kMaxCodeLen
int operator()(int index) const
void Set3(int code0, int code1, int code2)
bool operator==(const RecodedCharID &other) const
uint64_t operator()(const RecodedCharID &code) const
const std::vector< int > * GetFinalCodes(const RecodedCharID &code) const
const std::vector< int > * GetNextCodes(const RecodedCharID &code) const
bool IsValidFirstCode(int code) const
#define TESS_API
Definition: export.h:34