tesseract  5.0.0
unicharcompress.cpp
Go to the documentation of this file.
1 // File: unicharcompress.cpp
3 // Description: Unicode re-encoding using a sequence of smaller numbers in
4 // place of a single large code for CJK, similarly for Indic,
5 // and dissection of ligatures for other scripts.
6 // Author: Ray Smith
7 //
8 // (C) Copyright 2015, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include "unicharcompress.h"
22 #include <algorithm>
23 #include <memory>
24 #include "tprintf.h"
25 
26 namespace tesseract {
27 
28 // String used to represent the null_id in direct_set.
29 static const char *kNullChar = "<nul>";
30 // Radix to make unique values from the stored radical codes.
31 const int kRadicalRadix = 29;
32 
33 // "Hash" function for const std::vector<int> computes the sum of elements.
34 // Build a unique number for each code sequence that we can use as the index in
35 // a hash map of ints instead of trying to hash the vectors.
36 static int RadicalPreHash(const std::vector<int> &rs) {
37  size_t result = 0;
38  for (int radical : rs) {
39  result *= kRadicalRadix;
40  result += radical;
41  }
42  return result;
43 }
44 
45 // A hash map to convert unicodes to radical encoding.
46 using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
47 // A hash map to count occurrences of each radical encoding.
48 using RSCounts = std::unordered_map<int, int>;
49 
50 static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map) {
51  if (radical_data_line.empty() || (radical_data_line)[0] == '#') {
52  return true;
53  }
54  std::vector<std::string> entries = split(radical_data_line, ' ');
55  if (entries.size() < 2) {
56  return false;
57  }
58  char *end = nullptr;
59  int unicode = strtol(&entries[0][0], &end, 10);
60  if (*end != '\0') {
61  return false;
62  }
63  std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
64  for (size_t i = 1; i < entries.size(); ++i) {
65  int radical = strtol(&entries[i][0], &end, 10);
66  if (*end != '\0') {
67  return false;
68  }
69  radicals->push_back(radical);
70  }
71  (*radical_map)[unicode] = std::move(radicals);
72  return true;
73 }
74 
75 // Helper function builds the RSMap from the radical-stroke file, which has
76 // already been read into a string. Returns false on error.
77 // The radical_stroke_table is non-const because it gets split and the caller
78 // is unlikely to want to use it again.
79 static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) {
80  std::vector<std::string> lines = split(radical_data, '\n');
81  for (unsigned i = 0; i < lines.size(); ++i) {
82  if (!DecodeRadicalLine(lines[i], radical_map)) {
83  tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str());
84  return false;
85  }
86  }
87  return true;
88 }
89 
90 UnicharCompress::UnicharCompress() : code_range_(0) {}
92  *this = src;
93 }
95  Cleanup();
96 }
98  Cleanup();
99  encoder_ = src.encoder_;
100  code_range_ = src.code_range_;
101  SetupDecoder();
102  return *this;
103 }
104 
105 // Computes the encoding for the given unicharset. It is a requirement that
106 // the file training/langdata/radical-stroke.txt have been read into the
107 // input string radical_stroke_table.
108 // Returns false if the encoding cannot be constructed.
109 bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
110  std::string *radical_stroke_table) {
111  RSMap radical_map;
112  if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) {
113  return false;
114  }
115  encoder_.clear();
116  UNICHARSET direct_set;
117  // To avoid unused codes, clear the special codes from the direct_set.
118  direct_set.clear();
119  // Always keep space as 0;
120  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
121  // Null char is next if we have one.
122  if (null_id >= 0) {
123  direct_set.unichar_insert(kNullChar);
124  }
125  RSCounts radical_counts;
126  // In the initial map, codes [0, unicharset.size()) are
127  // reserved for non-han/hangul sequences of 1 or more unicodes.
128  int hangul_offset = unicharset.size();
129  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
130  const int kTotalJamos = kLCount + kVCount + kTCount;
131  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
132  // to measure the number of radicals and strokes, initially we use the same
133  // code range for all 3 Han code positions, and fix them after.
134  int han_offset = hangul_offset + kTotalJamos;
135  for (unsigned u = 0; u <= unicharset.size(); ++u) {
136  // We special-case allow null_id to be equal to unicharset.size() in case
137  // there is no space in unicharset for it.
138  if (u == unicharset.size() && static_cast<int>(u) != null_id) {
139  break; // Finished
140  }
141  RecodedCharID code;
142  // Convert to unicodes.
143  std::vector<char32> unicodes;
144  std::string cleaned;
145  if (u < unicharset.size()) {
146  cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
147  }
148  if (u < unicharset.size() && (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
149  // Check single unicodes for Hangul/Han and encode if so.
150  int unicode = unicodes[0];
151  int leading, vowel, trailing;
152  auto it = radical_map.find(unicode);
153  if (it != radical_map.end()) {
154  // This is Han. Use the radical codes directly.
155  int num_radicals = it->second->size();
156  for (int c = 0; c < num_radicals; ++c) {
157  code.Set(c, han_offset + (*it->second)[c]);
158  }
159  int pre_hash = RadicalPreHash(*it->second);
160  int num_samples = radical_counts[pre_hash]++;
161  if (num_samples > 0) {
162  code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
163  }
164  } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
165  // This is Hangul. Since we know the exact size of each part at compile
166  // time, it gets the bottom set of codes.
167  code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
168  trailing + kLCount + kVCount + hangul_offset);
169  }
170  }
171  // If the code is still empty, it wasn't Han or Hangul.
172  if (code.empty()) {
173  // Special cases.
174  if (u == UNICHAR_SPACE) {
175  code.Set(0, 0); // Space.
176  } else if (static_cast<int>(u) == null_id ||
177  (unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) {
178  code.Set(0, direct_set.unichar_to_id(kNullChar));
179  } else {
180  // Add the direct_set unichar-ids of the unicodes in sequence to the
181  // code.
182  for (int uni : unicodes) {
183  int position = code.length();
184  if (position >= RecodedCharID::kMaxCodeLen) {
185  tprintf("Unichar %d=%s is too long to encode!!\n", u, unicharset.id_to_unichar(u));
186  return false;
187  }
188  UNICHAR unichar(uni);
189  char *utf8 = unichar.utf8_str();
190  if (!direct_set.contains_unichar(utf8)) {
191  direct_set.unichar_insert(utf8);
192  }
193  code.Set(position, direct_set.unichar_to_id(utf8));
194  delete[] utf8;
195  if (direct_set.size() > unicharset.size() + !unicharset.has_special_codes()) {
196  // Code space got bigger!
197  tprintf("Code space expanded from original unicharset!!\n");
198  return false;
199  }
200  }
201  }
202  }
203  encoder_.push_back(code);
204  }
205  // Now renumber Han to make all codes unique. We already added han_offset to
206  // all Han. Now separate out the radical, stroke, and count codes for Han.
207  int code_offset = 0;
208  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
209  int max_offset = 0;
210  for (unsigned u = 0; u < unicharset.size(); ++u) {
211  RecodedCharID *code = &encoder_[u];
212  if (code->length() <= i) {
213  continue;
214  }
215  max_offset = std::max(max_offset, (*code)(i)-han_offset);
216  code->Set(i, (*code)(i) + code_offset);
217  }
218  if (max_offset == 0) {
219  break;
220  }
221  code_offset += max_offset + 1;
222  }
223  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
224  SetupDecoder();
225  return true;
226 }
227 
228 // Sets up an encoder that doesn't change the unichars at all, so it just
229 // passes them through unchanged.
231  std::vector<RecodedCharID> codes;
232  for (unsigned u = 0; u < unicharset.size(); ++u) {
233  RecodedCharID code;
234  code.Set(0, u);
235  codes.push_back(code);
236  }
237  if (!unicharset.has_special_codes()) {
238  RecodedCharID code;
239  code.Set(0, unicharset.size());
240  codes.push_back(code);
241  }
242  SetupDirect(codes);
243 }
244 
245 // Sets up an encoder directly using the given encoding vector, which maps
246 // unichar_ids to the given codes.
247 void UnicharCompress::SetupDirect(const std::vector<RecodedCharID> &codes) {
248  encoder_ = codes;
249  ComputeCodeRange();
250  SetupDecoder();
251 }
252 
253 // Renumbers codes to eliminate unused values.
254 void UnicharCompress::DefragmentCodeValues(int encoded_null) {
255  // There may not be any Hangul, but even if there is, it is possible that not
256  // all codes are used. Likewise with the Han encoding, it is possible that not
257  // all numbers of strokes are used.
258  ComputeCodeRange();
259  std::vector<int> offsets(code_range_);
260  // Find which codes are used
261  for (auto &code : encoder_) {
262  for (int i = 0; i < code.length(); ++i) {
263  offsets[code(i)] = 1;
264  }
265  }
266  // Compute offsets based on code use.
267  int offset = 0;
268  for (unsigned i = 0; i < offsets.size(); ++i) {
269  // If not used, decrement everything above here.
270  // We are moving encoded_null to the end, so it is not "used".
271  if (offsets[i] == 0 || i == static_cast<unsigned>(encoded_null)) {
272  --offset;
273  } else {
274  offsets[i] = offset;
275  }
276  }
277  if (encoded_null >= 0) {
278  // The encoded_null is moving to the end, for the benefit of TensorFlow,
279  // which is offsets.size() + offsets.back().
280  offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
281  }
282  // Now apply the offsets.
283  for (auto &c : encoder_) {
284  RecodedCharID *code = &c;
285  for (int i = 0; i < code->length(); ++i) {
286  int value = (*code)(i);
287  code->Set(i, value + offsets[value]);
288  }
289  }
290  ComputeCodeRange();
291 }
292 
293 // Encodes a single unichar_id. Returns the length of the code, or zero if
294 // invalid input, and the encoding itself
295 int UnicharCompress::EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const {
296  if (unichar_id >= encoder_.size()) {
297  return 0;
298  }
299  *code = encoder_[unichar_id];
300  return code->length();
301 }
302 
303 // Decodes code, returning the original unichar-id, or
304 // INVALID_UNICHAR_ID if the input is invalid.
306  int len = code.length();
307  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) {
308  return INVALID_UNICHAR_ID;
309  }
310  auto it = decoder_.find(code);
311  if (it == decoder_.end()) {
312  return INVALID_UNICHAR_ID;
313  }
314  return it->second;
315 }
316 
317 // Writes to the given file. Returns false in case of error.
319  return fp->Serialize(encoder_);
320 }
321 
322 // Reads from the given file. Returns false in case of error.
324  if (!fp->DeSerialize(encoder_)) {
325  return false;
326  }
327  ComputeCodeRange();
328  SetupDecoder();
329  return true;
330 }
331 
332 // Returns a string containing a text file that describes the encoding thus:
333 // <index>[,<index>]*<tab><UTF8-str><newline>
334 // In words, a comma-separated list of one or more indices, followed by a tab
335 // and the UTF-8 string that the code represents per line. Most simple scripts
336 // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
337 // and the Indic scripts will contain a many-to-many mapping.
338 // See the class comment above for details.
339 std::string UnicharCompress::GetEncodingAsString(const UNICHARSET &unicharset) const {
340  std::string encoding;
341  for (unsigned c = 0; c < encoder_.size(); ++c) {
342  const RecodedCharID &code = encoder_[c];
343  if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
344  // Don't show the duplicate entry.
345  continue;
346  }
347  encoding += std::to_string(code(0));
348  for (int i = 1; i < code.length(); ++i) {
349  encoding += "," + std::to_string(code(i));
350  }
351  encoding += "\t";
352  if (c >= unicharset.size() ||
353  (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && unicharset.has_special_codes())) {
354  encoding += kNullChar;
355  } else {
356  encoding += unicharset.id_to_unichar(c);
357  }
358  encoding += "\n";
359  }
360  return encoding;
361 }
362 
363 // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
364 // Note that the returned values are 0-based indices, NOT unicode Jamo.
365 // Returns false if the input is not in the Hangul unicode range.
366 /* static */
367 bool UnicharCompress::DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing) {
368  if (unicode < kFirstHangul) {
369  return false;
370  }
371  int offset = unicode - kFirstHangul;
372  if (offset >= kNumHangul) {
373  return false;
374  }
375  const int kNCount = kVCount * kTCount;
376  *leading = offset / kNCount;
377  *vowel = (offset % kNCount) / kTCount;
378  *trailing = offset % kTCount;
379  return true;
380 }
381 
382 // Computes the value of code_range_ from the encoder_.
383 void UnicharCompress::ComputeCodeRange() {
384  code_range_ = -1;
385  for (auto &code : encoder_) {
386  for (int i = 0; i < code.length(); ++i) {
387  if (code(i) > code_range_) {
388  code_range_ = code(i);
389  }
390  }
391  }
392  ++code_range_;
393 }
394 
395 // Initializes the decoding hash_map from the encoding array.
396 void UnicharCompress::SetupDecoder() {
397  Cleanup();
398  is_valid_start_.clear();
399  is_valid_start_.resize(code_range_);
400  for (unsigned c = 0; c < encoder_.size(); ++c) {
401  const RecodedCharID &code = encoder_[c];
402  decoder_[code] = c;
403  is_valid_start_[code(0)] = true;
404  RecodedCharID prefix = code;
405  int len = code.length() - 1;
406  prefix.Truncate(len);
407  auto final_it = final_codes_.find(prefix);
408  if (final_it == final_codes_.end()) {
409  auto *code_list = new std::vector<int>;
410  code_list->push_back(code(len));
411  final_codes_[prefix] = code_list;
412  while (--len >= 0) {
413  prefix.Truncate(len);
414  auto next_it = next_codes_.find(prefix);
415  if (next_it == next_codes_.end()) {
416  auto *code_list = new std::vector<int>;
417  code_list->push_back(code(len));
418  next_codes_[prefix] = code_list;
419  } else {
420  // We still have to search the list as we may get here via multiple
421  // lengths of code.
422  if (!contains(*next_it->second, code(len))) {
423  next_it->second->push_back(code(len));
424  }
425  break; // This prefix has been processed.
426  }
427  }
428  } else {
429  if (!contains(*final_it->second, code(len))) {
430  final_it->second->push_back(code(len));
431  }
432  }
433  }
434 }
435 
436 // Frees allocated memory.
437 void UnicharCompress::Cleanup() {
438  decoder_.clear();
439  is_valid_start_.clear();
440  for (auto &next_code : next_codes_) {
441  delete next_code.second;
442  }
443  for (auto &final_code : final_codes_) {
444  delete final_code.second;
445  }
446  next_codes_.clear();
447  final_codes_.clear();
448 }
449 
450 } // namespace tesseract.
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:41
const int kRadicalRadix
std::unordered_map< int, std::unique_ptr< std::vector< int > >> RSMap
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
std::unordered_map< int, int > RSCounts
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
bool contains(const std::vector< T > &data, const T &value)
Definition: helpers.h:37
char * utf8_str() const
Definition: unichar.cpp:134
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220
bool DeSerialize(std::string &data)
Definition: serialis.cpp:94
bool Serialize(const std::string &data)
Definition: serialis.cpp:107
void Set(int index, int value)
static const int kMaxCodeLen
void Set3(int code0, int code1, int code2)
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
UnicharCompress & operator=(const UnicharCompress &src)
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
static const int kFirstHangul
void SetupPassThrough(const UNICHARSET &unicharset)
int DecodeUnichar(const RecodedCharID &code) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool Serialize(TFile *fp) const
void SetupDirect(const std::vector< RecodedCharID > &codes)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
bool has_special_codes() const
Definition: unicharset.h:757
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
size_t size() const
Definition: unicharset.h:355
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265