tesseract  5.0.0
tesseract::UnicharCompress Class Reference

#include <unicharcompress.h>

Public Member Functions

 UnicharCompress ()
 
 UnicharCompress (const UnicharCompress &src)
 
 ~UnicharCompress ()
 
UnicharCompressoperator= (const UnicharCompress &src)
 
bool ComputeEncoding (const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
 
void SetupPassThrough (const UNICHARSET &unicharset)
 
void SetupDirect (const std::vector< RecodedCharID > &codes)
 
int code_range () const
 
int EncodeUnichar (unsigned unichar_id, RecodedCharID *code) const
 
int DecodeUnichar (const RecodedCharID &code) const
 
bool IsValidFirstCode (int code) const
 
const std::vector< int > * GetNextCodes (const RecodedCharID &code) const
 
const std::vector< int > * GetFinalCodes (const RecodedCharID &code) const
 
bool Serialize (TFile *fp) const
 
bool DeSerialize (TFile *fp)
 
std::string GetEncodingAsString (const UNICHARSET &unicharset) const
 

Static Public Member Functions

static bool DecomposeHangul (int unicode, int *leading, int *vowel, int *trailing)
 

Static Public Attributes

static const int kFirstHangul = 0xac00
 
static const int kNumHangul = 11172
 
static const int kLCount = 19
 
static const int kVCount = 21
 
static const int kTCount = 28
 

Detailed Description

Definition at line 139 of file unicharcompress.h.

Constructor & Destructor Documentation

◆ UnicharCompress() [1/2]

tesseract::UnicharCompress::UnicharCompress ( )

Definition at line 90 of file unicharcompress.cpp.

90 : code_range_(0) {}

◆ UnicharCompress() [2/2]

tesseract::UnicharCompress::UnicharCompress ( const UnicharCompress src)

Definition at line 91 of file unicharcompress.cpp.

91  {
92  *this = src;
93 }

◆ ~UnicharCompress()

tesseract::UnicharCompress::~UnicharCompress ( )

Definition at line 94 of file unicharcompress.cpp.

94  {
95  Cleanup();
96 }

Member Function Documentation

◆ code_range()

int tesseract::UnicharCompress::code_range ( ) const
inline

Definition at line 171 of file unicharcompress.h.

171  {
172  return code_range_;
173  }

◆ ComputeEncoding()

bool tesseract::UnicharCompress::ComputeEncoding ( const UNICHARSET unicharset,
int  null_id,
std::string *  radical_stroke_table 
)

Definition at line 109 of file unicharcompress.cpp.

110  {
111  RSMap radical_map;
112  if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) {
113  return false;
114  }
115  encoder_.clear();
116  UNICHARSET direct_set;
117  // To avoid unused codes, clear the special codes from the direct_set.
118  direct_set.clear();
119  // Always keep space as 0;
120  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
121  // Null char is next if we have one.
122  if (null_id >= 0) {
123  direct_set.unichar_insert(kNullChar);
124  }
125  RSCounts radical_counts;
126  // In the initial map, codes [0, unicharset.size()) are
127  // reserved for non-han/hangul sequences of 1 or more unicodes.
128  int hangul_offset = unicharset.size();
129  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
130  const int kTotalJamos = kLCount + kVCount + kTCount;
131  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
132  // to measure the number of radicals and strokes, initially we use the same
133  // code range for all 3 Han code positions, and fix them after.
134  int han_offset = hangul_offset + kTotalJamos;
135  for (unsigned u = 0; u <= unicharset.size(); ++u) {
136  // We special-case allow null_id to be equal to unicharset.size() in case
137  // there is no space in unicharset for it.
138  if (u == unicharset.size() && static_cast<int>(u) != null_id) {
139  break; // Finished
140  }
141  RecodedCharID code;
142  // Convert to unicodes.
143  std::vector<char32> unicodes;
144  std::string cleaned;
145  if (u < unicharset.size()) {
146  cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
147  }
148  if (u < unicharset.size() && (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
149  // Check single unicodes for Hangul/Han and encode if so.
150  int unicode = unicodes[0];
151  int leading, vowel, trailing;
152  auto it = radical_map.find(unicode);
153  if (it != radical_map.end()) {
154  // This is Han. Use the radical codes directly.
155  int num_radicals = it->second->size();
156  for (int c = 0; c < num_radicals; ++c) {
157  code.Set(c, han_offset + (*it->second)[c]);
158  }
159  int pre_hash = RadicalPreHash(*it->second);
160  int num_samples = radical_counts[pre_hash]++;
161  if (num_samples > 0) {
162  code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
163  }
164  } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
165  // This is Hangul. Since we know the exact size of each part at compile
166  // time, it gets the bottom set of codes.
167  code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
168  trailing + kLCount + kVCount + hangul_offset);
169  }
170  }
171  // If the code is still empty, it wasn't Han or Hangul.
172  if (code.empty()) {
173  // Special cases.
174  if (u == UNICHAR_SPACE) {
175  code.Set(0, 0); // Space.
176  } else if (static_cast<int>(u) == null_id ||
177  (unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) {
178  code.Set(0, direct_set.unichar_to_id(kNullChar));
179  } else {
180  // Add the direct_set unichar-ids of the unicodes in sequence to the
181  // code.
182  for (int uni : unicodes) {
183  int position = code.length();
184  if (position >= RecodedCharID::kMaxCodeLen) {
185  tprintf("Unichar %d=%s is too long to encode!!\n", u, unicharset.id_to_unichar(u));
186  return false;
187  }
188  UNICHAR unichar(uni);
189  char *utf8 = unichar.utf8_str();
190  if (!direct_set.contains_unichar(utf8)) {
191  direct_set.unichar_insert(utf8);
192  }
193  code.Set(position, direct_set.unichar_to_id(utf8));
194  delete[] utf8;
195  if (direct_set.size() > unicharset.size() + !unicharset.has_special_codes()) {
196  // Code space got bigger!
197  tprintf("Code space expanded from original unicharset!!\n");
198  return false;
199  }
200  }
201  }
202  }
203  encoder_.push_back(code);
204  }
205  // Now renumber Han to make all codes unique. We already added han_offset to
206  // all Han. Now separate out the radical, stroke, and count codes for Han.
207  int code_offset = 0;
208  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
209  int max_offset = 0;
210  for (unsigned u = 0; u < unicharset.size(); ++u) {
211  RecodedCharID *code = &encoder_[u];
212  if (code->length() <= i) {
213  continue;
214  }
215  max_offset = std::max(max_offset, (*code)(i)-han_offset);
216  code->Set(i, (*code)(i) + code_offset);
217  }
218  if (max_offset == 0) {
219  break;
220  }
221  code_offset += max_offset + 1;
222  }
223  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
224  SetupDecoder();
225  return true;
226 }
const int kRadicalRadix
std::unordered_map< int, std::unique_ptr< std::vector< int > >> RSMap
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
std::unordered_map< int, int > RSCounts
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220
static const int kMaxCodeLen
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265

◆ DecodeUnichar()

int tesseract::UnicharCompress::DecodeUnichar ( const RecodedCharID code) const

Definition at line 305 of file unicharcompress.cpp.

305  {
306  int len = code.length();
307  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) {
308  return INVALID_UNICHAR_ID;
309  }
310  auto it = decoder_.find(code);
311  if (it == decoder_.end()) {
312  return INVALID_UNICHAR_ID;
313  }
314  return it->second;
315 }

◆ DecomposeHangul()

bool tesseract::UnicharCompress::DecomposeHangul ( int  unicode,
int *  leading,
int *  vowel,
int *  trailing 
)
static

Definition at line 367 of file unicharcompress.cpp.

367  {
368  if (unicode < kFirstHangul) {
369  return false;
370  }
371  int offset = unicode - kFirstHangul;
372  if (offset >= kNumHangul) {
373  return false;
374  }
375  const int kNCount = kVCount * kTCount;
376  *leading = offset / kNCount;
377  *vowel = (offset % kNCount) / kTCount;
378  *trailing = offset % kTCount;
379  return true;
380 }
static const int kFirstHangul

◆ DeSerialize()

bool tesseract::UnicharCompress::DeSerialize ( TFile fp)

Definition at line 323 of file unicharcompress.cpp.

323  {
324  if (!fp->DeSerialize(encoder_)) {
325  return false;
326  }
327  ComputeCodeRange();
328  SetupDecoder();
329  return true;
330 }

◆ EncodeUnichar()

int tesseract::UnicharCompress::EncodeUnichar ( unsigned  unichar_id,
RecodedCharID code 
) const

Definition at line 295 of file unicharcompress.cpp.

295  {
296  if (unichar_id >= encoder_.size()) {
297  return 0;
298  }
299  *code = encoder_[unichar_id];
300  return code->length();
301 }

◆ GetEncodingAsString()

std::string tesseract::UnicharCompress::GetEncodingAsString ( const UNICHARSET unicharset) const

Definition at line 339 of file unicharcompress.cpp.

339  {
340  std::string encoding;
341  for (unsigned c = 0; c < encoder_.size(); ++c) {
342  const RecodedCharID &code = encoder_[c];
343  if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
344  // Don't show the duplicate entry.
345  continue;
346  }
347  encoding += std::to_string(code(0));
348  for (int i = 1; i < code.length(); ++i) {
349  encoding += "," + std::to_string(code(i));
350  }
351  encoding += "\t";
352  if (c >= unicharset.size() ||
353  (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && unicharset.has_special_codes())) {
354  encoding += kNullChar;
355  } else {
356  encoding += unicharset.id_to_unichar(c);
357  }
358  encoding += "\n";
359  }
360  return encoding;
361 }

◆ GetFinalCodes()

const std::vector<int>* tesseract::UnicharCompress::GetFinalCodes ( const RecodedCharID code) const
inline

Definition at line 193 of file unicharcompress.h.

193  {
194  auto it = final_codes_.find(code);
195  return it == final_codes_.end() ? nullptr : it->second;
196  }

◆ GetNextCodes()

const std::vector<int>* tesseract::UnicharCompress::GetNextCodes ( const RecodedCharID code) const
inline

Definition at line 187 of file unicharcompress.h.

187  {
188  auto it = next_codes_.find(code);
189  return it == next_codes_.end() ? nullptr : it->second;
190  }

◆ IsValidFirstCode()

bool tesseract::UnicharCompress::IsValidFirstCode ( int  code) const
inline

Definition at line 182 of file unicharcompress.h.

182  {
183  return is_valid_start_[code];
184  }

◆ operator=()

UnicharCompress & tesseract::UnicharCompress::operator= ( const UnicharCompress src)

Definition at line 97 of file unicharcompress.cpp.

97  {
98  Cleanup();
99  encoder_ = src.encoder_;
100  code_range_ = src.code_range_;
101  SetupDecoder();
102  return *this;
103 }

◆ Serialize()

bool tesseract::UnicharCompress::Serialize ( TFile fp) const

Definition at line 318 of file unicharcompress.cpp.

318  {
319  return fp->Serialize(encoder_);
320 }

◆ SetupDirect()

void tesseract::UnicharCompress::SetupDirect ( const std::vector< RecodedCharID > &  codes)

Definition at line 247 of file unicharcompress.cpp.

247  {
248  encoder_ = codes;
249  ComputeCodeRange();
250  SetupDecoder();
251 }

◆ SetupPassThrough()

void tesseract::UnicharCompress::SetupPassThrough ( const UNICHARSET unicharset)

Definition at line 230 of file unicharcompress.cpp.

230  {
231  std::vector<RecodedCharID> codes;
232  for (unsigned u = 0; u < unicharset.size(); ++u) {
233  RecodedCharID code;
234  code.Set(0, u);
235  codes.push_back(code);
236  }
237  if (!unicharset.has_special_codes()) {
238  RecodedCharID code;
239  code.Set(0, unicharset.size());
240  codes.push_back(code);
241  }
242  SetupDirect(codes);
243 }
void SetupDirect(const std::vector< RecodedCharID > &codes)

Member Data Documentation

◆ kFirstHangul

const int tesseract::UnicharCompress::kFirstHangul = 0xac00
static

Definition at line 147 of file unicharcompress.h.

◆ kLCount

const int tesseract::UnicharCompress::kLCount = 19
static

Definition at line 152 of file unicharcompress.h.

◆ kNumHangul

const int tesseract::UnicharCompress::kNumHangul = 11172
static

Definition at line 149 of file unicharcompress.h.

◆ kTCount

const int tesseract::UnicharCompress::kTCount = 28
static

Definition at line 154 of file unicharcompress.h.

◆ kVCount

const int tesseract::UnicharCompress::kVCount = 21
static

Definition at line 153 of file unicharcompress.h.


The documentation for this class was generated from the following files: