tesseract  5.0.0
recodebeam_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "include_gunit.h"
13 #include "log.h" // for LOG
14 
15 #include "matrix.h"
16 #include "normstrngs.h"
17 #include "pageres.h"
18 #include "ratngs.h"
19 #include "recodebeam.h"
20 #include "unicharcompress.h"
22 
23 #include "helpers.h"
24 
25 namespace tesseract {
26 
27 // Number of characters to test beam search with.
28 const int kNumChars = 100;
29 // Amount of extra random data to pad with after.
30 const int kPadding = 64;
31 // Dictionary test data.
32 // The top choice is: "Gef s wordsright.".
33 // The desired phrase is "Gets words right.".
34 // There is a competing dictionary phrase: "Get swords right.".
35 // ... due to the following errors from the network:
36 // f stronger than t in "Get".
37 // weak space between Gef and s and between s and words.
38 // weak space between words and right.
39 const char *kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r", "d",
40  "s", "", "r", "i", "g", "h", "t", ".", nullptr};
41 const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, 0.89, 0.99, 0.99,
42  0.99, 0.99, 0.95, 0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
43 const char *kGWR2nds[] = {"C", "c", "t", "", "S", "", "W", "O", "t", "h",
44  "S", " ", "t", "I", "9", "b", "f", ",", nullptr};
45 const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, 0.10, 0.01, 0.01,
46  0.01, 0.01, 0.05, 0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
47 
48 const char *kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr};
49 const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};
50 const char *kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr};
51 const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
52 
53 const char *kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr};
54 const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97};
55 const char *kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr};
56 const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
57 
58 class RecodeBeamTest : public ::testing::Test {
59 protected:
60  void SetUp() override {
61  std::locale::global(std::locale(""));
63  }
64 
66  ~RecodeBeamTest() override {
67  lstm_dict_.End();
68  }
69 
70  // Loads and compresses the given unicharset.
71  void LoadUnicharset(const std::string &unicharset_name) {
72  std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
73  std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
74  std::string radical_data;
75  CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
76  CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
79  std::string radical_str(radical_data.c_str());
80  EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_, &radical_str));
81  RecodedCharID code;
83  encoded_null_char_ = code(0);
84  // Space should encode as itself.
86  EXPECT_EQ(UNICHAR_SPACE, code(0));
87  std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
88  std::string encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
89  std::string encoding_str(&encoding[0], encoding.size());
90  CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
91  LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
92  }
93  // Loads the dictionary.
94  void LoadDict(const std::string &lang) {
95  std::string traineddata_name = lang + ".traineddata";
96  std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name);
97  lstm_dict_.SetupForLoad(nullptr);
99  mgr.Init(traineddata_file.c_str());
100  lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
102  }
103 
104  // Expects the appropriate results from the compressed_ ccutil_.unicharset.
106  const std::vector<int> &transcription) {
107  // Get the utf8 string of the transcription.
108  std::string truth_utf8;
109  for (int i : transcription) {
110  truth_utf8 += ccutil_.unicharset.id_to_unichar(i);
111  }
113  ExpectCorrect(output, truth_utf8, nullptr, &words);
114  }
115  void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output, const std::string &truth_utf8,
116  Dict *dict, PointerVector<WERD_RES> *words) {
117  RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
118  beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
119  // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
120  // beam_search.DebugBeams(ccutil_.unicharset);
121  std::vector<int> labels, xcoords;
122  beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
123  LOG(INFO) << "Labels size = " << labels.size() << " coords " << xcoords.size() << "\n";
124  // Now decode using recoder_.
125  std::string decoded;
126  int end = 1;
127  for (unsigned start = 0; start < labels.size(); start = end) {
128  RecodedCharID code;
129  unsigned index = start;
130  int uni_id = INVALID_UNICHAR_ID;
131  do {
132  code.Set(code.length(), labels[index++]);
133  uni_id = recoder_.DecodeUnichar(code);
134  } while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen &&
135  (uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index])));
136  EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << "index=" << index << "/" << labels.size();
137  // To the extent of truth_utf8, we expect decoded to match, but if
138  // transcription is shorter, that is OK too, as we may just be testing
139  // that we get a valid sequence when padded with random data.
140  if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size()) {
141  decoded += ccutil_.unicharset.id_to_unichar(uni_id);
142  }
143  end = index;
144  }
145  EXPECT_EQ(truth_utf8, decoded);
146 
147  // Check that ExtractBestPathAsUnicharIds does the same thing.
148  std::vector<int> unichar_ids;
149  std::vector<float> certainties, ratings;
150  beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties,
151  &ratings, &xcoords);
152  std::string u_decoded;
153  float total_rating = 0.0f;
154  for (unsigned u = 0; u < unichar_ids.size(); ++u) {
155  // To the extent of truth_utf8, we expect decoded to match, but if
156  // transcription is shorter, that is OK too, as we may just be testing
157  // that we get a valid sequence when padded with random data.
158  if (u_decoded.size() < truth_utf8.size()) {
159  const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
160  total_rating += ratings[u];
161  LOG(INFO) << u << ":u_id=" << unichar_ids[u] << "=" << str << ", c="
162  << certainties[u] << ", r=" << ratings[u] << "r_sum="
163  << total_rating << " @" << xcoords[u] << "\n";
164  if (str[0] == ' ') {
165  total_rating = 0.0f;
166  }
167  u_decoded += str;
168  }
169  }
170  EXPECT_EQ(truth_utf8, u_decoded);
171 
172  // Check that ExtractBestPathAsWords does the same thing.
173  TBOX line_box(0, 0, 100, 10);
174  for (int i = 0; i < 2; ++i) {
175  beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words);
176  std::string w_decoded;
177  for (int w = 0; w < words->size(); ++w) {
178  const WERD_RES *word = (*words)[w];
179  if (w_decoded.size() < truth_utf8.size()) {
180  if (!w_decoded.empty() && word->word->space()) {
181  w_decoded += " ";
182  }
183  w_decoded += word->best_choice->unichar_string().c_str();
184  }
185  LOG(INFO) << "Word:" << w << " = " << word->best_choice->unichar_string()
186  << ", c=" << word->best_choice->certainty() << ", r=" << word->best_choice->rating()
187  << ", perm=" << word->best_choice->permuter() << "\n";
188  }
189  std::string w_trunc(w_decoded.data(), truth_utf8.size());
190  if (truth_utf8 != w_trunc) {
193  tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
194  w_trunc.assign(w_decoded.data(), truth_utf8.size());
195  }
196  EXPECT_EQ(truth_utf8, w_trunc);
197  }
198  }
199  // Generates easy encoding of the given unichar_ids, and pads with at least
200  // padding of random data.
201  GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(const std::vector<int> &unichar_ids,
202  int padding) {
203  int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
204  int num_codes = recoder_.code_range();
205  GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
206  // Fill with random data.
207  TRand random;
208  for (int t = 0; t < width; ++t) {
209  for (int i = 0; i < num_codes; ++i) {
210  outputs(t, i) = random.UnsignedRand(0.25);
211  }
212  }
213  int t = 0;
214  for (int unichar_id : unichar_ids) {
215  RecodedCharID code;
216  int len = recoder_.EncodeUnichar(unichar_id, &code);
217  EXPECT_NE(0, len);
218  for (int j = 0; j < len; ++j) {
219  // Make the desired answer a clear winner.
220  if (j > 0 && code(j) == code(j - 1)) {
221  // We will collapse adjacent equal codes so put a null in between.
222  outputs(t++, encoded_null_char_) = 1.0f;
223  }
224  outputs(t++, code(j)) = 1.0f;
225  }
226  // Put a 0 as a null char in between.
227  outputs(t++, encoded_null_char_) = 1.0f;
228  }
229  // Normalize the probs.
230  for (int t = 0; t < width; ++t) {
231  double sum = 0.0;
232  for (int i = 0; i < num_codes; ++i) {
233  sum += outputs(t, i);
234  }
235  for (int i = 0; i < num_codes; ++i) {
236  outputs(t, i) /= sum;
237  }
238  }
239 
240  return outputs;
241  }
242  // Encodes a utf8 string (character) as unichar_id, then recodes, and sets
243  // the score for the appropriate sequence of codes, returning the ending t.
244  int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random,
245  GENERIC_2D_ARRAY<float> *outputs) {
246  int t = start_t;
247  std::vector<int> unichar_ids;
248  EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr));
249  if (unichar_ids.empty() || utf8_str[0] == '\0') {
250  unichar_ids.clear();
251  unichar_ids.push_back(unichar_null_char_);
252  }
253  int num_ids = unichar_ids.size();
254  for (int u = 0; u < num_ids; ++u) {
255  RecodedCharID code;
256  int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
257  EXPECT_NE(0, len);
258  for (int i = 0; i < len; ++i) {
259  // Apply the desired score.
260  (*outputs)(t++, code(i)) = score;
261  if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
262  int dups = static_cast<int>(random->UnsignedRand(3.0));
263  for (int d = 0; d < dups; ++d) {
264  // Duplicate the desired score.
265  (*outputs)(t++, code(i)) = score;
266  }
267  }
268  }
269  if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
270  int dups = static_cast<int>(random->UnsignedRand(3.0));
271  for (int d = 0; d < dups; ++d) {
272  // Add a random number of nulls as well.
273  (*outputs)(t++, encoded_null_char_) = score;
274  }
275  }
276  }
277  return t;
278  }
279  // Generates an encoding of the given 4 arrays as synthetic network scores.
280  // uses scores1 for chars1 and scores2 for chars2, and everything else gets
281  // the leftovers shared out equally. Note that empty string encodes as the
282  // null_char_.
283  GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char *chars1[], const float scores1[],
284  const char *chars2[], const float scores2[],
285  TRand *random) {
286  int width = 0;
287  while (chars1[width] != nullptr) {
288  ++width;
289  }
290  int padding = width * RecodedCharID::kMaxCodeLen;
291  int num_codes = recoder_.code_range();
292  GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
293  int t = 0;
294  for (int i = 0; i < width; ++i) {
295  // In case there is overlap in the codes between 1st and 2nd choice, it
296  // is better to encode the 2nd choice first.
297  int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
298  int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
299  // Advance t to the max end, setting everything else to the leftovers.
300  int max_t = std::max(end_t1, end_t2);
301  while (t < max_t) {
302  double total_score = 0.0;
303  for (int j = 0; j < num_codes; ++j) {
304  total_score += outputs(t, j);
305  }
306  double null_remainder = (1.0 - total_score) / 2.0;
307  double remainder = null_remainder / (num_codes - 2);
308  if (outputs(t, encoded_null_char_) < null_remainder) {
309  outputs(t, encoded_null_char_) += null_remainder;
310  } else {
311  remainder += remainder;
312  }
313  for (int j = 0; j < num_codes; ++j) {
314  if (outputs(t, j) == 0.0f) {
315  outputs(t, j) = remainder;
316  }
317  }
318  ++t;
319  }
320  }
321  // Fill the rest with null chars.
322  while (t < width + padding) {
323  outputs(t++, encoded_null_char_) = 1.0f;
324  }
325  return outputs;
326  }
332 };
333 
334 TEST_F(RecodeBeamTest, DoesChinese) {
335  LOG(INFO) << "Testing chi_tra"
336  << "\n";
337  LoadUnicharset("chi_tra.unicharset");
338  // Correctly reproduce the first kNumchars characters from easy output.
339  std::vector<int> transcription;
340  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
341  transcription.push_back(i);
342  }
343  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
344  ExpectCorrect(outputs, transcription);
345  LOG(INFO) << "Testing chi_sim"
346  << "\n";
347  LoadUnicharset("chi_sim.unicharset");
348  // Correctly reproduce the first kNumchars characters from easy output.
349  transcription.clear();
350  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
351  transcription.push_back(i);
352  }
353  outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
354  ExpectCorrect(outputs, transcription);
355 }
356 
357 TEST_F(RecodeBeamTest, DoesJapanese) {
358  LOG(INFO) << "Testing jpn"
359  << "\n";
360  LoadUnicharset("jpn.unicharset");
361  // Correctly reproduce the first kNumchars characters from easy output.
362  std::vector<int> transcription;
363  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
364  transcription.push_back(i);
365  }
366  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
367  ExpectCorrect(outputs, transcription);
368 }
369 
370 TEST_F(RecodeBeamTest, DoesKorean) {
371  LOG(INFO) << "Testing kor"
372  << "\n";
373  LoadUnicharset("kor.unicharset");
374  // Correctly reproduce the first kNumchars characters from easy output.
375  std::vector<int> transcription;
376  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
377  transcription.push_back(i);
378  }
379  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
380  ExpectCorrect(outputs, transcription);
381 }
382 
383 TEST_F(RecodeBeamTest, DoesKannada) {
384  LOG(INFO) << "Testing kan"
385  << "\n";
386  LoadUnicharset("kan.unicharset");
387  // Correctly reproduce the first kNumchars characters from easy output.
388  std::vector<int> transcription;
389  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
390  transcription.push_back(i);
391  }
392  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
393  ExpectCorrect(outputs, transcription);
394 }
395 
396 TEST_F(RecodeBeamTest, DoesMarathi) {
397  LOG(INFO) << "Testing mar"
398  << "\n";
399  LoadUnicharset("mar.unicharset");
400  // Correctly reproduce the first kNumchars characters from easy output.
401  std::vector<int> transcription;
402  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
403  transcription.push_back(i);
404  }
405  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
406  ExpectCorrect(outputs, transcription);
407 }
408 
409 TEST_F(RecodeBeamTest, DoesEnglish) {
410  LOG(INFO) << "Testing eng"
411  << "\n";
412  LoadUnicharset("eng.unicharset");
413  // Correctly reproduce the first kNumchars characters from easy output.
414  std::vector<int> transcription;
415  for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
416  transcription.push_back(i);
417  }
418  GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
419  ExpectCorrect(outputs, transcription);
420 }
421 
422 TEST_F(RecodeBeamTest, DISABLED_EngDictionary) {
423  LOG(INFO) << "Testing eng dictionary"
424  << "\n";
425  LoadUnicharset("eng_beam.unicharset");
426  GENERIC_2D_ARRAY<float> outputs =
427  GenerateSyntheticOutputs(kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);
428  std::string default_str;
429  for (int i = 0; kGWRTops[i] != nullptr; ++i) {
430  default_str += kGWRTops[i];
431  }
433  ExpectCorrect(outputs, default_str, nullptr, &words);
434  // Now try again with the dictionary.
435  LoadDict("eng_beam");
436  ExpectCorrect(outputs, "Gets words right.", &lstm_dict_, &words);
437 }
438 
439 TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
440  LOG(INFO) << "Testing zh_hans dictionary"
441  << "\n";
442  LoadUnicharset("zh_hans.unicharset");
443  GENERIC_2D_ARRAY<float> outputs =
444  GenerateSyntheticOutputs(kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);
446  ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words);
447  // Each is an individual word, with permuter = top choice.
448  EXPECT_EQ(7, words.size());
449  for (int w = 0; w < words.size(); ++w) {
450  EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter());
451  }
452  // Now try again with the dictionary.
453  LoadDict("zh_hans");
454  ExpectCorrect(outputs, "实学储啬投学生", &lstm_dict_, &words);
455  // Number of words expected.
456  const int kNumWords = 5;
457  // Content of the words.
458  const char *kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"};
459  // Permuters of the words.
460  const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, TOP_CHOICE_PERM,
462  EXPECT_EQ(kNumWords, words.size());
463  for (int w = 0; w < kNumWords && w < words.size(); ++w) {
464  EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());
465  EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter());
466  }
467 }
468 
469 // Tests that a recoder built with decomposed unicode allows true ctc
470 // arbitrary duplicates and inserted nulls inside the multicode sequence.
471 TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) {
472  LOG(INFO) << "Testing duplicates in multi-code sequences"
473  << "\n";
474  LoadUnicharset("vie.d.unicharset");
475  tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset);
476  TRand random;
477  GENERIC_2D_ARRAY<float> outputs =
478  GenerateSyntheticOutputs(kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);
480  std::string truth_str;
482  tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str);
483  ExpectCorrect(outputs, truth_str, nullptr, &words);
484 }
485 
486 } // namespace tesseract
@ LOG
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_OK(test)
Definition: include_gunit.h:84
@ INFO
Definition: log.h:28
const float kGWR2ndScores[]
const char * kGWRTops[]
const float kZH2ndScores[]
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
const char * kVi2nds[]
const char * kViTops[]
const float kViTopScores[]
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ UNICHAR_BROKEN
Definition: unicharset.h:38
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:152
const int kNumChars
const char * kZH2nds[]
const float kVi2ndScores[]
const char * kZHTops[]
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:240
@ TOP_CHOICE_PERM
Definition: ratngs.h:234
TEST_F(EuroText, FastLatinOCR)
const float kZHTopScores[]
const float kGWRTopScores[]
const char * kGWR2nds[]
const int kPadding
WERD_CHOICE * best_choice
Definition: pageres.h:239
float certainty() const
Definition: ratngs.h:311
uint8_t permuter() const
Definition: ratngs.h:327
float rating() const
Definition: ratngs.h:308
std::string & unichar_string()
Definition: ratngs.h:515
uint8_t space() const
Definition: werd.h:100
UNICHARSET unicharset
Definition: ccutil.h:61
unsigned size() const
Definition: genericvector.h:74
double UnsignedRand(double range)
Definition: helpers.h:80
bool Init(const char *data_file_name)
void Set(int index, int value)
static const int kMaxCodeLen
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
bool IsValidFirstCode(int code) const
int DecodeUnichar(const RecodedCharID &code) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239
bool has_special_codes() const
Definition: unicharset.h:757
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
size_t size() const
Definition: unicharset.h:355
void LoadLSTM(const std::string &lang, TessdataManager *data_file)
Definition: dict.cpp:291
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:180
void End()
Definition: dict.cpp:379
bool FinishLoad()
Definition: dict.cpp:357
void Decode(const NetworkIO &output, double dict_ratio, double cert_offset, double worst_dict_cert, const UNICHARSET *charset, int lstm_choice_mode=0)
Definition: recodebeam.cpp:89
void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET *unicharset, std::vector< int > *unichar_ids, std::vector< float > *certs, std::vector< float > *ratings, std::vector< int > *xcoords) const
Definition: recodebeam.cpp:230
void ExtractBestPathAsLabels(std::vector< int > *labels, std::vector< int > *xcoords) const
Definition: recodebeam.cpp:207
void ExtractBestPathAsWords(const TBOX &line_box, float scale_factor, bool debug, const UNICHARSET *unicharset, PointerVector< WERD_RES > *words, int lstm_choice_mode=0)
Definition: recodebeam.cpp:245
static int Defaults()
Definition: include_gunit.h:61
static void MakeTmpdir()
Definition: include_gunit.h:38
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:56
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52
int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random, GENERIC_2D_ARRAY< float > *outputs)
void ExpectCorrect(const GENERIC_2D_ARRAY< float > &output, const std::vector< int > &transcription)
void LoadUnicharset(const std::string &unicharset_name)
void LoadDict(const std::string &lang)
void ExpectCorrect(const GENERIC_2D_ARRAY< float > &output, const std::string &truth_utf8, Dict *dict, PointerVector< WERD_RES > *words)
GENERIC_2D_ARRAY< float > GenerateRandomPaddedOutputs(const std::vector< int > &unichar_ids, int padding)
GENERIC_2D_ARRAY< float > GenerateSyntheticOutputs(const char *chars1[], const float scores1[], const char *chars2[], const float scores2[], TRand *random)