tesseract  5.0.0
tesseract::RecodeBeamTest Class Reference
Inheritance diagram for tesseract::RecodeBeamTest:

Protected Member Functions

void SetUp () override
 
 RecodeBeamTest ()
 
 ~RecodeBeamTest () override
 
void LoadUnicharset (const std::string &unicharset_name)
 
void LoadDict (const std::string &lang)
 
void ExpectCorrect (const GENERIC_2D_ARRAY< float > &output, const std::vector< int > &transcription)
 
void ExpectCorrect (const GENERIC_2D_ARRAY< float > &output, const std::string &truth_utf8, Dict *dict, PointerVector< WERD_RES > *words)
 
GENERIC_2D_ARRAY< float > GenerateRandomPaddedOutputs (const std::vector< int > &unichar_ids, int padding)
 
int EncodeUTF8 (const char *utf8_str, float score, int start_t, TRand *random, GENERIC_2D_ARRAY< float > *outputs)
 
GENERIC_2D_ARRAY< float > GenerateSyntheticOutputs (const char *chars1[], const float scores1[], const char *chars2[], const float scores2[], TRand *random)
 

Protected Attributes

UnicharCompress recoder_
 
int unichar_null_char_ = 0
 
int encoded_null_char_ = 0
 
CCUtil ccutil_
 
Dict lstm_dict_
 

Detailed Description

Definition at line 58 of file recodebeam_test.cc.

Constructor & Destructor Documentation

◆ RecodeBeamTest()

tesseract::RecodeBeamTest::RecodeBeamTest ( )
inlineprotected

◆ ~RecodeBeamTest()

tesseract::RecodeBeamTest::~RecodeBeamTest ( )
inlineoverrideprotected

Definition at line 66 of file recodebeam_test.cc.

66  {
67  lstm_dict_.End();
68  }
void End()
Definition: dict.cpp:379

Member Function Documentation

◆ EncodeUTF8()

int tesseract::RecodeBeamTest::EncodeUTF8 ( const char *  utf8_str,
float  score,
int  start_t,
TRand random,
GENERIC_2D_ARRAY< float > *  outputs 
)
inlineprotected

Definition at line 244 of file recodebeam_test.cc.

245  {
246  int t = start_t;
247  std::vector<int> unichar_ids;
248  EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr));
249  if (unichar_ids.empty() || utf8_str[0] == '\0') {
250  unichar_ids.clear();
251  unichar_ids.push_back(unichar_null_char_);
252  }
253  int num_ids = unichar_ids.size();
254  for (int u = 0; u < num_ids; ++u) {
255  RecodedCharID code;
256  int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
257  EXPECT_NE(0, len);
258  for (int i = 0; i < len; ++i) {
259  // Apply the desired score.
260  (*outputs)(t++, code(i)) = score;
261  if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
262  int dups = static_cast<int>(random->UnsignedRand(3.0));
263  for (int d = 0; d < dups; ++d) {
264  // Duplicate the desired score.
265  (*outputs)(t++, code(i)) = score;
266  }
267  }
268  }
269  if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
270  int dups = static_cast<int>(random->UnsignedRand(3.0));
271  for (int d = 0; d < dups; ++d) {
272  // Add a random number of nulls as well.
273  (*outputs)(t++, encoded_null_char_) = score;
274  }
275  }
276  }
277  return t;
278  }
UNICHARSET unicharset
Definition: ccutil.h:61
static const int kMaxCodeLen
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239

◆ ExpectCorrect() [1/2]

void tesseract::RecodeBeamTest::ExpectCorrect ( const GENERIC_2D_ARRAY< float > &  output,
const std::string &  truth_utf8,
Dict dict,
PointerVector< WERD_RES > *  words 
)
inlineprotected

Definition at line 115 of file recodebeam_test.cc.

116  {
117  RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
118  beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
119  // Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
120  // beam_search.DebugBeams(ccutil_.unicharset);
121  std::vector<int> labels, xcoords;
122  beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
123  LOG(INFO) << "Labels size = " << labels.size() << " coords " << xcoords.size() << "\n";
124  // Now decode using recoder_.
125  std::string decoded;
126  int end = 1;
127  for (unsigned start = 0; start < labels.size(); start = end) {
128  RecodedCharID code;
129  unsigned index = start;
130  int uni_id = INVALID_UNICHAR_ID;
131  do {
132  code.Set(code.length(), labels[index++]);
133  uni_id = recoder_.DecodeUnichar(code);
134  } while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen &&
135  (uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index])));
136  EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << "index=" << index << "/" << labels.size();
137  // To the extent of truth_utf8, we expect decoded to match, but if
138  // transcription is shorter, that is OK too, as we may just be testing
139  // that we get a valid sequence when padded with random data.
140  if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size()) {
141  decoded += ccutil_.unicharset.id_to_unichar(uni_id);
142  }
143  end = index;
144  }
145  EXPECT_EQ(truth_utf8, decoded);
146 
147  // Check that ExtractBestPathAsUnicharIds does the same thing.
148  std::vector<int> unichar_ids;
149  std::vector<float> certainties, ratings;
150  beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties,
151  &ratings, &xcoords);
152  std::string u_decoded;
153  float total_rating = 0.0f;
154  for (unsigned u = 0; u < unichar_ids.size(); ++u) {
155  // To the extent of truth_utf8, we expect decoded to match, but if
156  // transcription is shorter, that is OK too, as we may just be testing
157  // that we get a valid sequence when padded with random data.
158  if (u_decoded.size() < truth_utf8.size()) {
159  const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
160  total_rating += ratings[u];
161  LOG(INFO) << u << ":u_id=" << unichar_ids[u] << "=" << str << ", c="
162  << certainties[u] << ", r=" << ratings[u] << "r_sum="
163  << total_rating << " @" << xcoords[u] << "\n";
164  if (str[0] == ' ') {
165  total_rating = 0.0f;
166  }
167  u_decoded += str;
168  }
169  }
170  EXPECT_EQ(truth_utf8, u_decoded);
171 
172  // Check that ExtractBestPathAsWords does the same thing.
173  TBOX line_box(0, 0, 100, 10);
174  for (int i = 0; i < 2; ++i) {
175  beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words);
176  std::string w_decoded;
177  for (int w = 0; w < words->size(); ++w) {
178  const WERD_RES *word = (*words)[w];
179  if (w_decoded.size() < truth_utf8.size()) {
180  if (!w_decoded.empty() && word->word->space()) {
181  w_decoded += " ";
182  }
183  w_decoded += word->best_choice->unichar_string().c_str();
184  }
185  LOG(INFO) << "Word:" << w << " = " << word->best_choice->unichar_string()
186  << ", c=" << word->best_choice->certainty() << ", r=" << word->best_choice->rating()
187  << ", perm=" << word->best_choice->permuter() << "\n";
188  }
189  std::string w_trunc(w_decoded.data(), truth_utf8.size());
190  if (truth_utf8 != w_trunc) {
193  tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
194  w_trunc.assign(w_decoded.data(), truth_utf8.size());
195  }
196  EXPECT_EQ(truth_utf8, w_trunc);
197  }
198  }
@ TBOX
@ LOG
@ INFO
Definition: log.h:28
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:152
bool IsValidFirstCode(int code) const
int DecodeUnichar(const RecodedCharID &code) const
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279

◆ ExpectCorrect() [2/2]

void tesseract::RecodeBeamTest::ExpectCorrect ( const GENERIC_2D_ARRAY< float > &  output,
const std::vector< int > &  transcription 
)
inlineprotected

Definition at line 105 of file recodebeam_test.cc.

106  {
107  // Get the utf8 string of the transcription.
108  std::string truth_utf8;
109  for (int i : transcription) {
110  truth_utf8 += ccutil_.unicharset.id_to_unichar(i);
111  }
112  PointerVector<WERD_RES> words;
113  ExpectCorrect(output, truth_utf8, nullptr, &words);
114  }
void ExpectCorrect(const GENERIC_2D_ARRAY< float > &output, const std::vector< int > &transcription)

◆ GenerateRandomPaddedOutputs()

GENERIC_2D_ARRAY<float> tesseract::RecodeBeamTest::GenerateRandomPaddedOutputs ( const std::vector< int > &  unichar_ids,
int  padding 
)
inlineprotected

Definition at line 201 of file recodebeam_test.cc.

202  {
203  int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
204  int num_codes = recoder_.code_range();
205  GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
206  // Fill with random data.
207  TRand random;
208  for (int t = 0; t < width; ++t) {
209  for (int i = 0; i < num_codes; ++i) {
210  outputs(t, i) = random.UnsignedRand(0.25);
211  }
212  }
213  int t = 0;
214  for (int unichar_id : unichar_ids) {
215  RecodedCharID code;
216  int len = recoder_.EncodeUnichar(unichar_id, &code);
217  EXPECT_NE(0, len);
218  for (int j = 0; j < len; ++j) {
219  // Make the desired answer a clear winner.
220  if (j > 0 && code(j) == code(j - 1)) {
221  // We will collapse adjacent equal codes so put a null in between.
222  outputs(t++, encoded_null_char_) = 1.0f;
223  }
224  outputs(t++, code(j)) = 1.0f;
225  }
226  // Put a 0 as a null char in between.
227  outputs(t++, encoded_null_char_) = 1.0f;
228  }
229  // Normalize the probs.
230  for (int t = 0; t < width; ++t) {
231  double sum = 0.0;
232  for (int i = 0; i < num_codes; ++i) {
233  sum += outputs(t, i);
234  }
235  for (int i = 0; i < num_codes; ++i) {
236  outputs(t, i) /= sum;
237  }
238  }
239 
240  return outputs;
241  }

◆ GenerateSyntheticOutputs()

GENERIC_2D_ARRAY<float> tesseract::RecodeBeamTest::GenerateSyntheticOutputs ( const char *  chars1[],
const float  scores1[],
const char *  chars2[],
const float  scores2[],
TRand random 
)
inlineprotected

Definition at line 283 of file recodebeam_test.cc.

285  {
286  int width = 0;
287  while (chars1[width] != nullptr) {
288  ++width;
289  }
290  int padding = width * RecodedCharID::kMaxCodeLen;
291  int num_codes = recoder_.code_range();
292  GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
293  int t = 0;
294  for (int i = 0; i < width; ++i) {
295  // In case there is overlap in the codes between 1st and 2nd choice, it
296  // is better to encode the 2nd choice first.
297  int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
298  int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
299  // Advance t to the max end, setting everything else to the leftovers.
300  int max_t = std::max(end_t1, end_t2);
301  while (t < max_t) {
302  double total_score = 0.0;
303  for (int j = 0; j < num_codes; ++j) {
304  total_score += outputs(t, j);
305  }
306  double null_remainder = (1.0 - total_score) / 2.0;
307  double remainder = null_remainder / (num_codes - 2);
308  if (outputs(t, encoded_null_char_) < null_remainder) {
309  outputs(t, encoded_null_char_) += null_remainder;
310  } else {
311  remainder += remainder;
312  }
313  for (int j = 0; j < num_codes; ++j) {
314  if (outputs(t, j) == 0.0f) {
315  outputs(t, j) = remainder;
316  }
317  }
318  ++t;
319  }
320  }
321  // Fill the rest with null chars.
322  while (t < width + padding) {
323  outputs(t++, encoded_null_char_) = 1.0f;
324  }
325  return outputs;
326  }
int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random, GENERIC_2D_ARRAY< float > *outputs)

◆ LoadDict()

void tesseract::RecodeBeamTest::LoadDict ( const std::string &  lang)
inlineprotected

Definition at line 94 of file recodebeam_test.cc.

94  {
95  std::string traineddata_name = lang + ".traineddata";
96  std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name);
97  lstm_dict_.SetupForLoad(nullptr);
99  mgr.Init(traineddata_file.c_str());
100  lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
102  }
bool Init(const char *data_file_name)
void LoadLSTM(const std::string &lang, TessdataManager *data_file)
Definition: dict.cpp:291
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:180
bool FinishLoad()
Definition: dict.cpp:357
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65

◆ LoadUnicharset()

void tesseract::RecodeBeamTest::LoadUnicharset ( const std::string &  unicharset_name)
inlineprotected

Definition at line 71 of file recodebeam_test.cc.

71  {
72  std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
73  std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
74  std::string radical_data;
75  CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
76  CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
79  std::string radical_str(radical_data.c_str());
80  EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_, &radical_str));
81  RecodedCharID code;
83  encoded_null_char_ = code(0);
84  // Space should encode as itself.
86  EXPECT_EQ(UNICHAR_SPACE, code(0));
87  std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
88  std::string encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
89  std::string encoding_str(&encoding[0], encoding.size());
90  CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
91  LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
92  }
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_OK(test)
Definition: include_gunit.h:84
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ UNICHAR_BROKEN
Definition: unicharset.h:38
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool has_special_codes() const
Definition: unicharset.h:757
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
size_t size() const
Definition: unicharset.h:355
static int Defaults()
Definition: include_gunit.h:61
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:56
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52

◆ SetUp()

void tesseract::RecodeBeamTest::SetUp ( )
inlineoverrideprotected

Definition at line 60 of file recodebeam_test.cc.

60  {
61  std::locale::global(std::locale(""));
63  }
static void MakeTmpdir()
Definition: include_gunit.h:38

Member Data Documentation

◆ ccutil_

CCUtil tesseract::RecodeBeamTest::ccutil_
protected

Definition at line 330 of file recodebeam_test.cc.

◆ encoded_null_char_

int tesseract::RecodeBeamTest::encoded_null_char_ = 0
protected

Definition at line 329 of file recodebeam_test.cc.

◆ lstm_dict_

Dict tesseract::RecodeBeamTest::lstm_dict_
protected

Definition at line 331 of file recodebeam_test.cc.

◆ recoder_

UnicharCompress tesseract::RecodeBeamTest::recoder_
protected

Definition at line 327 of file recodebeam_test.cc.

◆ unichar_null_char_

int tesseract::RecodeBeamTest::unichar_null_char_ = 0
protected

Definition at line 328 of file recodebeam_test.cc.


The documentation for this class was generated from the following file: