tesseract  5.0.0
tesseract::UnicharcompressTest Class Reference
Inheritance diagram for tesseract::UnicharcompressTest:

Protected Member Functions

void SetUp () override
 
void LoadUnicharset (const std::string &unicharset_name)
 
void SerializeAndUndo ()
 
bool IsCJKLang (const std::string &lang)
 
bool IsIndicLang (const std::string &lang)
 
void ExpectCorrect (const std::string &lang)
 
void CheckCodeExtensions (const RecodedCharID &code, const std::vector< RecodedCharID > &times_seen)
 

Protected Attributes

UnicharCompress compressed_
 
UNICHARSET unicharset_
 
int null_char_
 
int encoded_null_char_
 

Detailed Description

Definition at line 24 of file unicharcompress_test.cc.

Member Function Documentation

◆ CheckCodeExtensions()

void tesseract::UnicharcompressTest::CheckCodeExtensions ( const RecodedCharID code,
const std::vector< RecodedCharID > &  times_seen 
)
inlineprotected

Definition at line 135 of file unicharcompress_test.cc.

136  {
137  RecodedCharID extended = code;
138  int length = code.length();
139  const std::vector<int> *final_codes = compressed_.GetFinalCodes(code);
140  if (final_codes != nullptr) {
141  for (int ending : *final_codes) {
142  EXPECT_GT(times_seen[ending](length), 0);
143  extended.Set(length, ending);
144  int unichar_id = compressed_.DecodeUnichar(extended);
145  EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
146  }
147  }
148  const std::vector<int> *next_codes = compressed_.GetNextCodes(code);
149  if (next_codes != nullptr) {
150  for (int extension : *next_codes) {
151  EXPECT_GT(times_seen[extension](length), 0);
152  extended.Set(length, extension);
153  CheckCodeExtensions(extended, times_seen);
154  }
155  }
156  }
const std::vector< int > * GetFinalCodes(const RecodedCharID &code) const
const std::vector< int > * GetNextCodes(const RecodedCharID &code) const
int DecodeUnichar(const RecodedCharID &code) const
void CheckCodeExtensions(const RecodedCharID &code, const std::vector< RecodedCharID > &times_seen)

◆ ExpectCorrect()

void tesseract::UnicharcompressTest::ExpectCorrect ( const std::string &  lang)
inlineprotected

Definition at line 75 of file unicharcompress_test.cc.

75  {
76  // Count the number of times each code is used in each element of
77  // RecodedCharID.
78  RecodedCharID zeros;
79  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
80  zeros.Set(i, 0);
81  }
82  int code_range = compressed_.code_range();
83  std::vector<RecodedCharID> times_seen(code_range, zeros);
84  for (int u = 0; u <= unicharset_.size(); ++u) {
85  if (u != UNICHAR_SPACE && u != null_char_ &&
86  (u == unicharset_.size() ||
88  continue; // Not used so not encoded.
89  }
90  RecodedCharID code;
91  int len = compressed_.EncodeUnichar(u, &code);
92  // Check round-trip encoding.
93  int unichar_id;
94  std::vector<UNICHAR_ID> normed_ids;
95  if (u == null_char_ || u == unicharset_.size()) {
96  unichar_id = null_char_;
97  } else {
98  unichar_id = u;
99  }
100  EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));
101  // Check that the codes are valid.
102  for (int i = 0; i < len; ++i) {
103  int code_val = code(i);
104  EXPECT_GE(code_val, 0);
105  EXPECT_LT(code_val, code_range);
106  times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
107  }
108  }
109  // Check that each code is used in at least one position.
110  for (int c = 0; c < code_range; ++c) {
111  int num_used = 0;
112  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
113  if (times_seen[c](i) != 0) {
114  ++num_used;
115  }
116  }
117  EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;
118  }
119  // Check that GetNextCodes/GetFinalCodes lists match the times_seen,
120  // and create valid codes.
121  RecodedCharID code;
122  CheckCodeExtensions(code, times_seen);
123  // Finally, we achieved all that using a codebook < 10% of the size of
124  // the original unicharset, for CK or Indic, and 20% with J, but just
125  // no bigger for all others.
126  if (IsCJKLang(lang) || IsIndicLang(lang)) {
127  EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10));
128  } else {
129  EXPECT_LE(code_range, unicharset_.size() + 1);
130  }
131  LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to " << code_range;
132  }
@ LOG
@ INFO
Definition: log.h:28
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
static const int kMaxCodeLen
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
bool has_special_codes() const
Definition: unicharset.h:757
size_t size() const
Definition: unicharset.h:355
bool IsIndicLang(const std::string &lang)
bool IsCJKLang(const std::string &lang)

◆ IsCJKLang()

bool tesseract::UnicharcompressTest::IsCJKLang ( const std::string &  lang)
inlineprotected

Definition at line 63 of file unicharcompress_test.cc.

63  {
64  return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" || lang == "jpn";
65  }

◆ IsIndicLang()

bool tesseract::UnicharcompressTest::IsIndicLang ( const std::string &  lang)
inlineprotected

Definition at line 67 of file unicharcompress_test.cc.

67  {
68  return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" || lang == "mar" ||
69  lang == "nep" || lang == "san" || lang == "bod" || lang == "dzo" || lang == "guj" ||
70  lang == "kan" || lang == "mal" || lang == "ori" || lang == "pan" || lang == "sin" ||
71  lang == "tam" || lang == "tel";
72  }

◆ LoadUnicharset()

void tesseract::UnicharcompressTest::LoadUnicharset ( const std::string &  unicharset_name)
inlineprotected

Definition at line 32 of file unicharcompress_test.cc.

32  {
33  std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
34  std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
35  std::string radical_data;
36  CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
37  CHECK(unicharset_.load_from_file(unicharset_file.c_str()));
38  std::string radical_str(radical_data.c_str());
41  // Get the encoding of the null char.
42  RecodedCharID code;
44  encoded_null_char_ = code(0);
45  std::string output_name =
46  file::JoinPath(FLAGS_test_tmpdir, unicharset_name) + ".encoding.txt";
47  std::string encoding = compressed_.GetEncodingAsString(unicharset_);
48  std::string encoding_str(&encoding[0], encoding.size());
49  CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
50  LOG(INFO) << "Wrote encoding to:" << output_name;
51  }
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_OK(test)
Definition: include_gunit.h:84
@ UNICHAR_BROKEN
Definition: unicharset.h:38
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
static int Defaults()
Definition: include_gunit.h:61
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:56
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52

◆ SerializeAndUndo()

void tesseract::UnicharcompressTest::SerializeAndUndo ( )
inlineprotected

Definition at line 53 of file unicharcompress_test.cc.

53  {
54  std::vector<char> data;
55  TFile wfp;
56  wfp.OpenWrite(&data);
57  EXPECT_TRUE(compressed_.Serialize(&wfp));
58  TFile rfp;
59  rfp.Open(&data[0], data.size());
60  EXPECT_TRUE(compressed_.DeSerialize(&rfp));
61  }
bool Serialize(TFile *fp) const

◆ SetUp()

void tesseract::UnicharcompressTest::SetUp ( )
inlineoverrideprotected

Definition at line 26 of file unicharcompress_test.cc.

26  {
27  std::locale::global(std::locale(""));
29  }
static void MakeTmpdir()
Definition: include_gunit.h:38

Member Data Documentation

◆ compressed_

UnicharCompress tesseract::UnicharcompressTest::compressed_
protected

Definition at line 158 of file unicharcompress_test.cc.

◆ encoded_null_char_

int tesseract::UnicharcompressTest::encoded_null_char_
protected

Definition at line 162 of file unicharcompress_test.cc.

◆ null_char_

int tesseract::UnicharcompressTest::null_char_
protected

Definition at line 160 of file unicharcompress_test.cc.

◆ unicharset_

UNICHARSET tesseract::UnicharcompressTest::unicharset_
protected

Definition at line 159 of file unicharcompress_test.cc.


The documentation for this class was generated from the following file: