tesseract  5.0.0
validate_khmer.cpp
Go to the documentation of this file.
1 #include "validate_khmer.h"
2 #include "errcode.h"
3 #include "tprintf.h"
4 
5 namespace tesseract {
6 
7 // Returns whether codes matches the pattern for a Khmer Grapheme.
8 // Taken from unicode standard:
9 // http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
10 // where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
11 // to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
12 // Translated to the codes used by the CharClass enum:
13 // C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
14 // Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
15 // Also the Consonant class here includes independent vowels, as they are
16 // treated the same anyway.
17 // In the split grapheme mode, the only characters that get grouped are the
18 // HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
19 // the BNF syntax, so who knows what they do.
21  const unsigned num_codes = codes_.size();
22  if (codes_used_ == num_codes) {
23  return false;
24  }
25  if (codes_[codes_used_].first == CharClass::kOther) {
26  UseMultiCode(1);
27  return true;
28  }
30  if (report_errors_) {
31  tprintf("Invalid start of Khmer syllable:0x%x\n", codes_[codes_used_].second);
32  }
33  return false;
34  }
35  if (UseMultiCode(1)) {
36  return true;
37  }
38  if (codes_[codes_used_].first == CharClass::kRobat ||
40  if (UseMultiCode(1)) {
41  return true;
42  }
43  }
44  while (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama &&
47  if (UseMultiCode(2)) {
48  return true;
49  }
50  if (codes_[codes_used_].first == CharClass::kRobat) {
51  if (UseMultiCode(1)) {
52  return true;
53  }
54  }
55  }
56  unsigned num_matra_parts = 0;
57  if (codes_[codes_used_].second == kZeroWidthJoiner ||
59  if (CodeOnlyToOutput()) {
60  if (report_errors_) {
61  tprintf("Unterminated joiner: 0x%x\n", output_.back());
62  }
63  return false;
64  }
65  ++num_matra_parts;
66  }
67  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
68  // own or as an addition to other matras.
69  if (codes_[codes_used_].first == CharClass::kMatra ||
71  ++num_matra_parts;
72  if (UseMultiCode(num_matra_parts)) {
73  return true;
74  }
75  } else if (num_matra_parts) {
76  if (report_errors_) {
77  tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", output_.back(),
78  codes_[codes_used_].second);
79  }
80  return false;
81  }
84  if (UseMultiCode(1)) {
85  return true;
86  }
87  }
89  if (UseMultiCode(1)) {
90  return true;
91  }
92  }
93  if (codes_used_ + 1 < num_codes && codes_[codes_used_].first == CharClass::kVirama &&
96  if (UseMultiCode(2)) {
97  return true;
98  }
99  }
100  return true;
101 }
102 
104  if (IsVedicAccent(ch)) {
105  return CharClass::kVedicMark;
106  }
107  if (ch == kZeroWidthNonJoiner) {
109  }
110  if (ch == kZeroWidthJoiner) {
112  }
113  // Offset from the start of the relevant unicode code block aka code page.
114  int off = ch - static_cast<char32>(script_);
115  // Anything in another code block is other.
116  if (off < 0 || off >= kIndicCodePageSize) {
117  return CharClass::kOther;
118  }
119  if (off <= 0x33) {
120  return CharClass::kConsonant;
121  }
122  if (off <= 0x45) {
123  return CharClass::kMatra;
124  }
125  if (off == 0x46) {
126  return CharClass::kMatraPiece;
127  }
128  if (off == 0x4c) {
129  return CharClass::kRobat;
130  }
131  if (off == 0x49 || off == 0x4a) {
132  return CharClass::kNukta;
133  }
134  if (off <= 0x51) {
136  }
137  if (off == 0x52) {
138  return CharClass::kVirama;
139  }
140  return CharClass::kOther;
141 }
142 
143 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char32
Definition: unichar.h:51
bool ConsumeGraphemeIfValid() override
CharClass UnicodeToCharClass(char32 ch) const override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
ViramaScript script_
Definition: validator.h:223
std::vector< char32 > output_
Definition: validator.h:229
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:178
unsigned codes_used_
Definition: validator.h:231
bool UseMultiCode(unsigned length)
Definition: validator.h:189
static const int kIndicCodePageSize
Definition: validator.h:207
std::vector< IndicPair > codes_
Definition: validator.h:225
static const char32 kZeroWidthJoiner
Definition: validator.h:98