tesseract  5.0.0
validate_grapheme.cpp
Go to the documentation of this file.
1 #include "validate_grapheme.h"
2 #include "tprintf.h"
3 #include "unicode/uchar.h" // From libicu
4 
5 namespace tesseract {
6 
8  const unsigned num_codes = codes_.size();
9  char32 prev_prev_ch = ' ';
10  char32 prev_ch = ' ';
12  int num_codes_in_grapheme = 0;
13  while (codes_used_ < num_codes) {
14  CharClass cc = codes_[codes_used_].first;
15  char32 ch = codes_[codes_used_].second;
16  const bool is_combiner = cc == CharClass::kCombiner || cc == CharClass::kVirama;
17 // TODO: Make this code work well with RTL text.
18 // See
19 // https://github.com/tesseract-ocr/tesseract/pull/2266#issuecomment-467114751
20 #if 0
21  // Reject easily detected badly formed sequences.
22  if (prev_cc == CharClass::kWhitespace && is_combiner) {
23  if (report_errors_) tprintf("Word started with a combiner:0x%x\n", ch);
24  return false;
25  }
26 #endif
27  if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) {
28  if (report_errors_) {
29  tprintf("Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch);
30  }
31  return false;
32  }
33  if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace &&
34  IsBadlyFormed(prev_ch, ch)) {
35  return false;
36  }
37  bool prev_is_fwd_combiner = prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama ||
38  (prev_ch == kZeroWidthNonJoiner &&
39  (cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner));
40  if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner) {
41  break;
42  }
44  ++num_codes_in_grapheme;
45  prev_prev_ch = prev_ch;
46  prev_ch = ch;
47  prev_cc = cc;
48  }
49  if (num_codes_in_grapheme > 0) {
50  MultiCodePart(num_codes_in_grapheme);
51  }
52  return true;
53 }
54 
56  if (IsVedicAccent(ch)) {
57  return CharClass::kVedicMark;
58  }
59  // The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they
60  // always combine with the previous character.
61  if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) {
62  return CharClass::kVirama;
63  }
64  if (u_isUWhiteSpace(ch)) {
66  }
67  // Workaround for Javanese Aksara's Taling, do not label it as a combiner
68  if (ch == 0xa9ba) {
69  return CharClass::kConsonant;
70  }
71  int char_type = u_charType(ch);
72  if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||
73  char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner ||
74  ch == kZeroWidthJoiner) {
75  return CharClass::kCombiner;
76  }
77  return CharClass::kOther;
78 }
79 
80 // Helper returns true if the sequence prev_ch,ch is invalid.
81 bool ValidateGrapheme::IsBadlyFormed(char32 prev_ch, char32 ch) {
82  // Reject badly formed Indic vowels.
83  if (IsBadlyFormedIndicVowel(prev_ch, ch)) {
84  if (report_errors_) {
85  tprintf("Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch, ch);
86  }
87  return true;
88  }
89  if (IsBadlyFormedThai(prev_ch, ch)) {
90  if (report_errors_) {
91  tprintf("Badly formed Thai:0x%x 0x%x\n", prev_ch, ch);
92  }
93  return true;
94  }
95  return false;
96 }
97 
98 // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
99 // Some vowels in Indic scripts may be analytically decomposed into atomic pairs
100 // of components that are themselves valid unicode symbols. (See Table 12-1 in
101 // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
102 // for examples in Devanagari). The Unicode standard discourages specifying
103 // vowels this way, but they are sometimes encountered in text, probably because
104 // some editors still permit it. Renderers however dislike such pairs, and so
105 // this function may be used to detect their occurrence for removal.
106 // TODO(rays) This function only covers a subset of Indic languages and doesn't
107 // include all rules. Add rules as appropriate to support other languages or
108 // find a way to generalize these existing rules that makes use of the
109 // regularity of the mapping from ISCII to Unicode.
110 /* static */
111 bool ValidateGrapheme::IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch) {
112  return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) || (prev_ch == 0x909 && ch == 0x941) ||
113  (prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) ||
114  (prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) ||
115  (prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) ||
116  // Illegal combinations of two dependent Devanagari vowels.
117  (prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) ||
118  // Dependent Devanagari vowels following a virama.
119  (prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) ||
120  // Bengali vowels (Table 9-5, pg 313)
121  (prev_ch == 0x985 && ch == 0x9BE) ||
122  // Telugu vowels (Table 9-19, pg 331)
123  (prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) ||
124  // Kannada vowels (Table 9-20, pg 332)
125  (prev_ch == 0xC92 && ch == 0xCCC));
126 }
127 
128 // Helper returns true if ch is a Thai consonant.
129 static bool IsThaiConsonant(char32 ch) {
130  return 0xe01 <= ch && ch <= 0xe2e;
131 }
132 
133 // Helper returns true is ch is a before-consonant vowel.
134 static bool IsThaiBeforeConsonantVowel(char32 ch) {
135  return 0xe40 <= ch && ch <= 0xe44;
136 }
137 
138 // Helper returns true if ch is a Thai tone mark.
139 static bool IsThaiToneMark(char32 ch) {
140  return 0xe48 <= ch && ch <= 0xe4b;
141 }
142 
143 // Helper returns true if ch is a Thai vowel that may be followed by a tone
144 // mark.
145 static bool IsThaiTonableVowel(char32 ch) {
146  return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31;
147 }
148 
149 // Helper returns true if the sequence prev_ch,ch is invalid Thai.
150 // These rules come from a native Thai speaker, and are not covered by the
151 // Thai section in the unicode book:
152 // http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
153 // Comments below added by Ray interpreting the code ranges.
154 /* static */
155 bool ValidateGrapheme::IsBadlyFormedThai(char32 prev_ch, char32 ch) {
156  // Tone marks must follow consonants or specific vowels.
157  if (IsThaiToneMark(ch) && !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) {
158  return true;
159  }
160  // Tonable vowels must follow consonants.
161  if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) {
162  return true;
163  }
164  // Thanthakhat must follow consonant or specific vowels.
165  if (ch == 0xe4c && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) {
166  return true;
167  }
168  // Nikkhahit must follow a consonant ?or certain markers?.
169  // TODO(rays) confirm this, but there were so many in the ground truth of the
170  // validation set that it seems reasonable to assume it is valid.
171  if (ch == 0xe4d && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) {
172  return true;
173  }
174  // The vowels e30, e32, e33 can be used more liberally.
175  if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) &&
176  !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) &&
177  !(prev_ch == 0xe32 && ch == 0xe30) && !(prev_ch == 0xe4d && ch == 0xe32)) {
178  return true;
179  }
180  // Some vowels come before consonants, and therefore cannot follow things
181  // that cannot end a syllable.
182  if (IsThaiBeforeConsonantVowel(ch) &&
183  (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 || prev_ch == 0xe37)) {
184  return true;
185  }
186  // Don't allow the standalone vowel U+0e24 to be followed by other vowels.
187  if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) {
188  return true;
189  }
190  return false;
191 }
192 
193 } // namespace tesseract
signed int char32
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char32
Definition: unichar.h:51
CharClass UnicodeToCharClass(char32 ch) const override
bool ConsumeGraphemeIfValid() override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:178
unsigned codes_used_
Definition: validator.h:231
void MultiCodePart(unsigned length)
Definition: validator.h:176
std::vector< IndicPair > codes_
Definition: validator.h:225
static const char32 kZeroWidthJoiner
Definition: validator.h:98