tesseract  5.0.0
validate_javanese.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: validate_javanese.cpp
3  * Description: Text validator for Javanese Script - aksara jawa.
4  * Author: Shree Devi Kumar
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  * http://www.apache.org/licenses/LICENSE-2.0
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  **********************************************************************/
17 
18 #include "validate_javanese.h"
19 #include "errcode.h"
20 #include "tprintf.h"
21 
22 namespace tesseract {
23 
24 // Returns whether codes matches the pattern for a Javanese Grapheme.
25 // Taken from unicode standard:
26 // http://www.unicode.org/charts/PDF/UA980.pdf
27 // http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
28 // The Consonant class here includes independent vowels.
29 // The order of components in an orthographic syllable as expressed in BNF is:
30 // {C F} C {{R}Y} {V{A}} {Z}
31 // Translated to the codes used by the CharClass enum:
32 // [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]
33 // Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
34 // Validation rules copied from validate_indic.cpp and modified for Javanese.
35 // Indic - for reference
36 // + vowel Grapheme: V[D](v)*
37 // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
38 
40  switch (codes_[codes_used_].first) {
42  return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
43  case CharClass::kVowel:
45  return ConsumeVowelIfValid();
48  // Apart from within an aksara, joiners are silently dropped.
49  if (report_errors_) {
50  tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
51  }
52  ++codes_used_;
53  return true;
54  case CharClass::kOther:
55  UseMultiCode(1);
56  return true;
57  default:
58  if (report_errors_) {
59  tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
60  static_cast<int>(codes_[codes_used_].first),
61  codes_[codes_used_].second);
62  }
63  return false;
64  }
65 }
66 
67 // Helper consumes/copies a virama and any associated post-virama joiners.
68 // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
69 // no joiner at all) must be followed by a consonant.
70 // A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
71 // consonant, space, or character from a different script. We clean up the
72 // representation to make it consistent by adding a ZWNJ if missing from a
73 // non-linking virama. Returns false with an invalid sequence.
74 bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
75  const unsigned num_codes = codes_.size();
76  if (joiner.first == CharClass::kOther) {
78  if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthJoiner) {
79  // Post-matra viramas must be explicit, so no joiners allowed here.
80  if (post_matra) {
81  if (report_errors_) {
82  tprintf("ZWJ after a post-matra virama!!\n");
83  }
84  return false;
85  }
86  if (codes_used_ + 1 < num_codes && codes_[codes_used_ - 2].second != kCakra &&
87  (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
88  codes_[codes_used_ + 1].second == kPengkal ||
89  codes_[codes_used_ + 1].second == kCakra)) {
90  // This combination will be picked up later.
92  } else {
93  // Half-form with optional Nukta.
94  unsigned len = output_.size() + 1 - output_used_;
95  if (UseMultiCode(len)) {
96  return true;
97  }
98  }
99  if (codes_used_ < num_codes && codes_[codes_used_].second == kZeroWidthNonJoiner) {
100  if (output_used_ == output_.size() || output_[output_used_] != kCakra) {
101  if (report_errors_) {
102  tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", static_cast<int>(script_));
103  }
104  return false;
105  }
106  // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
107  if (UseMultiCode(4)) {
108  return true;
109  }
110  }
111  } else if (codes_used_ == num_codes || codes_[codes_used_].first != CharClass::kConsonant ||
112  post_matra) {
113  if (codes_used_ == num_codes || codes_[codes_used_].second != kZeroWidthNonJoiner) {
114  // It is valid to have an unterminated virama at the end of a word, but
115  // for consistency, we will always add ZWNJ if not present.
117  } else {
119  }
120  // Explicit virama [H z]
121  MultiCodePart(2);
122  }
123  } else {
124  // Pre-virama joiner [{Z|z} H] requests specific conjunct.
125  if (UseMultiCode(2)) {
126  if (report_errors_) {
127  tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
128  }
129  return false;
130  }
131  if (codes_[codes_used_].second == kZeroWidthJoiner ||
133  if (report_errors_) {
134  tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
135  codes_[codes_used_].second);
136  }
137  return false;
138  }
139  }
140  // It is good so far as it goes.
141  return true;
142 }
143 
144 // Helper consumes/copies a series of consonants separated by viramas while
145 // valid, but not any vowel or other modifiers.
146 bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
147  const unsigned num_codes = codes_.size();
148  // Consonant aksara
149  do {
151  // Special Sinhala case of [H Z Yayana/Rayana].
152  int index = output_.size() - 3;
153  if (output_used_ + 3 <= output_.size() &&
154  (output_.back() == kPengkal || output_.back() == kCakra) && IsVirama(output_[index]) &&
155  output_[index + 1] == kZeroWidthJoiner) {
156  MultiCodePart(3);
157  }
158  bool have_nukta = false;
159  if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kNukta) {
160  have_nukta = true;
162  }
163  // Test for subscript conjunct.
164  index = output_.size() - 2 - have_nukta;
165  if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
166  IsVirama(output_[index])) {
167  // Output previous virama, consonant + optional nukta.
168  MultiCodePart(2 + have_nukta);
169  }
170  IndicPair joiner(CharClass::kOther, 0);
171  if (codes_used_ < num_codes && (codes_[codes_used_].second == kZeroWidthJoiner ||
172  (codes_[codes_used_].second == kZeroWidthNonJoiner &&
174  joiner = codes_[codes_used_];
175  if (++codes_used_ == num_codes) {
176  if (report_errors_) {
177  tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), joiner.second);
178  }
179  return true;
180  }
181  if (codes_[codes_used_].first == CharClass::kVirama) {
182  output_.push_back(joiner.second);
183  } else {
184  if (report_errors_) {
185  tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", output_.back(), joiner.second,
186  codes_[codes_used_].second);
187  }
188  joiner = std::make_pair(CharClass::kOther, 0);
189  }
190  }
191  if (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kVirama) {
192  if (!ConsumeViramaIfValid(joiner, false)) {
193  return false;
194  }
195  } else {
196  break; // No virama, so the run of consonants is over.
197  }
198  } while (codes_used_ < num_codes && codes_[codes_used_].first == CharClass::kConsonant);
199  if (output_used_ < output_.size()) {
200  MultiCodePart(1);
201  }
202  return true;
203 }
204 
205 // Helper consumes/copies a tail part of a consonant, comprising optional
206 // matra/piece, vowel modifier, vedic mark, terminating virama.
207 bool ValidateJavanese::ConsumeConsonantTailIfValid() {
208  if (codes_used_ == codes_.size()) {
209  return true;
210  }
211  // No virama: Finish the grapheme.
212  // Are multiple matras allowed?
213  if (codes_[codes_used_].first == CharClass::kMatra) {
214  if (UseMultiCode(1)) {
215  return true;
216  }
217  if (codes_[codes_used_].first == CharClass::kMatraPiece) {
218  if (UseMultiCode(1)) {
219  return true;
220  }
221  }
222  }
223  // Tarung also used for long versions of u and o vowels and vocalic r
224  // Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
225  while (codes_[codes_used_].first == CharClass::kMatraPiece) {
226  if (UseMultiCode(1)) {
227  return true;
228  }
229  }
230  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
231  if (UseMultiCode(1)) {
232  return true;
233  }
234  // Only Malayalam allows only repeated 0xd02.
235  if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) {
236  break;
237  }
238  }
239  while (codes_[codes_used_].first == CharClass::kVedicMark) {
240  if (UseMultiCode(1)) {
241  return true;
242  }
243  }
244  if (codes_[codes_used_].first == CharClass::kVirama) {
245  if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
246  return false;
247  }
248  }
249  // What we have consumed so far is a valid consonant cluster.
250  if (output_used_ < output_.size()) {
251  MultiCodePart(1);
252  }
253 
254  return true;
255 }
256 
257 // Helper consumes/copies a vowel and optional modifiers.
258 bool ValidateJavanese::ConsumeVowelIfValid() {
259  if (UseMultiCode(1)) {
260  return true;
261  }
262  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
263  if (UseMultiCode(1)) {
264  return true;
265  }
266  // Only Malayalam allows repeated modifiers?
268  break;
269  }
270  }
271  while (codes_[codes_used_].first == CharClass::kVedicMark) {
272  if (UseMultiCode(1)) {
273  return true;
274  }
275  }
276  // What we have consumed so far is a valid vowel cluster.
277  return true;
278 }
279 
281  if (ch == kZeroWidthNonJoiner) {
283  }
284  if (ch == kZeroWidthJoiner) {
286  }
287  // Offset from the start of the relevant unicode code block aka code page.
288  int off = ch - static_cast<char32>(script_);
289  // Anything in another code block is other.
290  if (off < 0 || off >= kIndicCodePageSize) {
291  return CharClass::kOther;
292  }
293  if (off < 0x4) {
295  }
296  if (off <= 0x32) {
297  return CharClass::kConsonant; // includes independent vowels
298  }
299  if (off == 0x33) {
300  return CharClass::kNukta; // A9B3 CECAK TELU
301  }
302  if (off == 0x34) {
303  return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
304  }
305  if (off <= 0x39) {
306  return CharClass::kMatra;
307  }
308  if (off <= 0x3a) {
309  return CharClass::kConsonant; // A9BA TALING - pre base vowel
310  }
311  if (off <= 0x3d) {
312  return CharClass::kMatra;
313  }
314  if (off <= 0x3f) {
315  return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
316  }
317  if (off == 0x40) {
318  return CharClass::kVirama; // A9C0 PANGKON
319  }
320  return CharClass::kOther;
321 }
322 
323 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char32
Definition: unichar.h:51
bool ConsumeGraphemeIfValid() override
Validator::CharClass UnicodeToCharClass(char32 ch) const override
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
ViramaScript script_
Definition: validator.h:223
std::vector< char32 > output_
Definition: validator.h:229
unsigned output_used_
Definition: validator.h:233
unsigned codes_used_
Definition: validator.h:231
bool UseMultiCode(unsigned length)
Definition: validator.h:189
void MultiCodePart(unsigned length)
Definition: validator.h:176
static bool IsVirama(char32 unicode)
Definition: validator.cpp:169
static const int kIndicCodePageSize
Definition: validator.h:207
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:135
bool IsSubscriptScript() const
Definition: validator.cpp:184
std::vector< IndicPair > codes_
Definition: validator.h:225
static const char32 kZeroWidthJoiner
Definition: validator.h:98