tesseract  5.0.0
validate_indic_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include "include_gunit.h"
13 #include "normstrngs.h"
14 #include "normstrngs_test.h"
15 
16 namespace tesseract {
17 
18 // Though the unicode example for Telugu in section 12.7:
19 // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
20 // shows using ZWNJ to force an explicit virama, in practice a ZWNJ is used to
21 // suppress a conjugate that would otherwise occur. If a consonant is followed
22 // by a virama and then by a non-Indic character, OpenType will presume that
23 // the user simply meant to suppress the inherent vowel of the consonant
24 // and render it as the consonant with an explicit virama, the same as if
25 // a ZWNJ had followed. Since this is confusing to an OCR engine, the
26 // normalizer always puts a termninating ZWNJ on the end if not present,
27 // and accepts the string as valid.
28 TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) {
29  std::string str = "\u0c15\u0c4d"; // KA - virama
30  std::string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ
31  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);
32  // Same result if we started with the normalized string.
33  ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);
34 }
35 
36 // Only one dependent vowel is allowed.
37 TEST(ValidateIndicTest, OnlyOneDependentVowel) {
38  std::string str = "\u0d15\u0d3e\u0d42"; // KA AA UU
39  std::string dest;
41  str.c_str(), &dest))
43 }
44 
45 // [c26][c4d][c01]
46 // A consonant (DA) followed by the virama followed by a bindu
47 // Syllable modifiers [c01][c02][c03] all modify the pronunciation of
48 // the vowel in a syllable, as does the virama [c04]. You can only
49 // have one of these on a syllable.
50 //
51 // References:
52 // http://www.omniglot.com/writing/telugu.htm
53 TEST(ValidateIndicTest, OnlyOneVowelModifier) {
54  std::string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu
55  std::string result;
57  str.c_str(), &result));
58  // It made 1 grapheme of 4 chars, by terminating the explicit virama.
59  EXPECT_EQ(std::string("\u0c26\u0c4d\u200c\u0c01"), result);
60 
61  str = "\u0995\u0983\u0981"; // KA visarga candrabindu
63  str.c_str(), &result));
64 
65  // Exception: Malayalam allows multiple anusvara.
66  str = "\u0d15\u0d02\u0d02"; // KA Anusvara Anusvara
68  str.c_str(), &result));
69  EXPECT_EQ(str, result);
70 }
71 
72 // [c28][c02][c3f]
73 // A consonant (NA) followed by the Anusvara/sunna and another matra (I).
74 // The anusvara [c02] is a pronunciation directive
75 // for a whole syllable and only appears at the end of the syllable
76 // References:
77 // + Unicode v9, 12.1 "Modifier Mark Rules R10,"
78 // and the Microsoft page
79 // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
80 TEST(ValidateIndicTest, VowelModifierMustBeLast) {
81  std::string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I
82  std::string dest;
84  str.c_str(), &dest))
86  // Swap c02/c3f and all is ok.
87  str = "\u0c28\u0c3f\u0c02"; // NA I Sunna
89  str.c_str(), &dest))
91  EXPECT_EQ(dest, str);
92 }
93 
94 // [c05][c47]
95 // A Vowel (A) followed by a combining vowel/matra (EE).
96 // In Telugu, matras are only put on consonants, not independent
97 // vowels.
98 // References:
99 // + Unicode v9, 12.1:
100 // Principles of the Devanagari Script: Dependent Vowel Signs (Matras).
101 // + http://varamozhi.sourceforge.net/iscii91.pdf
102 TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) {
103  std::string str = "\u0c05\u0c47"; // A EE
104  std::string dest;
106  str.c_str(), &dest))
108  str = "\u0c1e\u0c3e"; // NYA AA
110  str.c_str(), &dest))
112  EXPECT_EQ(dest, str);
113 }
114 
115 // Sub-graphemes are allowed if GraphemeNorm is turned off.
116 TEST(ValidateIndicTest, SubGraphemes) {
117  std::string str = "\u0d3e"; // AA
118  std::string dest;
120  str.c_str(), &dest))
123  str.c_str(), &dest))
125  EXPECT_EQ(dest, str);
126 }
127 
128 TEST(ValidateIndicTest, Nukta) {
129  std::string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA
130  std::vector<std::string> glyphs;
132  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
133  &glyphs));
134  EXPECT_EQ(glyphs.size(), 3);
135  EXPECT_EQ(glyphs[2], std::string("\u0ccd\u0cb9"));
136  // Swapped Nukta and Virama are not allowed, but NFC normalization fixes it.
137  std::string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA
139 }
140 
141 // Sinhala has some of its own specific rules. See www.macciato.com/sinhala
142 TEST(ValidateIndicTest, SinhalaRakaransaya) {
143  std::string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna
144  std::string dest;
146  str.c_str(), &dest))
148  EXPECT_EQ(dest, str);
149  std::vector<std::string> glyphs;
151  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
152  &glyphs));
153  EXPECT_EQ(glyphs.size(), 2);
154  EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dbb"));
155  // Can be followed by a dependent vowel.
156  str += "\u0dd9"; // E
158  str.c_str(), &dest))
160  EXPECT_EQ(dest, str);
161 }
162 
163 TEST(ValidateIndicTest, SinhalaYansaya) {
164  std::string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna
165  std::string dest;
167  str.c_str(), &dest))
169  EXPECT_EQ(dest, str);
170  // Can be followed by a dependent vowel.
171  str += "\u0ddd"; // OO
173  str.c_str(), &dest))
175  EXPECT_EQ(dest, str);
176  std::vector<std::string> glyphs;
178  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
179  &glyphs));
180  EXPECT_EQ(glyphs.size(), 3);
181  EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dba"));
182 }
183 
184 TEST(ValidateIndicTest, SinhalaRepaya) {
185  std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA
186  std::vector<std::string> glyphs;
188  GraphemeNormMode::kCombined, true, str.c_str(),
189  &glyphs));
190  EXPECT_EQ(glyphs.size(), 2);
191  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d\u0db8"));
193  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
194  &glyphs));
195  EXPECT_EQ(glyphs.size(), 3);
196  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
197 }
198 
199 TEST(ValidateIndicTest, SinhalaSpecials) {
200  // Sinhala has some exceptions from the usual rules.
201  std::string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d";
202  std::vector<std::string> glyphs;
204  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
205  &glyphs));
206  EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs);
207  EXPECT_EQ(glyphs[0], std::string("\u0dc0"));
208  EXPECT_EQ(glyphs[1], std::string("\u0d9c"));
209  EXPECT_EQ(glyphs[2], std::string("\u0dca\u200d\u0dbb"));
210  EXPECT_EQ(glyphs[3], std::string("\u0dca\u200d"));
211  EXPECT_EQ(glyphs[4], std::string("\u0dbb\u0dca\u200d"));
212  str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf";
214  GraphemeNormMode::kGlyphSplit, true, str.c_str(),
215  &glyphs));
216  EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs);
217  EXPECT_EQ(glyphs[0], std::string("\u0dc3"));
218  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
219  EXPECT_EQ(glyphs[2], std::string("\u0dbb\u0dca\u200d"));
220  EXPECT_EQ(glyphs[3], std::string("\u0dcf"));
221 }
222 
223 } // namespace tesseract
std::string PrintString32WithUnicodes(const std::string &str)
void ExpectGraphemeModeResults(const std::string &str, UnicodeNormMode u_mode, int unicode_count, int glyph_count, int grapheme_count, const std::string &target_str)
std::string PrintStringVectorWithUnicodes(const std::vector< std::string > &glyphs)
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:179
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:152
TEST(TesseractInstanceTest, TestMultipleTessInstances)