tesseract  5.0.0
tesseract::ValidateJavanese Class Reference

#include <validate_javanese.h>

Inheritance diagram for tesseract::ValidateJavanese:
tesseract::Validator

Public Member Functions

 ValidateJavanese (ViramaScript script, bool report_errors)
 
 ~ValidateJavanese () override=default
 
- Public Member Functions inherited from tesseract::Validator
virtual ~Validator ()
 

Protected Member Functions

bool ConsumeGraphemeIfValid () override
 
Validator::CharClass UnicodeToCharClass (char32 ch) const override
 
- Protected Member Functions inherited from tesseract::Validator
 Validator (ViramaScript script, bool report_errors)
 
bool ValidateCleanAndSegmentInternal (GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
void MoveResultsToDest (GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
 
bool IsSubscriptScript () const
 
bool CodeOnlyToOutput ()
 
void MultiCodePart (unsigned length)
 
bool UseMultiCode (unsigned length)
 
void ComputeClassCodes (const std::vector< char32 > &text)
 
void Clear ()
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Validator
static bool ValidateCleanAndSegment (GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
static bool IsZeroWidthMark (char32 ch)
 
- Static Public Attributes inherited from tesseract::Validator
static const char32 kZeroWidthSpace = 0x200B
 
static const char32 kZeroWidthNonJoiner = 0x200C
 
static const char32 kZeroWidthJoiner = 0x200D
 
static const char32 kLeftToRightMark = 0x200E
 
static const char32 kRightToLeftMark = 0x200F
 
static const char32 kInvalid = 0xfffd
 
- Protected Types inherited from tesseract::Validator
enum class  CharClass {
  kConsonant = 'C' , kVowel = 'V' , kVirama = 'H' , kMatra = 'M' ,
  kMatraPiece = 'P' , kVowelModifier = 'D' , kZeroWidthNonJoiner = 'z' , kZeroWidthJoiner = 'Z' ,
  kVedicMark = 'v' , kNukta = 'N' , kRobat = 'R' , kOther = 'O' ,
  kWhitespace = ' ' , kCombiner = 'c'
}
 
using IndicPair = std::pair< CharClass, char32 >
 
- Static Protected Member Functions inherited from tesseract::Validator
static std::unique_ptr< ValidatorScriptValidator (ViramaScript script, bool report_errors)
 
static ViramaScript MostFrequentViramaScript (const std::vector< char32 > &utf32)
 
static bool IsVirama (char32 unicode)
 
static bool IsVedicAccent (char32 unicode)
 
- Protected Attributes inherited from tesseract::Validator
ViramaScript script_
 
std::vector< IndicPaircodes_
 
std::vector< std::vector< char32 > > parts_
 
std::vector< char32output_
 
unsigned codes_used_
 
unsigned output_used_
 
bool report_errors_
 
- Static Protected Attributes inherited from tesseract::Validator
static const int kIndicCodePageSize = 128
 
static const char32 kMinIndicUnicode = 0x900
 
static const char32 kMaxSinhalaUnicode = 0xdff
 
static const char32 kMaxViramaScriptUnicode = 0x17ff
 
static const char32 kSinhalaVirama = 0xdca
 
static const char32 kMyanmarVirama = 0x1039
 
static const char32 kKhmerVirama = 0x17d2
 
static const char32 kJavaneseVirama = 0xa9c0
 
static const char32 kMaxJavaneseUnicode = 0xa9df
 

Detailed Description

Definition at line 28 of file validate_javanese.h.

Constructor & Destructor Documentation

◆ ValidateJavanese()

tesseract::ValidateJavanese::ValidateJavanese ( ViramaScript  script,
bool  report_errors 
)
inline

Definition at line 30 of file validate_javanese.h.

30 : Validator(script, report_errors) {}
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:137

◆ ~ValidateJavanese()

tesseract::ValidateJavanese::~ValidateJavanese ( )
overridedefault

Member Function Documentation

◆ ConsumeGraphemeIfValid()

bool tesseract::ValidateJavanese::ConsumeGraphemeIfValid ( )
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 39 of file validate_javanese.cpp.

39  {
40  switch (codes_[codes_used_].first) {
42  return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
43  case CharClass::kVowel:
45  return ConsumeVowelIfValid();
48  // Apart from within an aksara, joiners are silently dropped.
49  if (report_errors_) {
50  tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
51  }
52  ++codes_used_;
53  return true;
54  case CharClass::kOther:
55  UseMultiCode(1);
56  return true;
57  default:
58  if (report_errors_) {
59  tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
60  static_cast<int>(codes_[codes_used_].first),
61  codes_[codes_used_].second);
62  }
63  return false;
64  }
65 }
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
unsigned codes_used_
Definition: validator.h:231
bool UseMultiCode(unsigned length)
Definition: validator.h:189
std::vector< IndicPair > codes_
Definition: validator.h:225

◆ UnicodeToCharClass()

Validator::CharClass tesseract::ValidateJavanese::UnicodeToCharClass ( char32  ch) const
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 280 of file validate_javanese.cpp.

280  {
281  if (ch == kZeroWidthNonJoiner) {
283  }
284  if (ch == kZeroWidthJoiner) {
286  }
287  // Offset from the start of the relevant unicode code block aka code page.
288  int off = ch - static_cast<char32>(script_);
289  // Anything in another code block is other.
290  if (off < 0 || off >= kIndicCodePageSize) {
291  return CharClass::kOther;
292  }
293  if (off < 0x4) {
295  }
296  if (off <= 0x32) {
297  return CharClass::kConsonant; // includes independent vowels
298  }
299  if (off == 0x33) {
300  return CharClass::kNukta; // A9B3 CECAK TELU
301  }
302  if (off == 0x34) {
303  return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
304  }
305  if (off <= 0x39) {
306  return CharClass::kMatra;
307  }
308  if (off <= 0x3a) {
309  return CharClass::kConsonant; // A9BA TALING - pre base vowel
310  }
311  if (off <= 0x3d) {
312  return CharClass::kMatra;
313  }
314  if (off <= 0x3f) {
315  return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
316  }
317  if (off == 0x40) {
318  return CharClass::kVirama; // A9C0 PANGKON
319  }
320  return CharClass::kOther;
321 }
signed int char32
static const char32 kZeroWidthNonJoiner
Definition: validator.h:97
ViramaScript script_
Definition: validator.h:223
static const int kIndicCodePageSize
Definition: validator.h:207
static const char32 kZeroWidthJoiner
Definition: validator.h:98

The documentation for this class was generated from the following files: