tesseract  5.0.0
unilib.cc
Go to the documentation of this file.
1 
17 // Author: sligocki@google.com (Shawn Ligocki)
18 
19 #include "util/utf8/unilib.h"
20 
21 #include "syntaxnet/base.h"
22 #include "third_party/utf/utf.h"
23 
24 namespace UniLib {
25 
26 // Codepoints not allowed for interchange are:
27 // C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
28 // Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
29 // Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
30 // C1 controls: U+007F to U+009F
31 // Surrogates: U+D800 to U+DFFF
32 // Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
34  return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
35  (c >= 0x7F && c <= 0x9F) || (c >= 0xD800 && c <= 0xDFFF) ||
36  (c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE);
37 }
38 
39 int SpanInterchangeValid(const char *begin, int byte_length) {
40  char32 rune;
41  const char *p = begin;
42  const char *end = begin + byte_length;
43  while (p < end) {
44  int bytes_consumed = charntorune(&rune, p, end - p);
45  // We want to accept Runeerror == U+FFFD as a valid char, but it is used
46  // by chartorune to indicate error. Luckily, the real codepoint is size 3
47  // while errors return bytes_consumed <= 1.
48  if ((rune == Runeerror && bytes_consumed <= 1) || !IsInterchangeValid(rune)) {
49  break; // Found
50  }
51  p += bytes_consumed;
52  }
53  return p - begin;
54 }
55 
56 } // namespace UniLib
signed int char32
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:64
@ Runeerror
Definition: utf.h:25
Definition: unilib.cc:24
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
int SpanInterchangeValid(const char *begin, int byte_length)
Definition: unilib.cc:39