tesseract  5.0.0
unichar.cpp
Go to the documentation of this file.
1 // File: unichar.cpp
3 // Description: Unicode character/ligature class.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include <tesseract/unichar.h>
20 #include "errcode.h"
21 #include "tprintf.h"
22 
23 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF
24 
25 namespace tesseract {
26 
27 // Construct from a utf8 string. If len<0 then the string is null terminated.
28 // If the string is too long to fit in the UNICHAR then it takes only what
29 // will fit. Checks for illegal input and stops at an illegal sequence.
30 // The resulting UNICHAR may be empty.
31 UNICHAR::UNICHAR(const char *utf8_str, int len) {
32  int total_len = 0;
33  int step = 0;
34  if (len < 0) {
35  for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) {
36  ;
37  }
38  }
39  for (total_len = 0; total_len < len; total_len += step) {
40  step = utf8_step(utf8_str + total_len);
41  if (total_len + step > UNICHAR_LEN) {
42  break; // Too long.
43  }
44  if (step == 0) {
45  break; // Illegal first byte.
46  }
47  int i;
48  for (i = 1; i < step; ++i) {
49  if ((utf8_str[total_len + i] & 0xc0) != 0x80) {
50  break;
51  }
52  }
53  if (i < step) {
54  break; // Illegal surrogate
55  }
56  }
57  memcpy(chars, utf8_str, total_len);
58  if (total_len < UNICHAR_LEN) {
59  chars[UNICHAR_LEN - 1] = total_len;
60  while (total_len < UNICHAR_LEN - 1) {
61  chars[total_len++] = 0;
62  }
63  }
64 }
65 
66 // Construct from a single UCS4 character. Illegal values are ignored,
67 // resulting in an empty UNICHAR.
68 UNICHAR::UNICHAR(int unicode) {
69  const int bytemask = 0xBF;
70  const int bytemark = 0x80;
71 
72  if (unicode < 0x80) {
73  chars[UNICHAR_LEN - 1] = 1;
74  chars[2] = 0;
75  chars[1] = 0;
76  chars[0] = static_cast<char>(unicode);
77  } else if (unicode < 0x800) {
78  chars[UNICHAR_LEN - 1] = 2;
79  chars[2] = 0;
80  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
81  unicode >>= 6;
82  chars[0] = static_cast<char>(unicode | 0xc0);
83  } else if (unicode < 0x10000) {
84  chars[UNICHAR_LEN - 1] = 3;
85  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
86  unicode >>= 6;
87  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
88  unicode >>= 6;
89  chars[0] = static_cast<char>(unicode | 0xe0);
90  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
91  chars[UNICHAR_LEN - 1] = 4;
92  chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
93  unicode >>= 6;
94  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
95  unicode >>= 6;
96  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
97  unicode >>= 6;
98  chars[0] = static_cast<char>(unicode | 0xf0);
99  } else {
100  memset(chars, 0, UNICHAR_LEN);
101  }
102 }
103 
104 // Get the first character as UCS-4.
105 int UNICHAR::first_uni() const {
106  static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};
107  int uni = 0;
108  int len = utf8_step(chars);
109  const char *src = chars;
110 
111  switch (len) {
112  default:
113  break;
114  case 4:
115  uni += static_cast<unsigned char>(*src++);
116  uni <<= 6;
117  // Fall through.
118  case 3:
119  uni += static_cast<unsigned char>(*src++);
120  uni <<= 6;
121  // Fall through.
122  case 2:
123  uni += static_cast<unsigned char>(*src++);
124  uni <<= 6;
125  // Fall through.
126  case 1:
127  uni += static_cast<unsigned char>(*src++);
128  }
129  uni -= utf8_offsets[len];
130  return uni;
131 }
132 
133 // Get a terminated UTF8 string: Must delete[] it after use.
134 char *UNICHAR::utf8_str() const {
135  int len = utf8_len();
136  char *str = new char[len + 1];
137  memcpy(str, chars, len);
138  str[len] = 0;
139  return str;
140 }
141 
142 // Get the number of bytes in the first character of the given utf8 string.
143 int UNICHAR::utf8_step(const char *utf8_str) {
144  static const char utf8_bytes[256] = {
145  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
146  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
147  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
149  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
150  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
151  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
152  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
153  3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};
154 
155  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
156 }
157 
159  ASSERT_HOST(it_ != nullptr);
160  int step = utf8_step(it_);
161  if (step == 0) {
162  tprintf("ERROR: Illegal UTF8 encountered.\n");
163  for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
164  tprintf("Index %d char = 0x%x\n", i, it_[i]);
165  }
166  step = 1;
167  }
168  it_ += step;
169  return *this;
170 }
171 
173  ASSERT_HOST(it_ != nullptr);
174  const int len = utf8_step(it_);
175  if (len == 0) {
176  tprintf("WARNING: Illegal UTF8 encountered\n");
177  return ' ';
178  }
179  UNICHAR uch(it_, len);
180  return uch.first_uni();
181 }
182 
183 int UNICHAR::const_iterator::get_utf8(char *utf8_output) const {
184  ASSERT_HOST(it_ != nullptr);
185  const int len = utf8_step(it_);
186  if (len == 0) {
187  tprintf("WARNING: Illegal UTF8 encountered\n");
188  utf8_output[0] = ' ';
189  return 1;
190  }
191  strncpy(utf8_output, it_, len);
192  return len;
193 }
194 
196  ASSERT_HOST(it_ != nullptr);
197  const int len = utf8_step(it_);
198  if (len == 0) {
199  tprintf("WARNING: Illegal UTF8 encountered\n");
200  return 1;
201  }
202  return len;
203 }
204 
206  return utf8_step(it_) > 0;
207 }
208 
211 }
212 
214  return UNICHAR::const_iterator(utf8_str + len);
215 }
216 
217 // Converts a utf-8 string to a vector of unicodes.
218 // Returns an empty vector if the input contains invalid UTF-8.
219 /* static */
220 std::vector<char32> UNICHAR::UTF8ToUTF32(const char *utf8_str) {
221  const int utf8_length = strlen(utf8_str);
222  std::vector<char32> unicodes;
223  unicodes.reserve(utf8_length);
224  const_iterator end_it(end(utf8_str, utf8_length));
225  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
226  if (it.is_legal()) {
227  unicodes.push_back(*it);
228  } else {
229  unicodes.clear();
230  return unicodes;
231  }
232  }
233  return unicodes;
234 }
235 
236 // Returns an empty string if the input contains an invalid unicode.
237 std::string UNICHAR::UTF32ToUTF8(const std::vector<char32> &str32) {
238  std::string utf8_str;
239  for (char32 ch : str32) {
240  UNICHAR uni_ch(ch);
241  int step;
242  if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
243  utf8_str.append(uni_ch.utf8(), step);
244  } else {
245  return "";
246  }
247  }
248  return utf8_str;
249 }
250 
251 } // namespace tesseract
#define UNICHAR_LEN
Definition: unichar.h:33
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:23
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char32
Definition: unichar.h:51
char * utf8_str() const
Definition: unichar.cpp:134
const char * utf8() const
Definition: unichar.h:83
int first_uni() const
Definition: unichar.cpp:105
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:209
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220
int utf8_len() const
Definition: unichar.h:77
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:237
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:213
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:143
const_iterator & operator++()
Definition: unichar.cpp:158
int get_utf8(char *buf) const
Definition: unichar.cpp:183