tesseract  5.0.0
utf.h File Reference
#include <stdint.h>

Go to the source code of this file.

Typedefs

typedef signed int Rune
 

Enumerations

enum  {
  UTFmax = 4 , Runesync = 0x80 , Runeself = 0x80 , Runeerror = 0xFFFD ,
  Runemax = 0x10FFFF
}
 

Functions

int runetochar (char *s, const Rune *r)
 
int chartorune (Rune *r, const char *s)
 
int charntorune (Rune *r, const char *s, int n)
 
int isvalidcharntorune (const char *str, int n, Rune *r, int *consumed)
 
int runelen (Rune r)
 
int runenlen (const Rune *r, int n)
 
int fullrune (const char *s, int n)
 
int utflen (const char *s)
 
int utfnlen (const char *s, long n)
 
const char * utfrune (const char *s, Rune r)
 
const char * utfrrune (const char *s, Rune r)
 
const char * utfutf (const char *s1, const char *s2)
 
char * utfecpy (char *s1, char *es1, const char *s2)
 
Runerunestrcat (Rune *s1, const Rune *s2)
 
Runerunestrncat (Rune *s1, const Rune *s2, long n)
 
const Runerunestrchr (const Rune *s, Rune c)
 
int runestrcmp (const Rune *s1, const Rune *s2)
 
int runestrncmp (const Rune *s1, const Rune *s2, long n)
 
Runerunestrcpy (Rune *s1, const Rune *s2)
 
Runerunestrncpy (Rune *s1, const Rune *s2, long n)
 
Runerunestrecpy (Rune *s1, Rune *es1, const Rune *s2)
 
Runerunestrdup (const Rune *s)
 
const Runerunestrrchr (const Rune *s, Rune c)
 
long runestrlen (const Rune *s)
 
const Runerunestrstr (const Rune *s1, const Rune *s2)
 
Rune toupperrune (Rune r)
 
Rune tolowerrune (Rune r)
 
Rune totitlerune (Rune r)
 
int isupperrune (Rune r)
 
int islowerrune (Rune r)
 
int istitlerune (Rune r)
 
int isalpharune (Rune r)
 
int isdigitrune (Rune r)
 
int isideographicrune (Rune r)
 
int isspacerune (Rune r)
 

Typedef Documentation

◆ Rune

typedef signed int Rune

Definition at line 19 of file utf.h.

Enumeration Type Documentation

◆ anonymous enum

anonymous enum
Enumerator
UTFmax 
Runesync 
Runeself 
Runeerror 
Runemax 

Definition at line 21 of file utf.h.

21  {
22  UTFmax = 4, /* maximum bytes per rune */
23  Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
24  Runeself = 0x80, /* rune and UTF sequences are the same (<) */
25  Runeerror = 0xFFFD, /* decoding error in UTF */
26  Runemax = 0x10FFFF, /* maximum rune value */
27 };
@ Runemax
Definition: utf.h:26
@ Runesync
Definition: utf.h:23
@ UTFmax
Definition: utf.h:22
@ Runeerror
Definition: utf.h:25
@ Runeself
Definition: utf.h:24

Function Documentation

◆ charntorune()

int charntorune ( Rune r,
const char *  s,
int  n 
)

Definition at line 64 of file rune.c.

64  {
65  int c, c1, c2, c3;
66  long l;
67 
68  /* When we're not allowed to read anything */
69  if (length <= 0) {
70  goto badlen;
71  }
72 
73  /*
74  * one character sequence (7-bit value)
75  * 00000-0007F => T1
76  */
77  c = *(uchar *)str;
78  if (c < Tx) {
79  *rune = c;
80  return 1;
81  }
82 
83  // If we can't read more than one character we must stop
84  if (length <= 1) {
85  goto badlen;
86  }
87 
88  /*
89  * two character sequence (11-bit value)
90  * 0080-07FF => T2 Tx
91  */
92  c1 = *(uchar *)(str + 1) ^ Tx;
93  if (c1 & Testx)
94  goto bad;
95  if (c < T3) {
96  if (c < T2)
97  goto bad;
98  l = ((c << Bitx) | c1) & Rune2;
99  if (l <= Rune1)
100  goto bad;
101  *rune = l;
102  return 2;
103  }
104 
105  // If we can't read more than two characters we must stop
106  if (length <= 2) {
107  goto badlen;
108  }
109 
110  /*
111  * three character sequence (16-bit value)
112  * 0800-FFFF => T3 Tx Tx
113  */
114  c2 = *(uchar *)(str + 2) ^ Tx;
115  if (c2 & Testx)
116  goto bad;
117  if (c < T4) {
118  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
119  if (l <= Rune2)
120  goto bad;
121  *rune = l;
122  return 3;
123  }
124 
125  if (length <= 3)
126  goto badlen;
127 
128  /*
129  * four character sequence (21-bit value)
130  * 10000-1FFFFF => T4 Tx Tx Tx
131  */
132  c3 = *(uchar *)(str + 3) ^ Tx;
133  if (c3 & Testx)
134  goto bad;
135  if (c < T5) {
136  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
137  if (l <= Rune3)
138  goto bad;
139  if (l > Runemax)
140  goto bad;
141  *rune = l;
142  return 4;
143  }
144 
145  // Support for 5-byte or longer UTF-8 would go here, but
146  // since we don't have that, we'll just fall through to bad.
147 
148  /*
149  * bad decoding
150  */
151 bad:
152  *rune = Bad;
153  return 1;
154 badlen:
155  *rune = Bad;
156  return 0;
157 }
@ T4
Definition: rune.c:31
@ Testx
Definition: rune.c:41
@ T3
Definition: rune.c:30
@ Rune4
Definition: rune.c:37
@ Rune3
Definition: rune.c:36
@ T5
Definition: rune.c:32
@ T2
Definition: rune.c:29
@ Rune2
Definition: rune.c:35
@ Rune1
Definition: rune.c:34
@ Tx
Definition: rune.c:28
@ Bad
Definition: rune.c:43
@ Bitx
Definition: rune.c:21
unsigned char uchar
Definition: utfdef.h:8

◆ chartorune()

int chartorune ( Rune r,
const char *  s 
)

Definition at line 163 of file rune.c.

163  {
164  int c, c1, c2, c3;
165  long l;
166 
167  /*
168  * one character sequence
169  * 00000-0007F => T1
170  */
171  c = *(uchar *)str;
172  if (c < Tx) {
173  *rune = c;
174  return 1;
175  }
176 
177  /*
178  * two character sequence
179  * 0080-07FF => T2 Tx
180  */
181  c1 = *(uchar *)(str + 1) ^ Tx;
182  if (c1 & Testx)
183  goto bad;
184  if (c < T3) {
185  if (c < T2)
186  goto bad;
187  l = ((c << Bitx) | c1) & Rune2;
188  if (l <= Rune1)
189  goto bad;
190  *rune = l;
191  return 2;
192  }
193 
194  /*
195  * three character sequence
196  * 0800-FFFF => T3 Tx Tx
197  */
198  c2 = *(uchar *)(str + 2) ^ Tx;
199  if (c2 & Testx)
200  goto bad;
201  if (c < T4) {
202  l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
203  if (l <= Rune2)
204  goto bad;
205  *rune = l;
206  return 3;
207  }
208 
209  /*
210  * four character sequence (21-bit value)
211  * 10000-1FFFFF => T4 Tx Tx Tx
212  */
213  c3 = *(uchar *)(str + 3) ^ Tx;
214  if (c3 & Testx)
215  goto bad;
216  if (c < T5) {
217  l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
218  if (l <= Rune3)
219  goto bad;
220  if (l > Runemax)
221  goto bad;
222  *rune = l;
223  return 4;
224  }
225 
226  /*
227  * Support for 5-byte or longer UTF-8 would go here, but
228  * since we don't have that, we'll just fall through to bad.
229  */
230 
231  /*
232  * bad decoding
233  */
234 bad:
235  *rune = Bad;
236  return 1;
237 }

◆ fullrune()

int fullrune ( const char *  s,
int  n 
)

Definition at line 326 of file rune.c.

326  {
327  if (n > 0) {
328  int c = *(uchar *)str;
329  if (c < Tx)
330  return 1;
331  if (n > 1) {
332  if (c < T3)
333  return 1;
334  if (n > 2) {
335  if (c < T4 || n > 3)
336  return 1;
337  }
338  }
339  }
340  return 0;
341 }

◆ isalpharune()

int isalpharune ( Rune  r)

◆ isdigitrune()

int isdigitrune ( Rune  r)

◆ isideographicrune()

int isideographicrune ( Rune  r)

◆ islowerrune()

int islowerrune ( Rune  r)

◆ isspacerune()

int isspacerune ( Rune  r)

◆ istitlerune()

int istitlerune ( Rune  r)

◆ isupperrune()

int isupperrune ( Rune  r)

◆ isvalidcharntorune()

int isvalidcharntorune ( const char *  str,
int  n,
Rune r,
int *  consumed 
)

Definition at line 239 of file rune.c.

239  {
240  *consumed = charntorune(rune, str, length);
241  return *rune != Runeerror || *consumed == 3;
242 }
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:64

◆ runelen()

int runelen ( Rune  r)

Definition at line 299 of file rune.c.

299  {
300  char str[10];
301 
302  return runetochar(str, &rune);
303 }
int runetochar(char *str, const Rune *rune)
Definition: rune.c:244

◆ runenlen()

int runenlen ( const Rune r,
int  n 
)

Definition at line 305 of file rune.c.

305  {
306  int nb;
307  ulong c; /* Rune is signed, so use unsigned for range check. */
308 
309  nb = 0;
310  while (nrune--) {
311  c = *r++;
312  if (c <= Rune1)
313  nb++;
314  else if (c <= Rune2)
315  nb += 2;
316  else if (c <= Rune3)
317  nb += 3;
318  else if (c <= Runemax)
319  nb += 4;
320  else
321  nb += 3; /* Runeerror = 0xFFFD, see runetochar */
322  }
323  return nb;
324 }
unsigned long ulong
Definition: utfdef.h:11

◆ runestrcat()

Rune* runestrcat ( Rune s1,
const Rune s2 
)

◆ runestrchr()

const Rune* runestrchr ( const Rune s,
Rune  c 
)

◆ runestrcmp()

int runestrcmp ( const Rune s1,
const Rune s2 
)

◆ runestrcpy()

Rune* runestrcpy ( Rune s1,
const Rune s2 
)

◆ runestrdup()

Rune* runestrdup ( const Rune s)

◆ runestrecpy()

Rune* runestrecpy ( Rune s1,
Rune es1,
const Rune s2 
)

◆ runestrlen()

long runestrlen ( const Rune s)

◆ runestrncat()

Rune* runestrncat ( Rune s1,
const Rune s2,
long  n 
)

◆ runestrncmp()

int runestrncmp ( const Rune s1,
const Rune s2,
long  n 
)

◆ runestrncpy()

Rune* runestrncpy ( Rune s1,
const Rune s2,
long  n 
)

◆ runestrrchr()

const Rune* runestrrchr ( const Rune s,
Rune  c 
)

◆ runestrstr()

const Rune* runestrstr ( const Rune s1,
const Rune s2 
)

◆ runetochar()

int runetochar ( char *  s,
const Rune r 
)

Definition at line 244 of file rune.c.

244  {
245  /* Runes are signed, so convert to unsigned for range check. */
246  unsigned long c;
247 
248  /*
249  * one character sequence
250  * 00000-0007F => 00-7F
251  */
252  c = *rune;
253  if (c <= Rune1) {
254  str[0] = c;
255  return 1;
256  }
257 
258  /*
259  * two character sequence
260  * 0080-07FF => T2 Tx
261  */
262  if (c <= Rune2) {
263  str[0] = T2 | (c >> 1 * Bitx);
264  str[1] = Tx | (c & Maskx);
265  return 2;
266  }
267 
268  /*
269  * If the Rune is out of range, convert it to the error rune.
270  * Do this test here because the error rune encodes to three bytes.
271  * Doing it earlier would duplicate work, since an out of range
272  * Rune wouldn't have fit in one or two bytes.
273  */
274  if (c > Runemax)
275  c = Runeerror;
276 
277  /*
278  * three character sequence
279  * 0800-FFFF => T3 Tx Tx
280  */
281  if (c <= Rune3) {
282  str[0] = T3 | (c >> 2 * Bitx);
283  str[1] = Tx | ((c >> 1 * Bitx) & Maskx);
284  str[2] = Tx | (c & Maskx);
285  return 3;
286  }
287 
288  /*
289  * four character sequence (21-bit value)
290  * 10000-1FFFFF => T4 Tx Tx Tx
291  */
292  str[0] = T4 | (c >> 3 * Bitx);
293  str[1] = Tx | ((c >> 2 * Bitx) & Maskx);
294  str[2] = Tx | ((c >> 1 * Bitx) & Maskx);
295  str[3] = Tx | (c & Maskx);
296  return 4;
297 }
@ Maskx
Definition: rune.c:40

◆ tolowerrune()

Rune tolowerrune ( Rune  r)

◆ totitlerune()

Rune totitlerune ( Rune  r)

◆ toupperrune()

Rune toupperrune ( Rune  r)

◆ utfecpy()

char* utfecpy ( char *  s1,
char *  es1,
const char *  s2 
)

◆ utflen()

int utflen ( const char *  s)

◆ utfnlen()

int utfnlen ( const char *  s,
long  n 
)

◆ utfrrune()

const char* utfrrune ( const char *  s,
Rune  r 
)

◆ utfrune()

const char* utfrune ( const char *  s,
Rune  r 
)

◆ utfutf()

const char* utfutf ( const char *  s1,
const char *  s2 
)