tesseract  5.0.0
context.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * File: context.cpp (Formerly context.c)
4  * Description: Context checking functions
5  * Author: Mark Seaman, OCR Technology
6  *
7  * (c) Copyright 1990, Hewlett-Packard Company.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  *****************************************************************************/
19 
20 #include "dict.h"
21 #include "unicharset.h"
22 
23 namespace tesseract {
24 
25 static const int kMinAbsoluteGarbageWordLength = 10;
26 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
27 
28 const int case_state_table[6][4] = {
29  {/* 0. Beginning of word */
30  /* P U L D */
31  /* -1. Error on case */
32  0, 1, 5, 4},
33  {/* 1. After initial capital */
34  0, 3, 2, 4},
35  {/* 2. After lower case */
36  0, -1, 2, -1},
37  {/* 3. After upper case */
38  0, 3, -1, 4},
39  {/* 4. After a digit */
40  0, -1, -1, 4},
41  {/* 5. After initial lower case */
42  5, -1, 2, -1},
43 };
44 
45 int Dict::case_ok(const WERD_CHOICE &word) const {
46  int state = 0;
47  const UNICHARSET *unicharset = word.unicharset();
48  for (unsigned x = 0; x < word.length(); ++x) {
49  UNICHAR_ID ch_id = word.unichar_id(x);
50  if (unicharset->get_isupper(ch_id)) {
51  state = case_state_table[state][1];
52  } else if (unicharset->get_islower(ch_id)) {
53  state = case_state_table[state][2];
54  } else if (unicharset->get_isdigit(ch_id)) {
55  state = case_state_table[state][3];
56  } else {
57  state = case_state_table[state][0];
58  }
59  if (state == -1) {
60  return false;
61  }
62  }
63  return state != 5; // single lower is bad
64 }
65 
66 bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
67  if (word.length() < kMinAbsoluteGarbageWordLength) {
68  return false;
69  }
70  int num_alphanum = 0;
71  for (unsigned x = 0; x < word.length(); ++x) {
72  num_alphanum +=
73  (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));
74  }
75  return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) <
76  kMinAbsoluteGarbageAlphanumFrac);
77 }
78 
79 } // namespace tesseract
int UNICHAR_ID
Definition: unichar.h:36
const int case_state_table[6][4]
Definition: context.cpp:28
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
const UNICHARSET * unicharset() const
Definition: ratngs.h:277
unsigned length() const
Definition: ratngs.h:283
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:45
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Definition: context.cpp:66