tesseract  5.0.0
permdawg.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * File: permdawg.cpp (Formerly permdawg.c)
4  * Description: Scale word choices by a dictionary
5  * Author: Mark Seaman, OCR Technology
6  *
7  * (c) Copyright 1987, Hewlett-Packard Company.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  *****************************************************************************/
19 /*----------------------------------------------------------------------
20  I n c l u d e s
21 ----------------------------------------------------------------------*/
22 
23 #include "dawg.h"
24 #include "params.h"
25 #include "stopper.h"
26 #include "tprintf.h"
27 
28 #include <algorithm>
29 #include <cctype>
30 #include "dict.h"
31 
32 /*----------------------------------------------------------------------
33  F u n c t i o n s
34 ----------------------------------------------------------------------*/
35 namespace tesseract {
36 
43 void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
44  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
45  bool word_ending, WERD_CHOICE *word, float certainties[],
46  float *limit, WERD_CHOICE *best_choice, int *attempts_left,
47  void *void_more_args) {
48  auto *more_args = static_cast<DawgArgs *>(void_more_args);
49  word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
50  int word_index = word->length() - 1;
51  if (best_choice->rating() < *limit) {
52  return;
53  }
54  // Look up char in DAWG
55 
56  // If the current unichar is an ngram first try calling
57  // letter_is_okay() for each unigram it contains separately.
58  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
59  bool checked_unigrams = false;
60  if (getUnicharset().get_isngram(orig_uch_id)) {
61  if (dawg_debug_level) {
62  tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
63  }
64  int num_unigrams = 0;
65  word->remove_last_unichar_id();
66  std::vector<UNICHAR_ID> encoding;
67  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68  // Since the string came out of the unicharset, failure is impossible.
69  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
70  bool unigrams_ok = true;
71  // Construct DawgArgs that reflect the current state.
72  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
73  DawgPositionVector unigram_updated_dawgs;
74  DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
75  // Check unigrams in the ngram with letter_is_okay().
76  for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
77  UNICHAR_ID uch_id = encoding[i];
78  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
79  ++num_unigrams;
80  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
81  unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),
82  word->unichar_id(word_index + num_unigrams - 1),
83  word_ending && i == encoding.size() - 1);
84  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
85  if (dawg_debug_level) {
86  tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
87  unigrams_ok ? "OK" : "not OK");
88  }
89  }
90  // Restore the word and copy the updated dawg state if needed.
91  while (num_unigrams-- > 0) {
92  word->remove_last_unichar_id();
93  }
94  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
95  if (unigrams_ok) {
96  checked_unigrams = true;
97  more_args->permuter = unigram_dawg_args.permuter;
98  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
99  }
100  }
101 
102  // Check which dawgs from the dawgs_ vector contain the word
103  // up to and including the current unichar.
104  if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),
105  word->unichar_id(word_index), word_ending)) {
106  // Add a new word choice
107  if (word_ending) {
108  if (dawg_debug_level) {
109  tprintf("found word = %s\n", word->debug_string().c_str());
110  }
111  if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
112  if (output_ambig_words_file_ == nullptr) {
113  output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
114  if (output_ambig_words_file_ == nullptr) {
115  tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
116  exit(1);
117  }
118  std::string word_str;
119  word->string_and_lengths(&word_str, nullptr);
120  word_str += " ";
121  fprintf(output_ambig_words_file_, "%s", word_str.c_str());
122  }
123  std::string word_str;
124  word->string_and_lengths(&word_str, nullptr);
125  word_str += " ";
126  fprintf(output_ambig_words_file_, "%s", word_str.c_str());
127  }
128  WERD_CHOICE *adjusted_word = word;
129  adjusted_word->set_permuter(more_args->permuter);
130  update_best_choice(*adjusted_word, best_choice);
131  } else { // search the next letter
132  // Make updated_* point to the next entries in the DawgPositionVector
133  // arrays (that were originally created in dawg_permute_and_select)
134  ++(more_args->updated_dawgs);
135  // Make active_dawgs and constraints point to the updated ones.
136  ++(more_args->active_dawgs);
137  permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
138  certainties, limit, best_choice, attempts_left, more_args);
139  // Restore previous state to explore another letter in this position.
140  --(more_args->updated_dawgs);
141  --(more_args->active_dawgs);
142  }
143  } else {
144  if (dawg_debug_level) {
145  tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
146  }
147  }
148 }
149 
160  float rating_limit) {
161  auto *best_choice = new WERD_CHOICE(&getUnicharset());
162  best_choice->make_bad();
163  best_choice->set_rating(rating_limit);
164  if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {
165  return best_choice;
166  }
167  auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
168  init_active_dawgs(&(active_dawgs[0]), true);
169  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
171 
172  float certainties[MAX_WERD_LENGTH];
174  int attempts_left = max_permuter_attempts;
175  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
176  &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
177  delete[] active_dawgs;
178  return best_choice;
179 }
180 
187 void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
188  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
189  WERD_CHOICE *word, float certainties[], float *limit,
190  WERD_CHOICE *best_choice, int *attempts_left, void *more_args) {
191  if (debug) {
192  tprintf(
193  "%s permute_choices: char_choice_index=%d"
194  " limit=%g rating=%g, certainty=%g word=%s\n",
195  debug, char_choice_index, *limit, word->rating(), word->certainty(),
196  word->debug_string().c_str());
197  }
198  if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
199  BLOB_CHOICE_IT blob_choice_it;
200  blob_choice_it.set_to_list(char_choices.at(char_choice_index));
201  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
202  (*attempts_left)--;
203  append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
204  prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
205  more_args);
206  if (*attempts_left <= 0) {
207  if (debug) {
208  tprintf("permute_choices(): attempts_left is 0\n");
209  }
210  break;
211  }
212  }
213  }
214 }
215 
224 void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
225  const BLOB_CHOICE &blob_choice, int char_choice_index,
226  const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
227  float certainties[], float *limit, WERD_CHOICE *best_choice,
228  int *attempts_left, void *more_args) {
229  auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
230 
231  // Deal with fragments.
232  CHAR_FRAGMENT_INFO char_frag_info;
233  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
234  prev_char_frag_info, debug, word_ending, &char_frag_info)) {
235  return; // blob_choice must be an invalid fragment
236  }
237  // Search the next letter if this character is a fragment.
238  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
239  permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
240  limit, best_choice, attempts_left, more_args);
241  return;
242  }
243 
244  // Add the next unichar.
245  float old_rating = word->rating();
246  float old_certainty = word->certainty();
247  uint8_t old_permuter = word->permuter();
248  certainties[word->length()] = char_frag_info.certainty;
249  word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
250  char_frag_info.rating, char_frag_info.certainty);
251 
252  // Explore the next unichar.
253  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
254  word, certainties, limit, best_choice, attempts_left, more_args);
255 
256  // Remove the unichar we added to explore other choices in it's place.
257  word->remove_last_unichar_id();
258  word->set_rating(old_rating);
259  word->set_certainty(old_certainty);
260  word->set_permuter(old_permuter);
261 }
262 
288 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
289  const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
290  int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) {
291  const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
292  const CHAR_FRAGMENT *prev_fragment =
293  prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
294 
295  // Print debug info for fragments.
296  if (debug && (prev_fragment || this_fragment)) {
297  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
298  getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
299  if (prev_fragment) {
300  tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
301  }
302  if (this_fragment) {
303  tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
304  }
305  }
306 
307  char_frag_info->unichar_id = curr_unichar_id;
308  char_frag_info->fragment = this_fragment;
309  char_frag_info->rating = curr_rating;
310  char_frag_info->certainty = curr_certainty;
311  char_frag_info->num_fragments = 1;
312  if (prev_fragment && !this_fragment) {
313  if (debug) {
314  tprintf("Skip choice with incomplete fragment\n");
315  }
316  return false;
317  }
318  if (this_fragment) {
319  // We are dealing with a fragment.
320  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
321  if (prev_fragment) {
322  if (!this_fragment->is_continuation_of(prev_fragment)) {
323  if (debug) {
324  tprintf("Non-matching fragment piece\n");
325  }
326  return false;
327  }
328  if (this_fragment->is_ending()) {
329  char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
330  char_frag_info->fragment = nullptr;
331  if (debug) {
332  tprintf("Built character %s from fragments\n",
333  getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
334  }
335  } else {
336  if (debug) {
337  tprintf("Record fragment continuation\n");
338  }
339  char_frag_info->fragment = this_fragment;
340  }
341  // Update certainty and rating.
342  char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
343  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
344  char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
345  } else {
346  if (this_fragment->is_beginning()) {
347  if (debug) {
348  tprintf("Record fragment beginning\n");
349  }
350  } else {
351  if (debug) {
352  tprintf("Non-starting fragment piece with no prev_fragment\n");
353  }
354  return false;
355  }
356  }
357  }
358  if (word_ending && char_frag_info->fragment) {
359  if (debug) {
360  tprintf("Word can not end with a fragment\n");
361  }
362  return false;
363  }
364  return true;
365 }
366 
367 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define MAX_WERD_LENGTH
Definition: dict.h:45
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int UNICHAR_ID
Definition: unichar.h:36
@ NO_PERM
Definition: ratngs.h:232
std::vector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:623
float certainty() const
Definition: ratngs.h:87
UNICHAR_ID unichar_id() const
Definition: ratngs.h:81
float rating() const
Definition: ratngs.h:84
std::string debug_string() const
Definition: ratngs.h:475
float certainty() const
Definition: ratngs.h:311
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
Definition: ratngs.cpp:427
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
uint8_t permuter() const
Definition: ratngs.h:327
void set_certainty(float new_val)
Definition: ratngs.h:353
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:424
void set_permuter(uint8_t perm)
Definition: ratngs.h:356
const UNICHARSET * unicharset() const
Definition: ratngs.h:277
unsigned length() const
Definition: ratngs.h:283
void remove_last_unichar_id()
Definition: ratngs.h:451
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:447
float rating() const
Definition: ratngs.h:308
void set_rating(float new_val)
Definition: ratngs.h:350
bool is_ending() const
Definition: unicharset.h:121
static std::string to_string(const char *unichar, int pos, int total, bool natural)
const char * get_unichar() const
Definition: unicharset.h:76
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:109
bool is_beginning() const
Definition: unicharset.h:116
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:542
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:769
const CHAR_FRAGMENT * fragment
Definition: dict.h:51
DawgPositionVector * updated_dawgs
Definition: dict.h:88
DawgPositionVector * active_dawgs
Definition: dict.h:87
PermuterType permuter
Definition: dict.h:89
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:187
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:224
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:345
const UNICHARSET & getUnicharset() const
Definition: dict.h:104
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:210
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:159
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:43
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:610
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:288