tesseract  5.0.0
recogtraining.cpp
Go to the documentation of this file.
1 // File: recogtraining.cpp
3 // Description: Functions for ambiguity and parameter training.
4 // Author: Daria Antonova
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "tesseractclass.h"
20 
21 #include "boxread.h"
22 #include "control.h"
23 #include "host.h" // for NearlyEqual
24 #include "ratngs.h"
25 #ifndef DISABLED_LEGACY_ENGINE
26 # include "reject.h"
27 #endif
28 #include "stopper.h"
29 
30 namespace tesseract {
31 
32 const int16_t kMaxBoxEdgeDiff = 2;
33 
34 // Sets flags necessary for recognition in the training mode.
35 // Opens and returns the pointer to the output file.
36 FILE *Tesseract::init_recog_training(const char *filename) {
37  if (tessedit_ambigs_training) {
38  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39  tessedit_enable_doc_dict.set_value(false); // turn off document dictionary
40  // Explore all segmentations.
41  getDict().stopper_no_acceptable_choices.set_value(true);
42  }
43 
44  std::string output_fname = filename;
45  const char *lastdot = strrchr(output_fname.c_str(), '.');
46  if (lastdot != nullptr) {
47  output_fname[lastdot - output_fname.c_str()] = '\0';
48  }
49  output_fname += ".txt";
50  FILE *output_file = fopen(output_fname.c_str(), "a+");
51  if (output_file == nullptr) {
52  tprintf("Error: Could not open file %s\n", output_fname.c_str());
53  ASSERT_HOST(output_file);
54  }
55  return output_file;
56 }
57 
58 // Copies the bounding box from page_res_it->word() to the given TBOX.
59 static bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
60  while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) {
61  page_res_it->forward();
62  }
63 
64  if (page_res_it->word() != nullptr) {
65  *tbox = page_res_it->word()->word->bounding_box();
66 
67  // If tbox->left() is negative, the training image has vertical text and
68  // all the coordinates of bounding boxes of page_res are rotated by 90
69  // degrees in a counterclockwise direction. We need to rotate the TBOX back
70  // in order to compare with the TBOXes of box files.
71  if (tbox->left() < 0) {
72  tbox->rotate(FCOORD(0.0, -1.0));
73  }
74 
75  return true;
76  } else {
77  return false;
78  }
79 }
80 
81 // This function takes tif/box pair of files and runs recognition on the image,
82 // while making sure that the word bounds that tesseract identified roughly
83 // match to those specified by the input box file. For each word (ngram in a
84 // single bounding box from the input box file) it outputs the ocred result,
85 // the correct label, rating and certainty.
86 void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_res,
87  volatile ETEXT_DESC *monitor, FILE *output_file) {
88  std::string box_fname = filename;
89  const char *lastdot = strrchr(box_fname.c_str(), '.');
90  if (lastdot != nullptr) {
91  box_fname[lastdot - box_fname.c_str()] = '\0';
92  }
93  box_fname += ".box";
94  // ReadNextBox() will close box_file
95  FILE *box_file = fopen(box_fname.c_str(), "r");
96  if (box_file == nullptr) {
97  tprintf("Error: Could not open file %s\n", box_fname.c_str());
98  ASSERT_HOST(box_file);
99  }
100 
101  PAGE_RES_IT page_res_it;
102  page_res_it.page_res = page_res;
103  page_res_it.restart_page();
104  std::string label;
105 
106  // Process all the words on this page.
107  TBOX tbox; // tesseract-identified box
108  TBOX bbox; // box from the box file
109  bool keep_going;
110  int line_number = 0;
111  int examined_words = 0;
112  do {
113  keep_going = read_t(&page_res_it, &tbox);
114  keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
115  // Align bottom left points of the TBOXes.
116  while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
117  if (bbox.bottom() < tbox.bottom()) {
118  page_res_it.forward();
119  keep_going = read_t(&page_res_it, &tbox);
120  } else {
121  keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
122  }
123  }
124  while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
125  if (bbox.left() > tbox.left()) {
126  page_res_it.forward();
127  keep_going = read_t(&page_res_it, &tbox);
128  } else {
129  keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
130  }
131  }
132  // OCR the word if top right points of the TBOXes are similar.
133  if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
134  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
135  ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
136  examined_words++;
137  }
138  page_res_it.forward();
139  } while (keep_going);
140 
141  // Set up scripts on all of the words that did not get sent to
142  // ambigs_classify_and_output. They all should have, but if all the
143  // werd_res's don't get uch_sets, tesseract will crash when you try
144  // to iterate over them. :-(
145  int total_words = 0;
146  for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {
147  if (page_res_it.word()) {
148  if (page_res_it.word()->uch_set == nullptr) {
149  page_res_it.word()->SetupFake(unicharset);
150  }
151  total_words++;
152  }
153  }
154  if (examined_words < 0.85 * total_words) {
155  tprintf(
156  "TODO(antonova): clean up recog_training_segmented; "
157  " It examined only a small fraction of the ambigs image.\n");
158  }
159  tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words);
160 }
161 
162 // Helper prints the given set of blob choices.
163 static void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
164  const char *label, FILE *output_file) {
165  float rating = 0.0f;
166  float certainty = 0.0f;
167  for (int i = 0; i < length; ++i) {
168  const BLOB_CHOICE *blob_choice = blob_choices[i];
169  fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id()));
170  rating += blob_choice->rating();
171  if (certainty > blob_choice->certainty()) {
172  certainty = blob_choice->certainty();
173  }
174  }
175  fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
176 }
177 
178 // Helper recursively prints all paths through the ratings matrix, starting
179 // at column col.
180 static void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length,
181  const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
182  const char *label, FILE *output_file) {
183  for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
184  if (ratings.get(col, row) != NOT_CLASSIFIED) {
185  BLOB_CHOICE_IT bc_it(ratings.get(col, row));
186  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
187  blob_choices[length] = bc_it.data();
188  if (row + 1 < dim) {
189  PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label,
190  output_file);
191  } else {
192  PrintPath(length + 1, blob_choices, unicharset, label, output_file);
193  }
194  }
195  }
196  }
197 }
198 
199 // Runs classify_word_pass1() on the current word. Outputs Tesseract's
200 // raw choice as a result of the classification. For words labeled with a
201 // single unichar also outputs all alternatives from blob_choices of the
202 // best choice.
203 void Tesseract::ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it,
204  FILE *output_file) {
205  // Classify word.
206  fflush(stdout);
207  WordData word_data(*pr_it);
208  SetupWordPassN(1, &word_data);
209  classify_word_and_language(1, pr_it, &word_data);
210  WERD_RES *werd_res = word_data.word;
211  WERD_CHOICE *best_choice = werd_res->best_choice;
212  ASSERT_HOST(best_choice != nullptr);
213 
214  // Compute the number of unichars in the label.
215  std::vector<UNICHAR_ID> encoding;
216  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
217  tprintf("Not outputting illegal unichar %s\n", label);
218  return;
219  }
220 
221  // Dump all paths through the ratings matrix (which is normally small).
222  int dim = werd_res->ratings->dimension();
223  const auto **blob_choices = new const BLOB_CHOICE *[dim];
224  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);
225  delete[] blob_choices;
226 }
227 
228 } // namespace tesseract
#define NOT_CLASSIFIED
Definition: matrix.h:45
#define ASSERT_HOST(x)
Definition: errcode.h:59
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const int16_t kMaxBoxEdgeDiff
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:146
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1302
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:166
Dict & getDict() override
void recog_training_segmented(const char *filename, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
FILE * init_recog_training(const char *filename)
int dimension() const
Definition: matrix.h:612
WERD_CHOICE * best_choice
Definition: pageres.h:239
const UNICHARSET * uch_set
Definition: pageres.h:201
MATRIX * ratings
Definition: pageres.h:235
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:353
PAGE_RES * page_res
Definition: pageres.h:684
WERD_RES * restart_page()
Definition: pageres.h:710
WERD_RES * forward()
Definition: pageres.h:743
WERD_RES * word() const
Definition: pageres.h:763
BLOCK_RES * block() const
Definition: pageres.h:769
float certainty() const
Definition: ratngs.h:87
UNICHAR_ID unichar_id() const
Definition: ratngs.h:81
float rating() const
Definition: ratngs.h:84
TDimension left() const
Definition: rect.h:82
void rotate(const FCOORD &vec)
Definition: rect.h:210
TDimension top() const
Definition: rect.h:68
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
TBOX bounding_box() const
Definition: werd.cpp:155
UNICHARSET unicharset
Definition: ccutil.h:61
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279