tesseract  5.0.0
linerec.cpp
Go to the documentation of this file.
1 // File: linerec.cpp
3 // Description: Top-level line-based recognition module for Tesseract.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2013, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
17 
18 #include "tesseractclass.h"
19 
20 #include <allheaders.h>
21 #include "boxread.h"
22 #include "imagedata.h" // for ImageData
23 #include "lstmrecognizer.h"
24 #include "pageres.h"
25 #include "recodebeam.h"
26 #include "tprintf.h"
27 
28 #include <algorithm>
29 
30 namespace tesseract {
31 
32 // Scale factor to make certainty more comparable to Tesseract.
33 const float kCertaintyScale = 7.0f;
34 // Worst acceptable certainty for a dictionary word.
35 const float kWorstDictCertainty = -25.0f;
36 
37 // Generates training data for training a line recognizer, eg LSTM.
38 // Breaks the page into lines, according to the boxes, and writes them to a
39 // serialized DocumentData based on output_basename.
40 // Return true if successful, false if an error occurred.
41 bool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,
42  BLOCK_LIST *block_list) {
43  std::string lstmf_name = output_basename + ".lstmf";
44  DocumentData images(lstmf_name);
45  if (applybox_page > 0) {
46  // Load existing document for the previous pages.
47  if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
48  tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
49  return false;
50  }
51  }
52  std::vector<TBOX> boxes;
53  std::vector<std::string> texts;
54  // Get the boxes for this page, if there are any.
55  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) ||
56  boxes.empty()) {
57  tprintf("Failed to read boxes from %s\n", input_imagename);
58  return false;
59  }
60  TrainFromBoxes(boxes, texts, block_list, &images);
61  if (images.PagesSize() == 0) {
62  tprintf("Failed to read pages from %s\n", input_imagename);
63  return false;
64  }
65  images.Shuffle();
66  if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
67  tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
68  return false;
69  }
70  return true;
71 }
72 
73 // Generates training data for training a line recognizer, eg LSTM.
74 // Breaks the boxes into lines, normalizes them, converts to ImageData and
75 // appends them to the given training_data.
76 void Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
77  BLOCK_LIST *block_list, DocumentData *training_data) {
78  auto box_count = boxes.size();
79  // Process all the text lines in this page, as defined by the boxes.
80  unsigned end_box = 0;
81  // Don't let \t, which marks newlines in the box file, get into the line
82  // content, as that makes the line unusable in training.
83  while (end_box < texts.size() && texts[end_box] == "\t") {
84  ++end_box;
85  }
86  for (auto start_box = end_box; start_box < box_count; start_box = end_box) {
87  // Find the textline of boxes starting at start and their bounding box.
88  TBOX line_box = boxes[start_box];
89  std::string line_str = texts[start_box];
90  for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) {
91  line_box += boxes[end_box];
92  line_str += texts[end_box];
93  }
94  // Find the most overlapping block.
95  BLOCK *best_block = nullptr;
96  int best_overlap = 0;
97  BLOCK_IT b_it(block_list);
98  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
99  BLOCK *block = b_it.data();
100  if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
101  continue; // Not a text block.
102  }
103  TBOX block_box = block->pdblk.bounding_box();
104  block_box.rotate(block->re_rotation());
105  if (block_box.major_overlap(line_box)) {
106  TBOX overlap_box = line_box.intersection(block_box);
107  if (overlap_box.area() > best_overlap) {
108  best_overlap = overlap_box.area();
109  best_block = block;
110  }
111  }
112  }
113  ImageData *imagedata = nullptr;
114  if (best_block == nullptr) {
115  tprintf("No block overlapping textline: %s\n", line_str.c_str());
116  } else {
117  imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);
118  }
119  if (imagedata != nullptr) {
120  training_data->AddPageToDocument(imagedata);
121  }
122  // Don't let \t, which marks newlines in the box file, get into the line
123  // content, as that makes the line unusable in training.
124  while (end_box < texts.size() && texts[end_box] == "\t") {
125  ++end_box;
126  }
127  }
128 }
129 
130 // Returns an Imagedata containing the image of the given box,
131 // and ground truth boxes/truth text if available in the input.
132 // The image is not normalized in any way.
133 ImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,
134  const std::vector<std::string> &texts, int start_box, int end_box,
135  const BLOCK &block) {
136  TBOX revised_box;
137  ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);
138  if (image_data == nullptr) {
139  return nullptr;
140  }
141  image_data->set_page_number(applybox_page);
142  // Copy the boxes and shift them so they are relative to the image.
143  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
144  ICOORD shift = -revised_box.botleft();
145  std::vector<TBOX> line_boxes;
146  std::vector<std::string> line_texts;
147  for (int b = start_box; b < end_box; ++b) {
148  TBOX box = boxes[b];
149  box.rotate(block_rotation);
150  box.move(shift);
151  line_boxes.push_back(box);
152  line_texts.push_back(texts[b]);
153  }
154  std::vector<int> page_numbers(line_boxes.size(), applybox_page);
155  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
156  return image_data;
157 }
158 
159 // Helper gets the image of a rectangle, using the block.re_rotation() if
160 // needed to get to the image, and rotating the result back to horizontal
161 // layout. (CJK characters will be on their left sides) The vertical text flag
162 // is set in the returned ImageData if the text was originally vertical, which
163 // can be used to invoke a different CJK recognition engine. The revised_box
164 // is also returned to enable calculation of output bounding boxes.
165 ImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding,
166  TBOX *revised_box) const {
167  TBOX wbox = box;
168  wbox.pad(padding, padding);
169  *revised_box = wbox;
170  // Number of clockwise 90 degree rotations needed to get back to tesseract
171  // coords from the clipped image.
172  int num_rotations = 0;
173  if (block.re_rotation().y() > 0.0f) {
174  num_rotations = 1;
175  } else if (block.re_rotation().x() < 0.0f) {
176  num_rotations = 2;
177  } else if (block.re_rotation().y() < 0.0f) {
178  num_rotations = 3;
179  }
180  // Handle two cases automatically: 1 the box came from the block, 2 the box
181  // came from a box file, and refers to the image, which the block may not.
182  if (block.pdblk.bounding_box().major_overlap(*revised_box)) {
183  revised_box->rotate(block.re_rotation());
184  }
185  // Now revised_box always refers to the image.
186  // BestPix is never colormapped, but may be of any depth.
187  Image pix = BestPix();
188  int width = pixGetWidth(pix);
189  int height = pixGetHeight(pix);
190  TBOX image_box(0, 0, width, height);
191  // Clip to image bounds;
192  *revised_box &= image_box;
193  if (revised_box->null_box()) {
194  return nullptr;
195  }
196  Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),
197  revised_box->height());
198  Image box_pix = pixClipRectangle(pix, clip_box, nullptr);
199  boxDestroy(&clip_box);
200  if (box_pix == nullptr) {
201  return nullptr;
202  }
203  if (num_rotations > 0) {
204  Image rot_pix = pixRotateOrth(box_pix, num_rotations);
205  box_pix.destroy();
206  box_pix = rot_pix;
207  }
208  // Convert sub-8-bit images to 8 bit.
209  int depth = pixGetDepth(box_pix);
210  if (depth < 8) {
211  Image grey;
212  grey = pixConvertTo8(box_pix, false);
213  box_pix.destroy();
214  box_pix = grey;
215  }
216  bool vertical_text = false;
217  if (num_rotations > 0) {
218  // Rotated the clipped revised box back to internal coordinates.
219  FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
220  revised_box->rotate(rotation);
221  if (num_rotations != 2) {
222  vertical_text = true;
223  }
224  }
225  return new ImageData(vertical_text, box_pix);
226 }
227 
228 // Recognizes a word or group of words, converting to WERD_RES in *words.
229 // Analogous to classify_word_pass1, but can handle a group of words as well.
230 void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
231  PointerVector<WERD_RES> *words) {
232  TBOX word_box = word->word->bounding_box();
233  // Get the word image - no frills.
234  if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) {
235  // In single word mode, use the whole image without any other row/word
236  // interpretation.
237  word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
238  } else {
239  float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
240  if (baseline + row->descenders() < word_box.bottom()) {
241  word_box.set_bottom(baseline + row->descenders());
242  }
243  if (baseline + row->x_height() + row->ascenders() > word_box.top()) {
244  word_box.set_top(baseline + row->x_height() + row->ascenders());
245  }
246  }
247  ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
248  if (im_data == nullptr) {
249  return;
250  }
251 
252  bool do_invert = tessedit_do_invert;
253  lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
254  kWorstDictCertainty / kCertaintyScale, word_box, words,
255  lstm_choice_mode, lstm_choice_iterations);
256  delete im_data;
257  SearchWords(words);
258 }
259 
260 // Apply segmentation search to the given set of words, within the constraints
261 // of the existing ratings matrix. If there is already a best_choice on a word
262 // leaves it untouched and just sets the done/accepted etc flags.
264  // Run the segmentation search on the network outputs and make a BoxWord
265  // for each of the output words.
266  // If we drop a word as junk, then there is always a space in front of the
267  // next.
268  const Dict *stopper_dict = lstm_recognizer_->GetDict();
269  if (stopper_dict == nullptr) {
270  stopper_dict = &getDict();
271  }
272  for (unsigned w = 0; w < words->size(); ++w) {
273  WERD_RES *word = (*words)[w];
274  if (word->best_choice == nullptr) {
275  // It is a dud.
276  word->SetupFake(lstm_recognizer_->GetUnicharset());
277  } else {
278  // Set the best state.
279  for (unsigned i = 0; i < word->best_choice->length(); ++i) {
280  int length = word->best_choice->state(i);
281  word->best_state.push_back(length);
282  }
283  word->reject_map.initialise(word->best_choice->length());
284  word->tess_failed = false;
285  word->tess_accepted = true;
286  word->tess_would_adapt = false;
287  word->done = true;
288  word->tesseract = this;
289  float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());
290  word_certainty *= kCertaintyScale;
291  if (getDict().stopper_debug_level >= 1) {
292  tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
293  word->best_choice->certainty(), word->space_certainty,
294  std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,
295  word_certainty);
296  word->best_choice->print();
297  }
298  word->best_choice->set_certainty(word_certainty);
299 
300  word->tess_accepted = stopper_dict->AcceptableResult(word);
301  }
302  }
303 }
304 
305 } // namespace tesseract.
@ TBOX
const float kWorstDictCertainty
Definition: linerec.cpp:35
@ PSM_SINGLE_WORD
Treat the image as a single word.
Definition: publictypes.h:170
const float kCertaintyScale
Definition: linerec.cpp:33
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const int kImagePadding
Definition: imagedata.h:39
@ baseline
Definition: mfoutline.h:53
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
Definition: boxread.cpp:75
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:230
bool TrainLineRecognizer(const char *input_imagename, const std::string &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:41
ImageData * GetLineData(const TBOX &line_box, const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:133
Dict & getDict() override
Image BestPix() const
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:165
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:263
void TrainFromBoxes(const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:76
void destroy()
Definition: image.cpp:32
void AddBoxes(const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, const std::vector< int > &box_pages)
Definition: imagedata.cpp:315
void set_page_number(int num)
Definition: imagedata.h:92
TESS_API bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:421
size_t PagesSize() const
Definition: imagedata.h:198
TESS_API bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:402
TESS_API void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:433
FCOORD re_rotation() const
Definition: ocrblock.h:129
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
float x_height() const
Definition: ocrrow.h:66
float base_line(float xpos) const
Definition: ocrrow.h:61
float ascenders() const
Definition: ocrrow.h:84
float descenders() const
Definition: ocrrow.h:87
tesseract::Tesseract * tesseract
Definition: pageres.h:278
WERD_CHOICE * best_choice
Definition: pageres.h:239
std::vector< int > best_state
Definition: pageres.h:283
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:353
float space_certainty
Definition: pageres.h:319
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
integer coordinate
Definition: points.h:36
float y() const
Definition: points.h:209
float x() const
Definition: points.h:206
bool IsText() const
Definition: polyblk.h:52
float certainty() const
Definition: ratngs.h:311
void set_certainty(float new_val)
Definition: ratngs.h:353
unsigned state(unsigned index) const
Definition: ratngs.h:299
unsigned length() const
Definition: ratngs.h:283
void print() const
Definition: ratngs.h:557
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
void move(const ICOORD vec)
Definition: rect.h:170
const ICOORD & botleft() const
Definition: rect.h:102
void rotate(const FCOORD &vec)
Definition: rect.h:210
TDimension top() const
Definition: rect.h:68
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:84
bool null_box() const
Definition: rect.h:60
void set_bottom(int y)
Definition: rect.h:78
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void pad(int xpad, int ypad)
Definition: rect.h:144
bool major_overlap(const TBOX &box) const
Definition: rect.h:374
int32_t area() const
Definition: rect.h:134
void set_top(int y)
Definition: rect.h:71
void initialise(uint16_t length)
Definition: rejctmap.cpp:67
TBOX bounding_box() const
Definition: werd.cpp:155
unsigned size() const
Definition: genericvector.h:74
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:111
const UNICHARSET & GetUnicharset() const
const Dict * GetDict() const
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0, int lstm_choice_amount=5)