tesseract  5.0.0
paragraphs_internal.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: paragraphs_internal.h
3  * Description: Paragraph Detection internal data structures.
4  * Author: David Eger
5  *
6  * (C) Copyright 2011, Google Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
20 #define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
21 
22 #include <tesseract/publictypes.h> // for ParagraphJustification
23 #include "paragraphs.h"
24 
25 // NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
26 // DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
27 
28 namespace tesseract {
29 
30 class UNICHARSET;
31 class WERD_CHOICE;
32 
33 // Return whether the given word is likely to be a list item start word.
35 bool AsciiLikelyListItem(const std::string &word);
36 
37 // Set right word attributes given either a unicharset and werd or a utf8
38 // string.
40 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
41  bool *is_list, bool *starts_idea, bool *ends_idea);
42 
43 // Set left word attributes given either a unicharset and werd or a utf8 string.
45 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
46  bool *is_list, bool *starts_idea, bool *ends_idea);
47 
48 enum LineType {
49  LT_START = 'S', // First line of a paragraph.
50  LT_BODY = 'C', // Continuation line of a paragraph.
51  LT_UNKNOWN = 'U', // No clues.
52  LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
53 };
54 
55 // The first paragraph in a page of body text is often un-indented.
56 // This is a typographic convention which is common to indicate either that:
57 // (1) The paragraph is the continuation of a previous paragraph, or
58 // (2) The paragraph is the first paragraph in a chapter.
59 //
60 // I refer to such paragraphs as "crown"s, and the output of the paragraph
61 // detection algorithm attempts to give them the same paragraph model as
62 // the rest of the body text.
63 //
64 // Nonetheless, while building hypotheses, it is useful to mark the lines
65 // of crown paragraphs temporarily as crowns, either aligned left or right.
66 extern const ParagraphModel *kCrownLeft;
67 extern const ParagraphModel *kCrownRight;
68 
69 inline bool StrongModel(const ParagraphModel *model) {
70  return model != nullptr && model != kCrownLeft && model != kCrownRight;
71 }
72 
74  LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
75  LineHypothesis(LineType line_type, const ParagraphModel *m) : ty(line_type), model(m) {}
76  LineHypothesis(const LineHypothesis &other) = default;
77 
78  // Copy assignment operator.
79  LineHypothesis &operator=(const LineHypothesis &other) = default;
80 
81  bool operator==(const LineHypothesis &other) const {
82  return ty == other.ty && model == other.model;
83  }
84 
87 };
88 
89 class ParagraphTheory; // Forward Declaration
90 
91 using SetOfModels = std::vector<const ParagraphModel *>;
92 
93 // Row Scratch Registers are data generated by the paragraph detection
94 // algorithm based on a RowInfo input.
96 public:
97  // We presume row will outlive us.
98  void Init(const RowInfo &row);
99 
100  LineType GetLineType() const;
101 
102  LineType GetLineType(const ParagraphModel *model) const;
103 
104  // Mark this as a start line type, sans model. This is useful for the
105  // initial marking of probable body lines or paragraph start lines.
106  void SetStartLine();
107 
108  // Mark this as a body line type, sans model. This is useful for the
109  // initial marking of probably body lines or paragraph start lines.
110  void SetBodyLine();
111 
112  // Record that this row fits as a paragraph start line in the given model,
113  void AddStartLine(const ParagraphModel *model);
114  // Record that this row fits as a paragraph body line in the given model,
115  void AddBodyLine(const ParagraphModel *model);
116 
117  // Clear all hypotheses about this line.
118  void SetUnknown() {
119  hypotheses_.clear();
120  }
121 
122  // Append all hypotheses of strong models that match this row as a start.
123  void StartHypotheses(SetOfModels *models) const;
124 
125  // Append all hypotheses of strong models matching this row.
126  void StrongHypotheses(SetOfModels *models) const;
127 
128  // Append all hypotheses for this row.
129  void NonNullHypotheses(SetOfModels *models) const;
130 
131  // Discard any hypotheses whose model is not in the given list.
132  void DiscardNonMatchingHypotheses(const SetOfModels &models);
133 
134  // If we have only one hypothesis and that is that this line is a paragraph
135  // start line of a certain model, return that model. Else return nullptr.
136  const ParagraphModel *UniqueStartHypothesis() const;
137 
138  // If we have only one hypothesis and that is that this line is a paragraph
139  // body line of a certain model, return that model. Else return nullptr.
140  const ParagraphModel *UniqueBodyHypothesis() const;
141 
142  // Return the indentation for the side opposite of the aligned side.
144  switch (just) {
146  return lindent_;
148  return rindent_;
149  default:
150  return lindent_ > rindent_ ? lindent_ : rindent_;
151  }
152  }
153 
154  // Return the indentation for the side the text is aligned to.
156  switch (just) {
158  return rindent_;
160  return lindent_;
161  default:
162  return lindent_ > rindent_ ? lindent_ : rindent_;
163  }
164  }
165 
166  // Append header fields to a vector of row headings.
167  static void AppendDebugHeaderFields(std::vector<std::string> &header);
168 
169  // Append data for this row to a vector of debug strings.
170  void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const;
171 
172  const RowInfo *ri_;
173 
174  // These four constants form a horizontal box model for the white space
175  // on the edges of each line. At each point in the algorithm, the following
176  // shall hold:
177  // ri_->pix_ldistance = lmargin_ + lindent_
178  // ri_->pix_rdistance = rindent_ + rmargin_
179  int lmargin_;
180  int lindent_;
181  int rindent_;
182  int rmargin_;
183 
184 private:
185  // Hypotheses of either LT_START or LT_BODY
186  std::vector<LineHypothesis> hypotheses_;
187 };
188 
189 // A collection of convenience functions for wrapping the set of
190 // Paragraph Models we believe correctly model the paragraphs in the image.
192 public:
193  // We presume models will outlive us, and that models will take ownership
194  // of any ParagraphModel *'s we add.
195  explicit ParagraphTheory(std::vector<ParagraphModel *> *models) : models_(models) {}
196  std::vector<ParagraphModel *> &models() {
197  return *models_;
198  }
199  const std::vector<ParagraphModel *> &models() const {
200  return *models_;
201  }
202 
203  // Return an existing model if one that is Comparable() can be found.
204  // Else, allocate a new copy of model to save and return a pointer to it.
205  const ParagraphModel *AddModel(const ParagraphModel &model);
206 
207  // Discard any models we've made that are not in the list of used models.
208  void DiscardUnusedModels(const SetOfModels &used_models);
209 
210  // Return the set of all non-centered models.
212 
213  // If any of the non-centered paragraph models we know about fit
214  // rows[start, end), return it. Else nullptr.
215  const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,
216  int end) const;
217 
218  int IndexOf(const ParagraphModel *model) const;
219 
220 private:
221  std::vector<ParagraphModel *> *models_;
222  std::vector<ParagraphModel *> models_we_added_;
223 };
224 
225 bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
226  const ParagraphModel *model);
227 bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
228  const ParagraphModel *model);
229 bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
230  const ParagraphModel *model);
231 
232 // A class for smearing Paragraph Model hypotheses to surrounding rows.
233 // The idea here is that StrongEvidenceClassify first marks only exceedingly
234 // obvious start and body rows and constructs models of them. Thereafter,
235 // we may have left over unmarked lines (mostly end-of-paragraph lines) which
236 // were too short to have much confidence about, but which fit the models we've
237 // constructed perfectly and which we ought to mark. This class is used to
238 // "smear" our models over the text.
240 public:
241  ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
242  ParagraphTheory *theory);
243 
244  // Smear forward paragraph models from existing row markings to subsequent
245  // text lines if they fit, and mark any thereafter still unmodeled rows
246  // with any model in the theory that fits them.
247  void Smear();
248 
249 private:
250  // Record in open_models_ for rows [start_row, end_row) the list of models
251  // currently open at each row.
252  // A model is still open in a row if some previous row has said model as a
253  // start hypothesis, and all rows since (including this row) would fit as
254  // either a body or start line in that model.
255  void CalculateOpenModels(int row_start, int row_end);
256 
257  SetOfModels &OpenModels(int row) {
258  return open_models_[row - row_start_ + 1];
259  }
260 
261  ParagraphTheory *theory_;
262  std::vector<RowScratchRegisters> *rows_;
263  int row_start_;
264  int row_end_;
265 
266  // open_models_ corresponds to rows[start_row_ - 1, end_row_]
267  //
268  // open_models_: Contains models which there was an active (open) paragraph
269  // as of the previous line and for which the left and right
270  // indents admit the possibility that this text line continues
271  // to fit the same model.
272  // TODO(eger): Think about whether we can get rid of "Open" models and just
273  // use the current hypotheses on RowScratchRegisters.
274  std::vector<SetOfModels> open_models_;
275 };
276 
277 // Clear all hypotheses about lines [start, end) and reset the margins to the
278 // percentile (0..100) value of the left and right row edges for this run of
279 // rows.
280 void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
281  int end, int percentile);
282 
283 // Return the median inter-word space in rows[row_start, row_end).
284 int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);
285 
286 // Return whether the first word on the after line can fit in the space at
287 // the end of the before line (knowing which way the text is aligned and read).
288 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,
289  tesseract::ParagraphJustification justification);
290 
291 // Return whether the first word on the after line can fit in the space at
292 // the end of the before line (not knowing the text alignment).
293 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);
294 
295 // Do rows[start, end) form a single instance of the given paragraph model?
296 bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
297  const ParagraphModel *model);
298 
299 // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
300 // normalize each row_owner to point to an actual PARA, and output the
301 // paragraphs in order onto paragraphs.
302 void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);
303 
304 } // namespace tesseract
305 
306 #endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
bool StrongModel(const ParagraphModel *model)
ParagraphJustification
Definition: publictypes.h:248
@ JUSTIFICATION_LEFT
Definition: publictypes.h:250
@ JUSTIFICATION_RIGHT
Definition: publictypes.h:252
std::vector< const ParagraphModel * > SetOfModels
int InterwordSpace(const std::vector< RowScratchRegisters > &rows, int row_start, int row_end)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool RowsFitModel(const std::vector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
void RecomputeMarginsAndClearHypotheses(std::vector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool ValidBodyLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:477
void CanonicalizeDetectionResults(std::vector< PARA * > *row_owners, PARA_LIST *paragraphs)
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:56
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:58
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:431
bool CrownCompatible(const std::vector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
bool ValidFirstLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool AsciiLikelyListItem(const std::string &word)
Definition: paragraphs.cpp:282
bool operator==(const LineHypothesis &other) const
LineHypothesis & operator=(const LineHypothesis &other)=default
const ParagraphModel * model
LineHypothesis(LineType line_type, const ParagraphModel *m)
LineHypothesis(const LineHypothesis &other)=default
int AlignsideIndent(tesseract::ParagraphJustification just) const
void StartHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:645
const ParagraphModel * UniqueStartHypothesis() const
Definition: paragraphs.cpp:669
void NonNullHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:661
void AddBodyLine(const ParagraphModel *model)
Definition: paragraphs.cpp:637
void StrongHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:653
static void AppendDebugHeaderFields(std::vector< std::string > &header)
Definition: paragraphs.cpp:510
void AppendDebugInfo(const ParagraphTheory &theory, std::vector< std::string > &dbg) const
Definition: paragraphs.cpp:515
int OffsideIndent(tesseract::ParagraphJustification just) const
void DiscardNonMatchingHypotheses(const SetOfModels &models)
Definition: paragraphs.cpp:684
void AddStartLine(const ParagraphModel *model)
Definition: paragraphs.cpp:629
const ParagraphModel * UniqueBodyHypothesis() const
Definition: paragraphs.cpp:676
void Init(const RowInfo &row)
Definition: paragraphs.cpp:548
void NonCenteredModels(SetOfModels *models)
std::vector< ParagraphModel * > & models()
const std::vector< ParagraphModel * > & models() const
ParagraphTheory(std::vector< ParagraphModel * > *models)
const ParagraphModel * Fits(const std::vector< RowScratchRegisters > *rows, int start, int end) const
void DiscardUnusedModels(const SetOfModels &used_models)
int IndexOf(const ParagraphModel *model) const
const ParagraphModel * AddModel(const ParagraphModel &model)
ParagraphModelSmearer(std::vector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
#define TESS_API
Definition: export.h:34