tesseract  5.0.0
ocrpara.h
Go to the documentation of this file.
1 // File: ocrpara.h
3 // Description: OCR Paragraph Output Type
4 // Author: David Eger
5 //
6 // (C) Copyright 2010, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_CCSTRUCT_OCRPARA_H_
20 #define TESSERACT_CCSTRUCT_OCRPARA_H_
21 
22 #include "elst.h"
23 
24 #include <tesseract/publictypes.h>
25 
26 namespace tesseract {
27 
28 class ParagraphModel;
29 
30 struct PARA : public ELIST_LINK {
31 public:
32  PARA()
33  : model(nullptr)
34  , is_list_item(false)
36  , has_drop_cap(false) {}
37 
38  // We do not own the model, we just reference it.
39  // model may be nullptr if there is not a good model for this paragraph.
41 
43 
44  // The first paragraph on a page often lacks a first line indent, but should
45  // still be modeled by the same model as other body text paragraphs on the
46  // page.
48 
49  // Does this paragraph begin with a drop cap?
51 };
52 
54 
55 // A geometric model of paragraph indentation and alignment.
56 //
57 // Measurements are in pixels. The meaning of the integer arguments changes
58 // depending upon the value of justification. Distances less than or equal
59 // to tolerance apart we take as "equivalent" for the purpose of model
60 // matching, and in the examples below, we assume tolerance is zero.
61 //
62 // justification = LEFT:
63 // margin the "ignored" margin to the left block edge.
64 // first_indent indent from the left margin to a typical first text line.
65 // body_indent indent from the left margin of a typical body text line.
66 //
67 // justification = RIGHT:
68 // margin the "ignored" margin to the right block edge.
69 // first_indent indent from the right margin to a typical first text line.
70 // body_indent indent from the right margin of a typical body text line.
71 //
72 // justification = CENTER:
73 // margin ignored
74 // first_indent ignored
75 // body_indent ignored
76 //
77 // ====== Extended example, assuming each letter is ten pixels wide: =======
78 //
79 // +--------------------------------+
80 // | Awesome | ParagraphModel(CENTER, 0, 0, 0)
81 // | Centered Title |
82 // | Paragraph Detection |
83 // | OCR TEAM |
84 // | 10 November 2010 |
85 // | |
86 // | Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)
87 // |This paragraph starts at the top|
88 // |of the page and takes 3 lines. |
89 // | Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)
90 // |which indicates that the first |
91 // |paragraph is not a continuation |
92 // |from a previous page, as it is |
93 // |indented just like this second |
94 // |paragraph. |
95 // | Here is a block quote. It | ParagraphModel(LEFT, 30, 0, 0)
96 // | looks like the prior text |
97 // | but it is indented more |
98 // | and is fully justified. |
99 // | So how does one deal with | ParagraphModel(LEFT, 0, 20, 0)
100 // |centered text, block quotes, |
101 // |normal paragraphs, and lists |
102 // |like what follows? |
103 // |1. Make a plan. | ParagraphModel(LEFT, 0, 0, 30)
104 // |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)
105 // | looking for lines where the |
106 // | first word of the next line |
107 // | would fit on the previous |
108 // | line. |
109 // |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)
110 // | Python and try it out. |
111 // |4. Determine how to fix the | ParagraphModel(LEFT, 0, 0, 30)
112 // | mistakes. |
113 // |5. Repeat. | ParagraphModel(LEFT, 0, 0, 30)
114 // | For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)
115 // |you can try to identify source |
116 // |code. Ouch! |
117 // +--------------------------------+
119 public:
120  ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent,
121  int body_indent, int tolerance)
122  : justification_(justification)
123  , margin_(margin)
124  , first_indent_(first_indent)
125  , body_indent_(body_indent)
126  , tolerance_(tolerance) {
127  // Make one of {first_indent, body_indent} is 0.
128  int added_margin = first_indent;
129  if (body_indent < added_margin) {
130  added_margin = body_indent;
131  }
132  margin_ += added_margin;
133  first_indent_ -= added_margin;
134  body_indent_ -= added_margin;
135  }
136 
138  : justification_(tesseract::JUSTIFICATION_UNKNOWN)
139  , margin_(0)
140  , first_indent_(0)
141  , body_indent_(0)
142  , tolerance_(0) {}
143 
144  // ValidFirstLine() and ValidBodyLine() take arguments describing a text line
145  // in a block of text which we are trying to model:
146  // lmargin, lindent: these add up to the distance from the leftmost ink
147  // in the text line to the surrounding text block's left
148  // edge.
149  // rmargin, rindent: these add up to the distance from the rightmost ink
150  // in the text line to the surrounding text block's right
151  // edge.
152  // The caller determines the division between "margin" and "indent", which
153  // only actually affect whether we think the line may be centered.
154  //
155  // If the amount of whitespace matches the amount of whitespace expected on
156  // the relevant side of the line (within tolerance_) we say it matches.
157 
158  // Return whether a given text line could be a first paragraph line according
159  // to this paragraph model.
160  bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;
161 
162  // Return whether a given text line could be a first paragraph line according
163  // to this paragraph model.
164  bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;
165 
167  return justification_;
168  }
169  int margin() const {
170  return margin_;
171  }
172  int first_indent() const {
173  return first_indent_;
174  }
175  int body_indent() const {
176  return body_indent_;
177  }
178  int tolerance() const {
179  return tolerance_;
180  }
181  bool is_flush() const {
182  return (justification_ == tesseract::JUSTIFICATION_LEFT ||
183  justification_ == tesseract::JUSTIFICATION_RIGHT) &&
184  abs(first_indent_ - body_indent_) <= tolerance_;
185  }
186 
187  // Return whether this model is likely to agree with the other model on most
188  // paragraphs they are marked.
189  bool Comparable(const ParagraphModel &other) const;
190 
191  std::string ToString() const;
192 
193 private:
194  tesseract::ParagraphJustification justification_;
195  int margin_;
196  int first_indent_;
197  int body_indent_;
198  int tolerance_;
199 };
200 
201 } // namespace tesseract
202 
203 #endif // TESSERACT_CCSTRUCT_OCRPARA_H_
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:803
ParagraphJustification
Definition: publictypes.h:248
@ JUSTIFICATION_LEFT
Definition: publictypes.h:250
@ JUSTIFICATION_UNKNOWN
Definition: publictypes.h:249
@ JUSTIFICATION_RIGHT
Definition: publictypes.h:252
bool ValidBodyLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool ValidFirstLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
const ParagraphModel * model
Definition: ocrpara.h:40
bool has_drop_cap
Definition: ocrpara.h:50
bool is_list_item
Definition: ocrpara.h:42
bool is_very_first_or_continuation
Definition: ocrpara.h:47
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:166
bool is_flush() const
Definition: ocrpara.h:181
int body_indent() const
Definition: ocrpara.h:175
int tolerance() const
Definition: ocrpara.h:178
ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent, int body_indent, int tolerance)
Definition: ocrpara.h:120
int first_indent() const
Definition: ocrpara.h:172
#define TESS_API
Definition: export.h:34