tesseract  5.0.0
paragraphs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: paragraphs.cpp
3  * Description: Paragraph detection for tesseract.
4  * Author: David Eger
5  *
6  * (C) Copyright 2011, Google Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "paragraphs.h"
20 
21 #include "helpers.h" // for UpdateRange, ClipToRange
22 #include "host.h" // for NearlyEqual
23 #include "mutableiterator.h" // for MutableIterator
24 #include "ocrblock.h" // for BLOCK
25 #include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA...
26 #include "ocrrow.h" // for ROW
27 #include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...
28 #include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels
29 #include "pdblock.h" // for PDBLK
30 #include "polyblk.h" // for POLY_BLOCK
31 #include "ratngs.h" // for WERD_CHOICE
32 #include "rect.h" // for TBOX
33 #include "statistc.h" // for STATS
34 #include "tprintf.h" // for tprintf
35 #include "unicharset.h" // for UNICHARSET
36 #include "werd.h" // for WERD, W_REP_CHAR
37 
38 #include <tesseract/pageiterator.h> // for PageIterator
39 #include <tesseract/publictypes.h> // for JUSTIFICATION_LEFT, JUSTIFICATION_R...
40 #include <tesseract/unichar.h> // for UNICHAR, UNICHAR_ID
41 
42 #include <algorithm> // for max
43 #include <cctype> // for isspace
44 #include <cmath> // for abs
45 #include <cstdio> // for snprintf
46 #include <cstdlib> // for abs
47 #include <cstring> // for strchr, strlen
48 #include <memory> // for unique_ptr
49 
50 static const char *const kRLE = "\u202A"; // Right-to-Left Embedding
51 static const char *const kPDF = "\u202C"; // Pop Directional Formatting
52 
53 namespace tesseract {
54 
55 // Special "weak" ParagraphModels.
57  reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));
59  reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));
60 
61 // Do the text and geometry of two rows support a paragraph break between them?
62 static bool LikelyParagraphStart(const RowScratchRegisters &before,
63  const RowScratchRegisters &after,
65 
66 // Given the width of a typical space between words, what is the threshold
67 // by which by which we think left and right alignments for paragraphs
68 // can vary and still be aligned.
69 static int Epsilon(int space_pix) {
70  return space_pix * 4 / 5;
71 }
72 
73 static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
74  const std::vector<RowScratchRegisters> *rows, int row_start,
75  int row_end) {
76  if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
77  tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
78  rows->size());
79  return false;
80  }
81  if (row_end - row_start < min_num_rows) {
82  if (debug_level > 1) {
83  tprintf("# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name);
84  }
85  return false;
86  }
87  return true;
88 }
89 
90 // =============================== Debug Code ================================
91 
92 // Given a row-major matrix of unicode text and a column separator, print
93 // a formatted table. For ASCII, we get good column alignment.
94 static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
95  std::vector<int> max_col_widths;
96  for (const auto &row : rows) {
97  auto num_columns = row.size();
98  for (size_t c = 0; c < num_columns; c++) {
99  int num_unicodes = 0;
100  for (char i : row[c]) {
101  if ((i & 0xC0) != 0x80) {
102  num_unicodes++;
103  }
104  }
105  if (c >= max_col_widths.size()) {
106  max_col_widths.push_back(num_unicodes);
107  } else {
108  if (num_unicodes > max_col_widths[c]) {
109  max_col_widths[c] = num_unicodes;
110  }
111  }
112  }
113  }
114 
115  std::vector<std::string> col_width_patterns;
116  col_width_patterns.reserve(max_col_widths.size());
117  for (int max_col_width : max_col_widths) {
118  col_width_patterns.push_back(std::string("%-") + std::to_string(max_col_width) + "s");
119  }
120 
121  for (const auto &row : rows) {
122  for (unsigned c = 0; c < row.size(); c++) {
123  if (c > 0) {
124  tprintf("%s", colsep);
125  }
126  tprintf(col_width_patterns[c].c_str(), row[c].c_str());
127  }
128  tprintf("\n");
129  }
130 }
131 
132 static std::string RtlEmbed(const std::string &word, bool rtlify) {
133  if (rtlify) {
134  return std::string(kRLE) + word + std::string(kPDF);
135  }
136  return word;
137 }
138 
139 // Print the current thoughts of the paragraph detector.
140 static void PrintDetectorState(const ParagraphTheory &theory,
141  const std::vector<RowScratchRegisters> &rows) {
142  std::vector<std::vector<std::string>> output;
143  output.emplace_back();
144  output.back().push_back("#row");
145  output.back().push_back("space");
146  output.back().push_back("..");
147  output.back().push_back("lword[widthSEL]");
148  output.back().push_back("rword[widthSEL]");
150  output.back().push_back("text");
151 
152  for (unsigned i = 0; i < rows.size(); i++) {
153  output.emplace_back();
154  std::vector<std::string> &row = output.back();
155  const RowInfo &ri = *rows[i].ri_;
156  row.push_back(std::to_string(i));
157  row.push_back(std::to_string(ri.average_interword_space));
158  row.emplace_back(ri.has_leaders ? ".." : " ");
159  row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + std::to_string(ri.lword_box.width()) +
160  (ri.lword_likely_starts_idea ? "S" : "s") +
161  (ri.lword_likely_ends_idea ? "E" : "e") +
162  (ri.lword_indicates_list_item ? "L" : "l") + "]");
163  row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + std::to_string(ri.rword_box.width()) +
164  (ri.rword_likely_starts_idea ? "S" : "s") +
165  (ri.rword_likely_ends_idea ? "E" : "e") +
166  (ri.rword_indicates_list_item ? "L" : "l") + "]");
167  rows[i].AppendDebugInfo(theory, row);
168  row.push_back(RtlEmbed(ri.text, !ri.ltr));
169  }
170  PrintTable(output, " ");
171 
172  tprintf("Active Paragraph Models:\n");
173  unsigned m = 0;
174  for (const auto &model : theory.models()) {
175  tprintf(" %d: %s\n", ++m, model->ToString().c_str());
176  }
177 }
178 
179 static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
180  const std::vector<RowScratchRegisters> &rows) {
181  if (!should_print) {
182  return;
183  }
184  tprintf("# %s\n", phase);
185  PrintDetectorState(theory, rows);
186 }
187 
188 // Print out the text for rows[row_start, row_end)
189 static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,
190  int row_end) {
191  tprintf("======================================\n");
192  for (int row = row_start; row < row_end; row++) {
193  tprintf("%s\n", rows[row].ri_->text.c_str());
194  }
195  tprintf("======================================\n");
196 }
197 
198 // ============= Brain Dead Language Model (ASCII Version) ===================
199 
200 static bool IsLatinLetter(int ch) {
201  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
202 }
203 
204 static bool IsDigitLike(int ch) {
205  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
206 }
207 
208 static bool IsOpeningPunct(int ch) {
209  return strchr("'\"({[", ch) != nullptr;
210 }
211 
212 static bool IsTerminalPunct(int ch) {
213  return strchr(":'\".?!]})", ch) != nullptr;
214 }
215 
216 // Return a pointer after consuming as much text as qualifies as roman numeral.
217 static const char *SkipChars(const char *str, const char *toskip) {
218  while (*str != '\0' && strchr(toskip, *str)) {
219  str++;
220  }
221  return str;
222 }
223 
224 static const char *SkipChars(const char *str, bool (*skip)(int)) {
225  while (*str != '\0' && skip(*str)) {
226  str++;
227  }
228  return str;
229 }
230 
231 static const char *SkipOne(const char *str, const char *toskip) {
232  if (*str != '\0' && strchr(toskip, *str)) {
233  return str + 1;
234  }
235  return str;
236 }
237 
238 // Return whether it is very likely that this is a numeral marker that could
239 // start a list item. Some examples include:
240 // A I iii. VI (2) 3.5. [C-4]
241 static bool LikelyListNumeral(const std::string &word) {
242  const char *kRomans = "ivxlmdIVXLMD";
243  const char *kDigits = "012345789";
244  const char *kOpen = "[{(";
245  const char *kSep = ":;-.,";
246  const char *kClose = "]})";
247 
248  int num_segments = 0;
249  const char *pos = word.c_str();
250  while (*pos != '\0' && num_segments < 3) {
251  // skip up to two open parens.
252  const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
253  const char *numeral_end = SkipChars(numeral_start, kRomans);
254  if (numeral_end != numeral_start) {
255  // Got Roman Numeral. Great.
256  } else {
257  numeral_end = SkipChars(numeral_start, kDigits);
258  if (numeral_end == numeral_start) {
259  // If there's a single latin letter, we can use that.
260  numeral_end = SkipChars(numeral_start, IsLatinLetter);
261  if (numeral_end - numeral_start != 1) {
262  break;
263  }
264  }
265  }
266  // We got some sort of numeral.
267  num_segments++;
268  // Skip any trailing parens or punctuation.
269  pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
270  if (pos == numeral_end) {
271  break;
272  }
273  }
274  return *pos == '\0';
275 }
276 
277 static bool LikelyListMark(const std::string &word) {
278  const char *kListMarks = "0Oo*.,+.";
279  return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
280 }
281 
282 bool AsciiLikelyListItem(const std::string &word) {
283  return LikelyListMark(word) || LikelyListNumeral(word);
284 }
285 
286 // ========== Brain Dead Language Model (Tesseract Version) ================
287 
288 // Return the first Unicode Codepoint from werd[pos].
289 static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {
290  if (!u || !werd || pos > werd->length()) {
291  return 0;
292  }
293  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
294 }
295 
296 // A useful helper class for finding the first j >= i so that word[j]
297 // does not have given character type.
299 public:
300  UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
301  : u_(unicharset), word_(word), wordlen_(word->length()) {
302  }
303 
304  // Given an input position, return the first position >= pos not punc.
305  unsigned SkipPunc(unsigned pos);
306  // Given an input position, return the first position >= pos not digit.
307  unsigned SkipDigits(unsigned pos);
308  // Given an input position, return the first position >= pos not roman.
309  unsigned SkipRomans(unsigned pos);
310  // Given an input position, return the first position >= pos not alpha.
311  unsigned SkipAlpha(unsigned pos);
312 
313 private:
314  const UNICHARSET *u_;
315  const WERD_CHOICE *word_;
316  unsigned wordlen_;
317 };
318 
319 unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {
320  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
321  pos++;
322  }
323  return pos;
324 }
325 
326 unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {
327  while (pos < wordlen_ &&
328  (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
329  pos++;
330  }
331  return pos;
332 }
333 
334 unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {
335  const char *kRomans = "ivxlmdIVXLMD";
336  while (pos < wordlen_) {
337  int ch = UnicodeFor(u_, word_, pos);
338  if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) {
339  break;
340  }
341  pos++;
342  }
343  return pos;
344 }
345 
346 unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {
347  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
348  pos++;
349  }
350  return pos;
351 }
352 
353 static bool LikelyListMarkUnicode(int ch) {
354  if (ch < 0x80) {
355  std::string single_ch;
356  single_ch += ch;
357  return LikelyListMark(single_ch);
358  }
359  switch (ch) {
360  // TODO(eger) expand this list of unicodes as needed.
361  case 0x00B0: // degree sign
362  case 0x2022: // bullet
363  case 0x25E6: // white bullet
364  case 0x00B7: // middle dot
365  case 0x25A1: // white square
366  case 0x25A0: // black square
367  case 0x25AA: // black small square
368  case 0x2B1D: // black very small square
369  case 0x25BA: // black right-pointing pointer
370  case 0x25CF: // black circle
371  case 0x25CB: // white circle
372  return true;
373  default:
374  break; // fall through
375  }
376  return false;
377 }
378 
379 // Return whether it is very likely that this is a numeral marker that could
380 // start a list item. Some examples include:
381 // A I iii. VI (2) 3.5. [C-4]
382 static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
383  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) {
384  return true;
385  }
386 
387  UnicodeSpanSkipper m(u, werd);
388  int num_segments = 0;
389  unsigned pos = 0;
390  while (pos < werd->length() && num_segments < 3) {
391  auto numeral_start = m.SkipPunc(pos);
392  if (numeral_start > pos + 1) {
393  break;
394  }
395  auto numeral_end = m.SkipRomans(numeral_start);
396  if (numeral_end == numeral_start) {
397  numeral_end = m.SkipDigits(numeral_start);
398  if (numeral_end == numeral_start) {
399  // If there's a single latin letter, we can use that.
400  numeral_end = m.SkipAlpha(numeral_start);
401  if (numeral_end - numeral_start != 1) {
402  break;
403  }
404  }
405  }
406  // We got some sort of numeral.
407  num_segments++;
408  // Skip any trailing punctuation.
409  pos = m.SkipPunc(numeral_end);
410  if (pos == numeral_end) {
411  break;
412  }
413  }
414  return pos == werd->length();
415 }
416 
417 template<class T>
418 void push_back_new(std::vector<T> &vector, const T &data) {
419  if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
420  vector.push_back(data);
421  }
422 }
423 
424 // ========= Brain Dead Language Model (combined entry points) ================
425 
426 // Given the leftmost word of a line either as a Tesseract unicharset + werd
427 // or a utf8 string, set the following attributes for it:
428 // is_list - this word might be a list number or bullet.
429 // starts_idea - this word is likely to start a sentence.
430 // ends_idea - this word is likely to end a sentence.
431 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
432  bool *is_list, bool *starts_idea, bool *ends_idea) {
433  *is_list = false;
434  *starts_idea = false;
435  *ends_idea = false;
436  if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty
437  *ends_idea = true;
438  return;
439  }
440 
441  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
442  if (UniLikelyListItem(unicharset, werd)) {
443  *is_list = true;
444  *starts_idea = true;
445  *ends_idea = true;
446  }
447  if (unicharset->get_isupper(werd->unichar_id(0))) {
448  *starts_idea = true;
449  }
450  if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
451  *starts_idea = true;
452  *ends_idea = true;
453  }
454  } else { // Assume utf8 is mostly ASCII
455  if (AsciiLikelyListItem(utf8)) {
456  *is_list = true;
457  *starts_idea = true;
458  }
459  int start_letter = utf8[0];
460  if (IsOpeningPunct(start_letter)) {
461  *starts_idea = true;
462  }
463  if (IsTerminalPunct(start_letter)) {
464  *ends_idea = true;
465  }
466  if (start_letter >= 'A' && start_letter <= 'Z') {
467  *starts_idea = true;
468  }
469  }
470 }
471 
472 // Given the rightmost word of a line either as a Tesseract unicharset + werd
473 // or a utf8 string, set the following attributes for it:
474 // is_list - this word might be a list number or bullet.
475 // starts_idea - this word is likely to start a sentence.
476 // ends_idea - this word is likely to end a sentence.
477 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
478  bool *is_list, bool *starts_idea, bool *ends_idea) {
479  *is_list = false;
480  *starts_idea = false;
481  *ends_idea = false;
482  if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty
483  *ends_idea = true;
484  return;
485  }
486 
487  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
488  if (UniLikelyListItem(unicharset, werd)) {
489  *is_list = true;
490  *starts_idea = true;
491  }
492  UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
493  if (unicharset->get_ispunctuation(last_letter)) {
494  *ends_idea = true;
495  }
496  } else { // Assume utf8 is mostly ASCII
497  if (AsciiLikelyListItem(utf8)) {
498  *is_list = true;
499  *starts_idea = true;
500  }
501  int last_letter = utf8[utf8.size() - 1];
502  if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
503  *ends_idea = true;
504  }
505  }
506 }
507 
508 // =============== Implementation of RowScratchRegisters =====================
509 /* static */
510 void RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) {
511  header.emplace_back("[lmarg,lind;rind,rmarg]");
512  header.emplace_back("model");
513 }
514 
516  std::vector<std::string> &dbg) const {
517  char s[30];
518  snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", lmargin_, lindent_, rindent_, rmargin_);
519  dbg.emplace_back(s);
520  std::string model_string;
521  model_string += static_cast<char>(GetLineType());
522  model_string += ":";
523 
524  int model_numbers = 0;
525  for (const auto &hypothese : hypotheses_) {
526  if (hypothese.model == nullptr) {
527  continue;
528  }
529  if (model_numbers > 0) {
530  model_string += ",";
531  }
532  if (StrongModel(hypothese.model)) {
533  model_string += std::to_string(1 + theory.IndexOf(hypothese.model));
534  } else if (hypothese.model == kCrownLeft) {
535  model_string += "CrL";
536  } else if (hypothese.model == kCrownRight) {
537  model_string += "CrR";
538  }
539  model_numbers++;
540  }
541  if (model_numbers == 0) {
542  model_string += "0";
543  }
544 
545  dbg.push_back(model_string);
546 }
547 
549  ri_ = &row;
550  lmargin_ = 0;
551  lindent_ = row.pix_ldistance;
552  rmargin_ = 0;
553  rindent_ = row.pix_rdistance;
554 }
555 
557  if (hypotheses_.empty()) {
558  return LT_UNKNOWN;
559  }
560  bool has_start = false;
561  bool has_body = false;
562  for (const auto &hypothese : hypotheses_) {
563  switch (hypothese.ty) {
564  case LT_START:
565  has_start = true;
566  break;
567  case LT_BODY:
568  has_body = true;
569  break;
570  default:
571  tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty);
572  break;
573  }
574  }
575  if (has_start && has_body) {
576  return LT_MULTIPLE;
577  }
578  return has_start ? LT_START : LT_BODY;
579 }
580 
582  if (hypotheses_.empty()) {
583  return LT_UNKNOWN;
584  }
585  bool has_start = false;
586  bool has_body = false;
587  for (const auto &hypothese : hypotheses_) {
588  if (hypothese.model != model) {
589  continue;
590  }
591  switch (hypothese.ty) {
592  case LT_START:
593  has_start = true;
594  break;
595  case LT_BODY:
596  has_body = true;
597  break;
598  default:
599  tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty);
600  break;
601  }
602  }
603  if (has_start && has_body) {
604  return LT_MULTIPLE;
605  }
606  return has_start ? LT_START : LT_BODY;
607 }
608 
610  LineType current_lt = GetLineType();
611  if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
612  tprintf("Trying to set a line to be START when it's already BODY.\n");
613  }
614  if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
615  push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));
616  }
617 }
618 
620  LineType current_lt = GetLineType();
621  if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
622  tprintf("Trying to set a line to be BODY when it's already START.\n");
623  }
624  if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
625  push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));
626  }
627 }
628 
630  push_back_new(hypotheses_, LineHypothesis(LT_START, model));
631  auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));
632  if (found != hypotheses_.end()) {
633  hypotheses_.erase(found);
634  }
635 }
636 
638  push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));
639  auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));
640  if (found != hypotheses_.end()) {
641  hypotheses_.erase(found);
642  }
643 }
644 
646  for (const auto &hypothese : hypotheses_) {
647  if (hypothese.ty == LT_START && StrongModel(hypothese.model)) {
648  push_back_new(*models, hypothese.model);
649  }
650  }
651 }
652 
654  for (const auto &hypothese : hypotheses_) {
655  if (StrongModel(hypothese.model)) {
656  push_back_new(*models, hypothese.model);
657  }
658  }
659 }
660 
662  for (const auto &hypothese : hypotheses_) {
663  if (hypothese.model != nullptr) {
664  push_back_new(*models, hypothese.model);
665  }
666  }
667 }
668 
670  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) {
671  return nullptr;
672  }
673  return hypotheses_[0].model;
674 }
675 
677  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) {
678  return nullptr;
679  }
680  return hypotheses_[0].model;
681 }
682 
683 // Discard any hypotheses whose model is not in the given list.
685  if (models.empty()) {
686  return;
687  }
688  for (int h = hypotheses_.size() - 1; h >= 0; h--) {
689  if (!contains(models, hypotheses_[h].model)) {
690  hypotheses_.erase(hypotheses_.begin() + h);
691  }
692  }
693 }
694 
695 // ============ Geometry based Paragraph Detection Algorithm =================
696 
697 struct Cluster {
698  Cluster() : center(0), count(0) {}
699  Cluster(int cen, int num) : center(cen), count(num) {}
700 
701  int center; // The center of the cluster.
702  int count; // The number of entries within the cluster.
703 };
704 
706 public:
707  explicit SimpleClusterer(int max_cluster_width) : max_cluster_width_(max_cluster_width) {}
708  void Add(int value) {
709  values_.push_back(value);
710  }
711  size_t size() const {
712  return values_.size();
713  }
714  void GetClusters(std::vector<Cluster> *clusters);
715 
716 private:
717  int max_cluster_width_;
718  std::vector<int> values_;
719 };
720 
721 // Return the index of the cluster closest to value.
722 static int ClosestCluster(const std::vector<Cluster> &clusters, int value) {
723  unsigned best_index = 0;
724  for (unsigned i = 0; i < clusters.size(); i++) {
725  if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) {
726  best_index = i;
727  }
728  }
729  return best_index;
730 }
731 
732 void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) {
733  clusters->clear();
734  std::sort(values_.begin(), values_.end());
735  for (unsigned i = 0; i < values_.size();) {
736  int orig_i = i;
737  int lo = values_[i];
738  int hi = lo;
739  while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
740  hi = values_[i];
741  }
742  clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
743  }
744 }
745 
746 // Calculate left- and right-indent tab stop values seen in
747 // rows[row_start, row_end) given a tolerance of tolerance.
748 static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
749  int tolerance, std::vector<Cluster> *left_tabs,
750  std::vector<Cluster> *right_tabs) {
751  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) {
752  return;
753  }
754  // First pass: toss all left and right indents into clusterers.
755  SimpleClusterer initial_lefts(tolerance);
756  SimpleClusterer initial_rights(tolerance);
757  std::vector<Cluster> initial_left_tabs;
758  std::vector<Cluster> initial_right_tabs;
759  for (int i = row_start; i < row_end; i++) {
760  initial_lefts.Add((*rows)[i].lindent_);
761  initial_rights.Add((*rows)[i].rindent_);
762  }
763  initial_lefts.GetClusters(&initial_left_tabs);
764  initial_rights.GetClusters(&initial_right_tabs);
765 
766  // Second pass: cluster only lines that are not "stray"
767  // An example of a stray line is a page number -- a line whose start
768  // and end tab-stops are far outside the typical start and end tab-stops
769  // for the block.
770  // Put another way, we only cluster data from lines whose start or end
771  // tab stop is frequent.
772  SimpleClusterer lefts(tolerance);
773  SimpleClusterer rights(tolerance);
774 
775  // Outlier elimination. We might want to switch this to test outlier-ness
776  // based on how strange a position an outlier is in instead of or in addition
777  // to how rare it is. These outliers get re-added if we end up having too
778  // few tab stops, to work with, however.
779  int infrequent_enough_to_ignore = 0;
780  if (row_end - row_start >= 8) {
781  infrequent_enough_to_ignore = 1;
782  }
783  if (row_end - row_start >= 20) {
784  infrequent_enough_to_ignore = 2;
785  }
786 
787  for (int i = row_start; i < row_end; i++) {
788  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
789  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
790  if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
791  initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
792  lefts.Add((*rows)[i].lindent_);
793  rights.Add((*rows)[i].rindent_);
794  }
795  }
796  lefts.GetClusters(left_tabs);
797  rights.GetClusters(right_tabs);
798 
799  if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
800  (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
801  // One side is really ragged, and the other only has one tab stop,
802  // so those "insignificant outliers" are probably important, actually.
803  // This often happens on a page of an index. Add back in the ones
804  // we omitted in the first pass.
805  for (int i = row_start; i < row_end; i++) {
806  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
807  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
808  if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
809  initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
810  lefts.Add((*rows)[i].lindent_);
811  rights.Add((*rows)[i].rindent_);
812  }
813  }
814  }
815  lefts.GetClusters(left_tabs);
816  rights.GetClusters(right_tabs);
817 
818  // If one side is almost a two-indent aligned side, and the other clearly
819  // isn't, try to prune out the least frequent tab stop from that side.
820  if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
821  int to_prune = -1;
822  for (int i = left_tabs->size() - 1; i >= 0; i--) {
823  if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
824  to_prune = i;
825  }
826  }
827  if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
828  left_tabs->erase(left_tabs->begin() + to_prune);
829  }
830  }
831  if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
832  int to_prune = -1;
833  for (int i = right_tabs->size() - 1; i >= 0; i--) {
834  if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
835  to_prune = i;
836  }
837  }
838  if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
839  right_tabs->erase(right_tabs->begin() + to_prune);
840  }
841  }
842 }
843 
844 // Given a paragraph model mark rows[row_start, row_end) as said model
845 // start or body lines.
846 //
847 // Case 1: model->first_indent_ != model->body_indent_
848 // Differentiating the paragraph start lines from the paragraph body lines in
849 // this case is easy, we just see how far each line is indented.
850 //
851 // Case 2: model->first_indent_ == model->body_indent_
852 // Here, we find end-of-paragraph lines by looking for "short lines."
853 // What constitutes a "short line" changes depending on whether the text
854 // ragged-right[left] or fully justified (aligned left and right).
855 //
856 // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0)
857 // We have a new paragraph it the first word would have at the end
858 // of the previous line.
859 //
860 // Case 2b: Fully Justified. (eop_threshold > 0)
861 // We mark a line as short (end of paragraph) if the offside indent
862 // is greater than eop_threshold.
863 static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
864  const ParagraphModel *model, bool ltr, int eop_threshold) {
865  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
866  return;
867  }
868  for (int row = row_start; row < row_end; row++) {
869  bool valid_first = ValidFirstLine(rows, row, model);
870  bool valid_body = ValidBodyLine(rows, row, model);
871  if (valid_first && !valid_body) {
872  (*rows)[row].AddStartLine(model);
873  } else if (valid_body && !valid_first) {
874  (*rows)[row].AddBodyLine(model);
875  } else if (valid_body && valid_first) {
876  bool after_eop = (row == row_start);
877  if (row > row_start) {
878  if (eop_threshold > 0) {
879  if (model->justification() == JUSTIFICATION_LEFT) {
880  after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
881  } else {
882  after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
883  }
884  } else {
885  after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], model->justification());
886  }
887  }
888  if (after_eop) {
889  (*rows)[row].AddStartLine(model);
890  } else {
891  (*rows)[row].AddBodyLine(model);
892  }
893  } else {
894  // Do nothing. Stray row.
895  }
896  }
897 }
898 
899 // GeometricClassifierState holds all of the information we'll use while
900 // trying to determine a paragraph model for the text lines in a block of
901 // text:
902 // + the rows under consideration [row_start, row_end)
903 // + the common left- and right-indent tab stops
904 // + does the block start out left-to-right or right-to-left
905 // Further, this struct holds the data we amass for the (single) ParagraphModel
906 // we'll assign to the text lines (assuming we get that far).
908  GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,
909  int r_end)
910  : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
911  tolerance = InterwordSpace(*r, r_start, r_end);
912  CalculateTabStops(r, r_start, r_end, tolerance, &left_tabs, &right_tabs);
913  if (debug_level >= 3) {
914  tprintf(
915  "Geometry: TabStop cluster tolerance = %d; "
916  "%zu left tabs; %zu right tabs\n",
917  tolerance, left_tabs.size(), right_tabs.size());
918  }
919  ltr = (*r)[r_start].ri_->ltr;
920  }
921 
924  margin = (*rows)[row_start].lmargin_;
925  }
926 
929  margin = (*rows)[row_start].rmargin_;
930  }
931 
932  // Align tabs are the tab stops the text is aligned to.
933  const std::vector<Cluster> &AlignTabs() const {
935  return right_tabs;
936  }
937  return left_tabs;
938  }
939 
940  // Offside tabs are the tab stops opposite the tabs used to align the text.
941  //
942  // Note that for a left-to-right text which is aligned to the right such as
943  // this function comment, the offside tabs are the horizontal tab stops
944  // marking the beginning of ("Note", "this" and "marking").
945  const std::vector<Cluster> &OffsideTabs() const {
947  return left_tabs;
948  }
949  return right_tabs;
950  }
951 
952  // Return whether the i'th row extends from the leftmost left tab stop
953  // to the right most right tab stop.
954  bool IsFullRow(int i) const {
955  return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
956  ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
957  }
958 
959  int AlignsideTabIndex(int row_idx) const {
960  return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
961  }
962 
963  // Given what we know about the paragraph justification (just), would the
964  // first word of row_b have fit at the end of row_a?
965  bool FirstWordWouldHaveFit(int row_a, int row_b) {
967  }
968 
969  void PrintRows() const {
970  PrintRowRange(*rows, row_start, row_end);
971  }
972 
973  void Fail(int min_debug_level, const char *why) const {
974  if (debug_level < min_debug_level) {
975  return;
976  }
977  tprintf("# %s\n", why);
978  PrintRows();
979  }
980 
983  }
984 
985  // We print out messages with a debug level at least as great as debug_level.
986  int debug_level = 0;
987 
988  // The Geometric Classifier was asked to find a single paragraph model
989  // to fit the text rows (*rows)[row_start, row_end)
990  std::vector<RowScratchRegisters> *rows;
991  int row_start = 0;
992  int row_end = 0;
993 
994  // The amount by which we expect the text edge can vary and still be aligned.
995  int tolerance = 0;
996 
997  // Is the script in this text block left-to-right?
998  // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve
999  bool ltr = false;
1000 
1001  // These left and right tab stops were determined to be the common tab
1002  // stops for the given text.
1003  std::vector<Cluster> left_tabs;
1004  std::vector<Cluster> right_tabs;
1005 
1006  // These are parameters we must determine to create a ParagraphModel.
1008  int margin = 0;
1009  int first_indent = 0;
1010  int body_indent = 0;
1011 
1012  // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel()
1013  int eop_threshold = 0;
1014 };
1015 
1016 // Given a section of text where strong textual clues did not help identifying
1017 // paragraph breaks, and for which the left and right indents have exactly
1018 // three tab stops between them, attempt to find the paragraph breaks based
1019 // solely on the outline of the text and whether the script is left-to-right.
1020 //
1021 // Algorithm Detail:
1022 // The selected rows are in the form of a rectangle except
1023 // for some number of "short lines" of the same length:
1024 //
1025 // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx
1026 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
1027 // xxxxxxxxxxxxx xxxxxxxxxxxx
1028 // xxxxxxxxxxxxx xxxxxxxxxxxx
1029 //
1030 // We have a slightly different situation if the only short
1031 // line is at the end of the excerpt.
1032 //
1033 // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx
1034 // xxxxxxxxxxxxx xxxxxxxxxxxx
1035 // xxxxxxxxxxxxx xxxxxxxxxxxx
1036 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
1037 //
1038 // We'll interpret these as follows based on the reasoning in the comment for
1039 // GeometricClassify():
1040 // [script direction: first indent, body indent]
1041 // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0
1042 // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0
1043 static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s,
1044  ParagraphTheory *theory) {
1045  int num_rows = s.row_end - s.row_start;
1046  int num_full_rows = 0;
1047  int last_row_full = 0;
1048  for (int i = s.row_start; i < s.row_end; i++) {
1049  if (s.IsFullRow(i)) {
1050  num_full_rows++;
1051  if (i == s.row_end - 1) {
1052  last_row_full++;
1053  }
1054  }
1055  }
1056 
1057  if (num_full_rows < 0.7 * num_rows) {
1058  s.Fail(1, "Not enough full lines to know which lines start paras.");
1059  return;
1060  }
1061 
1062  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
1063  s.eop_threshold = 0;
1064 
1065  if (s.ltr) {
1067  } else {
1069  }
1070 
1071  if (debug_level > 0) {
1072  tprintf(
1073  "# Not enough variety for clear outline classification. "
1074  "Guessing these are %s aligned based on script.\n",
1075  s.ltr ? "left" : "right");
1076  s.PrintRows();
1077  }
1078 
1079  if (s.AlignTabs().size() == 2) { // case A1 or A2
1080  s.first_indent = s.AlignTabs()[1].center;
1081  s.body_indent = s.AlignTabs()[0].center;
1082  } else { // case B1 or B2
1083  if (num_rows - 1 == num_full_rows - last_row_full) {
1084  // case B2
1085  const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
1086  (*s.rows)[s.row_start].AddStartLine(model);
1087  for (int i = s.row_start + 1; i < s.row_end; i++) {
1088  (*s.rows)[i].AddBodyLine(model);
1089  }
1090  return;
1091  } else {
1092  // case B1
1093  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1094  s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1095  }
1096  }
1097  const ParagraphModel *model = theory->AddModel(s.Model());
1098  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, s.ltr, s.eop_threshold);
1099  return;
1100 }
1101 
1102 // This function is called if strong textual clues were not available, but
1103 // the caller hopes that the paragraph breaks will be super obvious just
1104 // by the outline of the text.
1105 //
1106 // The particularly difficult case is figuring out what's going on if you
1107 // don't have enough short paragraph end lines to tell us what's going on.
1108 //
1109 // For instance, let's say you have the following outline:
1110 //
1111 // (A1) xxxxxxxxxxxxxxxxxxxxxx
1112 // xxxxxxxxxxxxxxxxxxxx
1113 // xxxxxxxxxxxxxxxxxxxxxx
1114 // xxxxxxxxxxxxxxxxxxxxxx
1115 //
1116 // Even if we know that the text is left-to-right and so will probably be
1117 // left-aligned, both of the following are possible texts:
1118 //
1119 // (A1a) 1. Here our list item
1120 // with two full lines.
1121 // 2. Here a second item.
1122 // 3. Here our third one.
1123 //
1124 // (A1b) so ends paragraph one.
1125 // Here starts another
1126 // paragraph we want to
1127 // read. This continues
1128 //
1129 // These examples are obvious from the text and should have been caught
1130 // by the StrongEvidenceClassify pass. However, for languages where we don't
1131 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
1132 // it's worth guessing that (A1b) is the correct interpretation if there are
1133 // far more "full" lines than "short" lines.
1134 static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
1135  int row_start, int row_end, ParagraphTheory *theory) {
1136  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) {
1137  return;
1138  }
1139  if (debug_level > 1) {
1140  tprintf("###############################################\n");
1141  tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n", row_start, row_end);
1142  tprintf("###############################################\n");
1143  }
1144  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1145 
1146  GeometricClassifierState s(debug_level, rows, row_start, row_end);
1147  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1148  s.Fail(2, "Too much variety for simple outline classification.");
1149  return;
1150  }
1151  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1152  s.Fail(1, "Not enough variety for simple outline classification.");
1153  return;
1154  }
1155  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1156  GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1157  return;
1158  }
1159 
1160  // At this point, we know that one side has at least two tab stops, and the
1161  // other side has one or two tab stops.
1162  // Left to determine:
1163  // (1) Which is the body indent and which is the first line indent?
1164  // (2) Is the text fully justified?
1165 
1166  // If one side happens to have three or more tab stops, assume that side
1167  // is opposite of the aligned side.
1168  if (s.right_tabs.size() > 2) {
1170  } else if (s.left_tabs.size() > 2) {
1172  } else if (s.ltr) { // guess based on script direction
1174  } else {
1176  }
1177 
1178  if (s.AlignTabs().size() == 2) {
1179  // For each tab stop on the aligned side, how many of them appear
1180  // to be paragraph start lines? [first lines]
1181  int firsts[2] = {0, 0};
1182  // Count the first line as a likely paragraph start line.
1183  firsts[s.AlignsideTabIndex(s.row_start)]++;
1184  // For each line, if the first word would have fit on the previous
1185  // line count it as a likely paragraph start line.
1186  bool jam_packed = true;
1187  for (int i = s.row_start + 1; i < s.row_end; i++) {
1188  if (s.FirstWordWouldHaveFit(i - 1, i)) {
1189  firsts[s.AlignsideTabIndex(i)]++;
1190  jam_packed = false;
1191  }
1192  }
1193  // Make an extra accounting for the last line of the paragraph just
1194  // in case it's the only short line in the block. That is, take its
1195  // first word as typical and see if this looks like the *last* line
1196  // of a paragraph. If so, mark the *other* indent as probably a first.
1197  if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1198  firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1199  }
1200 
1201  int percent0firsts, percent1firsts;
1202  percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1203  percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1204 
1205  // TODO(eger): Tune these constants if necessary.
1206  if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) {
1207  s.first_indent = s.AlignTabs()[1].center;
1208  s.body_indent = s.AlignTabs()[0].center;
1209  } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1210  percent1firsts + 30 < percent0firsts) {
1211  s.first_indent = s.AlignTabs()[0].center;
1212  s.body_indent = s.AlignTabs()[1].center;
1213  } else {
1214  // Ambiguous! Probably lineated (poetry)
1215  if (debug_level > 1) {
1216  tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1217  s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1218  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1219  s.AlignTabs()[0].center, percent0firsts);
1220  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1221  s.AlignTabs()[1].center, percent1firsts);
1222  s.PrintRows();
1223  }
1224  return;
1225  }
1226  } else {
1227  // There's only one tab stop for the "aligned to" side.
1228  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1229  }
1230 
1231  // At this point, we have our model.
1232  const ParagraphModel *model = theory->AddModel(s.Model());
1233 
1234  // Now all we have to do is figure out if the text is fully justified or not.
1235  // eop_threshold: default to fully justified unless we see evidence below.
1236  // See description on MarkRowsWithModel()
1237  s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1238  // If the text is not fully justified, re-set the eop_threshold to 0.
1239  if (s.AlignTabs().size() == 2) {
1240  // Paragraphs with a paragraph-start indent.
1241  for (int i = s.row_start; i < s.row_end - 1; i++) {
1242  if (ValidFirstLine(s.rows, i + 1, model) &&
1243  !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),
1244  s.tolerance)) {
1245  // We found a non-end-of-paragraph short line: not fully justified.
1246  s.eop_threshold = 0;
1247  break;
1248  }
1249  }
1250  } else {
1251  // Paragraphs with no paragraph-start indent.
1252  for (int i = s.row_start; i < s.row_end - 1; i++) {
1253  if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1254  !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),
1255  s.tolerance)) {
1256  // We found a non-end-of-paragraph short line: not fully justified.
1257  s.eop_threshold = 0;
1258  break;
1259  }
1260  }
1261  }
1262  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1263 }
1264 
1265 // =============== Implementation of ParagraphTheory =====================
1266 
1268  for (const auto &m : *models_) {
1269  if (m->Comparable(model)) {
1270  return m;
1271  }
1272  }
1273  auto *m = new ParagraphModel(model);
1274  models_->push_back(m);
1275  push_back_new(models_we_added_, m);
1276  return m;
1277 }
1278 
1280  size_t w = 0;
1281  for (size_t r = 0; r < models_->size(); r++) {
1282  ParagraphModel *m = (*models_)[r];
1283  if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) {
1284  delete m;
1285  } else {
1286  if (r > w) {
1287  (*models_)[w] = m;
1288  }
1289  w++;
1290  }
1291  }
1292  models_->resize(w);
1293 }
1294 
1295 // Examine rows[start, end) and try to determine if an existing non-centered
1296 // paragraph model would fit them perfectly. If so, return a pointer to it.
1297 // If not, return nullptr.
1298 const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,
1299  int start, int end) const {
1300  for (const auto *model : *models_) {
1301  if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) {
1302  return model;
1303  }
1304  }
1305  return nullptr;
1306 }
1307 
1309  for (const auto *model : *models_) {
1310  if (model->justification() != JUSTIFICATION_CENTER) {
1311  push_back_new(*models, model);
1312  }
1313  }
1314 }
1315 
1316 int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
1317  int i = 0;
1318  for (const auto *m : *models_) {
1319  if (m == model) {
1320  return i;
1321  }
1322  i++;
1323  }
1324  return -1;
1325 }
1326 
1327 bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
1328  const ParagraphModel *model) {
1329  if (!StrongModel(model)) {
1330  tprintf("ValidFirstLine() should only be called with strong models!\n");
1331  }
1332  return StrongModel(model) && model->ValidFirstLine((*rows)[row].lmargin_, (*rows)[row].lindent_,
1333  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1334 }
1335 
1336 bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
1337  const ParagraphModel *model) {
1338  if (!StrongModel(model)) {
1339  tprintf("ValidBodyLine() should only be called with strong models!\n");
1340  }
1341  return StrongModel(model) && model->ValidBodyLine((*rows)[row].lmargin_, (*rows)[row].lindent_,
1342  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1343 }
1344 
1345 bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
1346  const ParagraphModel *model) {
1347  if (model != kCrownRight && model != kCrownLeft) {
1348  tprintf("CrownCompatible() should only be called with crown models!\n");
1349  return false;
1350  }
1351  auto &row_a = (*rows)[a];
1352  auto &row_b = (*rows)[b];
1353  if (model == kCrownRight) {
1354  return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_,
1355  Epsilon(row_a.ri_->average_interword_space));
1356  }
1357  return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_,
1358  Epsilon(row_a.ri_->average_interword_space));
1359 }
1360 
1361 // =============== Implementation of ParagraphModelSmearer ====================
1362 
1363 ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,
1364  int row_start, int row_end, ParagraphTheory *theory)
1365  : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
1366  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1367  row_start_ = 0;
1368  row_end_ = 0;
1369  return;
1370  }
1371  open_models_.resize(open_models_.size() + row_end - row_start + 2);
1372 }
1373 
1374 // see paragraphs_internal.h
1375 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
1376  SetOfModels no_models;
1377  if (row_start < row_start_) {
1378  row_start = row_start_;
1379  }
1380  if (row_end > row_end_) {
1381  row_end = row_end_;
1382  }
1383 
1384  for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) {
1385  if ((*rows_)[row].ri_->num_words == 0) {
1386  OpenModels(row + 1) = no_models;
1387  } else {
1388  SetOfModels &opened = OpenModels(row);
1389  (*rows_)[row].StartHypotheses(&opened);
1390 
1391  // Which models survive the transition from row to row + 1?
1392  SetOfModels still_open;
1393  for (auto &m : opened) {
1394  if (ValidFirstLine(rows_, row, m) || ValidBodyLine(rows_, row, m)) {
1395  // This is basic filtering; we check likely paragraph starty-ness down
1396  // below in Smear() -- you know, whether the first word would have fit
1397  // and such.
1398  push_back_new(still_open, m);
1399  }
1400  }
1401  OpenModels(row + 1) = still_open;
1402  }
1403  }
1404 }
1405 
1406 // see paragraphs_internal.h
1408  CalculateOpenModels(row_start_, row_end_);
1409 
1410  // For each row which we're unsure about (that is, it is LT_UNKNOWN or
1411  // we have multiple LT_START hypotheses), see if there's a model that
1412  // was recently used (an "open" model) which might model it well.
1413  for (int i = row_start_; i < row_end_; i++) {
1414  RowScratchRegisters &row = (*rows_)[i];
1415  if (row.ri_->num_words == 0) {
1416  continue;
1417  }
1418 
1419  // Step One:
1420  // Figure out if there are "open" models which are left-alined or
1421  // right-aligned. This is important for determining whether the
1422  // "first" word in a row would fit at the "end" of the previous row.
1423  bool left_align_open = false;
1424  bool right_align_open = false;
1425  for (auto &m : OpenModels(i)) {
1426  switch (m->justification()) {
1427  case JUSTIFICATION_LEFT:
1428  left_align_open = true;
1429  break;
1430  case JUSTIFICATION_RIGHT:
1431  right_align_open = true;
1432  break;
1433  default:
1434  left_align_open = right_align_open = true;
1435  }
1436  }
1437  // Step Two:
1438  // Use that knowledge to figure out if this row is likely to
1439  // start a paragraph.
1440  bool likely_start;
1441  if (i == 0) {
1442  likely_start = true;
1443  } else {
1444  if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) {
1445  likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT) ||
1446  LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);
1447  } else if (left_align_open) {
1448  likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT);
1449  } else {
1450  likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);
1451  }
1452  }
1453 
1454  // Step Three:
1455  // If this text line seems like an obvious first line of an
1456  // open model, or an obvious continuation of an existing
1457  // modelled paragraph, mark it up.
1458  if (likely_start) {
1459  // Add Start Hypotheses for all Open models that fit.
1460  for (unsigned m = 0; m < OpenModels(i).size(); m++) {
1461  if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
1462  row.AddStartLine(OpenModels(i)[m]);
1463  }
1464  }
1465  } else {
1466  // Add relevant body line hypotheses.
1467  SetOfModels last_line_models;
1468  if (i > 0) {
1469  (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1470  } else {
1471  theory_->NonCenteredModels(&last_line_models);
1472  }
1473  for (auto model : last_line_models) {
1474  if (ValidBodyLine(rows_, i, model)) {
1475  row.AddBodyLine(model);
1476  }
1477  }
1478  }
1479 
1480  // Step Four:
1481  // If we're still quite unsure about this line, go through all
1482  // models in our theory and see if this row could be the start
1483  // of any of our models.
1484  if (row.GetLineType() == LT_UNKNOWN ||
1485  (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
1486  SetOfModels all_models;
1487  theory_->NonCenteredModels(&all_models);
1488  for (auto &all_model : all_models) {
1489  if (ValidFirstLine(rows_, i, all_model)) {
1490  row.AddStartLine(all_model);
1491  }
1492  }
1493  }
1494  // Step Five:
1495  // Since we may have updated the hypotheses about this row, we need
1496  // to recalculate the Open models for the rest of rows[i + 1, row_end)
1497  if (row.GetLineType() != LT_UNKNOWN) {
1498  CalculateOpenModels(i + 1, row_end_);
1499  }
1500  }
1501 }
1502 
1503 // ================ Main Paragraph Detection Algorithm =======================
1504 
1505 // Find out what ParagraphModels are actually used, and discard any
1506 // that are not.
1507 static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,
1508  ParagraphTheory *theory) {
1509  SetOfModels used_models;
1510  for (const auto &row : rows) {
1511  row.StrongHypotheses(&used_models);
1512  }
1513  theory->DiscardUnusedModels(used_models);
1514 }
1515 
1516 // DowngradeWeakestToCrowns:
1517 // Forget any flush-{left, right} models unless we see two or more
1518 // of them in sequence.
1519 //
1520 // In pass 3, we start to classify even flush-left paragraphs (paragraphs
1521 // where the first line and body indent are the same) as having proper Models.
1522 // This is generally dangerous, since if you start imagining that flush-left
1523 // is a typical paragraph model when it is not, it will lead you to chop normal
1524 // indented paragraphs in the middle whenever a sentence happens to start on a
1525 // new line (see "This" above). What to do?
1526 // What we do is to take any paragraph which is flush left and is not
1527 // preceded by another paragraph of the same model and convert it to a "Crown"
1528 // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder
1529 // for later. It means that the paragraph is flush, but it would be desirable
1530 // to mark it as the same model as following text if it fits. This downgrade
1531 // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we
1532 // avoid making flush left Paragraph Models whenever we see a top-of-the-page
1533 // half-of-a-paragraph. and instead we mark it the same as normal body text.
1534 //
1535 // Implementation:
1536 //
1537 // Comb backwards through the row scratch registers, and turn any
1538 // sequences of body lines of equivalent type abutted against the beginning
1539 // or a body or start line of a different type into a crown paragraph.
1540 static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
1541  std::vector<RowScratchRegisters> *rows) {
1542  int start;
1543  for (int end = rows->size(); end > 0; end = start) {
1544  // Search back for a body line of a unique type.
1545  const ParagraphModel *model = nullptr;
1546  while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {
1547  end--;
1548  }
1549  if (end == 0) {
1550  break;
1551  }
1552  start = end - 1;
1553  while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1554  start--; // walk back to the first line that is not the same body type.
1555  }
1556  if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && StrongModel(model) &&
1557  NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) {
1558  start--;
1559  }
1560  start++;
1561  // Now rows[start, end) is a sequence of unique body hypotheses of model.
1562  if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) {
1563  continue;
1564  }
1565  if (!StrongModel(model)) {
1566  while (start > 0 && CrownCompatible(rows, start - 1, start, model)) {
1567  start--;
1568  }
1569  }
1570  if (start == 0 || (!StrongModel(model)) ||
1571  (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1572  // crownify rows[start, end)
1573  const ParagraphModel *crown_model = model;
1574  if (StrongModel(model)) {
1575  if (model->justification() == JUSTIFICATION_LEFT) {
1576  crown_model = kCrownLeft;
1577  } else {
1578  crown_model = kCrownRight;
1579  }
1580  }
1581  (*rows)[start].SetUnknown();
1582  (*rows)[start].AddStartLine(crown_model);
1583  for (int row = start + 1; row < end; row++) {
1584  (*rows)[row].SetUnknown();
1585  (*rows)[row].AddBodyLine(crown_model);
1586  }
1587  }
1588  }
1589  DiscardUnusedModels(*rows, theory);
1590 }
1591 
1592 // Clear all hypotheses about lines [start, end) and reset margins.
1593 //
1594 // The empty space between the left of a row and the block boundary (and
1595 // similarly for the right) is split into two pieces: margin and indent.
1596 // In initial processing, we assume the block is tight and the margin for
1597 // all lines is set to zero. However, if our first pass does not yield
1598 // models for everything, it may be due to an inset paragraph like a
1599 // block-quote. In that case, we make a second pass over that unmarked
1600 // section of the page and reset the "margin" portion of the empty space
1601 // to the common amount of space at the ends of the lines under consid-
1602 // eration. This would be equivalent to percentile set to 0. However,
1603 // sometimes we have a single character sticking out in the right margin
1604 // of a text block (like the 'r' in 'for' on line 3 above), and we can
1605 // really just ignore it as an outlier. To express this, we allow the
1606 // user to specify the percentile (0..100) of indent values to use as
1607 // the common margin for each row in the run of rows[start, end).
1608 void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
1609  int end, int percentile) {
1610  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) {
1611  return;
1612  }
1613 
1614  int lmin, lmax, rmin, rmax;
1615  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1616  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1617  for (int i = start; i < end; i++) {
1618  RowScratchRegisters &sr = (*rows)[i];
1619  sr.SetUnknown();
1620  if (sr.ri_->num_words == 0) {
1621  continue;
1622  }
1623  UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1624  UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1625  }
1626  STATS lefts(lmin, lmax + 1);
1627  STATS rights(rmin, rmax + 1);
1628  for (int i = start; i < end; i++) {
1629  RowScratchRegisters &sr = (*rows)[i];
1630  if (sr.ri_->num_words == 0) {
1631  continue;
1632  }
1633  lefts.add(sr.lmargin_ + sr.lindent_, 1);
1634  rights.add(sr.rmargin_ + sr.rindent_, 1);
1635  }
1636  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1637  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1638  for (int i = start; i < end; i++) {
1639  RowScratchRegisters &sr = (*rows)[i];
1640  int ldelta = ignorable_left - sr.lmargin_;
1641  sr.lmargin_ += ldelta;
1642  sr.lindent_ -= ldelta;
1643  int rdelta = ignorable_right - sr.rmargin_;
1644  sr.rmargin_ += rdelta;
1645  sr.rindent_ -= rdelta;
1646  }
1647 }
1648 
1649 // Return the median inter-word space in rows[row_start, row_end).
1650 int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) {
1651  if (row_end < row_start + 1) {
1652  return 1;
1653  }
1654  int word_height =
1655  (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2;
1656  int word_width =
1657  (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2;
1658  STATS spacing_widths(0, 5 + word_width);
1659  for (int i = row_start; i < row_end; i++) {
1660  if (rows[i].ri_->num_words > 1) {
1661  spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1662  }
1663  }
1664  int minimum_reasonable_space = word_height / 3;
1665  if (minimum_reasonable_space < 2) {
1666  minimum_reasonable_space = 2;
1667  }
1668  int median = spacing_widths.median();
1669  return (median > minimum_reasonable_space) ? median : minimum_reasonable_space;
1670 }
1671 
1672 // Return whether the first word on the after line can fit in the space at
1673 // the end of the before line (knowing which way the text is aligned and read).
1675  tesseract::ParagraphJustification justification) {
1676  if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {
1677  return true;
1678  }
1679 
1680  if (justification == JUSTIFICATION_UNKNOWN) {
1681  tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1682  }
1683  int available_space;
1684  if (justification == JUSTIFICATION_CENTER) {
1685  available_space = before.lindent_ + before.rindent_;
1686  } else {
1687  available_space = before.OffsideIndent(justification);
1688  }
1689  available_space -= before.ri_->average_interword_space;
1690 
1691  if (before.ri_->ltr) {
1692  return after.ri_->lword_box.width() < available_space;
1693  }
1694  return after.ri_->rword_box.width() < available_space;
1695 }
1696 
1697 // Return whether the first word on the after line can fit in the space at
1698 // the end of the before line (not knowing which way the text goes) in a left
1699 // or right alignment.
1701  if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {
1702  return true;
1703  }
1704 
1705  int available_space = before.lindent_;
1706  if (before.rindent_ > available_space) {
1707  available_space = before.rindent_;
1708  }
1709  available_space -= before.ri_->average_interword_space;
1710 
1711  if (before.ri_->ltr) {
1712  return after.ri_->lword_box.width() < available_space;
1713  }
1714  return after.ri_->rword_box.width() < available_space;
1715 }
1716 
1717 static bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after) {
1718  if (before.ri_->ltr) {
1719  return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea;
1720  } else {
1721  return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea;
1722  }
1723 }
1724 
1725 static bool LikelyParagraphStart(const RowScratchRegisters &before,
1726  const RowScratchRegisters &after,
1728  return before.ri_->num_words == 0 ||
1729  (FirstWordWouldHaveFit(before, after, j) && TextSupportsBreak(before, after));
1730 }
1731 
1732 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1733 // would fit them as a single paragraph.
1734 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
1735 // If the rows given could be a consistent start to a paragraph, set *consistent
1736 // true.
1737 static ParagraphModel InternalParagraphModelByOutline(
1738  const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,
1739  bool *consistent) {
1740  int ltr_line_count = 0;
1741  for (int i = start; i < end; i++) {
1742  ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1743  }
1744  bool ltr = (ltr_line_count >= (end - start) / 2);
1745 
1746  *consistent = true;
1747  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) {
1748  return ParagraphModel();
1749  }
1750 
1751  // Ensure the caller only passed us a region with a common rmargin and
1752  // lmargin.
1753  int lmargin = (*rows)[start].lmargin_;
1754  int rmargin = (*rows)[start].rmargin_;
1755  int lmin, lmax, rmin, rmax, cmin, cmax;
1756  lmin = lmax = (*rows)[start + 1].lindent_;
1757  rmin = rmax = (*rows)[start + 1].rindent_;
1758  cmin = cmax = 0;
1759  for (int i = start + 1; i < end; i++) {
1760  if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1761  tprintf("Margins don't match! Software error.\n");
1762  *consistent = false;
1763  return ParagraphModel();
1764  }
1765  UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1766  UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1767  UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1768  }
1769  int ldiff = lmax - lmin;
1770  int rdiff = rmax - rmin;
1771  int cdiff = cmax - cmin;
1772  if (rdiff > tolerance && ldiff > tolerance) {
1773  if (cdiff < tolerance * 2) {
1774  if (end - start < 3) {
1775  return ParagraphModel();
1776  }
1777  return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1778  }
1779  *consistent = false;
1780  return ParagraphModel();
1781  }
1782  if (end - start < 3) { // Don't return a model for two line paras.
1783  return ParagraphModel();
1784  }
1785 
1786  // These booleans keep us from saying something is aligned left when the body
1787  // left variance is too large.
1788  bool body_admits_left_alignment = ldiff < tolerance;
1789  bool body_admits_right_alignment = rdiff < tolerance;
1790 
1791  ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1792  (lmin + lmax) / 2, tolerance);
1793  ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1794  (rmin + rmax) / 2, tolerance);
1795 
1796  // These booleans keep us from having an indent on the "wrong side" for the
1797  // first line.
1798  bool text_admits_left_alignment = ltr || left_model.is_flush();
1799  bool text_admits_right_alignment = !ltr || right_model.is_flush();
1800 
1801  // At least one of the edges is less than tolerance in variance.
1802  // If the other is obviously ragged, it can't be the one aligned to.
1803  // [Note the last line is included in this raggedness.]
1804  if (tolerance < rdiff) {
1805  if (body_admits_left_alignment && text_admits_left_alignment) {
1806  return left_model;
1807  }
1808  *consistent = false;
1809  return ParagraphModel();
1810  }
1811  if (tolerance < ldiff) {
1812  if (body_admits_right_alignment && text_admits_right_alignment) {
1813  return right_model;
1814  }
1815  *consistent = false;
1816  return ParagraphModel();
1817  }
1818 
1819  // At this point, we know the body text doesn't vary much on either side.
1820 
1821  // If the first line juts out oddly in one direction or the other,
1822  // that likely indicates the side aligned to.
1823  int first_left = (*rows)[start].lindent_;
1824  int first_right = (*rows)[start].rindent_;
1825 
1826  if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) {
1827  return left_model;
1828  }
1829  if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) {
1830  return right_model;
1831  }
1832 
1833  *consistent = false;
1834  return ParagraphModel();
1835 }
1836 
1837 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1838 // would fit them as a single paragraph. If nothing fits,
1839 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
1840 // output if we're debugging.
1841 static ParagraphModel ParagraphModelByOutline(int debug_level,
1842  const std::vector<RowScratchRegisters> *rows,
1843  int start, int end, int tolerance) {
1844  bool unused_consistent;
1845  ParagraphModel retval =
1846  InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent);
1847  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1848  tprintf("Could not determine a model for this paragraph:\n");
1849  PrintRowRange(*rows, start, end);
1850  }
1851  return retval;
1852 }
1853 
1854 // Do rows[start, end) form a single instance of the given paragraph model?
1855 bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
1856  const ParagraphModel *model) {
1857  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) {
1858  return false;
1859  }
1860  if (!ValidFirstLine(rows, start, model)) {
1861  return false;
1862  }
1863  for (int i = start + 1; i < end; i++) {
1864  if (!ValidBodyLine(rows, i, model)) {
1865  return false;
1866  }
1867  }
1868  return true;
1869 }
1870 
1871 // Examine rows[row_start, row_end) as an independent section of text,
1872 // and mark rows that are exceptionally clear as start-of-paragraph
1873 // and paragraph-body lines.
1874 //
1875 // We presume that any lines surrounding rows[row_start, row_end) may
1876 // have wildly different paragraph models, so we don't key any data off
1877 // of those lines.
1878 //
1879 // We only take the very strongest signals, as we don't want to get
1880 // confused and marking up centered text, poetry, or source code as
1881 // clearly part of a typical paragraph.
1882 static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,
1883  int row_end) {
1884  // Record patently obvious body text.
1885  for (int i = row_start + 1; i < row_end; i++) {
1886  const RowScratchRegisters &prev = (*rows)[i - 1];
1887  RowScratchRegisters &curr = (*rows)[i];
1888  tesseract::ParagraphJustification typical_justification =
1889  prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1890  if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea &&
1891  !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1892  curr.SetBodyLine();
1893  }
1894  }
1895 
1896  // Record patently obvious start paragraph lines.
1897  //
1898  // It's an extremely good signal of the start of a paragraph that
1899  // the first word would have fit on the end of the previous line.
1900  // However, applying just that signal would have us mark random
1901  // start lines of lineated text (poetry and source code) and some
1902  // centered headings as paragraph start lines. Therefore, we use
1903  // a second qualification for a paragraph start: Not only should
1904  // the first word of this line have fit on the previous line,
1905  // but also, this line should go full to the right of the block,
1906  // disallowing a subsequent word from having fit on this line.
1907 
1908  // First row:
1909  {
1910  RowScratchRegisters &curr = (*rows)[row_start];
1911  RowScratchRegisters &next = (*rows)[row_start + 1];
1913  if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&
1914  (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) {
1915  curr.SetStartLine();
1916  }
1917  }
1918  // Middle rows
1919  for (int i = row_start + 1; i < row_end - 1; i++) {
1920  RowScratchRegisters &prev = (*rows)[i - 1];
1921  RowScratchRegisters &curr = (*rows)[i];
1922  RowScratchRegisters &next = (*rows)[i + 1];
1924  if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&
1925  LikelyParagraphStart(prev, curr, j)) {
1926  curr.SetStartLine();
1927  }
1928  }
1929  // Last row
1930  { // the short circuit at the top means we have at least two lines.
1931  RowScratchRegisters &prev = (*rows)[row_end - 2];
1932  RowScratchRegisters &curr = (*rows)[row_end - 1];
1934  if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, curr, j) &&
1935  LikelyParagraphStart(prev, curr, j)) {
1936  curr.SetStartLine();
1937  }
1938  }
1939 }
1940 
1941 // Look for sequences of a start line followed by some body lines in
1942 // rows[row_start, row_end) and create ParagraphModels for them if
1943 // they seem coherent.
1944 static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,
1945  int row_start, int row_end, bool allow_flush_models,
1946  ParagraphTheory *theory) {
1947  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
1948  return;
1949  }
1950 
1951  int start = row_start;
1952  while (start < row_end) {
1953  while (start < row_end && (*rows)[start].GetLineType() != LT_START) {
1954  start++;
1955  }
1956  if (start >= row_end - 1) {
1957  break;
1958  }
1959 
1960  int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1961  int end = start;
1962  ParagraphModel last_model;
1963  bool next_consistent;
1964  do {
1965  ++end;
1966  // rows[row, end) was consistent.
1967  // If rows[row, end + 1) is not consistent,
1968  // just model rows[row, end)
1969  if (end < row_end - 1) {
1970  RowScratchRegisters &next = (*rows)[end];
1971  LineType lt = next.GetLineType();
1972  next_consistent = lt == LT_BODY || (lt == LT_UNKNOWN &&
1973  !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1974  } else {
1975  next_consistent = false;
1976  }
1977  if (next_consistent) {
1978  ParagraphModel next_model =
1979  InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent);
1980  if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT &&
1981  next_model.justification() != JUSTIFICATION_LEFT) ||
1982  (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT &&
1983  next_model.justification() != JUSTIFICATION_RIGHT)) {
1984  next_consistent = false;
1985  }
1986  last_model = next_model;
1987  } else {
1988  next_consistent = false;
1989  }
1990  } while (next_consistent && end < row_end);
1991  // At this point, rows[start, end) looked like it could have been a
1992  // single paragraph. If we can make a good ParagraphModel for it,
1993  // do so and mark this sequence with that model.
1994  if (end > start + 1) {
1995  // emit a new paragraph if we have more than one line.
1996  const ParagraphModel *model = nullptr;
1997  ParagraphModel new_model = ParagraphModelByOutline(
1998  debug_level, rows, start, end, Epsilon(InterwordSpace(*rows, start, end)));
1999  if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
2000  // couldn't create a good model, oh well.
2001  } else if (new_model.is_flush()) {
2002  if (end == start + 2) {
2003  // It's very likely we just got two paragraph starts in a row.
2004  end = start + 1;
2005  } else if (start == row_start) {
2006  // Mark this as a Crown.
2007  if (new_model.justification() == JUSTIFICATION_LEFT) {
2008  model = kCrownLeft;
2009  } else {
2010  model = kCrownRight;
2011  }
2012  } else if (allow_flush_models) {
2013  model = theory->AddModel(new_model);
2014  }
2015  } else {
2016  model = theory->AddModel(new_model);
2017  }
2018  if (model) {
2019  (*rows)[start].AddStartLine(model);
2020  for (int i = start + 1; i < end; i++) {
2021  (*rows)[i].AddBodyLine(model);
2022  }
2023  }
2024  }
2025  start = end;
2026  }
2027 }
2028 
2029 // We examine rows[row_start, row_end) and do the following:
2030 // (1) Clear all existing hypotheses for the rows being considered.
2031 // (2) Mark up any rows as exceptionally likely to be paragraph starts
2032 // or paragraph body lines as such using both geometric and textual
2033 // clues.
2034 // (3) Form models for any sequence of start + continuation lines.
2035 // (4) Smear the paragraph models to cover surrounding text.
2036 static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
2037  int row_start, int row_end, ParagraphTheory *theory) {
2038  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
2039  return;
2040  }
2041 
2042  if (debug_level > 1) {
2043  tprintf("#############################################\n");
2044  tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2045  tprintf("#############################################\n");
2046  }
2047 
2048  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
2049  MarkStrongEvidence(rows, row_start, row_end);
2050 
2051  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
2052 
2053  // Create paragraph models.
2054  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
2055 
2056  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
2057 
2058  // At this point, some rows are marked up as paragraphs with model numbers,
2059  // and some rows are marked up as either LT_START or LT_BODY. Now let's
2060  // smear any good paragraph hypotheses forward and backward.
2061  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2062  smearer.Smear();
2063 }
2064 
2065 static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,
2066  int row_end, ParagraphTheory *theory) {
2067  for (int i = row_start + 1; i < row_end - 1; i++) {
2068  if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&
2069  (*rows)[i + 1].ri_->has_leaders) {
2070  const ParagraphModel *model =
2071  theory->AddModel(ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
2072  (*rows)[i].AddStartLine(model);
2073  }
2074  }
2075 }
2076 
2077 // Collect sequences of unique hypotheses in row registers and create proper
2078 // paragraphs for them, referencing the paragraphs in row_owners.
2079 static void ConvertHypothesizedModelRunsToParagraphs(int debug_level,
2080  std::vector<RowScratchRegisters> &rows,
2081  std::vector<PARA *> *row_owners,
2082  ParagraphTheory *theory) {
2083  int end = rows.size();
2084  int start;
2085  for (; end > 0; end = start) {
2086  start = end - 1;
2087  const ParagraphModel *model = nullptr;
2088  // TODO(eger): Be smarter about dealing with multiple hypotheses.
2089  bool single_line_paragraph = false;
2090  SetOfModels models;
2091  rows[start].NonNullHypotheses(&models);
2092  if (!models.empty()) {
2093  model = models[0];
2094  if (rows[start].GetLineType(model) != LT_BODY) {
2095  single_line_paragraph = true;
2096  }
2097  }
2098  if (model && !single_line_paragraph) {
2099  // walk back looking for more body lines and then a start line.
2100  while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2101  // do nothing
2102  }
2103  if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2104  model = nullptr;
2105  }
2106  }
2107  if (model == nullptr) {
2108  continue;
2109  }
2110  // rows[start, end) should be a paragraph.
2111  PARA *p = new PARA();
2112  if (model == kCrownLeft || model == kCrownRight) {
2113  p->is_very_first_or_continuation = true;
2114  // Crown paragraph.
2115  // If we can find an existing ParagraphModel that fits, use it,
2116  // else create a new one.
2117  for (unsigned row = end; row < rows.size(); row++) {
2118  if ((*row_owners)[row] &&
2119  (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2120  (start == 0 || ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2121  model = (*row_owners)[row]->model;
2122  break;
2123  }
2124  }
2125  if (model == kCrownLeft) {
2126  // No subsequent model fits, so cons one up.
2127  model = theory->AddModel(ParagraphModel(JUSTIFICATION_LEFT,
2128  rows[start].lmargin_ + rows[start].lindent_, 0, 0,
2129  Epsilon(rows[start].ri_->average_interword_space)));
2130  } else if (model == kCrownRight) {
2131  // No subsequent model fits, so cons one up.
2132  model = theory->AddModel(ParagraphModel(JUSTIFICATION_RIGHT,
2133  rows[start].rmargin_ + rows[start].rmargin_, 0, 0,
2134  Epsilon(rows[start].ri_->average_interword_space)));
2135  }
2136  }
2137  rows[start].SetUnknown();
2138  rows[start].AddStartLine(model);
2139  for (int i = start + 1; i < end; i++) {
2140  rows[i].SetUnknown();
2141  rows[i].AddBodyLine(model);
2142  }
2143  p->model = model;
2144  p->has_drop_cap = rows[start].ri_->has_drop_cap;
2145  p->is_list_item = model->justification() == JUSTIFICATION_RIGHT
2146  ? rows[start].ri_->rword_indicates_list_item
2147  : rows[start].ri_->lword_indicates_list_item;
2148  for (int row = start; row < end; row++) {
2149  if ((*row_owners)[row] != nullptr) {
2150  tprintf(
2151  "Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2152  "more than once!\n");
2153  delete (*row_owners)[row];
2154  }
2155  (*row_owners)[row] = p;
2156  }
2157  }
2158 }
2159 
2160 struct Interval {
2161  Interval() : begin(0), end(0) {}
2162  Interval(int b, int e) : begin(b), end(e) {}
2163 
2164  int begin;
2165  int end;
2166 };
2167 
2168 // Return whether rows[row] appears to be stranded, meaning that the evidence
2169 // for this row is very weak due to context. For instance, two lines of source
2170 // code may happen to be indented at the same tab vector as body text starts,
2171 // leading us to think they are two start-of-paragraph lines. This is not
2172 // optimal. However, we also don't want to mark a sequence of short dialog
2173 // as "weak," so our heuristic is:
2174 // (1) If a line is surrounded by lines of unknown type, it's weak.
2175 // (2) If two lines in a row are start lines for a given paragraph type, but
2176 // after that the same paragraph type does not continue, they're weak.
2177 static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) {
2178  SetOfModels row_models;
2179  rows[row].StrongHypotheses(&row_models);
2180 
2181  for (auto &row_model : row_models) {
2182  bool all_starts = rows[row].GetLineType();
2183  int run_length = 1;
2184  bool continues = true;
2185  for (int i = row - 1; i >= 0 && continues; i--) {
2186  SetOfModels models;
2187  rows[i].NonNullHypotheses(&models);
2188  switch (rows[i].GetLineType(row_model)) {
2189  case LT_START:
2190  run_length++;
2191  break;
2192  case LT_MULTIPLE: // explicit fall-through
2193  case LT_BODY:
2194  run_length++;
2195  all_starts = false;
2196  break;
2197  case LT_UNKNOWN: // explicit fall-through
2198  default:
2199  continues = false;
2200  }
2201  }
2202  continues = true;
2203  for (unsigned i = row + 1; i < rows.size() && continues; i++) {
2204  SetOfModels models;
2205  rows[i].NonNullHypotheses(&models);
2206  switch (rows[i].GetLineType(row_model)) {
2207  case LT_START:
2208  run_length++;
2209  break;
2210  case LT_MULTIPLE: // explicit fall-through
2211  case LT_BODY:
2212  run_length++;
2213  all_starts = false;
2214  break;
2215  case LT_UNKNOWN: // explicit fall-through
2216  default:
2217  continues = false;
2218  }
2219  }
2220  if (run_length > 2 || (!all_starts && run_length > 1)) {
2221  return false;
2222  }
2223  }
2224  return true;
2225 }
2226 
2227 // Go through rows[row_start, row_end) and gather up sequences that need better
2228 // classification.
2229 // + Sequences of non-empty rows without hypotheses.
2230 // + Crown paragraphs not immediately followed by a strongly modeled line.
2231 // + Single line paragraphs surrounded by text that doesn't match the
2232 // model.
2233 static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,
2234  std::vector<Interval> *to_fix, int row_start, int row_end) {
2235  to_fix->clear();
2236  for (int i = row_start; i < row_end; i++) {
2237  bool needs_fixing = false;
2238 
2239  SetOfModels models;
2240  SetOfModels models_w_crowns;
2241  rows[i].StrongHypotheses(&models);
2242  rows[i].NonNullHypotheses(&models_w_crowns);
2243  if (models.empty() && !models_w_crowns.empty()) {
2244  // Crown paragraph. Is it followed by a modeled line?
2245  for (unsigned end = i + 1; end < rows.size(); end++) {
2246  SetOfModels end_models;
2247  SetOfModels strong_end_models;
2248  rows[end].NonNullHypotheses(&end_models);
2249  rows[end].StrongHypotheses(&strong_end_models);
2250  if (end_models.empty()) {
2251  needs_fixing = true;
2252  break;
2253  } else if (!strong_end_models.empty()) {
2254  needs_fixing = false;
2255  break;
2256  }
2257  }
2258  } else if (models.empty() && rows[i].ri_->num_words > 0) {
2259  // No models at all.
2260  needs_fixing = true;
2261  }
2262 
2263  if (!needs_fixing && !models.empty()) {
2264  needs_fixing = RowIsStranded(rows, i);
2265  }
2266 
2267  if (needs_fixing) {
2268  if (!to_fix->empty() && to_fix->back().end == i - 1) {
2269  to_fix->back().end = i;
2270  } else {
2271  to_fix->push_back(Interval(i, i));
2272  }
2273  }
2274  }
2275  // Convert inclusive intervals to half-open intervals.
2276  for (auto &i : *to_fix) {
2277  i.end = i.end + 1;
2278  }
2279 }
2280 
2281 // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
2282 // normalize each row_owner to point to an actual PARA, and output the
2283 // paragraphs in order onto paragraphs.
2284 void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) {
2285  std::vector<PARA *> &rows = *row_owners;
2286  paragraphs->clear();
2287  PARA_IT out(paragraphs);
2288  PARA *formerly_null = nullptr;
2289  for (unsigned i = 0; i < rows.size(); i++) {
2290  if (rows[i] == nullptr) {
2291  if (i == 0 || rows[i - 1] != formerly_null) {
2292  rows[i] = formerly_null = new PARA();
2293  } else {
2294  rows[i] = formerly_null;
2295  continue;
2296  }
2297  } else if (i > 0 && rows[i - 1] == rows[i]) {
2298  continue;
2299  }
2300  out.add_after_then_move(rows[i]);
2301  }
2302 }
2303 
2304 // Main entry point for Paragraph Detection Algorithm.
2305 //
2306 // Given a set of equally spaced textlines (described by row_infos),
2307 // Split them into paragraphs.
2308 //
2309 // Output:
2310 // row_owners - one pointer for each row, to the paragraph it belongs to.
2311 // paragraphs - this is the actual list of PARA objects.
2312 // models - the list of paragraph models referenced by the PARA objects.
2313 // caller is responsible for deleting the models.
2314 void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
2315  std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
2316  std::vector<ParagraphModel *> *models) {
2317  ParagraphTheory theory(models);
2318 
2319  // Initialize row_owners to be a bunch of nullptr pointers.
2320  row_owners->clear();
2321  row_owners->resize(row_infos->size());
2322 
2323  // Set up row scratch registers for the main algorithm.
2324  std::vector<RowScratchRegisters> rows(row_infos->size());
2325  for (unsigned i = 0; i < row_infos->size(); i++) {
2326  rows[i].Init((*row_infos)[i]);
2327  }
2328 
2329  // Pass 1:
2330  // Detect sequences of lines that all contain leader dots (.....)
2331  // These are likely Tables of Contents. If there are three text lines in
2332  // a row with leader dots, it's pretty safe to say the middle one should
2333  // be a paragraph of its own.
2334  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2335 
2336  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2337 
2338  std::vector<Interval> leftovers;
2339  LeftoverSegments(rows, &leftovers, 0, rows.size());
2340  for (auto &leftover : leftovers) {
2341  // Pass 2a:
2342  // Find any strongly evidenced start-of-paragraph lines. If they're
2343  // followed by two lines that look like body lines, make a paragraph
2344  // model for that and see if that model applies throughout the text
2345  // (that is, "smear" it).
2346  StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2347 
2348  // Pass 2b:
2349  // If we had any luck in pass 2a, we got part of the page and didn't
2350  // know how to classify a few runs of rows. Take the segments that
2351  // didn't find a model and reprocess them individually.
2352  std::vector<Interval> leftovers2;
2353  LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
2354  bool pass2a_was_useful =
2355  leftovers2.size() > 1 ||
2356  (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));
2357  if (pass2a_was_useful) {
2358  for (auto &leftover2 : leftovers2) {
2359  StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
2360  }
2361  }
2362  }
2363 
2364  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2365 
2366  // Pass 3:
2367  // These are the dregs for which we didn't have enough strong textual
2368  // and geometric clues to form matching models for. Let's see if
2369  // the geometric clues are simple enough that we could just use those.
2370  LeftoverSegments(rows, &leftovers, 0, rows.size());
2371  for (auto &leftover : leftovers) {
2372  GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2373  }
2374 
2375  // Undo any flush models for which there's little evidence.
2376  DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2377 
2378  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2379 
2380  // Pass 4:
2381  // Take everything that's still not marked up well and clear all markings.
2382  LeftoverSegments(rows, &leftovers, 0, rows.size());
2383  for (auto &leftover : leftovers) {
2384  for (int j = leftover.begin; j < leftover.end; j++) {
2385  rows[j].SetUnknown();
2386  }
2387  }
2388 
2389  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2390 
2391  // Convert all of the unique hypothesis runs to PARAs.
2392  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory);
2393 
2394  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2395 
2396  // Finally, clean up any dangling nullptr row paragraph parents.
2397  CanonicalizeDetectionResults(row_owners, paragraphs);
2398 }
2399 
2400 // ============ Code interfacing with the rest of Tesseract ==================
2401 
2402 static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) {
2403  // Set up text, lword_text, and rword_text (mostly for debug printing).
2404  std::string fake_text;
2405  PageIterator pit(static_cast<const PageIterator &>(it));
2406  bool first_word = true;
2407  if (!pit.Empty(RIL_WORD)) {
2408  do {
2409  fake_text += "x";
2410  if (first_word) {
2411  info->lword_text += "x";
2412  }
2413  info->rword_text += "x";
2414  if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2415  !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
2416  fake_text += " ";
2417  info->rword_text = "";
2418  first_word = false;
2419  }
2420  } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && pit.Next(RIL_SYMBOL));
2421  }
2422  if (fake_text.empty()) {
2423  return;
2424  }
2425 
2426  int lspaces = info->pix_ldistance / info->average_interword_space;
2427  for (int i = 0; i < lspaces; i++) {
2428  info->text += ' ';
2429  }
2430  info->text += fake_text;
2431 
2432  // Set up lword_box, rword_box, and num_words.
2433  PAGE_RES_IT page_res_it = *it.PageResIt();
2434  WERD_RES *word_res = page_res_it.restart_row();
2435  ROW_RES *this_row = page_res_it.row();
2436 
2437  WERD_RES *lword = nullptr;
2438  WERD_RES *rword = nullptr;
2439  info->num_words = 0;
2440  do {
2441  if (word_res) {
2442  if (!lword) {
2443  lword = word_res;
2444  }
2445  if (rword != word_res) {
2446  info->num_words++;
2447  }
2448  rword = word_res;
2449  }
2450  word_res = page_res_it.forward();
2451  } while (page_res_it.row() == this_row);
2452 
2453  if (lword) {
2454  info->lword_box = lword->word->bounding_box();
2455  }
2456  if (rword) {
2457  info->rword_box = rword->word->bounding_box();
2458  }
2459 }
2460 
2461 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph
2462 // detector RowInfo with all relevant information from the row.
2463 static void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info) {
2464  if (it.PageResIt()->row() != nullptr) {
2465  ROW *row = it.PageResIt()->row()->row;
2466  info->pix_ldistance = row->lmargin();
2467  info->pix_rdistance = row->rmargin();
2468  info->average_interword_space =
2469  row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);
2470  info->pix_xheight = row->x_height();
2471  info->has_leaders = false;
2472  info->has_drop_cap = row->has_drop_cap();
2473  info->ltr = true; // set below depending on word scripts
2474  } else {
2475  info->pix_ldistance = info->pix_rdistance = 0;
2476  info->average_interword_space = 1;
2477  info->pix_xheight = 1.0;
2478  info->has_leaders = false;
2479  info->has_drop_cap = false;
2480  info->ltr = true;
2481  }
2482 
2483  info->num_words = 0;
2484  info->lword_indicates_list_item = false;
2485  info->lword_likely_starts_idea = false;
2486  info->lword_likely_ends_idea = false;
2487  info->rword_indicates_list_item = false;
2488  info->rword_likely_starts_idea = false;
2489  info->rword_likely_ends_idea = false;
2490  info->has_leaders = false;
2491  info->ltr = true;
2492 
2493  if (!after_recognition) {
2494  InitializeTextAndBoxesPreRecognition(it, info);
2495  return;
2496  }
2497  info->text = "";
2498  const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
2499  int trailing_ws_idx = strlen(text.get()); // strip trailing space
2500  while (trailing_ws_idx > 0 &&
2501  // isspace() only takes ASCII
2502  isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) {
2503  trailing_ws_idx--;
2504  }
2505  if (trailing_ws_idx > 0) {
2506  int lspaces = info->pix_ldistance / info->average_interword_space;
2507  for (int i = 0; i < lspaces; i++) {
2508  info->text += ' ';
2509  }
2510  for (int i = 0; i < trailing_ws_idx; i++) {
2511  info->text += text[i];
2512  }
2513  }
2514 
2515  if (info->text.empty()) {
2516  return;
2517  }
2518 
2519  PAGE_RES_IT page_res_it = *it.PageResIt();
2520  std::vector<WERD_RES *> werds;
2521  WERD_RES *word_res = page_res_it.restart_row();
2522  ROW_RES *this_row = page_res_it.row();
2523  int num_leaders = 0;
2524  int ltr = 0;
2525  int rtl = 0;
2526  do {
2527  if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2528  werds.push_back(word_res);
2529  ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2530  rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2531  if (word_res->word->flag(W_REP_CHAR)) {
2532  num_leaders++;
2533  }
2534  }
2535  word_res = page_res_it.forward();
2536  } while (page_res_it.row() == this_row);
2537  info->ltr = ltr >= rtl;
2538  info->has_leaders = num_leaders > 3;
2539  info->num_words = werds.size();
2540  if (!werds.empty()) {
2541  WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2542  info->lword_text = lword->best_choice->unichar_string().c_str();
2543  info->rword_text = rword->best_choice->unichar_string().c_str();
2544  info->lword_box = lword->word->bounding_box();
2545  info->rword_box = rword->word->bounding_box();
2546  LeftWordAttributes(lword->uch_set, lword->best_choice, info->lword_text,
2547  &info->lword_indicates_list_item, &info->lword_likely_starts_idea,
2548  &info->lword_likely_ends_idea);
2549  RightWordAttributes(rword->uch_set, rword->best_choice, info->rword_text,
2550  &info->rword_indicates_list_item, &info->rword_likely_starts_idea,
2551  &info->rword_likely_ends_idea);
2552  }
2553 }
2554 
2555 // This is called after rows have been identified and words are recognized.
2556 // Much of this could be implemented before word recognition, but text helps
2557 // to identify bulleted lists and gives good signals for sentence boundaries.
2558 void DetectParagraphs(int debug_level, bool after_text_recognition,
2559  const MutableIterator *block_start, std::vector<ParagraphModel *> *models) {
2560  // Clear out any preconceived notions.
2561  if (block_start->Empty(RIL_TEXTLINE)) {
2562  return;
2563  }
2564  BLOCK *block = block_start->PageResIt()->block()->block;
2565  block->para_list()->clear();
2566  bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();
2567 
2568  // Convert the Tesseract structures to RowInfos
2569  // for the paragraph detection algorithm.
2570  MutableIterator row(*block_start);
2571  if (row.Empty(RIL_TEXTLINE)) {
2572  return; // end of input already.
2573  }
2574 
2575  std::vector<RowInfo> row_infos;
2576  do {
2577  if (!row.PageResIt()->row()) {
2578  continue; // empty row.
2579  }
2580  row.PageResIt()->row()->row->set_para(nullptr);
2581  row_infos.emplace_back();
2582  RowInfo &ri = row_infos.back();
2583  InitializeRowInfo(after_text_recognition, row, &ri);
2584  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && row.Next(RIL_TEXTLINE));
2585 
2586  // If we're called before text recognition, we might not have
2587  // tight block bounding boxes, so trim by the minimum on each side.
2588  if (!row_infos.empty()) {
2589  int min_lmargin = row_infos[0].pix_ldistance;
2590  int min_rmargin = row_infos[0].pix_rdistance;
2591  for (unsigned i = 1; i < row_infos.size(); i++) {
2592  if (row_infos[i].pix_ldistance < min_lmargin) {
2593  min_lmargin = row_infos[i].pix_ldistance;
2594  }
2595  if (row_infos[i].pix_rdistance < min_rmargin) {
2596  min_rmargin = row_infos[i].pix_rdistance;
2597  }
2598  }
2599  if (min_lmargin > 0 || min_rmargin > 0) {
2600  for (auto &row_info : row_infos) {
2601  row_info.pix_ldistance -= min_lmargin;
2602  row_info.pix_rdistance -= min_rmargin;
2603  }
2604  }
2605  }
2606 
2607  // Run the paragraph detection algorithm.
2608  std::vector<PARA *> row_owners;
2609  std::vector<PARA *> the_paragraphs;
2610  if (!is_image_block) {
2611  DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);
2612  } else {
2613  row_owners.resize(row_infos.size());
2614  CanonicalizeDetectionResults(&row_owners, block->para_list());
2615  }
2616 
2617  // Now stitch in the row_owners into the rows.
2618  row = *block_start;
2619  for (auto &row_owner : row_owners) {
2620  while (!row.PageResIt()->row()) {
2621  row.Next(RIL_TEXTLINE);
2622  }
2623  row.PageResIt()->row()->row->set_para(row_owner);
2624  row.Next(RIL_TEXTLINE);
2625  }
2626 }
2627 
2628 } // namespace tesseract
@ W_REP_CHAR
repeated character
Definition: werd.h:40
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:51
bool StrongModel(const ParagraphModel *model)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
ParagraphJustification
Definition: publictypes.h:248
@ JUSTIFICATION_LEFT
Definition: publictypes.h:250
@ JUSTIFICATION_UNKNOWN
Definition: publictypes.h:249
@ JUSTIFICATION_RIGHT
Definition: publictypes.h:252
@ JUSTIFICATION_CENTER
Definition: publictypes.h:251
std::vector< const ParagraphModel * > SetOfModels
int InterwordSpace(const std::vector< RowScratchRegisters > &rows, int row_start, int row_end)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool RowsFitModel(const std::vector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
void RecomputeMarginsAndClearHypotheses(std::vector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool ValidBodyLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:477
void CanonicalizeDetectionResults(std::vector< PARA * > *row_owners, PARA_LIST *paragraphs)
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:110
int UNICHAR_ID
Definition: unichar.h:36
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:56
void push_back_new(std::vector< T > &vector, const T &data)
Definition: paragraphs.cpp:418
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:58
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:122
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:431
bool CrownCompatible(const std::vector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
bool ValidFirstLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool AsciiLikelyListItem(const std::string &word)
Definition: paragraphs.cpp:282
void DetectParagraphs(int debug_level, std::vector< RowInfo > *row_infos, std::vector< PARA * > *row_owners, PARA_LIST *paragraphs, std::vector< ParagraphModel * > *models)
bool contains(const std::vector< T > &data, const T &value)
Definition: helpers.h:37
bool Empty(PageIteratorLevel level) const
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
bool Next(PageIteratorLevel level) override
const PAGE_RES_IT * PageResIt() const
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
Definition: paragraphs.cpp:300
unsigned SkipDigits(unsigned pos)
Definition: paragraphs.cpp:326
unsigned SkipRomans(unsigned pos)
Definition: paragraphs.cpp:334
unsigned SkipPunc(unsigned pos)
Definition: paragraphs.cpp:319
unsigned SkipAlpha(unsigned pos)
Definition: paragraphs.cpp:346
Cluster(int cen, int num)
Definition: paragraphs.cpp:699
void GetClusters(std::vector< Cluster > *clusters)
Definition: paragraphs.cpp:732
SimpleClusterer(int max_cluster_width)
Definition: paragraphs.cpp:707
std::vector< Cluster > right_tabs
void Fail(int min_debug_level, const char *why) const
Definition: paragraphs.cpp:973
bool FirstWordWouldHaveFit(int row_a, int row_b)
Definition: paragraphs.cpp:965
std::vector< RowScratchRegisters > * rows
Definition: paragraphs.cpp:990
const std::vector< Cluster > & AlignTabs() const
Definition: paragraphs.cpp:933
const std::vector< Cluster > & OffsideTabs() const
Definition: paragraphs.cpp:945
GeometricClassifierState(int dbg_level, std::vector< RowScratchRegisters > *r, int r_start, int r_end)
Definition: paragraphs.cpp:908
ParagraphModel Model() const
Definition: paragraphs.cpp:981
tesseract::ParagraphJustification just
int AlignsideTabIndex(int row_idx) const
Definition: paragraphs.cpp:959
std::vector< Cluster > left_tabs
Interval(int b, int e)
int average_interword_space
Definition: paragraphs.h:50
void StartHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:645
const ParagraphModel * UniqueStartHypothesis() const
Definition: paragraphs.cpp:669
void NonNullHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:661
void AddBodyLine(const ParagraphModel *model)
Definition: paragraphs.cpp:637
void StrongHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:653
static void AppendDebugHeaderFields(std::vector< std::string > &header)
Definition: paragraphs.cpp:510
void AppendDebugInfo(const ParagraphTheory &theory, std::vector< std::string > &dbg) const
Definition: paragraphs.cpp:515
int OffsideIndent(tesseract::ParagraphJustification just) const
void DiscardNonMatchingHypotheses(const SetOfModels &models)
Definition: paragraphs.cpp:684
void AddStartLine(const ParagraphModel *model)
Definition: paragraphs.cpp:629
const ParagraphModel * UniqueBodyHypothesis() const
Definition: paragraphs.cpp:676
void Init(const RowInfo &row)
Definition: paragraphs.cpp:548
void NonCenteredModels(SetOfModels *models)
std::vector< ParagraphModel * > & models()
const ParagraphModel * Fits(const std::vector< RowScratchRegisters > *rows, int start, int end) const
void DiscardUnusedModels(const SetOfModels &used_models)
int IndexOf(const ParagraphModel *model) const
const ParagraphModel * AddModel(const ParagraphModel &model)
ParagraphModelSmearer(std::vector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
PARA_LIST * para_list()
Definition: ocrblock.h:119
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:45
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:59
void set_para(PARA *p)
Definition: ocrrow.h:117
ROW_RES * row() const
Definition: pageres.h:766
BLOCK_RES * block() const
Definition: pageres.h:769
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
bool IsText() const
Definition: polyblk.h:52
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
bool empty() const
Definition: ratngs.h:280
unsigned length() const
Definition: ratngs.h:283
TDimension width() const
Definition: rect.h:126
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
double median() const
Definition: statistc.cpp:242
double ile(double frac) const
Definition: statistc.cpp:173
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533