tesseract  5.0.0
paragraphs_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 
12 #include <string> // for std::string
13 
14 #include "include_gunit.h" // for TEST
15 #include "log.h" // for LOG
16 
17 // ccmain
18 #include "paragraphs.h"
19 #include "paragraphs_internal.h"
20 // ccstruct
21 #include "ocrpara.h"
22 
23 namespace tesseract {
24 
25 // Functions for making monospace ASCII trial text for the paragraph detector.
30 
32  PCONT = 0, // Continuation line of a paragraph (default).
33  PSTART = 1, // First line of a paragraph.
34  PNONE = 2, // Not a paragraph line.
35 };
36 
37 struct TextAndModel {
38  const char *ascii;
40 
41  // fields corresponding to PARA (see ccstruct/ocrpara.h)
45 };
46 
47 // Imagine that the given text is typewriter ASCII with each character ten
48 // pixels wide and twenty pixels high and return an appropriate row_info.
49 void AsciiToRowInfo(const char *text, int row_number, RowInfo *info) {
50  const int kCharWidth = 10;
51  const int kLineSpace = 30;
52  info->text = text;
53  info->has_leaders = strstr(text, "...") != nullptr || strstr(text, ". . .") != nullptr;
54  info->has_drop_cap = false;
55  info->pix_ldistance = info->pix_rdistance = 0;
56  info->average_interword_space = kCharWidth;
57  info->pix_xheight = kCharWidth;
58  info->lword_text = info->rword_text = "";
59  info->ltr = true;
60 
61  std::vector<std::string> words = split(text, ' ');
62  info->num_words = words.size();
63  if (info->num_words < 1) {
64  return;
65  }
66 
67  info->lword_text = words[0].c_str();
68  info->rword_text = words[words.size() - 1].c_str();
69  int lspace = 0;
70  while (lspace < info->text.size() && text[lspace] == ' ') {
71  lspace++;
72  }
73  int rspace = 0;
74  while (rspace < info->text.size() && text[info->text.size() - rspace - 1] == ' ') {
75  rspace++;
76  }
77 
78  int top = -kLineSpace * row_number;
79  int bottom = top - kLineSpace;
80  int row_right = kCharWidth * info->text.size();
81  int lword_width = kCharWidth * info->lword_text.size();
82  int rword_width = kCharWidth * info->rword_text.size();
83  info->pix_ldistance = lspace * kCharWidth;
84  info->pix_rdistance = rspace * kCharWidth;
85  info->lword_box = TBOX(info->pix_ldistance, bottom, info->pix_ldistance + lword_width, top);
86  info->rword_box = TBOX(row_right - info->pix_rdistance - rword_width, bottom,
87  row_right - info->pix_rdistance, top);
88  LeftWordAttributes(nullptr, nullptr, info->lword_text, &info->lword_indicates_list_item,
90  RightWordAttributes(nullptr, nullptr, info->rword_text, &info->rword_indicates_list_item,
92 }
93 
94 void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector<RowInfo> *output) {
95  output->clear();
96  RowInfo info;
97  for (int i = 0; i < n; i++) {
98  AsciiToRowInfo(row_infos[i].ascii, i, &info);
99  output->push_back(info);
100  }
101 }
102 
103 // Given n rows of reference ground truth, evaluate whether the n rows
104 // of PARA * pointers yield the same paragraph breakpoints.
105 void EvaluateParagraphDetection(const TextAndModel *correct, int n,
106  const std::vector<PARA *> &detector_output) {
107  int incorrect_breaks = 0;
108  int missed_breaks = 0;
109  int poorly_matched_models = 0;
110  int bad_crowns = 0;
111  int bad_list_items = 0;
112  ASSERT_EQ(detector_output.size(), n);
113  for (int i = 1; i < n; i++) {
114  bool has_break = correct[i].model_type != PCONT;
115  bool detected_break = (detector_output[i - 1] != detector_output[i]);
116  if (has_break && !detected_break) {
117  missed_breaks++;
118  }
119  if (detected_break && !has_break) {
120  incorrect_breaks++;
121  }
122  if (has_break) {
123  if (correct[i].model_type == PNONE) {
124  if (detector_output[i]->model != nullptr) {
125  poorly_matched_models++;
126  }
127  } else {
128  if (correct[i].model.justification() != kUnknown &&
129  (detector_output[i]->model == nullptr ||
130  !correct[i].model.Comparable(*detector_output[i]->model))) {
131  poorly_matched_models++;
132  }
133  }
134  if (correct[i].is_very_first_or_continuation ^
135  detector_output[i]->is_very_first_or_continuation) {
136  bad_crowns++;
137  }
138  if (correct[i].is_list_item ^ detector_output[i]->is_list_item) {
139  bad_list_items++;
140  }
141  }
142  }
143  EXPECT_EQ(incorrect_breaks, 0);
144  EXPECT_EQ(missed_breaks, 0);
145  EXPECT_EQ(poorly_matched_models, 0);
146  EXPECT_EQ(bad_list_items, 0);
147  EXPECT_EQ(bad_crowns, 0);
148  if (incorrect_breaks || missed_breaks || poorly_matched_models || bad_list_items || bad_crowns) {
149  std::vector<std::string> dbg_lines;
150  dbg_lines.emplace_back("# ==========================");
151  dbg_lines.emplace_back("# Correct paragraph breaks:");
152  dbg_lines.emplace_back("# ==========================");
153  for (int i = 0; i < n; i++) {
154  if (correct[i].model_type != PCONT) {
155  std::string s = std::string(correct[i].ascii) + " # " +
156  correct[i].model.ToString() +
157  (correct[i].is_very_first_or_continuation ? " crown" : "") +
158  (correct[i].is_list_item ? " li" : "");
159  dbg_lines.push_back(s);
160  } else {
161  dbg_lines.emplace_back(correct[i].ascii);
162  }
163  }
164  dbg_lines.emplace_back("");
165  dbg_lines.emplace_back("# ==========================");
166  dbg_lines.emplace_back("# Paragraph detector output:");
167  dbg_lines.emplace_back("# ==========================");
168  for (int i = 0; i < n; i++) {
169  std::string annotation;
170  if (i == 0 || (detector_output[i - 1] != detector_output[i])) {
171  if (detector_output[i] && detector_output[i]->model) {
172  annotation +=
173  " # " + detector_output[i]->model->ToString() +
174  (detector_output[i]->is_very_first_or_continuation ? " crown" : "") +
175  (detector_output[i]->is_list_item ? " li" : "");
176  } else {
177  annotation = " # Unmodeled paragraph.";
178  }
179  }
180  std::string s = correct[i].ascii + annotation;
181  dbg_lines.push_back(s);
182  }
183  std::string s;
184  for (auto &dbg_line : dbg_lines) {
185  s += dbg_line + "\n";
186  }
187  LOG(INFO) << "Discrepancy!\n" << s;
188  }
189 }
190 
191 void TestParagraphDetection(const TextAndModel *correct, int num_rows) {
192  std::vector<RowInfo> row_infos;
193  std::vector<PARA *> row_owners;
194  PARA_LIST paragraphs;
195  std::vector<ParagraphModel *> models;
196 
197  MakeAsciiRowInfos(correct, num_rows, &row_infos);
198  int debug_level(3);
199  tesseract::DetectParagraphs(debug_level, &row_infos, &row_owners, &paragraphs, &models);
200  EvaluateParagraphDetection(correct, num_rows, row_owners);
201  for (auto *model : models) {
202  delete model;
203  }
204 }
205 
206 TEST(ParagraphsTest, ListItemsIdentified) {
207  EXPECT_TRUE(tesseract::AsciiLikelyListItem("iii"));
208  EXPECT_TRUE(tesseract::AsciiLikelyListItem("A."));
209  EXPECT_TRUE(tesseract::AsciiLikelyListItem("B."));
210  EXPECT_TRUE(tesseract::AsciiLikelyListItem("C."));
211  EXPECT_TRUE(tesseract::AsciiLikelyListItem("1."));
212  EXPECT_TRUE(tesseract::AsciiLikelyListItem("2."));
213  EXPECT_TRUE(tesseract::AsciiLikelyListItem("3."));
214  EXPECT_TRUE(tesseract::AsciiLikelyListItem("1"));
215  EXPECT_TRUE(tesseract::AsciiLikelyListItem("2"));
216  EXPECT_TRUE(tesseract::AsciiLikelyListItem("3"));
217  EXPECT_TRUE(tesseract::AsciiLikelyListItem("[[1]]"));
218  EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-1."));
219  EXPECT_TRUE(tesseract::AsciiLikelyListItem("A-2"));
220  EXPECT_TRUE(tesseract::AsciiLikelyListItem("(A)(i)"));
221 
222  EXPECT_FALSE(tesseract::AsciiLikelyListItem("The"));
223  EXPECT_FALSE(tesseract::AsciiLikelyListItem("first"));
224  EXPECT_FALSE(tesseract::AsciiLikelyListItem("house"));
225  EXPECT_FALSE(tesseract::AsciiLikelyListItem("Oregonian."));
226  EXPECT_FALSE(tesseract::AsciiLikelyListItem("on."));
227 }
228 
230 
232  {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
233  {"This paragraph starts at the top", PCONT, PModel(), false, false},
234  {"of the page and takes 3 lines. ", PCONT, PModel(), false, false},
235  {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
236  {"which indicates that the first ", PCONT, PModel(), false, false},
237  {"paragraph is not a continuation ", PCONT, PModel(), false, false},
238  {"from a previous page, as it is ", PCONT, PModel(), false, false},
239  {"indented just like this second ", PCONT, PModel(), false, false},
240  {"paragraph. ", PCONT, PModel(), false, false},
241 };
242 
243 TEST(ParagraphsTest, TestSimpleParagraphDetection) {
245 }
246 
248  {"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
249  {"of the page and takes two lines.", PCONT, PModel(), false, false},
250  {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
251  {"which indicates that the first ", PCONT, PModel(), false, false},
252  {"paragraph is a continuation from", PCONT, PModel(), false, false},
253  {"a previous page, as it is ", PCONT, PModel(), false, false},
254  {"indented just like this second ", PCONT, PModel(), false, false},
255  {"paragraph. ", PCONT, PModel(), false, false},
256 };
257 
258 TEST(ParagraphsTest, TestFewCluesWithCrown) {
260 }
261 
263  {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
264  {"often not indented as the rest ", PCONT, PModel(), false, false},
265  {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
266  {"less it should be counted as the", PCONT, PModel(), false, false},
267  {"same type of paragraph. ", PCONT, PModel(), false, false},
268  {" The second and third para- ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
269  {"graphs are both indented two ", PCONT, PModel(), false, false},
270  {"spaces. ", PCONT, PModel(), false, false},
271  {" The first paragraph has what ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
272  {"fmt refers to as a 'crown.' ", PCONT, PModel(), false, false},
273 };
274 
275 TEST(ParagraphsTest, TestCrownParagraphDetection) {
277 }
278 
280  {"It is sometimes the case that", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
281  {"flush left paragraphs (those", PCONT, PModel(), false, false},
282  {"with no body indent) are not", PCONT, PModel(), false, false},
283  {"actually crowns. ", PCONT, PModel(), false, false},
284  {"Instead, further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
285  {"also flush left aligned. Usual-", PCONT, PModel(), false, false},
286  {"ly, these paragraphs are set", PCONT, PModel(), false, false},
287  {"apart vertically by some white-", PCONT, PModel(), false, false},
288  {"space, but you can also detect", PCONT, PModel(), false, false},
289  {"them by observing the big empty", PCONT, PModel(), false, false},
290  {"space at the ends of the para-", PCONT, PModel(), false, false},
291  {"graphs. ", PCONT, PModel(), false, false},
292 };
293 
294 TEST(ParagraphsText, TestRealFlushLeftParagraphs) {
296 }
297 
299  {"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
300  {"continuation. It flows from", PCONT, PModel(), false, false},
301  {"line to line, using the full", PCONT, PModel(), false, false},
302  {"column width with no clear", PCONT, PModel(), false, false},
303  {"paragraph break, because it", PCONT, PModel(), false, false},
304  {"actually doesn't have one. It", PCONT, PModel(), false, false},
305  {"is the middle of one monster", PCONT, PModel(), false, false},
306  {"paragraph continued from the", PCONT, PModel(), false, false},
307  {"previous page and continuing", PCONT, PModel(), false, false},
308  {"onto the next page. There-", PCONT, PModel(), false, false},
309  {"fore, it ends up getting", PCONT, PModel(), false, false},
310  {"marked as a crown and then", PCONT, PModel(), false, false},
311  {"getting re-marked as any ex-", PCONT, PModel(), false, false},
312  {"isting model. Not great, but", PCONT, PModel(), false, false},
313 };
314 
315 TEST(ParagraphsTest, TestSingleFullPageContinuation) {
316  const TextAndModel *correct = kSingleFullPageContinuation;
317  int num_rows = countof(kSingleFullPageContinuation);
318  std::vector<RowInfo> row_infos;
319  std::vector<PARA *> row_owners;
320  PARA_LIST paragraphs;
321  std::vector<ParagraphModel *> models;
322  models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10));
323  MakeAsciiRowInfos(correct, num_rows, &row_infos);
324  tesseract::DetectParagraphs(3, &row_infos, &row_owners, &paragraphs, &models);
325  EvaluateParagraphDetection(correct, num_rows, row_owners);
326  for (auto *model : models) {
327  delete model;
328  }
329 }
330 
332  {"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
333  {" uncommon in Left-to-Right", PCONT, PModel(), false, false},
334  {" languages, but they do", PCONT, PModel(), false, false},
335  {" exist.", PCONT, PModel(), false, false},
336  {" Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
337  {" horribly tiny paragraphs in", PCONT, PModel(), false, false},
338  {" tables on which we have no", PCONT, PModel(), false, false},
339  {" chance anyways.", PCONT, PModel(), false, false},
340 };
341 
342 TEST(ParagraphsTest, TestRightAlignedParagraph) {
344 }
345 
347  {" Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
348  {"obvious paragraph text, you might", PCONT, PModel(), false, false},
349  {"find short exchanges of dialogue ", PCONT, PModel(), false, false},
350  {"between characters. ", PCONT, PModel(), false, false},
351  {" 'Oh?' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
352  {" 'Don't be confused!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
353  {" 'Not me!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
354  {" One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
355  {"mark a new paragraph whenever one", PCONT, PModel(), false, false},
356  {"of the statistics (left, right or", PCONT, PModel(), false, false},
357  {"center) changes from one text-", PCONT, PModel(), false, false},
358  {"line to the next. Such an", PCONT, PModel(), false, false},
359  {"approach would misclassify the", PCONT, PModel(), false, false},
360  {"tiny paragraphs above as a single", PCONT, PModel(), false, false},
361  {"paragraph. ", PCONT, PModel(), false, false},
362 };
363 
364 TEST(ParagraphsTest, TestTinyParagraphs) {
366 }
367 
369  {" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},
370  {" Centered Title ", PCONT, PModel(), false, false},
371  {" Paragraph Detection ", PCONT, PModel(), false, false},
372  {" OCR TEAM ", PCONT, PModel(), false, false},
373  {" 10 November 2010 ", PCONT, PModel(), false, false},
374  {" ", PNONE, PModel(), false, false},
375  {" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
376  {"This paragraph starts at the top", PCONT, PModel(), false, false},
377  {"of the page and takes 3 lines. ", PCONT, PModel(), false, false},
378  {" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
379  {"which indicates that the first ", PCONT, PModel(), false, false},
380  {"paragraph is not a continuation ", PCONT, PModel(), false, false},
381  {"from a previous page, as it is ", PCONT, PModel(), false, false},
382  {"indented just like this second ", PCONT, PModel(), false, false},
383  {"paragraph. ", PCONT, PModel(), false, false},
384  {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), true, false},
385  {" looks like the prior text ", PCONT, PModel(), false, false},
386  {" but it is indented more ", PCONT, PModel(), false, false},
387  {" and is fully justified. ", PCONT, PModel(), false, false},
388  {" So how does one deal with ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
389  {"centered text, block quotes, ", PCONT, PModel(), false, false},
390  {"normal paragraphs, and lists ", PCONT, PModel(), false, false},
391  {"like what follows? ", PCONT, PModel(), false, false},
392  {"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
393  {"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
394  {" looking for lines where the ", PCONT, PModel(), false, false},
395  {" first word of the next line ", PCONT, PModel(), false, false},
396  {" would fit on the previous ", PCONT, PModel(), false, false},
397  {" line. ", PCONT, PModel(), false, false},
398  {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
399  {" Python and try it out. ", PCONT, PModel(), false, false},
400  {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
401  {" mistakes. ", PCONT, PModel(), false, false},
402  {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
403  {" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
404  {"you can try to identify source ", PCONT, PModel(), false, false},
405  {"code. Ouch! ", PCONT, PModel(), false, false},
406 };
407 
408 TEST(ParagraphsTest, TestComplexPage1) {
410 }
411 
412 // The same as above, but wider.
414  {" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},
415  {" Centered Title ", PCONT, PModel(), false, false},
416  {" Paragraph Detection ", PCONT, PModel(), false, false},
417  {" OCR TEAM ", PCONT, PModel(), false, false},
418  {" 10 November 2010 ", PCONT, PModel(), false, false},
419  {" ", PNONE, PModel(), false, false},
420  {" Look here, I have a paragraph. ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
421  {"This paragraph starts at the top of", PCONT, PModel(), false, false},
422  {"the page and takes 3 lines. ", PCONT, PModel(), false, false},
423  {" Here I have a second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
424  {"which indicates that the first ", PCONT, PModel(), false, false},
425  {"paragraph is not a continuation ", PCONT, PModel(), false, false},
426  {"from a previous page, as it is in- ", PCONT, PModel(), false, false},
427  {"dented just like this second para- ", PCONT, PModel(), false, false},
428  {"graph. ", PCONT, PModel(), false, false},
429  {" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0), true, false},
430  {" looks like the prior text ", PCONT, PModel(), false, false},
431  {" but it is indented more ", PCONT, PModel(), false, false},
432  {" and is fully justified. ", PCONT, PModel(), false, false},
433  {" So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
434  {"ed text, block quotes, normal para-", PCONT, PModel(), false, false},
435  {"graphs, and lists like what follow?", PCONT, PModel(), false, false},
436  {"1. Make a plan. ", PCONT, PModel(), false, false}, // BUG!!
437  {"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
438  {" looking for lines where the ", PCONT, PModel(), false, false},
439  {" first word of the next line ", PCONT, PModel(), false, false},
440  {" would fit on the previous line. ", PCONT, PModel(), false, false},
441  {"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
442  {" Python and try it out. ", PCONT, PModel(), false, false},
443  {"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
444  {" mistakes. ", PCONT, PModel(), false, false},
445  {"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, true},
446  {" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
447  {"you can try to identify source ", PCONT, PModel(), false, false},
448  {"code. Ouch! ", PCONT, PModel(), false, false},
449 };
450 
451 TEST(ParagraphsTest, TestComplexPage2) {
453 }
454 
456  {"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
457  {"often not indented as the rest ", PCONT, PModel(), false, false},
458  {"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
459  {"less it should be counted as the", PCONT, PModel(), false, false},
460  {"same type of paragraph. ", PCONT, PModel(), false, false},
461  {" Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
462  {"should suffice. ", PCONT, PModel(), false, false},
463  {" 1235 ", PNONE, PModel(), false, false},
464 };
465 
466 TEST(ParagraphsTest, TestSubtleCrown) {
468 }
469 
470 TEST(ParagraphsTest, TestStrayLineInBlock) {
472 }
473 
475  {" Defined contribution plans cover employees in Australia, New", PSTART,
476  PModel(kLeft, 0, 50, 0, 0), false, false},
477  {"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(), false,
478  false},
479  {"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(),
480  false, false},
481  {"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false,
482  false},
483  {"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false,
484  false},
485  {"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(), false,
486  false},
487  {"charged to income for defined contribution plans were $92 in ", PCONT, PModel(), false,
488  false},
489  {"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(), false,
490  false},
491  {" In addition to providing pension benefits, the Company pro- ", PSTART,
492  PModel(kLeft, 0, 50, 0, 0), false, false},
493  {"vides certain health care and life insurance benefits to retired ", PCONT, PModel(), false,
494  false},
495  {"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(), false,
496  false},
497  {"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(), false,
498  false},
499  {"Company recognized the cost of providing these benefits as the ", PCONT, PModel(), false,
500  false},
501  {"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(), false,
502  false},
503  {"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false,
504  false},
505  {"and life insurance benefits in the year incurred. ", PCONT, PModel(), false,
506  false},
507  {" The U.S. plan covering the parent company is the largest plan.", PSTART,
508  PModel(kLeft, 0, 50, 0, 0), false, false},
509  {"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(), false,
510  false},
511  {"physicians’ services and major medical expense benefits and life ", PCONT, PModel(), false,
512  false},
513  {"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(), false,
514  false},
515  {"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(),
516  false, false},
517  {"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(), false,
518  false},
519  {"Company portion increasing as the retiree has increased years of ", PCONT, PModel(), false,
520  false},
521  {"credited service. The Company has the ability to change these ", PCONT, PModel(), false,
522  false},
523  {"benefits at any time. ", PCONT, PModel(), false,
524  false},
525  {" Effective October 1993, the Company amended its health ", PSTART,
526  PModel(kLeft, 0, 50, 0, 0), false, false},
527  {"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false,
528  false},
529  {"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false,
530  false},
531  {"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false,
532  false},
533  {"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(), false,
534  false},
535  {"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false,
536  false},
537  {"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(), false,
538  false},
539  {"for 1994 by approximately $83. ", PCONT, PModel(), false,
540  false},
541 };
542 
543 TEST(ParagraphsTest, TestUnlvInsurance) {
545 }
546 
547 // The basic outcome we want for something with a bunch of leader dots is that
548 // we group each logical entry as a separate item. Without knowledge of
549 // leaders, we would most likely mark the text below as a simple right aligned
550 // paragraph or two.
551 // This example comes from Volume 9886293, Page 5
553  {"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
554  {" Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
555  {" Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
556  {" Proverbs . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
557  {" Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
558  {" Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
559  {" Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
560  {" Appearance . . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
561  {" Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
562  {" Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
563  {" Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
564  {" Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
565  {" Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
566 };
567 
568 TEST(ParagraphsTest, TestSplitsOutLeaderLines) {
570 }
571 
573  {" A typical page of a programming book may contain", PSTART, PModel(kLeft, 0, 20, 0, 0),
574  false, false},
575  {"examples of source code to exemplify an algorithm ", PCONT, PModel(), false, false},
576  {"being described in prose. Such examples should be", PCONT, PModel(), false, false},
577  {"rendered as lineated text, meaning text with ", PCONT, PModel(), false, false},
578  {"explicit line breaks but without extra inter-line ", PCONT, PModel(), false, false},
579  {"spacing. Accidentally finding stray paragraphs in", PCONT, PModel(), false, false},
580  {"source code would lead to a bad reading experience", PCONT, PModel(), false, false},
581  {"when the text is re-flowed. ", PCONT, PModel(), false, false},
582  {" Let's show this by describing the function fact-", PSTART, PModel(kLeft, 0, 20, 0, 0),
583  false, false},
584  {"orial. Factorial is a simple recursive function ", PCONT, PModel(), false, false},
585  {"which grows very quickly. So quickly, in fact, ", PCONT, PModel(), false, false},
586  {"that the typical C implementation will only work ", PCONT, PModel(), false, false},
587  {"for values less than about 12: ", PCONT, PModel(), false, false},
588  {" ", PNONE, PModel(), false, false},
589  {" # Naive implementation in C ", PCONT, PModel(), false, false},
590  {" int factorial(int n) { ", PCONT, PModel(), false, false},
591  {" if (n < 2) ", PCONT, PModel(), false, false},
592  {" return 1; ", PCONT, PModel(), false, false},
593  {" return n * factorial(n - 1); ", PCONT, PModel(), false, false},
594  {" } ", PCONT, PModel(), false, false},
595  {" ", PCONT, PModel(), false, false},
596  {" The C programming language does not have built- ", PSTART, PModel(kLeft, 0, 20, 0, 0),
597  false, false},
598  {"in support for detecting integer overflow, so this", PCONT, PModel(), false, false},
599  {"naive implementation simply returns random values ", PCONT, PModel(), false, false},
600  {"if even a moderate sized n is provided. ", PCONT, PModel(), false, false},
601 };
602 
603 TEST(ParagraphsTest, NotDistractedBySourceCode) {
605 }
606 
608  {"royal palm which are called guano and in it there was a bed, a", PSTART,
609  PModel(kLeft, 0, 50, 0, 0), false, false},
610  {"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(),
611  false, false},
612  {"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(),
613  false, false},
614  {"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(),
615  false, false},
616  {"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(),
617  false, false},
618  {"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(),
619  false, false},
620  {"wife on the wall but he had taken it down because it made him too", PCONT, PModel(),
621  false, false},
622  {"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(),
623  false, false},
624  {"shirt. ", PCONT, PModel(),
625  false, false},
626  {" \"What do you have to eat?\" the boy asked. ", PSTART,
627  PModel(kLeft, 0, 50, 0, 0), false, false},
628  {" \"A pot of yellow rice with fish. Do you want some?\" ", PSTART,
629  PModel(kLeft, 0, 50, 0, 0), false, false},
630  {" \"No. I will eat at home. Do you want me to make the fire?\" ", PSTART,
631  PModel(kLeft, 0, 50, 0, 0), false, false},
632  {" \"No. I will make it later on. Or I may eat the rice cold.\" ", PSTART,
633  PModel(kLeft, 0, 50, 0, 0), false, false},
634  {" \"May I take the cast net?\" ", PSTART,
635  PModel(kLeft, 0, 50, 0, 0), false, false},
636  {" \"Of course.\" ", PSTART,
637  PModel(kLeft, 0, 50, 0, 0), false, false},
638  {" There was no cast net and the boy remembered when they had", PSTART,
639  PModel(kLeft, 0, 50, 0, 0), false, false},
640  {"sold it. But they went through this fiction every day. There was no", PCONT, PModel(),
641  false, false},
642  {"pot of yellow rice and fish and the boy knew this too. "
643  " ",
644  PCONT, PModel(), false, false},
645  {" \"Eighty-five is a lucky number,\" the old man said. \"How", PSTART,
646  PModel(kLeft, 0, 50, 0, 0), false, false},
647  {"would you like to see me bring one in that dressed out over a "
648  "thou-",
649  PCONT, PModel(), false, false},
650  {"sand pounds? "
651  " ",
652  PCONT, PModel(), false, false},
653  {" \"I'll get the cast net and go for sardines. Will you sit in the "
654  "sun",
655  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
656  {"in the doorway?\" "
657  " ",
658  PCONT, PModel(), false, false},
659  {" \"Yes. I have yesterday's paper and I will read the baseball.\" ", PSTART,
660  PModel(kLeft, 0, 50, 0, 0), false, false},
661  {" The boy did not know whether yesterday's paper was a fiction", PSTART,
662  PModel(kLeft, 0, 50, 0, 0), false, false},
663  {"too. But the old man brought it out from under the bed. ", PCONT, PModel(),
664  false, false},
665  {" \"Pedrico gave it to me at the bodega,\" he explained. "
666  " ",
667  PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
668  {" \"I'll be back when I have the sardines. I'll keep yours and mine", PSTART,
669  PModel(kLeft, 0, 50, 0, 0), false, false},
670  {"together on ice and we can share them in the morning. When I", PCONT, PModel(),
671  false, false},
672  {"come back you can tell me about the baseball.\" ", PCONT, PModel(),
673  false, false},
674  {" \"The Yankees cannot lose.\" ", PSTART,
675  PModel(kLeft, 0, 50, 0, 0), false, false},
676  {" \"But I fear the Indians of Cleveland.\" ", PSTART,
677  PModel(kLeft, 0, 50, 0, 0), false, false},
678  {" \"Have faith in the Yankees my son. Think of the great Di-", PSTART,
679  PModel(kLeft, 0, 50, 0, 0), false, false},
680  {"Maggio.\" ", PCONT, PModel(),
681  false, false},
682  {" \"I fear both the Tigers of Detroit and the Indians of Cleve-", PSTART,
683  PModel(kLeft, 0, 50, 0, 0), false, false},
684  {"land.\" ", PCONT, PModel(),
685  false, false}};
686 
687 TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) {
689 }
690 
692  {"Oats, 51 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
693  {"O'Brien, Gregory, 175 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
694  {"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
695  {" 138 ", PCONT, PModel(), false, false},
696  {"OECD rankings, 155, 172 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
697  {"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
698  {"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
699  {" 145 ", PCONT, PModel(), false, false},
700  {"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
701  {"Old World evils, 77 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
702  {"Oliver, W. H., 39, 77, 89 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
703  {"Olssen, Erik, 45, 64, 84 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
704  {"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
705  {"Once on Chunuk Bair, 149 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
706  {"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
707  {"On—shore whaling, xvi ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
708  {"Opotiki, xix ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
709  {"Orakau battle of, xviii, 57 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
710  {"O’Regan, Tipene, 170, 198-99 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
711  {"Organic agriculture, 177 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
712  {"Orwell, George, 151 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
713  {"Otago, xvii, 45, 49-50, 70 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
714  {"Otago block, xvii ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
715  {"Otago Daily Times, 67 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
716  {"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
717  {" 85 ", PCONT, PModel(), false, false},
718  {"Otago gold rushes, 61-63 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
719  {"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
720  {"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
721  {"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
722  {"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}};
723 
724 TEST(ParagraphsTest, IndexPageTest) {
726 }
727 
728 // TODO(eger): Add some right-to-left examples, and fix the algorithm as needed.
729 
730 } // namespace tesseract
@ TBOX
@ LOG
@ INFO
Definition: log.h:28
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:41
const ParagraphJustification kRight
const TextAndModel kComplexPage2[]
const TextAndModel kFlushLeftParagraphs[]
const TextAndModel kTextWithSourceCode[]
const TextAndModel kFewCluesWithCrown[]
ParagraphJustification
Definition: publictypes.h:248
@ JUSTIFICATION_LEFT
Definition: publictypes.h:250
@ JUSTIFICATION_UNKNOWN
Definition: publictypes.h:249
@ JUSTIFICATION_RIGHT
Definition: publictypes.h:252
@ JUSTIFICATION_CENTER
Definition: publictypes.h:251
const TextAndModel kOldManAndSea[]
const TextAndModel kUnlvRep3AO[]
const TextAndModel kSubtleCrown[]
const TextAndModel kNewZealandIndex[]
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:42
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:477
void AsciiToRowInfo(const char *text, int row_number, RowInfo *info)
const TextAndModel kTableOfContents[]
ParagraphModel PModel
const ParagraphJustification kUnknown
const TextAndModel kSingleFullPageContinuation[]
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:431
bool AsciiLikelyListItem(const std::string &word)
Definition: paragraphs.cpp:282
const TextAndModel kTinyParagraphs[]
const TextAndModel kCrownedParagraph[]
const TextAndModel kComplexPage1[]
void DetectParagraphs(int debug_level, std::vector< RowInfo > *row_infos, std::vector< PARA * > *row_owners, PARA_LIST *paragraphs, std::vector< ParagraphModel * > *models)
const ParagraphJustification kLeft
void EvaluateParagraphDetection(const TextAndModel *correct, int n, const std::vector< PARA * > &detector_output)
const TextAndModel kRightAligned[]
const TextAndModel kTwoSimpleParagraphs[]
const ParagraphJustification kCenter
void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector< RowInfo > *output)
void TestParagraphDetection(const TextAndModel *correct, int num_rows)
TEST(TesseractInstanceTest, TestMultipleTessInstances)
bool lword_likely_ends_idea
Definition: paragraphs.h:71
bool rword_likely_ends_idea
Definition: paragraphs.h:75
int average_interword_space
Definition: paragraphs.h:50
bool rword_likely_starts_idea
Definition: paragraphs.h:74
std::string rword_text
Definition: paragraphs.h:57
std::string text
Definition: paragraphs.h:41
std::string lword_text
Definition: paragraphs.h:56
bool lword_indicates_list_item
Definition: paragraphs.h:69
bool rword_indicates_list_item
Definition: paragraphs.h:73
bool lword_likely_starts_idea
Definition: paragraphs.h:70
bool Comparable(const ParagraphModel &other) const
Definition: ocrpara.cpp:73
std::string ToString() const
Definition: ocrpara.cpp:85
TextModelInputType model_type