tesseract  5.0.0
resultiterator.cpp
Go to the documentation of this file.
1 // File: resultiterator.cpp
3 // Description: Iterator for tesseract results that is capable of
4 // iterating in proper reading order over Bi Directional
5 // (e.g. mixed Hebrew and English) text.
6 // Author: David Eger
7 //
8 // (C) Copyright 2011, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
22 
23 #include "pageres.h"
24 #include "tesseractclass.h"
25 #include "unicharset.h"
26 
27 #include <allheaders.h>
28 
29 #include <set>
30 #include <vector>
31 
32 static const char *const kLRM = "\u200E"; // Left-to-Right Mark
33 static const char *const kRLM = "\u200F"; // Right-to-Left Mark
34 
35 namespace tesseract {
36 
38  in_minor_direction_ = false;
39  at_beginning_of_minor_run_ = false;
40  preserve_interword_spaces_ = false;
41 
42  auto *p = ParamUtils::FindParam<BoolParam>(
43  "preserve_interword_spaces", GlobalParams()->bool_params, tesseract_->params()->bool_params);
44  if (p != nullptr) {
45  preserve_interword_spaces_ = (bool)(*p);
46  }
47 
48  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
49  MoveToLogicalStartOfTextline();
50 }
51 
53  return new ResultIterator(resit);
54 }
55 
57  return current_paragraph_is_ltr_;
58 }
59 
60 bool ResultIterator::CurrentParagraphIsLtr() const {
61  if (!it_->word()) {
62  return true; // doesn't matter.
63  }
64  LTRResultIterator it(*this);
65  it.RestartParagraph();
66  // Try to figure out the ltr-ness of the paragraph. The rules below
67  // make more sense in the context of a difficult paragraph example.
68  // Here we denote {ltr characters, RTL CHARACTERS}:
69  //
70  // "don't go in there!" DAIS EH
71  // EHT OTNI DEPMUJ FELSMIH NEHT DNA
72  // .GNIDLIUB GNINRUB
73  //
74  // On the first line, the left-most word is LTR and the rightmost word
75  // is RTL. Thus, we are better off taking the majority direction for
76  // the whole paragraph contents. So instead of "the leftmost word is LTR"
77  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
78  // would not do: Typically an RTL paragraph would *not* start with an LTR
79  // word. So our heuristics are as follows:
80  //
81  // (1) If the first text line has an RTL word in the left-most position
82  // it is RTL.
83  // (2) If the first text line has an LTR word in the right-most position
84  // it is LTR.
85  // (3) If neither of the above is true, take the majority count for the
86  // paragraph -- if there are more rtl words, it is RTL. If there
87  // are more LTR words, it's LTR.
88  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
89  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
90  int num_ltr, num_rtl;
91  num_rtl = leftmost_rtl ? 1 : 0;
92  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
93  for (it.Next(RIL_WORD); !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
94  it.Next(RIL_WORD)) {
95  StrongScriptDirection dir = it.WordDirection();
96  rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
97  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
98  num_ltr += rightmost_ltr ? 1 : 0;
99  }
100  if (leftmost_rtl) {
101  return false;
102  }
103  if (rightmost_ltr) {
104  return true;
105  }
106  // First line is ambiguous. Take statistics on the whole paragraph.
107  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) {
108  do {
109  StrongScriptDirection dir = it.WordDirection();
110  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
111  num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
112  } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
113  }
114  return num_ltr >= num_rtl;
115 }
116 
117 const int ResultIterator::kMinorRunStart = -1;
118 const int ResultIterator::kMinorRunEnd = -2;
119 const int ResultIterator::kComplexWord = -3;
120 
121 void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const {
122  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
123  blob_indices->clear();
124  if (Empty(RIL_WORD)) {
125  return;
126  }
127  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
128  // Easy! just return the blobs in order;
129  for (int i = 0; i < word_length_; i++) {
130  blob_indices->push_back(i);
131  }
132  return;
133  }
134 
135  // The blobs are in left-to-right order, but the current reading context
136  // is right-to-left.
137  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
138  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
139  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
140  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
141  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
142  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
143  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
144 
145  // Step 1: Scan for and mark European Number sequences
146  // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
147  std::vector<int> letter_types;
148  letter_types.reserve(word_length_);
149  for (int i = 0; i < word_length_; i++) {
150  letter_types.push_back(it_->word()->SymbolDirection(i));
151  }
152  // Convert a single separtor sandwiched between two EN's into an EN.
153  for (int i = 0; i + 2 < word_length_; i++) {
154  if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
155  (letter_types[i + 1] == U_EURO_NUM_SEP || letter_types[i + 1] == U_COMMON_NUM_SEP)) {
156  letter_types[i + 1] = U_EURO_NUM;
157  }
158  }
159  // Scan for sequences of European Number Terminators around ENs and convert
160  // them to ENs.
161  for (int i = 0; i < word_length_; i++) {
162  if (letter_types[i] == U_EURO_NUM_TERM) {
163  int j = i + 1;
164  while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {
165  j++;
166  }
167  if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
168  // The sequence [i..j] should be converted to all European Numbers.
169  for (int k = i; k < j; k++) {
170  letter_types[k] = U_EURO_NUM;
171  }
172  }
173  j = i - 1;
174  while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {
175  j--;
176  }
177  if (j > -1 && letter_types[j] == U_EURO_NUM) {
178  // The sequence [j..i] should be converted to all European Numbers.
179  for (int k = j; k <= i; k++) {
180  letter_types[k] = U_EURO_NUM;
181  }
182  }
183  }
184  }
185  // Step 2: Convert all remaining types to either L or R.
186  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
187  // All other are R.
188  for (int i = 0; i < word_length_;) {
189  int ti = letter_types[i];
190  if (ti == U_LTR || ti == U_EURO_NUM) {
191  // Left to right sequence; scan to the end of it.
192  int last_good = i;
193  for (int j = i + 1; j < word_length_; j++) {
194  int tj = letter_types[j];
195  if (tj == U_LTR || tj == U_EURO_NUM) {
196  last_good = j;
197  } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
198  // do nothing.
199  } else {
200  break;
201  }
202  }
203  // [i..last_good] is the L sequence
204  for (int k = i; k <= last_good; k++) {
205  letter_types[k] = U_LTR;
206  }
207  i = last_good + 1;
208  } else {
209  letter_types[i] = U_RTL;
210  i++;
211  }
212  }
213 
214  // At this point, letter_types is entirely U_LTR or U_RTL.
215  for (int i = word_length_ - 1; i >= 0;) {
216  if (letter_types[i] == U_RTL) {
217  blob_indices->push_back(i);
218  i--;
219  } else {
220  // left to right sequence. scan to the beginning.
221  int j = i - 1;
222  for (; j >= 0 && letter_types[j] != U_RTL; j--) {
223  } // pass
224  // Now (j, i] is LTR
225  for (int k = j + 1; k <= i; k++) {
226  blob_indices->push_back(k);
227  }
228  i = j;
229  }
230  }
231  ASSERT_HOST(blob_indices->size() == static_cast<size_t>(word_length_));
232 }
233 
234 static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) {
235  for (auto dir : dirs) {
236  switch (dir) {
237  case DIR_NEUTRAL:
238  tprintf("N ");
239  break;
240  case DIR_LEFT_TO_RIGHT:
241  tprintf("L ");
242  break;
243  case DIR_RIGHT_TO_LEFT:
244  tprintf("R ");
245  break;
246  case DIR_MIX:
247  tprintf("Z ");
248  break;
249  default:
250  tprintf("? ");
251  break;
252  }
253  }
254  tprintf("\n");
255 }
256 
257 void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
258  std::vector<int> *word_indices) const {
259  std::vector<StrongScriptDirection> directions;
260  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
261 }
262 
263 void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
264  std::vector<StrongScriptDirection> *dirs_arg,
265  std::vector<int> *word_indices) const {
266  std::vector<StrongScriptDirection> dirs;
267  std::vector<StrongScriptDirection> *directions;
268  directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
269  directions->clear();
270 
271  // A LTRResultIterator goes strictly left-to-right word order.
272  LTRResultIterator ltr_it(resit);
273  ltr_it.RestartRow();
274  if (ltr_it.Empty(RIL_WORD)) {
275  return;
276  }
277  do {
278  directions->push_back(ltr_it.WordDirection());
279  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
280 
281  word_indices->clear();
282  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
283 }
284 
285 void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr,
286  const std::vector<StrongScriptDirection> &word_dirs,
287  std::vector<int> *reading_order) {
288  reading_order->clear();
289  if (word_dirs.empty()) {
290  return;
291  }
292 
293  // Take all of the runs of minor direction words and insert them
294  // in reverse order.
295  int minor_direction, major_direction, major_step, start, end;
296  if (paragraph_is_ltr) {
297  start = 0;
298  end = word_dirs.size();
299  major_step = 1;
300  major_direction = DIR_LEFT_TO_RIGHT;
301  minor_direction = DIR_RIGHT_TO_LEFT;
302  } else {
303  start = word_dirs.size() - 1;
304  end = -1;
305  major_step = -1;
306  major_direction = DIR_RIGHT_TO_LEFT;
307  minor_direction = DIR_LEFT_TO_RIGHT;
308  // Special rule: if there are neutral words at the right most side
309  // of a line adjacent to a left-to-right word in the middle of the
310  // line, we interpret the end of the line as a single LTR sequence.
311  if (word_dirs[start] == DIR_NEUTRAL) {
312  int neutral_end = start;
313  while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
314  neutral_end--;
315  }
316  if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
317  // LTR followed by neutrals.
318  // Scan for the beginning of the minor left-to-right run.
319  int left = neutral_end;
320  for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
321  if (word_dirs[i] == DIR_LEFT_TO_RIGHT) {
322  left = i;
323  }
324  }
325  reading_order->push_back(kMinorRunStart);
326  for (unsigned i = left; i < word_dirs.size(); i++) {
327  reading_order->push_back(i);
328  if (word_dirs[i] == DIR_MIX) {
329  reading_order->push_back(kComplexWord);
330  }
331  }
332  reading_order->push_back(kMinorRunEnd);
333  start = left - 1;
334  }
335  }
336  }
337  for (int i = start; i != end;) {
338  if (word_dirs[i] == minor_direction) {
339  int j = i;
340  while (j != end && word_dirs[j] != major_direction) {
341  j += major_step;
342  }
343  if (j == end) {
344  j -= major_step;
345  }
346  while (j != i && word_dirs[j] != minor_direction) {
347  j -= major_step;
348  }
349  // [j..i] is a minor direction run.
350  reading_order->push_back(kMinorRunStart);
351  for (int k = j; k != i; k -= major_step) {
352  reading_order->push_back(k);
353  }
354  reading_order->push_back(i);
355  reading_order->push_back(kMinorRunEnd);
356  i = j + major_step;
357  } else {
358  reading_order->push_back(i);
359  if (word_dirs[i] == DIR_MIX) {
360  reading_order->push_back(kComplexWord);
361  }
362  i += major_step;
363  }
364  }
365 }
366 
367 int ResultIterator::LTRWordIndex() const {
368  int this_word_index = 0;
369  LTRResultIterator textline(*this);
370  textline.RestartRow();
371  while (!textline.PositionedAtSameWord(it_)) {
372  this_word_index++;
373  textline.Next(RIL_WORD);
374  }
375  return this_word_index;
376 }
377 
378 void ResultIterator::MoveToLogicalStartOfWord() {
379  if (word_length_ == 0) {
380  BeginWord(0);
381  return;
382  }
383  std::vector<int> blob_order;
384  CalculateBlobOrder(&blob_order);
385  if (blob_order.empty() || blob_order[0] == 0) {
386  return;
387  }
388  BeginWord(blob_order[0]);
389 }
390 
391 bool ResultIterator::IsAtFinalSymbolOfWord() const {
392  if (!it_->word()) {
393  return true;
394  }
395  std::vector<int> blob_order;
396  CalculateBlobOrder(&blob_order);
397  return blob_order.empty() || blob_order.back() == blob_index_;
398 }
399 
400 bool ResultIterator::IsAtFirstSymbolOfWord() const {
401  if (!it_->word()) {
402  return true;
403  }
404  std::vector<int> blob_order;
405  CalculateBlobOrder(&blob_order);
406  return blob_order.empty() || blob_order[0] == blob_index_;
407 }
408 
409 void ResultIterator::AppendSuffixMarks(std::string *text) const {
410  if (!it_->word()) {
411  return;
412  }
413  bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
414  // scan forward to see what meta-information the word ordering algorithm
415  // left us.
416  // If this word is at the *end* of a minor run, insert the other
417  // direction's mark; else if this was a complex word, insert the
418  // current reading order's mark.
419  std::vector<int> textline_order;
420  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);
421  int this_word_index = LTRWordIndex();
422  size_t i = 0;
423  for (const auto word_index : textline_order) {
424  if (word_index == this_word_index) {
425  break;
426  }
427  i++;
428  }
429  if (i == textline_order.size()) {
430  return;
431  }
432 
433  int last_non_word_mark = 0;
434  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
435  last_non_word_mark = textline_order[i];
436  }
437  if (last_non_word_mark == kComplexWord) {
438  *text += reading_direction_is_ltr ? kLRM : kRLM;
439  } else if (last_non_word_mark == kMinorRunEnd) {
440  if (current_paragraph_is_ltr_) {
441  *text += kLRM;
442  } else {
443  *text += kRLM;
444  }
445  }
446 }
447 
448 void ResultIterator::MoveToLogicalStartOfTextline() {
449  std::vector<int> word_indices;
450  RestartRow();
451  CalculateTextlineOrder(current_paragraph_is_ltr_, dynamic_cast<const LTRResultIterator &>(*this),
452  &word_indices);
453  unsigned i = 0;
454  for (; i < word_indices.size() && word_indices[i] < 0; i++) {
455  if (word_indices[i] == kMinorRunStart) {
456  in_minor_direction_ = true;
457  } else if (word_indices[i] == kMinorRunEnd) {
458  in_minor_direction_ = false;
459  }
460  }
461  if (in_minor_direction_) {
462  at_beginning_of_minor_run_ = true;
463  }
464  if (i >= word_indices.size()) {
465  return;
466  }
467  int first_word_index = word_indices[i];
468  for (int j = 0; j < first_word_index; j++) {
470  }
471  MoveToLogicalStartOfWord();
472 }
473 
476  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
477  in_minor_direction_ = false;
478  at_beginning_of_minor_run_ = false;
479  MoveToLogicalStartOfTextline();
480 }
481 
483  if (it_->block() == nullptr) {
484  return false; // already at end!
485  }
486  switch (level) {
487  case RIL_BLOCK: // explicit fall-through
488  case RIL_PARA: // explicit fall-through
489  case RIL_TEXTLINE:
490  if (!PageIterator::Next(level)) {
491  return false;
492  }
494  // if we've advanced to a new paragraph,
495  // recalculate current_paragraph_is_ltr_
496  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
497  }
498  in_minor_direction_ = false;
499  MoveToLogicalStartOfTextline();
500  return it_->block() != nullptr;
501  case RIL_SYMBOL: {
502  std::vector<int> blob_order;
503  CalculateBlobOrder(&blob_order);
504  unsigned next_blob = 0;
505  while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {
506  next_blob++;
507  }
508  next_blob++;
509  if (next_blob < blob_order.size()) {
510  // we're in the same word; simply advance one blob.
511  BeginWord(blob_order[next_blob]);
512  at_beginning_of_minor_run_ = false;
513  return true;
514  }
515  level = RIL_WORD; // we've fallen through to the next word.
516  }
517  // Fall through.
518  case RIL_WORD: // explicit fall-through.
519  {
520  if (it_->word() == nullptr) {
521  return Next(RIL_BLOCK);
522  }
523  std::vector<int> word_indices;
524  int this_word_index = LTRWordIndex();
525  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);
526  int final_real_index = word_indices.size() - 1;
527  while (final_real_index > 0 && word_indices[final_real_index] < 0) {
528  final_real_index--;
529  }
530  for (int i = 0; i < final_real_index; i++) {
531  if (word_indices[i] == this_word_index) {
532  int j = i + 1;
533  for (; j < final_real_index && word_indices[j] < 0; j++) {
534  if (word_indices[j] == kMinorRunStart) {
535  in_minor_direction_ = true;
536  }
537  if (word_indices[j] == kMinorRunEnd) {
538  in_minor_direction_ = false;
539  }
540  }
541  at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
542  // awesome, we move to word_indices[j]
543  if (BidiDebug(3)) {
544  tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]);
545  }
547  for (int k = 0; k < word_indices[j]; k++) {
549  }
550  MoveToLogicalStartOfWord();
551  return true;
552  }
553  }
554  if (BidiDebug(3)) {
555  tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
556  }
557  // we're going off the end of the text line.
558  return Next(RIL_TEXTLINE);
559  }
560  }
561  ASSERT_HOST(false); // shouldn't happen.
562  return false;
563 }
564 
566  if (it_->block() == nullptr) {
567  return false; // Already at the end!
568  }
569  if (it_->word() == nullptr) {
570  return true; // In an image block.
571  }
572  if (level == RIL_SYMBOL) {
573  return true; // Always at beginning of a symbol.
574  }
575 
576  bool at_word_start = IsAtFirstSymbolOfWord();
577  if (level == RIL_WORD) {
578  return at_word_start;
579  }
580 
581  ResultIterator line_start(*this);
582  // move to the first word in the line...
583  line_start.MoveToLogicalStartOfTextline();
584 
585  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
586  if (level == RIL_TEXTLINE) {
587  return at_textline_start;
588  }
589 
590  // now we move to the left-most word...
591  line_start.RestartRow();
592  bool at_block_start =
593  at_textline_start && line_start.it_->block() != line_start.it_->prev_block();
594  if (level == RIL_BLOCK) {
595  return at_block_start;
596  }
597 
598  bool at_para_start =
599  at_block_start || (at_textline_start && line_start.it_->row()->row->para() !=
600  line_start.it_->prev_row()->row->para());
601  if (level == RIL_PARA) {
602  return at_para_start;
603  }
604 
605  ASSERT_HOST(false); // shouldn't happen.
606  return false;
607 }
608 
615  if (Empty(element)) {
616  return true; // Already at the end!
617  }
618  // The result is true if we step forward by element and find we are
619  // at the the end of the page or at beginning of *all* levels in:
620  // [level, element).
621  // When there is more than one level difference between element and level,
622  // we could for instance move forward one symbol and still be at the first
623  // word on a line, so we also have to be at the first symbol in a word.
624  ResultIterator next(*this);
625  next.Next(element);
626  if (next.Empty(element)) {
627  return true; // Reached the end of the page.
628  }
629  while (element > level) {
630  element = static_cast<PageIteratorLevel>(element - 1);
631  if (!next.IsAtBeginningOf(element)) {
632  return false;
633  }
634  }
635  return true;
636 }
637 
638 // Returns the number of blanks before the current word.
640  if (CurrentParagraphIsLtr()) {
642  }
643  return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
644 }
645 
651  if (it_->word() == nullptr) {
652  return nullptr; // Already at the end!
653  }
654  std::string text;
655  switch (level) {
656  case RIL_BLOCK: {
657  ResultIterator pp(*this);
658  do {
659  pp.AppendUTF8ParagraphText(&text);
660  } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
661  } break;
662  case RIL_PARA:
663  AppendUTF8ParagraphText(&text);
664  break;
665  case RIL_TEXTLINE: {
666  ResultIterator it(*this);
667  it.MoveToLogicalStartOfTextline();
668  it.IterateAndAppendUTF8TextlineText(&text);
669  } break;
670  case RIL_WORD:
671  AppendUTF8WordText(&text);
672  break;
673  case RIL_SYMBOL: {
674  bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
675  if (at_beginning_of_minor_run_) {
676  text += reading_direction_is_ltr ? kLRM : kRLM;
677  }
678  text = it_->word()->BestUTF8(blob_index_, false);
679  if (IsAtFinalSymbolOfWord()) {
680  AppendSuffixMarks(&text);
681  }
682  } break;
683  }
684  int length = text.length() + 1;
685  char *result = new char[length];
686  strncpy(result, text.c_str(), length);
687  return result;
688 }
689 std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
691  if (it_->word() != nullptr) {
692  return &it_->word()->segmented_timesteps;
693  } else {
694  return nullptr;
695  }
696 }
697 
698 std::vector<std::vector<std::pair<const char *, float>>> *ResultIterator::GetBestLSTMSymbolChoices()
699  const {
700  if (it_->word() != nullptr) {
701  return &it_->word()->CTC_symbol_choices;
702  } else {
703  return nullptr;
704  }
705 }
706 
707 void ResultIterator::AppendUTF8WordText(std::string *text) const {
708  if (!it_->word()) {
709  return;
710  }
711  ASSERT_HOST(it_->word()->best_choice != nullptr);
712  bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
713  if (at_beginning_of_minor_run_) {
714  *text += reading_direction_is_ltr ? kLRM : kRLM;
715  }
716 
717  std::vector<int> blob_order;
718  CalculateBlobOrder(&blob_order);
719  for (int i : blob_order) {
720  *text += it_->word()->BestUTF8(i, false);
721  }
722  AppendSuffixMarks(text);
723 }
724 
725 void ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) {
726  if (Empty(RIL_WORD)) {
727  Next(RIL_WORD);
728  return;
729  }
730  if (BidiDebug(1)) {
731  std::vector<int> textline_order;
732  std::vector<StrongScriptDirection> dirs;
733  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs, &textline_order);
734  tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
735  current_paragraph_is_ltr_ ? "ltr" : "rtl");
736  PrintScriptDirs(dirs);
737  tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
738  current_paragraph_is_ltr_ ? "ltr" : "rtl");
739  for (int i : textline_order) {
740  tprintf("%d ", i);
741  }
742  tprintf("\n");
743  }
744 
745  int words_appended = 0;
746  do {
747  int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : (words_appended > 0);
748  for (int i = 0; i < numSpaces; ++i) {
749  *text += " ";
750  }
751  AppendUTF8WordText(text);
752  words_appended++;
753  if (BidiDebug(2)) {
754  tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str());
755  }
756  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
757  if (BidiDebug(1)) {
758  tprintf("%d words printed\n", words_appended);
759  }
760  *text += line_separator_;
761  // If we just finished a paragraph, add an extra newline.
762  if (IsAtBeginningOf(RIL_PARA)) {
763  *text += paragraph_separator_;
764  }
765 }
766 
767 void ResultIterator::AppendUTF8ParagraphText(std::string *text) const {
768  ResultIterator it(*this);
769  it.RestartParagraph();
770  it.MoveToLogicalStartOfTextline();
771  if (it.Empty(RIL_WORD)) {
772  return;
773  }
774  do {
775  it.IterateAndAppendUTF8TextlineText(text);
776  } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
777 }
778 
779 bool ResultIterator::BidiDebug(int min_level) const {
780  int debug_level = 1;
781  auto *p = ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params,
783  if (p != nullptr) {
784  debug_level = (int32_t)(*p);
785  }
786  return debug_level >= min_level;
787 }
788 
789 } // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:59
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
StrongScriptDirection
Definition: unichar.h:43
@ DIR_MIX
Definition: unichar.h:47
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:45
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:46
@ DIR_NEUTRAL
Definition: unichar.h:44
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:36
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
virtual void RestartRow()
virtual bool Next(PageIteratorLevel level)
bool IsWithinFirstTextlineOfParagraph() const
bool Empty(PageIteratorLevel level) const
void BeginWord(int offset)
static void CalculateTextlineOrder(bool paragraph_is_ltr, const std::vector< StrongScriptDirection > &word_dirs, std::vector< int > *reading_order)
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
static const int kMinorRunEnd
static const int kMinorRunStart
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsAtBeginningOf(PageIteratorLevel level) const override
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices() const
bool Next(PageIteratorLevel level) override
virtual std::vector< std::vector< std::vector< std::pair< const char *, float > > > > * GetRawLSTMTimesteps() const
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
ResultIterator(const LTRResultIterator &resit)
static const int kComplexWord
PARA * para() const
Definition: ocrrow.h:120
WERD_CHOICE * best_choice
Definition: pageres.h:239
const char * BestUTF8(unsigned blob_index, bool in_rtl_context) const
Definition: pageres.h:361
bool UnicharsInReadingOrder() const
Definition: pageres.h:435
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
Definition: pageres.h:224
UNICHARSET::Direction SymbolDirection(unsigned blob_index) const
Definition: pageres.h:387
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
Definition: pageres.h:222
BLOCK_RES * prev_block() const
Definition: pageres.h:760
ROW_RES * prev_row() const
Definition: pageres.h:757
ROW_RES * row() const
Definition: pageres.h:766
WERD_RES * word() const
Definition: pageres.h:763
BLOCK_RES * block() const
Definition: pageres.h:769
uint8_t space() const
Definition: werd.h:100
ParamsVectors * params()
Definition: ccutil.h:53
std::vector< BoolParam * > bool_params
Definition: params.h:47
std::vector< IntParam * > int_params
Definition: params.h:46