tesseract  5.0.0
tesseract::ResultIterator Class Reference

#include <resultiterator.h>

Inheritance diagram for tesseract::ResultIterator:
tesseract::LTRResultIterator tesseract::PageIterator tesseract::MutableIterator

Public Member Functions

 ~ResultIterator () override=default
 
void Begin () override
 
bool Next (PageIteratorLevel level) override
 
bool IsAtBeginningOf (PageIteratorLevel level) const override
 
bool IsAtFinalElement (PageIteratorLevel level, PageIteratorLevel element) const override
 
int BlanksBeforeWord () const
 
virtual char * GetUTF8Text (PageIteratorLevel level) const
 
virtual std::vector< std::vector< std::vector< std::pair< const char *, float > > > > * GetRawLSTMTimesteps () const
 
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices () const
 
bool ParagraphIsLtr () const
 
- Public Member Functions inherited from tesseract::LTRResultIterator
 LTRResultIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
 ~LTRResultIterator () override
 
char * GetUTF8Text (PageIteratorLevel level) const
 
void SetLineSeparator (const char *new_line)
 
void SetParagraphSeparator (const char *new_para)
 
float Confidence (PageIteratorLevel level) const
 
const char * WordFontAttributes (bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
 
const char * WordRecognitionLanguage () const
 
StrongScriptDirection WordDirection () const
 
bool WordIsFromDictionary () const
 
int BlanksBeforeWord () const
 
bool WordIsNumeric () const
 
bool HasBlamerInfo () const
 
const void * GetParamsTrainingBundle () const
 
const char * GetBlamerDebug () const
 
const char * GetBlamerMisadaptionDebug () const
 
bool HasTruthString () const
 
bool EquivalentToTruth (const char *str) const
 
char * WordTruthUTF8Text () const
 
char * WordNormedUTF8Text () const
 
const char * WordLattice (int *lattice_size) const
 
bool SymbolIsSuperscript () const
 
bool SymbolIsSubscript () const
 
bool SymbolIsDropcap () const
 
- Public Member Functions inherited from tesseract::PageIterator
 PageIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
virtual ~PageIterator ()
 
 PageIterator (const PageIterator &src)
 
const PageIteratoroperator= (const PageIterator &src)
 
bool PositionedAtSameWord (const PAGE_RES_IT *other) const
 
virtual void RestartParagraph ()
 
bool IsWithinFirstTextlineOfParagraph () const
 
virtual void RestartRow ()
 
int Cmp (const PageIterator &other) const
 
void SetBoundingBoxComponents (bool include_upper_dots, bool include_lower_dots)
 
bool BoundingBox (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBox (PageIteratorLevel level, int padding, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBoxInternal (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool Empty (PageIteratorLevel level) const
 
PolyBlockType BlockType () const
 
Pta * BlockPolygon () const
 
Pix * GetBinaryImage (PageIteratorLevel level) const
 
Pix * GetImage (PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const
 
bool Baseline (PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
 
void RowAttributes (float *row_height, float *descenders, float *ascenders) const
 
void Orientation (tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
 
void ParagraphInfo (tesseract::ParagraphJustification *justification, bool *is_list_item, bool *is_crown, int *first_line_indent) const
 
bool SetWordBlamerBundle (BlamerBundle *blamer_bundle)
 

Static Public Member Functions

static ResultIteratorStartOfParagraph (const LTRResultIterator &resit)
 
static void CalculateTextlineOrder (bool paragraph_is_ltr, const std::vector< StrongScriptDirection > &word_dirs, std::vector< int > *reading_order)
 

Static Public Attributes

static const int kMinorRunStart = -1
 
static const int kMinorRunEnd = -2
 
static const int kComplexWord = -3
 

Protected Member Functions

 ResultIterator (const LTRResultIterator &resit)
 
- Protected Member Functions inherited from tesseract::PageIterator
void BeginWord (int offset)
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LTRResultIterator
const char * line_separator_
 
const char * paragraph_separator_
 
- Protected Attributes inherited from tesseract::PageIterator
PAGE_RESpage_res_
 
Tesseracttesseract_
 
PAGE_RES_ITit_
 
WERDword_
 
int word_length_
 
int blob_index_
 
C_BLOB_IT * cblob_it_
 
bool include_upper_dots_
 
bool include_lower_dots_
 
int scale_
 
int scaled_yres_
 
int rect_left_
 
int rect_top_
 
int rect_width_
 
int rect_height_
 

Detailed Description

Definition at line 34 of file resultiterator.h.

Constructor & Destructor Documentation

◆ ~ResultIterator()

tesseract::ResultIterator::~ResultIterator ( )
overridedefault

ResultIterator is copy constructible! The default copy constructor works just fine for us.

◆ ResultIterator()

tesseract::ResultIterator::ResultIterator ( const LTRResultIterator resit)
explicitprotected

We presume the data associated with the given iterator will outlive us. NB: This is private because it does something that is non-obvious: it resets to the beginning of the paragraph instead of staying wherever resit might have pointed.

Definition at line 37 of file resultiterator.cpp.

37  : LTRResultIterator(resit) {
38  in_minor_direction_ = false;
39  at_beginning_of_minor_run_ = false;
40  preserve_interword_spaces_ = false;
41 
42  auto *p = ParamUtils::FindParam<BoolParam>(
43  "preserve_interword_spaces", GlobalParams()->bool_params, tesseract_->params()->bool_params);
44  if (p != nullptr) {
45  preserve_interword_spaces_ = (bool)(*p);
46  }
47 
48  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
49  MoveToLogicalStartOfTextline();
50 }
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:36
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
ParamsVectors * params()
Definition: ccutil.h:53
std::vector< BoolParam * > bool_params
Definition: params.h:47

Member Function Documentation

◆ Begin()

void tesseract::ResultIterator::Begin ( )
overridevirtual

Moves the iterator to point to the start of the page to begin an iteration.

Reimplemented from tesseract::PageIterator.

Definition at line 474 of file resultiterator.cpp.

474  {
476  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
477  in_minor_direction_ = false;
478  at_beginning_of_minor_run_ = false;
479  MoveToLogicalStartOfTextline();
480 }

◆ BlanksBeforeWord()

int tesseract::ResultIterator::BlanksBeforeWord ( ) const

Definition at line 639 of file resultiterator.cpp.

639  {
640  if (CurrentParagraphIsLtr()) {
642  }
643  return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
644 }
bool IsAtBeginningOf(PageIteratorLevel level) const override

◆ CalculateTextlineOrder()

void tesseract::ResultIterator::CalculateTextlineOrder ( bool  paragraph_is_ltr,
const std::vector< StrongScriptDirection > &  word_dirs,
std::vector< int > *  reading_order 
)
static

Yields the reading order as a sequence of indices and (optional) meta-marks for a set of words (given left-to-right). The meta marks are passed as negative values: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The next indexed word contains both left-to-right and right-to-left characters and was treated as neutral.

For example, suppose we have five words in a text line, indexed [0,1,2,3,4] from the leftmost side of the text line. The following are all believable reading_orders:

Left-to-Right (in ltr paragraph): { 0, 1, 2, 3, 4 } Left-to-Right (in rtl paragraph): { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } Right-to-Left (in rtl paragraph): { 4, 3, 2, 1, 0 } Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }

Definition at line 285 of file resultiterator.cpp.

287  {
288  reading_order->clear();
289  if (word_dirs.empty()) {
290  return;
291  }
292 
293  // Take all of the runs of minor direction words and insert them
294  // in reverse order.
295  int minor_direction, major_direction, major_step, start, end;
296  if (paragraph_is_ltr) {
297  start = 0;
298  end = word_dirs.size();
299  major_step = 1;
300  major_direction = DIR_LEFT_TO_RIGHT;
301  minor_direction = DIR_RIGHT_TO_LEFT;
302  } else {
303  start = word_dirs.size() - 1;
304  end = -1;
305  major_step = -1;
306  major_direction = DIR_RIGHT_TO_LEFT;
307  minor_direction = DIR_LEFT_TO_RIGHT;
308  // Special rule: if there are neutral words at the right most side
309  // of a line adjacent to a left-to-right word in the middle of the
310  // line, we interpret the end of the line as a single LTR sequence.
311  if (word_dirs[start] == DIR_NEUTRAL) {
312  int neutral_end = start;
313  while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
314  neutral_end--;
315  }
316  if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
317  // LTR followed by neutrals.
318  // Scan for the beginning of the minor left-to-right run.
319  int left = neutral_end;
320  for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
321  if (word_dirs[i] == DIR_LEFT_TO_RIGHT) {
322  left = i;
323  }
324  }
325  reading_order->push_back(kMinorRunStart);
326  for (unsigned i = left; i < word_dirs.size(); i++) {
327  reading_order->push_back(i);
328  if (word_dirs[i] == DIR_MIX) {
329  reading_order->push_back(kComplexWord);
330  }
331  }
332  reading_order->push_back(kMinorRunEnd);
333  start = left - 1;
334  }
335  }
336  }
337  for (int i = start; i != end;) {
338  if (word_dirs[i] == minor_direction) {
339  int j = i;
340  while (j != end && word_dirs[j] != major_direction) {
341  j += major_step;
342  }
343  if (j == end) {
344  j -= major_step;
345  }
346  while (j != i && word_dirs[j] != minor_direction) {
347  j -= major_step;
348  }
349  // [j..i] is a minor direction run.
350  reading_order->push_back(kMinorRunStart);
351  for (int k = j; k != i; k -= major_step) {
352  reading_order->push_back(k);
353  }
354  reading_order->push_back(i);
355  reading_order->push_back(kMinorRunEnd);
356  i = j + major_step;
357  } else {
358  reading_order->push_back(i);
359  if (word_dirs[i] == DIR_MIX) {
360  reading_order->push_back(kComplexWord);
361  }
362  i += major_step;
363  }
364  }
365 }
@ DIR_MIX
Definition: unichar.h:47
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:45
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:46
@ DIR_NEUTRAL
Definition: unichar.h:44
static const int kMinorRunEnd
static const int kMinorRunStart
static const int kComplexWord

◆ GetBestLSTMSymbolChoices()

std::vector< std::vector< std::pair< const char *, float > > > * tesseract::ResultIterator::GetBestLSTMSymbolChoices ( ) const
virtual

Definition at line 698 of file resultiterator.cpp.

699  {
700  if (it_->word() != nullptr) {
701  return &it_->word()->CTC_symbol_choices;
702  } else {
703  return nullptr;
704  }
705 }
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
Definition: pageres.h:224
WERD_RES * word() const
Definition: pageres.h:763

◆ GetRawLSTMTimesteps()

std::vector< std::vector< std::vector< std::pair< const char *, float > > > > * tesseract::ResultIterator::GetRawLSTMTimesteps ( ) const
virtual

Returns the LSTM choices for every LSTM timestep for the current word.

Definition at line 690 of file resultiterator.cpp.

690  {
691  if (it_->word() != nullptr) {
692  return &it_->word()->segmented_timesteps;
693  } else {
694  return nullptr;
695  }
696 }
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
Definition: pageres.h:222

◆ GetUTF8Text()

char * tesseract::ResultIterator::GetUTF8Text ( PageIteratorLevel  level) const
virtual

Returns the null terminated UTF-8 encoded text string for the current object at the given level. Use delete [] to free after use.

Definition at line 650 of file resultiterator.cpp.

650  {
651  if (it_->word() == nullptr) {
652  return nullptr; // Already at the end!
653  }
654  std::string text;
655  switch (level) {
656  case RIL_BLOCK: {
657  ResultIterator pp(*this);
658  do {
659  pp.AppendUTF8ParagraphText(&text);
660  } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
661  } break;
662  case RIL_PARA:
663  AppendUTF8ParagraphText(&text);
664  break;
665  case RIL_TEXTLINE: {
666  ResultIterator it(*this);
667  it.MoveToLogicalStartOfTextline();
668  it.IterateAndAppendUTF8TextlineText(&text);
669  } break;
670  case RIL_WORD:
671  AppendUTF8WordText(&text);
672  break;
673  case RIL_SYMBOL: {
674  bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
675  if (at_beginning_of_minor_run_) {
676  text += reading_direction_is_ltr ? kLRM : kRLM;
677  }
678  text = it_->word()->BestUTF8(blob_index_, false);
679  if (IsAtFinalSymbolOfWord()) {
680  AppendSuffixMarks(&text);
681  }
682  } break;
683  }
684  int length = text.length() + 1;
685  char *result = new char[length];
686  strncpy(result, text.c_str(), length);
687  return result;
688 }
ResultIterator(const LTRResultIterator &resit)
const char * BestUTF8(unsigned blob_index, bool in_rtl_context) const
Definition: pageres.h:361
BLOCK_RES * block() const
Definition: pageres.h:769

◆ IsAtBeginningOf()

bool tesseract::ResultIterator::IsAtBeginningOf ( PageIteratorLevel  level) const
overridevirtual

IsAtBeginningOf() returns whether we're at the logical beginning of the given level. (as opposed to ResultIterator's left-to-right top-to-bottom order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). For a full description, see pageiterator.h

Reimplemented from tesseract::PageIterator.

Definition at line 565 of file resultiterator.cpp.

565  {
566  if (it_->block() == nullptr) {
567  return false; // Already at the end!
568  }
569  if (it_->word() == nullptr) {
570  return true; // In an image block.
571  }
572  if (level == RIL_SYMBOL) {
573  return true; // Always at beginning of a symbol.
574  }
575 
576  bool at_word_start = IsAtFirstSymbolOfWord();
577  if (level == RIL_WORD) {
578  return at_word_start;
579  }
580 
581  ResultIterator line_start(*this);
582  // move to the first word in the line...
583  line_start.MoveToLogicalStartOfTextline();
584 
585  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
586  if (level == RIL_TEXTLINE) {
587  return at_textline_start;
588  }
589 
590  // now we move to the left-most word...
591  line_start.RestartRow();
592  bool at_block_start =
593  at_textline_start && line_start.it_->block() != line_start.it_->prev_block();
594  if (level == RIL_BLOCK) {
595  return at_block_start;
596  }
597 
598  bool at_para_start =
599  at_block_start || (at_textline_start && line_start.it_->row()->row->para() !=
600  line_start.it_->prev_row()->row->para());
601  if (level == RIL_PARA) {
602  return at_para_start;
603  }
604 
605  ASSERT_HOST(false); // shouldn't happen.
606  return false;
607 }
#define ASSERT_HOST(x)
Definition: errcode.h:59

◆ IsAtFinalElement()

bool tesseract::ResultIterator::IsAtFinalElement ( PageIteratorLevel  level,
PageIteratorLevel  element 
) const
overridevirtual

Implement PageIterator's IsAtFinalElement correctly in a BiDi context. For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we point at the last word in a paragraph. See PageIterator for full comment.

NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the change that the variable next is now a ResultIterator instead of a PageIterator.

Reimplemented from tesseract::PageIterator.

Definition at line 614 of file resultiterator.cpp.

614  {
615  if (Empty(element)) {
616  return true; // Already at the end!
617  }
618  // The result is true if we step forward by element and find we are
619  // at the the end of the page or at beginning of *all* levels in:
620  // [level, element).
621  // When there is more than one level difference between element and level,
622  // we could for instance move forward one symbol and still be at the first
623  // word on a line, so we also have to be at the first symbol in a word.
624  ResultIterator next(*this);
625  next.Next(element);
626  if (next.Empty(element)) {
627  return true; // Reached the end of the page.
628  }
629  while (element > level) {
630  element = static_cast<PageIteratorLevel>(element - 1);
631  if (!next.IsAtBeginningOf(element)) {
632  return false;
633  }
634  }
635  return true;
636 }
bool Empty(PageIteratorLevel level) const

◆ Next()

bool tesseract::ResultIterator::Next ( PageIteratorLevel  level)
overridevirtual

Moves to the start of the next object at the given level in the page hierarchy in the appropriate reading order and returns false if the end of the page was reached. NOTE that RIL_SYMBOL will skip non-text blocks, but all other PageIteratorLevel level values will visit each non-text block once. Think of non text blocks as containing a single para, with a single line, with a single imaginary word. Calls to Next with different levels may be freely intermixed. This function iterates words in right-to-left scripts correctly, if the appropriate language has been loaded into Tesseract.

Reimplemented from tesseract::PageIterator.

Definition at line 482 of file resultiterator.cpp.

482  {
483  if (it_->block() == nullptr) {
484  return false; // already at end!
485  }
486  switch (level) {
487  case RIL_BLOCK: // explicit fall-through
488  case RIL_PARA: // explicit fall-through
489  case RIL_TEXTLINE:
490  if (!PageIterator::Next(level)) {
491  return false;
492  }
494  // if we've advanced to a new paragraph,
495  // recalculate current_paragraph_is_ltr_
496  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
497  }
498  in_minor_direction_ = false;
499  MoveToLogicalStartOfTextline();
500  return it_->block() != nullptr;
501  case RIL_SYMBOL: {
502  std::vector<int> blob_order;
503  CalculateBlobOrder(&blob_order);
504  unsigned next_blob = 0;
505  while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {
506  next_blob++;
507  }
508  next_blob++;
509  if (next_blob < blob_order.size()) {
510  // we're in the same word; simply advance one blob.
511  BeginWord(blob_order[next_blob]);
512  at_beginning_of_minor_run_ = false;
513  return true;
514  }
515  level = RIL_WORD; // we've fallen through to the next word.
516  }
517  // Fall through.
518  case RIL_WORD: // explicit fall-through.
519  {
520  if (it_->word() == nullptr) {
521  return Next(RIL_BLOCK);
522  }
523  std::vector<int> word_indices;
524  int this_word_index = LTRWordIndex();
525  CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);
526  int final_real_index = word_indices.size() - 1;
527  while (final_real_index > 0 && word_indices[final_real_index] < 0) {
528  final_real_index--;
529  }
530  for (int i = 0; i < final_real_index; i++) {
531  if (word_indices[i] == this_word_index) {
532  int j = i + 1;
533  for (; j < final_real_index && word_indices[j] < 0; j++) {
534  if (word_indices[j] == kMinorRunStart) {
535  in_minor_direction_ = true;
536  }
537  if (word_indices[j] == kMinorRunEnd) {
538  in_minor_direction_ = false;
539  }
540  }
541  at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
542  // awesome, we move to word_indices[j]
543  if (BidiDebug(3)) {
544  tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]);
545  }
547  for (int k = 0; k < word_indices[j]; k++) {
549  }
550  MoveToLogicalStartOfWord();
551  return true;
552  }
553  }
554  if (BidiDebug(3)) {
555  tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
556  }
557  // we're going off the end of the text line.
558  return Next(RIL_TEXTLINE);
559  }
560  }
561  ASSERT_HOST(false); // shouldn't happen.
562  return false;
563 }
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
virtual void RestartRow()
virtual bool Next(PageIteratorLevel level)
bool IsWithinFirstTextlineOfParagraph() const
void BeginWord(int offset)
static void CalculateTextlineOrder(bool paragraph_is_ltr, const std::vector< StrongScriptDirection > &word_dirs, std::vector< int > *reading_order)
bool Next(PageIteratorLevel level) override

◆ ParagraphIsLtr()

bool tesseract::ResultIterator::ParagraphIsLtr ( ) const

Return whether the current paragraph's dominant reading direction is left-to-right (as opposed to right-to-left).

Definition at line 56 of file resultiterator.cpp.

56  {
57  return current_paragraph_is_ltr_;
58 }

◆ StartOfParagraph()

ResultIterator * tesseract::ResultIterator::StartOfParagraph ( const LTRResultIterator resit)
static

Definition at line 52 of file resultiterator.cpp.

52  {
53  return new ResultIterator(resit);
54 }

Member Data Documentation

◆ kComplexWord

const int tesseract::ResultIterator::kComplexWord = -3
static

Definition at line 138 of file resultiterator.h.

◆ kMinorRunEnd

const int tesseract::ResultIterator::kMinorRunEnd = -2
static

Definition at line 137 of file resultiterator.h.

◆ kMinorRunStart

const int tesseract::ResultIterator::kMinorRunStart = -1
static

Definition at line 136 of file resultiterator.h.


The documentation for this class was generated from the following files: