tesseract  5.0.0
tesseract::WERD_CHOICE Class Reference

#include <ratngs.h>

Inheritance diagram for tesseract::WERD_CHOICE:
tesseract::ELIST_LINK

Public Member Functions

 WERD_CHOICE (const UNICHARSET *unicharset)
 
 WERD_CHOICE (const UNICHARSET *unicharset, int reserved)
 
 WERD_CHOICE (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const char *src_string, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const WERD_CHOICE &word)
 
 ~WERD_CHOICE ()
 
const UNICHARSETunicharset () const
 
bool empty () const
 
unsigned length () const
 
float adjust_factor () const
 
void set_adjust_factor (float factor)
 
const std::vector< UNICHAR_ID > & unichar_ids () const
 
UNICHAR_ID unichar_id (unsigned index) const
 
unsigned state (unsigned index) const
 
ScriptPos BlobPosition (unsigned index) const
 
float rating () const
 
float certainty () const
 
float certainty (unsigned index) const
 
float min_x_height () const
 
float max_x_height () const
 
void set_x_heights (float min_height, float max_height)
 
uint8_t permuter () const
 
const char * permuter_name () const
 
BLOB_CHOICE_LIST * blob_choices (unsigned index, MATRIX *ratings) const
 
MATRIX_COORD MatrixCoord (unsigned index) const
 
void set_unichar_id (UNICHAR_ID unichar_id, unsigned index)
 
bool dangerous_ambig_found () const
 
void set_dangerous_ambig_found_ (bool value)
 
void set_rating (float new_val)
 
void set_certainty (float new_val)
 
void set_permuter (uint8_t perm)
 
void set_length (unsigned len)
 
void double_the_size ()
 Make more space in unichar_id_ and fragment_lengths_ arrays. More...
 
void init (unsigned reserved)
 
void init (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter)
 
void make_bad ()
 Set the fields in this choice to be default (bad) values. More...
 
void append_unichar_id_space_allocated (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void append_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void set_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, unsigned index)
 
void set_blob_choice (unsigned index, int blob_count, const BLOB_CHOICE *blob_choice)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
void remove_unichar_ids (unsigned index, int num)
 
void remove_last_unichar_id ()
 
void remove_unichar_id (unsigned index)
 
bool has_rtl_unichar_id () const
 
void reverse_and_mirror_unichar_ids ()
 
void punct_stripped (unsigned *start_core, unsigned *end_core) const
 
void GetNonSuperscriptSpan (int *start, int *end) const
 
WERD_CHOICE shallow_copy (unsigned start, unsigned end) const
 
void string_and_lengths (std::string *word_str, std::string *word_lengths_str) const
 
std::string debug_string () const
 
bool ContainsAnyNonSpaceDelimited () const
 
bool IsAllSpaces () const
 
bool set_unichars_in_script_order (bool in_script_order)
 
bool unichars_in_script_order () const
 
std::string & unichar_string ()
 
const std::string & unichar_string () const
 
const std::string & unichar_lengths () const
 
void SetScriptPositions (bool small_caps, TWERD *word, int debug=0)
 
void SetAllScriptPositions (ScriptPos position)
 
int GetTopScriptID () const
 
void UpdateStateForSplit (int blob_position)
 
unsigned TotalOfStates () const
 
void print () const
 
void print (const char *msg) const
 
void print_state (const char *msg) const
 
void DisplaySegmentation (TWERD *word)
 
WERD_CHOICEoperator+= (const WERD_CHOICE &second)
 
WERD_CHOICEoperator= (const WERD_CHOICE &source)
 
- Public Member Functions inherited from tesseract::ELIST_LINK
 ELIST_LINK ()
 
 ELIST_LINK (const ELIST_LINK &)
 
void operator= (const ELIST_LINK &)
 

Static Public Member Functions

static const char * permuter_name (uint8_t permuter)
 
static ScriptPos ScriptPositionOf (bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
 

Static Public Attributes

static const float kBadRating = 100000.0
 

Detailed Description

Definition at line 254 of file ratngs.h.

Constructor & Destructor Documentation

◆ WERD_CHOICE() [1/5]

tesseract::WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset)
inline

Definition at line 259 of file ratngs.h.

259  : unicharset_(unicharset) {
260  this->init(8);
261  }
void init(unsigned reserved)
Definition: ratngs.h:382
const UNICHARSET * unicharset() const
Definition: ratngs.h:277

◆ WERD_CHOICE() [2/5]

tesseract::WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset,
int  reserved 
)
inline

Definition at line 262 of file ratngs.h.

262  : unicharset_(unicharset) {
263  this->init(reserved);
264  }

◆ WERD_CHOICE() [3/5]

tesseract::WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uint8_t  src_permuter,
const UNICHARSET unicharset 
)
inline

Definition at line 265 of file ratngs.h.

267  : unicharset_(&unicharset) {
268  this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter);
269  }

◆ WERD_CHOICE() [4/5]

tesseract::WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const UNICHARSET unicharset 
)

WERD_CHOICE::WERD_CHOICE

Constructor to build a WERD_CHOICE from the given string. The function assumes that src_string is not nullptr.

Definition at line 213 of file ratngs.cpp.

214  : unicharset_(&unicharset) {
215  std::vector<UNICHAR_ID> encoding;
216  std::vector<char> lengths;
217  std::string cleaned = unicharset.CleanupString(src_string);
218  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths, nullptr)) {
219  lengths.push_back('\0');
220  std::string src_lengths = &lengths[0];
221  this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM);
222  } else { // There must have been an invalid unichar in the string.
223  this->init(8);
224  this->make_bad();
225  }
226 }
@ NO_PERM
Definition: ratngs.h:232
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:415
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265

◆ WERD_CHOICE() [5/5]

tesseract::WERD_CHOICE::WERD_CHOICE ( const WERD_CHOICE word)
inline

Definition at line 271 of file ratngs.h.

271  : ELIST_LINK(word), unicharset_(word.unicharset_) {
272  this->init(word.length());
273  this->operator=(word);
274  }
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:499

◆ ~WERD_CHOICE()

tesseract::WERD_CHOICE::~WERD_CHOICE ( )
default

Member Function Documentation

◆ adjust_factor()

float tesseract::WERD_CHOICE::adjust_factor ( ) const
inline

Definition at line 286 of file ratngs.h.

286  {
287  return adjust_factor_;
288  }

◆ append_unichar_id()

void tesseract::WERD_CHOICE::append_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)

append_unichar_id

Make sure there is enough space in the word for the new unichar id and call append_unichar_id_space_allocated().

Definition at line 447 of file ratngs.cpp.

448  {
449  if (length_ == reserved_) {
450  this->double_the_size();
451  }
453 }
float certainty() const
Definition: ratngs.h:311
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:295
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:424
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:368
float rating() const
Definition: ratngs.h:308

◆ append_unichar_id_space_allocated()

void tesseract::WERD_CHOICE::append_unichar_id_space_allocated ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)
inline

This function assumes that there is enough space reserved in the WERD_CHOICE for adding another unichar. This is an efficient alternative to append_unichar_id().

Definition at line 424 of file ratngs.h.

425  {
426  assert(reserved_ > length_);
427  length_++;
428  this->set_unichar_id(unichar_id, blob_count, rating, certainty, length_ - 1);
429  }
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:340

◆ blob_choices()

BLOB_CHOICE_LIST * tesseract::WERD_CHOICE::blob_choices ( unsigned  index,
MATRIX ratings 
) const

Definition at line 274 of file ratngs.cpp.

274  {
275  MATRIX_COORD coord = MatrixCoord(index);
276  BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row);
277  if (result == nullptr) {
278  result = new BLOB_CHOICE_LIST;
279  ratings->put(coord.col, coord.row, result);
280  }
281  return result;
282 }
MATRIX_COORD MatrixCoord(unsigned index) const
Definition: ratngs.cpp:286

◆ BlobPosition()

ScriptPos tesseract::WERD_CHOICE::BlobPosition ( unsigned  index) const
inline

Definition at line 302 of file ratngs.h.

302  {
303  if (index >= length_) {
304  return SP_NORMAL;
305  }
306  return script_pos_[index];
307  }
@ SP_NORMAL
Definition: ratngs.h:250

◆ certainty() [1/2]

float tesseract::WERD_CHOICE::certainty ( ) const
inline

Definition at line 311 of file ratngs.h.

311  {
312  return certainty_;
313  }

◆ certainty() [2/2]

float tesseract::WERD_CHOICE::certainty ( unsigned  index) const
inline

Definition at line 314 of file ratngs.h.

314  {
315  return certainties_[index];
316  }

◆ contains_unichar_id()

bool tesseract::WERD_CHOICE::contains_unichar_id ( UNICHAR_ID  unichar_id) const

contains_unichar_id

Returns true if unichar_ids_ contain the given unichar_id, false otherwise.

Definition at line 309 of file ratngs.cpp.

309  {
310  for (unsigned i = 0; i < length_; ++i) {
311  if (unichar_ids_[i] == unichar_id) {
312  return true;
313  }
314  }
315  return false;
316 }

◆ ContainsAnyNonSpaceDelimited()

bool tesseract::WERD_CHOICE::ContainsAnyNonSpaceDelimited ( ) const
inline

Definition at line 484 of file ratngs.h.

484  {
485  for (unsigned i = 0; i < length_; ++i) {
486  if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) {
487  return true;
488  }
489  }
490  return false;
491  }
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:669

◆ dangerous_ambig_found()

bool tesseract::WERD_CHOICE::dangerous_ambig_found ( ) const
inline

Definition at line 344 of file ratngs.h.

344  {
345  return dangerous_ambig_found_;
346  }

◆ debug_string()

std::string tesseract::WERD_CHOICE::debug_string ( ) const
inline

Definition at line 475 of file ratngs.h.

475  {
476  std::string word_str;
477  for (unsigned i = 0; i < length_; ++i) {
478  word_str += unicharset_->debug_str(unichar_ids_[i]);
479  word_str += " ";
480  }
481  return word_str;
482  }
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331

◆ DisplaySegmentation()

void tesseract::WERD_CHOICE::DisplaySegmentation ( TWERD word)

Definition at line 728 of file ratngs.cpp.

728  {
729  // Number of different colors to draw with.
730  const int kNumColors = 6;
731  static ScrollView *segm_window = nullptr;
732  // Check the state against the static prev_drawn_state.
733  static std::vector<int> prev_drawn_state;
734  bool already_done = prev_drawn_state.size() == length_;
735  if (!already_done) {
736  prev_drawn_state.clear();
737  prev_drawn_state.resize(length_);
738  }
739  for (unsigned i = 0; i < length_; ++i) {
740  if (prev_drawn_state[i] != state_[i]) {
741  already_done = false;
742  }
743  prev_drawn_state[i] = state_[i];
744  }
745  if (already_done || word->blobs.empty()) {
746  return;
747  }
748 
749  // Create the window if needed.
750  if (segm_window == nullptr) {
751  segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 2000.0, 256.0, true);
752  } else {
753  segm_window->Clear();
754  }
755 
756  TBOX bbox;
757  int blob_index = 0;
758  for (unsigned c = 0; c < length_; ++c) {
759  auto color = static_cast<ScrollView::Color>(c % kNumColors + 3);
760  for (int i = 0; i < state_[c]; ++i, ++blob_index) {
761  TBLOB *blob = word->blobs[blob_index];
762  bbox += blob->bounding_box();
763  blob->plot(segm_window, color, color);
764  }
765  }
766  segm_window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom());
767  segm_window->Update();
768  segm_window->Wait();
769 }
@ TBOX

◆ double_the_size()

void tesseract::WERD_CHOICE::double_the_size ( )
inline

Make more space in unichar_id_ and fragment_lengths_ arrays.

Definition at line 368 of file ratngs.h.

368  {
369  if (reserved_ > 0) {
370  reserved_ *= 2;
371  } else {
372  reserved_ = 1;
373  }
374  unichar_ids_.resize(reserved_);
375  script_pos_.resize(reserved_);
376  state_.resize(reserved_);
377  certainties_.resize(reserved_);
378  }

◆ empty()

bool tesseract::WERD_CHOICE::empty ( ) const
inline

Definition at line 280 of file ratngs.h.

280  {
281  return length_ == 0;
282  }

◆ GetNonSuperscriptSpan()

void tesseract::WERD_CHOICE::GetNonSuperscriptSpan ( int *  start,
int *  end 
) const

Definition at line 378 of file ratngs.cpp.

378  {
379  int end = length();
380  while (end > 0 && unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
382  end--;
383  }
384  int start = 0;
385  while (start < end && unicharset_->get_isdigit(unichar_ids_[start]) &&
387  start++;
388  }
389  *pstart = start;
390  *pend = end;
391 }
@ SP_SUPERSCRIPT
Definition: ratngs.h:250
unsigned length() const
Definition: ratngs.h:283
ScriptPos BlobPosition(unsigned index) const
Definition: ratngs.h:302
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524

◆ GetTopScriptID()

int tesseract::WERD_CHOICE::GetTopScriptID ( ) const

Definition at line 631 of file ratngs.cpp.

631  {
632  unsigned max_script = unicharset_->get_script_table_size();
633  std::vector<unsigned> sid(max_script);
634  for (unsigned x = 0; x < length_; ++x) {
635  int script_id = unicharset_->get_script(unichar_id(x));
636  sid[script_id]++;
637  }
638  if (unicharset_->han_sid() != unicharset_->null_sid()) {
639  // Add the Hiragana & Katakana counts to Han and zero them out.
640  if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
641  sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
642  sid[unicharset_->hiragana_sid()] = 0;
643  }
644  if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
645  sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
646  sid[unicharset_->katakana_sid()] = 0;
647  }
648  }
649  // Note that high script ID overrides lower one on a tie, thus biasing
650  // towards non-Common script (if sorted that way in unicharset file).
651  unsigned max_sid = 0;
652  for (unsigned x = 1; x < max_script; x++) {
653  if (sid[x] >= sid[max_sid]) {
654  max_sid = x;
655  }
656  }
657  if (sid[max_sid] < length_ / 2) {
658  max_sid = unicharset_->null_sid();
659  }
660  return max_sid;
661 }
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
int han_sid() const
Definition: unicharset.h:932
int get_script_table_size() const
Definition: unicharset.h:882
int hiragana_sid() const
Definition: unicharset.h:935
int null_sid() const
Definition: unicharset.h:917
int katakana_sid() const
Definition: unicharset.h:938

◆ has_rtl_unichar_id()

bool tesseract::WERD_CHOICE::has_rtl_unichar_id ( ) const

has_rtl_unichar_id

Returns true if unichar_ids contain at least one "strongly" RTL unichar.

Definition at line 411 of file ratngs.cpp.

411  {
412  for (unsigned i = 0; i < length_; ++i) {
413  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
415  return true;
416  }
417  }
418  return false;
419 }
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:713

◆ init() [1/2]

void tesseract::WERD_CHOICE::init ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uint8_t  src_permuter 
)

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter. The function assumes that src_string is not nullptr. src_lengths argument could be nullptr, in which case the unichars in src_string are assumed to all be of length 1.

WERD_CHOICE::init

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter.

The function assumes that src_string is not nullptr. src_lengths argument could be nullptr, in which case the unichars in src_string are assumed to all be of length 1.

Definition at line 238 of file ratngs.cpp.

239  {
240  int src_string_len = strlen(src_string);
241  if (src_string_len == 0) {
242  this->init(8);
243  } else {
244  this->init(src_lengths ? strlen(src_lengths) : src_string_len);
245  length_ = reserved_;
246  int offset = 0;
247  for (unsigned i = 0; i < length_; ++i) {
248  int unichar_length = src_lengths ? src_lengths[i] : 1;
249  unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length);
250  state_[i] = 1;
251  certainties_[i] = src_certainty;
252  offset += unichar_length;
253  }
254  }
255  adjust_factor_ = 1.0f;
256  rating_ = src_rating;
257  certainty_ = src_certainty;
258  permuter_ = src_permuter;
259  dangerous_ambig_found_ = false;
260 }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186

◆ init() [2/2]

void tesseract::WERD_CHOICE::init ( unsigned  reserved)
inline

Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and fragment_length_ arrays. Sets other values to default (blank) values.

Definition at line 382 of file ratngs.h.

382  {
383  reserved_ = reserved;
384  if (reserved > 0) {
385  unichar_ids_.resize(reserved);
386  script_pos_.resize(reserved);
387  state_.resize(reserved);
388  certainties_.resize(reserved);
389  } else {
390  unichar_ids_.clear();
391  script_pos_.clear();
392  state_.clear();
393  certainties_.clear();
394  }
395  length_ = 0;
396  adjust_factor_ = 1.0f;
397  rating_ = 0.0;
398  certainty_ = FLT_MAX;
399  min_x_height_ = 0.0f;
400  max_x_height_ = FLT_MAX;
401  permuter_ = NO_PERM;
402  unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
403  dangerous_ambig_found_ = false;
404  }

◆ IsAllSpaces()

bool tesseract::WERD_CHOICE::IsAllSpaces ( ) const
inline

Definition at line 493 of file ratngs.h.

493  {
494  for (unsigned i = 0; i < length_; ++i) {
495  if (unichar_ids_[i] != UNICHAR_SPACE) {
496  return false;
497  }
498  }
499  return true;
500  }
@ UNICHAR_SPACE
Definition: unicharset.h:36

◆ length()

unsigned tesseract::WERD_CHOICE::length ( ) const
inline

Definition at line 283 of file ratngs.h.

283  {
284  return length_;
285  }

◆ make_bad()

void tesseract::WERD_CHOICE::make_bad ( )
inline

Set the fields in this choice to be default (bad) values.

Definition at line 415 of file ratngs.h.

415  {
416  length_ = 0;
417  rating_ = kBadRating;
418  certainty_ = -FLT_MAX;
419  }
static const float kBadRating
Definition: ratngs.h:256

◆ MatrixCoord()

MATRIX_COORD tesseract::WERD_CHOICE::MatrixCoord ( unsigned  index) const

Definition at line 286 of file ratngs.cpp.

286  {
287  int col = 0;
288  for (unsigned i = 0; i < index; ++i) {
289  col += state_[i];
290  }
291  int row = col + state_[index] - 1;
292  return MATRIX_COORD(col, row);
293 }

◆ max_x_height()

float tesseract::WERD_CHOICE::max_x_height ( ) const
inline

Definition at line 320 of file ratngs.h.

320  {
321  return max_x_height_;
322  }

◆ min_x_height()

float tesseract::WERD_CHOICE::min_x_height ( ) const
inline

Definition at line 317 of file ratngs.h.

317  {
318  return min_x_height_;
319  }

◆ operator+=()

WERD_CHOICE & tesseract::WERD_CHOICE::operator+= ( const WERD_CHOICE second)

WERD_CHOICE::operator+=

Cat a second word rating on the end of this current one. The ratings are added and the confidence is the min. If the permuters are NOT the same the permuter is set to COMPOUND_PERM

Definition at line 462 of file ratngs.cpp.

462  {
463  ASSERT_HOST(unicharset_ == second.unicharset_);
464  while (reserved_ < length_ + second.length()) {
465  this->double_the_size();
466  }
467  const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids();
468  for (unsigned i = 0; i < second.length(); ++i) {
469  unichar_ids_[length_ + i] = other_unichar_ids[i];
470  state_[length_ + i] = second.state_[i];
471  certainties_[length_ + i] = second.certainties_[i];
472  script_pos_[length_ + i] = second.BlobPosition(i);
473  }
474  length_ += second.length();
475  if (second.adjust_factor_ > adjust_factor_) {
476  adjust_factor_ = second.adjust_factor_;
477  }
478  rating_ += second.rating(); // add ratings
479  if (second.certainty() < certainty_) { // take min
480  certainty_ = second.certainty();
481  }
482  if (second.dangerous_ambig_found_) {
483  dangerous_ambig_found_ = true;
484  }
485  if (permuter_ == NO_PERM) {
486  permuter_ = second.permuter();
487  } else if (second.permuter() != NO_PERM && second.permuter() != permuter_) {
488  permuter_ = COMPOUND_PERM;
489  }
490  return *this;
491 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
@ COMPOUND_PERM
Definition: ratngs.h:244

◆ operator=()

WERD_CHOICE & tesseract::WERD_CHOICE::operator= ( const WERD_CHOICE source)

WERD_CHOICE::operator=

Allocate enough memory to hold a copy of source and copy over all the information from source to this WERD_CHOICE.

Definition at line 499 of file ratngs.cpp.

499  {
500  while (reserved_ < source.length()) {
501  this->double_the_size();
502  }
503 
504  unicharset_ = source.unicharset_;
505  const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids();
506  for (unsigned i = 0; i < source.length(); ++i) {
507  unichar_ids_[i] = other_unichar_ids[i];
508  state_[i] = source.state_[i];
509  certainties_[i] = source.certainties_[i];
510  script_pos_[i] = source.BlobPosition(i);
511  }
512  length_ = source.length();
513  adjust_factor_ = source.adjust_factor_;
514  rating_ = source.rating();
515  certainty_ = source.certainty();
516  min_x_height_ = source.min_x_height();
517  max_x_height_ = source.max_x_height();
518  permuter_ = source.permuter();
519  dangerous_ambig_found_ = source.dangerous_ambig_found_;
520  return *this;
521 }

◆ permuter()

uint8_t tesseract::WERD_CHOICE::permuter ( ) const
inline

Definition at line 327 of file ratngs.h.

327  {
328  return permuter_;
329  }

◆ permuter_name() [1/2]

const char * tesseract::WERD_CHOICE::permuter_name ( ) const

Definition at line 267 of file ratngs.cpp.

267  {
268  return kPermuterTypeNames[permuter_];
269 }

◆ permuter_name() [2/2]

const char * tesseract::WERD_CHOICE::permuter_name ( uint8_t  permuter)
static

Definition at line 189 of file ratngs.cpp.

189  {
190  return kPermuterTypeNames[permuter];
191 }
uint8_t permuter() const
Definition: ratngs.h:327

◆ print() [1/2]

void tesseract::WERD_CHOICE::print ( ) const
inline

Definition at line 557 of file ratngs.h.

557  {
558  this->print("");
559  }
void print() const
Definition: ratngs.h:557

◆ print() [2/2]

void tesseract::WERD_CHOICE::print ( const char *  msg) const

WERD_CHOICE::print

Print WERD_CHOICE to stdout.

Definition at line 689 of file ratngs.cpp.

689  {
690  tprintf("%s : ", msg);
691  for (unsigned i = 0; i < length_; ++i) {
692  tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
693  }
694  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_,
695  adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_);
696  tprintf("pos");
697  for (unsigned i = 0; i < length_; ++i) {
698  tprintf("\t%s", ScriptPosToString(script_pos_[i]));
699  }
700  tprintf("\nstr");
701  for (unsigned i = 0; i < length_; ++i) {
702  tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
703  }
704  tprintf("\nstate:");
705  for (unsigned i = 0; i < length_; ++i) {
706  tprintf("\t%d ", state_[i]);
707  }
708  tprintf("\nC");
709  for (unsigned i = 0; i < length_; ++i) {
710  tprintf("\t%.3f", certainties_[i]);
711  }
712  tprintf("\n");
713 }
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:193
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279

◆ print_state()

void tesseract::WERD_CHOICE::print_state ( const char *  msg) const

Definition at line 716 of file ratngs.cpp.

716  {
717  tprintf("%s", msg);
718  for (unsigned i = 0; i < length_; ++i) {
719  tprintf(" %d", state_[i]);
720  }
721  tprintf("\n");
722 }

◆ punct_stripped()

void tesseract::WERD_CHOICE::punct_stripped ( unsigned *  start,
unsigned *  end 
) const

punct_stripped

Returns the half-open interval of unichar_id indices [start, end) which enclose the core portion of this word – the part after stripping punctuation from the left and right.

Definition at line 367 of file ratngs.cpp.

367  {
368  *start = 0;
369  *end = length();
370  while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) {
371  (*start)++;
372  }
373  while (*end > 0 && unicharset()->get_ispunctuation(unichar_id(*end - 1))) {
374  (*end)--;
375  }
376 }

◆ rating()

float tesseract::WERD_CHOICE::rating ( ) const
inline

Definition at line 308 of file ratngs.h.

308  {
309  return rating_;
310  }

◆ remove_last_unichar_id()

void tesseract::WERD_CHOICE::remove_last_unichar_id ( )
inline

Definition at line 451 of file ratngs.h.

451  {
452  --length_;
453  }

◆ remove_unichar_id()

void tesseract::WERD_CHOICE::remove_unichar_id ( unsigned  index)
inline

Definition at line 454 of file ratngs.h.

454  {
455  this->remove_unichar_ids(index, 1);
456  }
void remove_unichar_ids(unsigned index, int num)
Definition: ratngs.cpp:325

◆ remove_unichar_ids()

void tesseract::WERD_CHOICE::remove_unichar_ids ( unsigned  start,
int  num 
)

remove_unichar_ids

Removes num unichar ids starting from index start from unichar_ids_ and updates length_ and fragment_lengths_ to reflect this change. Note: this function does not modify rating_ and certainty_.

Definition at line 325 of file ratngs.cpp.

325  {
326  ASSERT_HOST(start + num <= length_);
327  // Accumulate the states to account for the merged blobs.
328  for (int i = 0; i < num; ++i) {
329  if (start > 0) {
330  state_[start - 1] += state_[start + i];
331  } else if (start + num < length_) {
332  state_[start + num] += state_[start + i];
333  }
334  }
335  for (unsigned i = start; i + num < length_; ++i) {
336  unichar_ids_[i] = unichar_ids_[i + num];
337  script_pos_[i] = script_pos_[i + num];
338  state_[i] = state_[i + num];
339  certainties_[i] = certainties_[i + num];
340  }
341  length_ -= num;
342 }

◆ reverse_and_mirror_unichar_ids()

void tesseract::WERD_CHOICE::reverse_and_mirror_unichar_ids ( )

reverse_and_mirror_unichar_ids

Reverses and mirrors unichars in unichar_ids.

Definition at line 349 of file ratngs.cpp.

349  {
350  for (unsigned i = 0; i < length_ / 2; ++i) {
351  UNICHAR_ID tmp_id = unichar_ids_[i];
352  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]);
353  unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id);
354  }
355  if (length_ % 2 != 0) {
356  unichar_ids_[length_ / 2] = unicharset_->get_mirror(unichar_ids_[length_ / 2]);
357  }
358 }
int UNICHAR_ID
Definition: unichar.h:36
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:722

◆ ScriptPositionOf()

ScriptPos tesseract::WERD_CHOICE::ScriptPositionOf ( bool  print_debug,
const UNICHARSET unicharset,
const TBOX blob_box,
UNICHAR_ID  unichar_id 
)
static

Definition at line 599 of file ratngs.cpp.

600  {
602  int top = blob_box.top();
603  int bottom = blob_box.bottom();
604  int min_bottom, max_bottom, min_top, max_top;
605  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);
606 
607  int sub_thresh_top = min_top - kMinSubscriptOffset;
608  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
609  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
610  if (bottom <= kMaxDropCapBottom) {
611  retval = tesseract::SP_DROPCAP;
612  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
613  retval = tesseract::SP_SUBSCRIPT;
614  } else if (bottom > sup_thresh_bot) {
615  retval = tesseract::SP_SUPERSCRIPT;
616  }
617 
618  if (print_debug) {
619  const char *pos = ScriptPosToString(retval);
620  tprintf(
621  "%s Character %s[bot:%d top: %d] "
622  "bot_range[%d,%d] top_range[%d, %d] "
623  "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
624  pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top,
625  max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot);
626  }
627  return retval;
628 }
const int kMaxDropCapBottom
Definition: ratngs.cpp:43
@ SP_SUBSCRIPT
Definition: ratngs.h:250
@ SP_DROPCAP
Definition: ratngs.h:250
const int kMinSubscriptOffset
Definition: ratngs.cpp:39
const int kMinSuperscriptOffset
Definition: ratngs.cpp:41
const int kBlnBaselineOffset
Definition: normalis.h:34
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586

◆ set_adjust_factor()

void tesseract::WERD_CHOICE::set_adjust_factor ( float  factor)
inline

Definition at line 289 of file ratngs.h.

289  {
290  adjust_factor_ = factor;
291  }

◆ set_blob_choice()

void tesseract::WERD_CHOICE::set_blob_choice ( unsigned  index,
int  blob_count,
const BLOB_CHOICE blob_choice 
)

Definition at line 297 of file ratngs.cpp.

297  {
298  unichar_ids_[index] = blob_choice->unichar_id();
299  script_pos_[index] = tesseract::SP_NORMAL;
300  state_[index] = blob_count;
301  certainties_[index] = blob_choice->certainty();
302 }

◆ set_certainty()

void tesseract::WERD_CHOICE::set_certainty ( float  new_val)
inline

Definition at line 353 of file ratngs.h.

353  {
354  certainty_ = new_val;
355  }

◆ set_dangerous_ambig_found_()

void tesseract::WERD_CHOICE::set_dangerous_ambig_found_ ( bool  value)
inline

Definition at line 347 of file ratngs.h.

347  {
348  dangerous_ambig_found_ = value;
349  }

◆ set_length()

void tesseract::WERD_CHOICE::set_length ( unsigned  len)
inline

Definition at line 362 of file ratngs.h.

362  {
363  ASSERT_HOST(reserved_ >= len);
364  length_ = len;
365  }

◆ set_permuter()

void tesseract::WERD_CHOICE::set_permuter ( uint8_t  perm)
inline

Definition at line 356 of file ratngs.h.

356  {
357  permuter_ = perm;
358  }

◆ set_rating()

void tesseract::WERD_CHOICE::set_rating ( float  new_val)
inline

Definition at line 350 of file ratngs.h.

350  {
351  rating_ = new_val;
352  }

◆ set_unichar_id() [1/2]

void tesseract::WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty,
unsigned  index 
)
inline

Definition at line 433 of file ratngs.h.

434  {
435  assert(index < length_);
436  unichar_ids_[index] = unichar_id;
437  state_[index] = blob_count;
438  certainties_[index] = certainty;
439  script_pos_[index] = SP_NORMAL;
440  rating_ += rating;
441  if (certainty < certainty_) {
442  certainty_ = certainty;
443  }
444  }

◆ set_unichar_id() [2/2]

void tesseract::WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
unsigned  index 
)
inline

Definition at line 340 of file ratngs.h.

340  {
341  assert(index < length_);
342  unichar_ids_[index] = unichar_id;
343  }

◆ set_unichars_in_script_order()

bool tesseract::WERD_CHOICE::set_unichars_in_script_order ( bool  in_script_order)
inline

Definition at line 505 of file ratngs.h.

505  {
506  return unichars_in_script_order_ = in_script_order;
507  }

◆ set_x_heights()

void tesseract::WERD_CHOICE::set_x_heights ( float  min_height,
float  max_height 
)
inline

Definition at line 323 of file ratngs.h.

323  {
324  min_x_height_ = min_height;
325  max_x_height_ = max_height;
326  }

◆ SetAllScriptPositions()

void tesseract::WERD_CHOICE::SetAllScriptPositions ( tesseract::ScriptPos  position)

Definition at line 592 of file ratngs.cpp.

592  {
593  for (unsigned i = 0; i < length_; ++i) {
594  script_pos_[i] = position;
595  }
596 }

◆ SetScriptPositions()

void tesseract::WERD_CHOICE::SetScriptPositions ( bool  small_caps,
TWERD word,
int  debug = 0 
)

Definition at line 528 of file ratngs.cpp.

528  {
529  // Initialize to normal.
530  for (unsigned i = 0; i < length_; ++i) {
531  script_pos_[i] = tesseract::SP_NORMAL;
532  }
533  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
534  return;
535  }
536 
537  unsigned position_counts[4] = {0, 0, 0, 0};
538 
539  int chunk_index = 0;
540  for (unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
541  TBLOB *tblob = word->blobs[chunk_index];
542  int uni_id = unichar_id(blob_index);
543  TBOX blob_box = tblob->bounding_box();
544  if (!state_.empty()) {
545  for (int i = 1; i < state_[blob_index]; ++i) {
546  ++chunk_index;
547  tblob = word->blobs[chunk_index];
548  blob_box += tblob->bounding_box();
549  }
550  }
551  script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, uni_id);
552  if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
553  script_pos_[blob_index] = tesseract::SP_NORMAL;
554  }
555  position_counts[script_pos_[blob_index]]++;
556  }
557  // If almost everything looks like a superscript or subscript,
558  // we most likely just got the baseline wrong.
559  if (4 * position_counts[tesseract::SP_SUBSCRIPT] > 3 * length_ ||
560  4 * position_counts[tesseract::SP_SUPERSCRIPT] > 3 * length_) {
561  if (debug >= 2) {
562  tprintf(
563  "Most characters of %s are subscript or superscript.\n"
564  "That seems wrong, so I'll assume we got the baseline wrong\n",
565  unichar_string().c_str());
566  }
567  for (unsigned i = 0; i < length_; i++) {
568  ScriptPos sp = script_pos_[i];
570  ASSERT_HOST(position_counts[sp] > 0);
571  position_counts[sp]--;
572  position_counts[tesseract::SP_NORMAL]++;
573  script_pos_[i] = tesseract::SP_NORMAL;
574  }
575  }
576  }
577 
578  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) {
579  tprintf("SetScriptPosition on %s\n", unichar_string().c_str());
580  int chunk_index = 0;
581  for (unsigned blob_index = 0; blob_index < length_; ++blob_index) {
582  if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
583  TBLOB *tblob = word->blobs[chunk_index];
584  ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index));
585  }
586  chunk_index += state_.empty() ? 1 : state_[blob_index];
587  }
588  }
589 }
unsigned TotalOfStates() const
Definition: ratngs.cpp:676
static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:599
std::string & unichar_string()
Definition: ratngs.h:515

◆ shallow_copy()

WERD_CHOICE tesseract::WERD_CHOICE::shallow_copy ( unsigned  start,
unsigned  end 
) const

Definition at line 393 of file ratngs.cpp.

393  {
394  ASSERT_HOST(start <= length_);
395  ASSERT_HOST(end <= length_);
396  if (end < start) {
397  end = start;
398  }
399  WERD_CHOICE retval(unicharset_, end - start);
400  for (auto i = start; i < end; i++) {
401  retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
402  }
403  return retval;
404 }
WERD_CHOICE(const UNICHARSET *unicharset)
Definition: ratngs.h:259

◆ state()

unsigned tesseract::WERD_CHOICE::state ( unsigned  index) const
inline

Definition at line 299 of file ratngs.h.

299  {
300  return state_[index];
301  }

◆ string_and_lengths()

void tesseract::WERD_CHOICE::string_and_lengths ( std::string *  word_str,
std::string *  word_lengths_str 
) const

string_and_lengths

Populates the given word_str with unichars from unichar_ids and and word_lengths_str with the corresponding unichar lengths.

Definition at line 427 of file ratngs.cpp.

427  {
428  *word_str = "";
429  if (word_lengths_str != nullptr) {
430  *word_lengths_str = "";
431  }
432  for (unsigned i = 0; i < length_; ++i) {
433  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
434  *word_str += ch;
435  if (word_lengths_str != nullptr) {
436  *word_lengths_str += (char)strlen(ch);
437  }
438  }
439 }
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:287

◆ TotalOfStates()

unsigned tesseract::WERD_CHOICE::TotalOfStates ( ) const

Definition at line 676 of file ratngs.cpp.

676  {
677  unsigned total_chunks = 0;
678  for (unsigned i = 0; i < length_; ++i) {
679  total_chunks += state_[i];
680  }
681  return total_chunks;
682 }

◆ unichar_id()

UNICHAR_ID tesseract::WERD_CHOICE::unichar_id ( unsigned  index) const
inline

Definition at line 295 of file ratngs.h.

295  {
296  assert(index < length_);
297  return unichar_ids_[index];
298  }

◆ unichar_ids()

const std::vector<UNICHAR_ID>& tesseract::WERD_CHOICE::unichar_ids ( ) const
inline

Definition at line 292 of file ratngs.h.

292  {
293  return unichar_ids_;
294  }

◆ unichar_lengths()

const std::string& tesseract::WERD_CHOICE::unichar_lengths ( ) const
inline

Definition at line 529 of file ratngs.h.

529  {
530  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
531  return unichar_lengths_;
532  }
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
Definition: ratngs.cpp:427

◆ unichar_string() [1/2]

std::string& tesseract::WERD_CHOICE::unichar_string ( )
inline

Definition at line 515 of file ratngs.h.

515  {
516  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
517  return unichar_string_;
518  }

◆ unichar_string() [2/2]

const std::string& tesseract::WERD_CHOICE::unichar_string ( ) const
inline

Definition at line 522 of file ratngs.h.

522  {
523  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
524  return unichar_string_;
525  }

◆ unichars_in_script_order()

bool tesseract::WERD_CHOICE::unichars_in_script_order ( ) const
inline

Definition at line 509 of file ratngs.h.

509  {
510  return unichars_in_script_order_;
511  }

◆ unicharset()

const UNICHARSET* tesseract::WERD_CHOICE::unicharset ( ) const
inline

Definition at line 277 of file ratngs.h.

277  {
278  return unicharset_;
279  }

◆ UpdateStateForSplit()

void tesseract::WERD_CHOICE::UpdateStateForSplit ( int  blob_position)

Definition at line 664 of file ratngs.cpp.

664  {
665  int total_chunks = 0;
666  for (unsigned i = 0; i < length_; ++i) {
667  total_chunks += state_[i];
668  if (total_chunks > blob_position) {
669  ++state_[i];
670  return;
671  }
672  }
673 }

Member Data Documentation

◆ kBadRating

const float tesseract::WERD_CHOICE::kBadRating = 100000.0
static

Definition at line 256 of file ratngs.h.


The documentation for this class was generated from the following files: