tesseract  5.0.0
tesseract::TrainingSampleSet Class Reference

#include <trainingsampleset.h>

Public Member Functions

 TrainingSampleSet (const FontInfoTable &fontinfo_table)
 
 ~TrainingSampleSet ()
 
bool Serialize (FILE *fp) const
 
bool DeSerialize (bool swap, FILE *fp)
 
int num_samples () const
 
int num_raw_samples () const
 
int NumFonts () const
 
const UNICHARSETunicharset () const
 
int charsetsize () const
 
const FontInfoTablefontinfo_table () const
 
void LoadUnicharset (const char *filename)
 
int AddSample (const char *unichar, TrainingSample *sample)
 
void AddSample (int unichar_id, TrainingSample *sample)
 
int NumClassSamples (int font_id, int class_id, bool randomize) const
 
const TrainingSampleGetSample (int index) const
 
const TrainingSampleGetSample (int font_id, int class_id, int index) const
 
TrainingSampleMutableSample (int font_id, int class_id, int index)
 
std::string SampleToString (const TrainingSample &sample) const
 
const BitVectorGetCloudFeatures (int font_id, int class_id) const
 
const std::vector< int > & GetCanonicalFeatures (int font_id, int class_id) const
 
float UnicharDistance (const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
 
float ClusterDistance (int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
 
float ComputeClusterDistance (int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
 
int ReliablySeparable (int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
 
int GlobalSampleIndex (int font_id, int class_id, int index) const
 
const TrainingSampleGetCanonicalSample (int font_id, int class_id) const
 
float GetCanonicalDist (int font_id, int class_id) const
 
TrainingSamplemutable_sample (int index)
 
TrainingSampleextract_sample (int index)
 
void IndexFeatures (const IntFeatureSpace &feature_space)
 
void KillSample (TrainingSample *sample)
 
void DeleteDeadSamples ()
 
void OrganizeByFontAndClass ()
 
void SetupFontIdMap ()
 
void ComputeCanonicalSamples (const IntFeatureMap &map, bool debug)
 
void ReplicateAndRandomizeSamples ()
 
void ComputeCanonicalFeatures ()
 
void ComputeCloudFeatures (int feature_space_size)
 
void AddAllFontsForClass (int class_id, Shape *shape) const
 
void DisplaySamplesWithFeature (int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
 

Detailed Description

Definition at line 41 of file trainingsampleset.h.

Constructor & Destructor Documentation

◆ TrainingSampleSet()

tesseract::TrainingSampleSet::TrainingSampleSet ( const FontInfoTable fontinfo_table)
explicit

Definition at line 86 of file trainingsampleset.cpp.

87  : num_raw_samples_(0)
88  , unicharset_size_(0)
89  , font_class_array_(nullptr)
90  , fontinfo_table_(font_table) {}

◆ ~TrainingSampleSet()

tesseract::TrainingSampleSet::~TrainingSampleSet ( )

Definition at line 92 of file trainingsampleset.cpp.

92  {
93  for (auto sample : samples_) {
94  delete sample;
95  }
96  delete font_class_array_;
97 }

Member Function Documentation

◆ AddAllFontsForClass()

void tesseract::TrainingSampleSet::AddAllFontsForClass ( int  class_id,
Shape shape 
) const

Definition at line 789 of file trainingsampleset.cpp.

789  {
790  for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
791  const int font_id = font_id_map_.CompactToSparse(f);
792  shape->AddToShape(class_id, font_id);
793  }
794 }
int CompactSize() const
Definition: indexmapbidi.h:63
int CompactToSparse(int compact_index) const
Definition: indexmapbidi.h:55

◆ AddSample() [1/2]

int tesseract::TrainingSampleSet::AddSample ( const char *  unichar,
TrainingSample sample 
)

Definition at line 170 of file trainingsampleset.cpp.

170  {
171  if (!unicharset_.contains_unichar(unichar)) {
172  unicharset_.unichar_insert(unichar);
173  if (unicharset_.size() > MAX_NUM_CLASSES) {
174  tprintf(
175  "Error: Size of unicharset in TrainingSampleSet::AddSample is "
176  "greater than MAX_NUM_CLASSES\n");
177  return -1;
178  }
179  }
180  UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
181  AddSample(char_id, sample);
182  return char_id;
183 }
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int UNICHAR_ID
Definition: unichar.h:36
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
size_t size() const
Definition: unicharset.h:355
int AddSample(const char *unichar, TrainingSample *sample)

◆ AddSample() [2/2]

void tesseract::TrainingSampleSet::AddSample ( int  unichar_id,
TrainingSample sample 
)

Definition at line 187 of file trainingsampleset.cpp.

187  {
188  sample->set_class_id(unichar_id);
189  samples_.push_back(sample);
190  num_raw_samples_ = samples_.size();
191  unicharset_size_ = unicharset_.size();
192 }

◆ charsetsize()

int tesseract::TrainingSampleSet::charsetsize ( ) const
inline

Definition at line 65 of file trainingsampleset.h.

65  {
66  return unicharset_size_;
67  }

◆ ClusterDistance()

float tesseract::TrainingSampleSet::ClusterDistance ( int  font_id1,
int  class_id1,
int  font_id2,
int  class_id2,
const IntFeatureMap feature_map 
)

Definition at line 337 of file trainingsampleset.cpp.

338  {
339  ASSERT_HOST(font_class_array_ != nullptr);
340  int font_index1 = font_id_map_.SparseToCompact(font_id1);
341  int font_index2 = font_id_map_.SparseToCompact(font_id2);
342  if (font_index1 < 0 || font_index2 < 0) {
343  return 0.0f;
344  }
345  FontClassInfo &fc_info = (*font_class_array_)(font_index1, class_id1);
346  if (font_id1 == font_id2) {
347  // Special case cache for speed.
348  if (fc_info.unichar_distance_cache.empty()) {
349  fc_info.unichar_distance_cache.resize(unicharset_size_, -1.0f);
350  }
351  if (fc_info.unichar_distance_cache[class_id2] < 0) {
352  // Distance has to be calculated.
353  float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map);
354  fc_info.unichar_distance_cache[class_id2] = result;
355  // Copy to the symmetric cache entry.
356  FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);
357  if (fc_info2.unichar_distance_cache.empty()) {
358  fc_info2.unichar_distance_cache.resize(unicharset_size_, -1.0f);
359  }
360  fc_info2.unichar_distance_cache[class_id1] = result;
361  }
362  return fc_info.unichar_distance_cache[class_id2];
363  } else if (class_id1 == class_id2) {
364  // Another special-case cache for equal class-id.
365  if (fc_info.font_distance_cache.empty()) {
366  fc_info.font_distance_cache.resize(font_id_map_.CompactSize(), -1.0f);
367  }
368  if (fc_info.font_distance_cache[font_index2] < 0) {
369  // Distance has to be calculated.
370  float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map);
371  fc_info.font_distance_cache[font_index2] = result;
372  // Copy to the symmetric cache entry.
373  FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);
374  if (fc_info2.font_distance_cache.empty()) {
375  fc_info2.font_distance_cache.resize(font_id_map_.CompactSize(), -1.0f);
376  }
377  fc_info2.font_distance_cache[font_index1] = result;
378  }
379  return fc_info.font_distance_cache[font_index2];
380  }
381  // Both font and class are different. Linear search for class_id2/font_id2
382  // in what is a hopefully short list of distances.
383  size_t cache_index = 0;
384  while (cache_index < fc_info.distance_cache.size() &&
385  (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
386  fc_info.distance_cache[cache_index].font_id != font_id2)) {
387  ++cache_index;
388  }
389  if (cache_index == fc_info.distance_cache.size()) {
390  // Distance has to be calculated.
391  float result = ComputeClusterDistance(font_id1, class_id1, font_id2, class_id2, feature_map);
392  FontClassDistance fc_dist = {class_id2, font_id2, result};
393  fc_info.distance_cache.push_back(fc_dist);
394  // Copy to the symmetric cache entry. We know it isn't there already, as
395  // we always copy to the symmetric entry.
396  FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);
397  fc_dist.unichar_id = class_id1;
398  fc_dist.font_id = font_id1;
399  fc_info2.distance_cache.push_back(fc_dist);
400  }
401  return fc_info.distance_cache[cache_index].distance;
402 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
int SparseToCompact(int sparse_index) const override
Definition: indexmapbidi.h:140
float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const

◆ ComputeCanonicalFeatures()

void tesseract::TrainingSampleSet::ComputeCanonicalFeatures ( )

Definition at line 746 of file trainingsampleset.cpp.

746  {
747  ASSERT_HOST(font_class_array_ != nullptr);
748  const int font_size = font_id_map_.CompactSize();
749  for (int font_index = 0; font_index < font_size; ++font_index) {
750  const int font_id = font_id_map_.CompactToSparse(font_index);
751  for (int c = 0; c < unicharset_size_; ++c) {
752  int num_samples = NumClassSamples(font_id, c, false);
753  if (num_samples == 0) {
754  continue;
755  }
756  const TrainingSample *sample = GetCanonicalSample(font_id, c);
757  FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
758  fcinfo.canonical_features = sample->indexed_features();
759  }
760  }
761 }
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
int NumClassSamples(int font_id, int class_id, bool randomize) const

◆ ComputeCanonicalSamples()

void tesseract::TrainingSampleSet::ComputeCanonicalSamples ( const IntFeatureMap map,
bool  debug 
)

Definition at line 619 of file trainingsampleset.cpp.

619  {
620  ASSERT_HOST(font_class_array_ != nullptr);
621  IntFeatureDist f_table;
622  if (debug) {
623  tprintf("feature table size %d\n", map.sparse_size());
624  }
625  f_table.Init(&map);
626  int worst_s1 = 0;
627  int worst_s2 = 0;
628  double global_worst_dist = 0.0;
629  // Compute distances independently for each font and char index.
630  int font_size = font_id_map_.CompactSize();
631  for (int font_index = 0; font_index < font_size; ++font_index) {
632  int font_id = font_id_map_.CompactToSparse(font_index);
633  for (int c = 0; c < unicharset_size_; ++c) {
634  int samples_found = 0;
635  FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
636  if (fcinfo.samples.empty() || (kTestChar >= 0 && c != kTestChar)) {
637  fcinfo.canonical_sample = -1;
638  fcinfo.canonical_dist = 0.0f;
639  if (debug) {
640  tprintf("Skipping class %d\n", c);
641  }
642  continue;
643  }
644  // The canonical sample will be the one with the min_max_dist, which
645  // is the sample with the lowest maximum distance to all other samples.
646  double min_max_dist = 2.0;
647  // We keep track of the farthest apart pair (max_s1, max_s2) which
648  // are max_max_dist apart, so we can see how bad the variability is.
649  double max_max_dist = 0.0;
650  int max_s1 = 0;
651  int max_s2 = 0;
652  fcinfo.canonical_sample = fcinfo.samples[0];
653  fcinfo.canonical_dist = 0.0f;
654  for (auto s1 : fcinfo.samples) {
655  const std::vector<int> &features1 = samples_[s1]->indexed_features();
656  f_table.Set(features1, features1.size(), true);
657  double max_dist = 0.0;
658  // Run the full squared-order search for similar samples. It is still
659  // reasonably fast because f_table.FeatureDistance is fast, but we
660  // may have to reconsider if we start playing with too many samples
661  // of a single char/font.
662  for (int s2 : fcinfo.samples) {
663  if (samples_[s2]->class_id() != c || samples_[s2]->font_id() != font_id || s2 == s1) {
664  continue;
665  }
666  std::vector<int> features2 = samples_[s2]->indexed_features();
667  double dist = f_table.FeatureDistance(features2);
668  if (dist > max_dist) {
669  max_dist = dist;
670  if (dist > max_max_dist) {
671  max_max_dist = dist;
672  max_s1 = s1;
673  max_s2 = s2;
674  }
675  }
676  }
677  // Using Set(..., false) is far faster than re initializing, due to
678  // the sparseness of the feature space.
679  f_table.Set(features1, features1.size(), false);
680  samples_[s1]->set_max_dist(max_dist);
681  ++samples_found;
682  if (max_dist < min_max_dist) {
683  fcinfo.canonical_sample = s1;
684  fcinfo.canonical_dist = max_dist;
685  }
686  UpdateRange(max_dist, &min_max_dist, &max_max_dist);
687  }
688  if (max_max_dist > global_worst_dist) {
689  // Keep a record of the worst pair over all characters/fonts too.
690  global_worst_dist = max_max_dist;
691  worst_s1 = max_s1;
692  worst_s2 = max_s2;
693  }
694  if (debug) {
695  tprintf(
696  "Found %d samples of class %d=%s, font %d, "
697  "dist range [%g, %g], worst pair= %s, %s\n",
698  samples_found, c, unicharset_.debug_str(c).c_str(), font_index, min_max_dist,
699  max_max_dist, SampleToString(*samples_[max_s1]).c_str(),
700  SampleToString(*samples_[max_s2]).c_str());
701  }
702  }
703  }
704  if (debug) {
705  tprintf("Global worst dist = %g, between sample %d and %d\n", global_worst_dist, worst_s1,
706  worst_s2);
707  }
708 }
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:122
const int kTestChar
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331
std::string SampleToString(const TrainingSample &sample) const

◆ ComputeCloudFeatures()

void tesseract::TrainingSampleSet::ComputeCloudFeatures ( int  feature_space_size)

Definition at line 765 of file trainingsampleset.cpp.

765  {
766  ASSERT_HOST(font_class_array_ != nullptr);
767  int font_size = font_id_map_.CompactSize();
768  for (int font_index = 0; font_index < font_size; ++font_index) {
769  int font_id = font_id_map_.CompactToSparse(font_index);
770  for (int c = 0; c < unicharset_size_; ++c) {
771  int num_samples = NumClassSamples(font_id, c, false);
772  if (num_samples == 0) {
773  continue;
774  }
775  FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
776  fcinfo.cloud_features.Init(feature_space_size);
777  for (int s = 0; s < num_samples; ++s) {
778  const TrainingSample *sample = GetSample(font_id, c, s);
779  const std::vector<int> &sample_features = sample->indexed_features();
780  for (int sample_feature : sample_features) {
781  fcinfo.cloud_features.SetBit(sample_feature);
782  }
783  }
784  }
785  }
786 }
const TrainingSample * GetSample(int index) const

◆ ComputeClusterDistance()

float tesseract::TrainingSampleSet::ComputeClusterDistance ( int  font_id1,
int  class_id1,
int  font_id2,
int  class_id2,
const IntFeatureMap feature_map 
) const

Definition at line 405 of file trainingsampleset.cpp.

407  {
408  int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2, feature_map, false);
409  dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1, feature_map, false);
410  int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
411  denominator += GetCanonicalFeatures(font_id2, class_id2).size();
412  return static_cast<float>(dist) / denominator;
413 }
const std::vector< int > & GetCanonicalFeatures(int font_id, int class_id) const
int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const

◆ DeleteDeadSamples()

void tesseract::TrainingSampleSet::DeleteDeadSamples ( )

Definition at line 540 of file trainingsampleset.cpp.

540  {
541  using namespace std::placeholders; // for _1
542  auto old_it = samples_.begin();
543  for (; old_it < samples_.end(); ++old_it) {
544  if (*old_it == nullptr || (*old_it)->class_id() < 0) {
545  break;
546  }
547  }
548  auto new_it = old_it;
549  for (; old_it < samples_.end(); ++old_it) {
550  if (*old_it == nullptr || (*old_it)->class_id() < 0) {
551  delete *old_it;
552  } else {
553  *new_it = *old_it;
554  ++new_it;
555  }
556  }
557  samples_.resize(new_it - samples_.begin() + 1);
558  num_raw_samples_ = samples_.size();
559  // Samples must be re-organized now we have deleted a few.
560 }

◆ DeSerialize()

bool tesseract::TrainingSampleSet::DeSerialize ( bool  swap,
FILE *  fp 
)

Definition at line 124 of file trainingsampleset.cpp.

124  {
125  if (!tesseract::DeSerialize(swap, fp, samples_)) {
126  return false;
127  }
128  num_raw_samples_ = samples_.size();
129  if (!unicharset_.load_from_file(fp)) {
130  return false;
131  }
132  if (!font_id_map_.DeSerialize(swap, fp)) {
133  return false;
134  }
135  delete font_class_array_;
136  font_class_array_ = nullptr;
137  int8_t not_null;
138  if (fread(&not_null, sizeof(not_null), 1, fp) != 1) {
139  return false;
140  }
141  if (not_null) {
142  FontClassInfo empty;
143  font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(1, 1, empty);
144  if (!font_class_array_->DeSerializeClasses(swap, fp)) {
145  return false;
146  }
147  }
148  unicharset_size_ = unicharset_.size();
149  return true;
150 }
bool DeSerialize(bool swap, FILE *fp, std::vector< T > &data)
Definition: helpers.h:220
bool DeSerializeClasses(bool swap, FILE *fp)
Definition: matrix.h:223
bool DeSerialize(bool swap, FILE *fp)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391

◆ DisplaySamplesWithFeature()

void tesseract::TrainingSampleSet::DisplaySamplesWithFeature ( int  f_index,
const Shape shape,
const IntFeatureSpace feature_space,
ScrollView::Color  color,
ScrollView window 
) const

Definition at line 800 of file trainingsampleset.cpp.

803  {
804  for (int s = 0; s < num_raw_samples(); ++s) {
805  const TrainingSample *sample = GetSample(s);
806  if (shape.ContainsUnichar(sample->class_id())) {
807  std::vector<int> indexed_features;
808  space.IndexAndSortFeatures(sample->features(), sample->num_features(), &indexed_features);
809  for (int indexed_feature : indexed_features) {
810  if (indexed_feature == f_index) {
811  sample->DisplayFeatures(color, window);
812  }
813  }
814  }
815  }
816 }

◆ extract_sample()

TrainingSample* tesseract::TrainingSampleSet::extract_sample ( int  index)
inline

Definition at line 157 of file trainingsampleset.h.

157  {
158  TrainingSample *sample = samples_[index];
159  samples_[index] = nullptr;
160  return sample;
161  }

◆ fontinfo_table()

const FontInfoTable& tesseract::TrainingSampleSet::fontinfo_table ( ) const
inline

Definition at line 68 of file trainingsampleset.h.

68  {
69  return fontinfo_table_;
70  }

◆ GetCanonicalDist()

float tesseract::TrainingSampleSet::GetCanonicalDist ( int  font_id,
int  class_id 
) const

Definition at line 513 of file trainingsampleset.cpp.

513  {
514  ASSERT_HOST(font_class_array_ != nullptr);
515  int font_index = font_id_map_.SparseToCompact(font_id);
516  if (font_index < 0) {
517  return 0.0f;
518  }
519  if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0) {
520  return (*font_class_array_)(font_index, class_id).canonical_dist;
521  } else {
522  return 0.0f;
523  }
524 }

◆ GetCanonicalFeatures()

const std::vector< int > & tesseract::TrainingSampleSet::GetCanonicalFeatures ( int  font_id,
int  class_id 
) const

Definition at line 263 of file trainingsampleset.cpp.

263  {
264  int font_index = font_id_map_.SparseToCompact(font_id);
265  ASSERT_HOST(font_index >= 0);
266  return (*font_class_array_)(font_index, class_id).canonical_features;
267 }

◆ GetCanonicalSample()

const TrainingSample * tesseract::TrainingSampleSet::GetCanonicalSample ( int  font_id,
int  class_id 
) const

Definition at line 501 of file trainingsampleset.cpp.

501  {
502  ASSERT_HOST(font_class_array_ != nullptr);
503  int font_index = font_id_map_.SparseToCompact(font_id);
504  if (font_index < 0) {
505  return nullptr;
506  }
507  const int sample_index = (*font_class_array_)(font_index, class_id).canonical_sample;
508  return sample_index >= 0 ? samples_[sample_index] : nullptr;
509 }

◆ GetCloudFeatures()

const BitVector & tesseract::TrainingSampleSet::GetCloudFeatures ( int  font_id,
int  class_id 
) const

Definition at line 256 of file trainingsampleset.cpp.

256  {
257  int font_index = font_id_map_.SparseToCompact(font_id);
258  ASSERT_HOST(font_index >= 0);
259  return (*font_class_array_)(font_index, class_id).cloud_features;
260 }

◆ GetSample() [1/2]

const TrainingSample * tesseract::TrainingSampleSet::GetSample ( int  font_id,
int  class_id,
int  index 
) const

Definition at line 223 of file trainingsampleset.cpp.

223  {
224  ASSERT_HOST(font_class_array_ != nullptr);
225  int font_index = font_id_map_.SparseToCompact(font_id);
226  if (font_index < 0) {
227  return nullptr;
228  }
229  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
230  return samples_[sample_index];
231 }

◆ GetSample() [2/2]

const TrainingSample * tesseract::TrainingSampleSet::GetSample ( int  index) const

Definition at line 217 of file trainingsampleset.cpp.

217  {
218  return samples_[index];
219 }

◆ GlobalSampleIndex()

int tesseract::TrainingSampleSet::GlobalSampleIndex ( int  font_id,
int  class_id,
int  index 
) const

Definition at line 490 of file trainingsampleset.cpp.

490  {
491  ASSERT_HOST(font_class_array_ != nullptr);
492  int font_index = font_id_map_.SparseToCompact(font_id);
493  if (font_index < 0) {
494  return -1;
495  }
496  return (*font_class_array_)(font_index, class_id).samples[index];
497 }

◆ IndexFeatures()

void tesseract::TrainingSampleSet::IndexFeatures ( const IntFeatureSpace feature_space)

Definition at line 527 of file trainingsampleset.cpp.

527  {
528  for (auto &sample : samples_) {
529  sample->IndexFeatures(feature_space);
530  }
531 }

◆ KillSample()

void tesseract::TrainingSampleSet::KillSample ( TrainingSample sample)

Definition at line 535 of file trainingsampleset.cpp.

535  {
536  sample->set_sample_index(-1);
537 }

◆ LoadUnicharset()

void tesseract::TrainingSampleSet::LoadUnicharset ( const char *  filename)

Definition at line 153 of file trainingsampleset.cpp.

153  {
154  if (!unicharset_.load_from_file(filename)) {
155  tprintf(
156  "Failed to load unicharset from file %s\n"
157  "Building unicharset from scratch...\n",
158  filename);
159  unicharset_.clear();
160  // Add special characters as they were removed by the clear.
161  UNICHARSET empty;
162  unicharset_.AppendOtherUnicharset(empty);
163  }
164  unicharset_size_ = unicharset_.size();
165 }
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:454

◆ mutable_sample()

TrainingSample* tesseract::TrainingSampleSet::mutable_sample ( int  index)
inline

Definition at line 153 of file trainingsampleset.h.

153  {
154  return samples_[index];
155  }

◆ MutableSample()

TrainingSample * tesseract::TrainingSampleSet::MutableSample ( int  font_id,
int  class_id,
int  index 
)

Definition at line 235 of file trainingsampleset.cpp.

235  {
236  ASSERT_HOST(font_class_array_ != nullptr);
237  int font_index = font_id_map_.SparseToCompact(font_id);
238  if (font_index < 0) {
239  return nullptr;
240  }
241  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
242  return samples_[sample_index];
243 }

◆ num_raw_samples()

int tesseract::TrainingSampleSet::num_raw_samples ( ) const
inline

Definition at line 56 of file trainingsampleset.h.

56  {
57  return num_raw_samples_;
58  }

◆ num_samples()

int tesseract::TrainingSampleSet::num_samples ( ) const
inline

Definition at line 53 of file trainingsampleset.h.

53  {
54  return samples_.size();
55  }

◆ NumClassSamples()

int tesseract::TrainingSampleSet::NumClassSamples ( int  font_id,
int  class_id,
bool  randomize 
) const

Definition at line 198 of file trainingsampleset.cpp.

198  {
199  ASSERT_HOST(font_class_array_ != nullptr);
200  if (font_id < 0 || class_id < 0 || font_id >= font_id_map_.SparseSize() ||
201  class_id >= unicharset_size_) {
202  // There are no samples because the font or class doesn't exist.
203  return 0;
204  }
205  int font_index = font_id_map_.SparseToCompact(font_id);
206  if (font_index < 0) {
207  return 0; // The font has no samples.
208  }
209  if (randomize) {
210  return (*font_class_array_)(font_index, class_id).samples.size();
211  } else {
212  return (*font_class_array_)(font_index, class_id).num_raw_samples;
213  }
214 }
int SparseSize() const override
Definition: indexmapbidi.h:144

◆ NumFonts()

int tesseract::TrainingSampleSet::NumFonts ( ) const
inline

Definition at line 59 of file trainingsampleset.h.

59  {
60  return font_id_map_.SparseSize();
61  }

◆ OrganizeByFontAndClass()

void tesseract::TrainingSampleSet::OrganizeByFontAndClass ( )

Definition at line 563 of file trainingsampleset.cpp.

563  {
564  // Font indexes are sparse, so we used a map to compact them, so we can
565  // have an efficient 2-d array of fonts and character classes.
566  SetupFontIdMap();
567  int compact_font_size = font_id_map_.CompactSize();
568  // Get a 2-d array of generic vectors.
569  delete font_class_array_;
570  FontClassInfo empty;
571  font_class_array_ =
572  new GENERIC_2D_ARRAY<FontClassInfo>(compact_font_size, unicharset_size_, empty);
573  for (size_t s = 0; s < samples_.size(); ++s) {
574  int font_id = samples_[s]->font_id();
575  int class_id = samples_[s]->class_id();
576  if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
577  tprintf("Font id = %d/%d, class id = %d/%d on sample %zu\n", font_id,
578  font_id_map_.SparseSize(), class_id, unicharset_size_, s);
579  }
580  ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
581  ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
582  int font_index = font_id_map_.SparseToCompact(font_id);
583  (*font_class_array_)(font_index, class_id).samples.push_back(s);
584  }
585  // Set the num_raw_samples member of the FontClassInfo, to set the boundary
586  // between the raw samples and the replicated ones.
587  for (int f = 0; f < compact_font_size; ++f) {
588  for (int c = 0; c < unicharset_size_; ++c) {
589  (*font_class_array_)(f, c).num_raw_samples = (*font_class_array_)(f, c).samples.size();
590  }
591  }
592  // This is the global number of samples and also marks the boundary between
593  // real and replicated samples.
594  num_raw_samples_ = samples_.size();
595 }

◆ ReliablySeparable()

int tesseract::TrainingSampleSet::ReliablySeparable ( int  font_id1,
int  class_id1,
int  font_id2,
int  class_id2,
const IntFeatureMap feature_map,
bool  thorough 
) const

Definition at line 451 of file trainingsampleset.cpp.

452  {
453  int result = 0;
454  const TrainingSample *sample2 = GetCanonicalSample(font_id2, class_id2);
455  if (sample2 == nullptr) {
456  return 0; // There are no canonical features.
457  }
458  const std::vector<int> &canonical2 = GetCanonicalFeatures(font_id2, class_id2);
459  const BitVector &cloud1 = GetCloudFeatures(font_id1, class_id1);
460  if (cloud1.empty()) {
461  return canonical2.size(); // There are no cloud features.
462  }
463 
464  // Find a canonical2 feature that is not in cloud1.
465  for (int feature : canonical2) {
466  if (cloud1[feature]) {
467  continue;
468  }
469  // Gather the near neighbours of f.
470  std::vector<int> good_features;
471  AddNearFeatures(feature_map, feature, 1, &good_features);
472  // Check that none of the good_features are in the cloud.
473  bool found = false;
474  for (auto good_f : good_features) {
475  if (cloud1[good_f]) {
476  found = true;
477  break;
478  }
479  }
480  if (found) {
481  continue; // Found one in the cloud.
482  }
483  ++result;
484  }
485  return result;
486 }
const BitVector & GetCloudFeatures(int font_id, int class_id) const

◆ ReplicateAndRandomizeSamples()

void tesseract::TrainingSampleSet::ReplicateAndRandomizeSamples ( )

Definition at line 715 of file trainingsampleset.cpp.

715  {
716  ASSERT_HOST(font_class_array_ != nullptr);
717  int font_size = font_id_map_.CompactSize();
718  for (int font_index = 0; font_index < font_size; ++font_index) {
719  for (int c = 0; c < unicharset_size_; ++c) {
720  FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
721  int sample_count = fcinfo.samples.size();
722  int min_samples = 2 * std::max(kSampleRandomSize, sample_count);
723  if (sample_count > 0 && sample_count < min_samples) {
724  int base_count = sample_count;
725  for (int base_index = 0; sample_count < min_samples; ++sample_count) {
726  int src_index = fcinfo.samples[base_index++];
727  if (base_index >= base_count) {
728  base_index = 0;
729  }
730  TrainingSample *sample =
731  samples_[src_index]->RandomizedCopy(sample_count % kSampleRandomSize);
732  int sample_index = samples_.size();
733  sample->set_sample_index(sample_index);
734  samples_.push_back(sample);
735  fcinfo.samples.push_back(sample_index);
736  }
737  }
738  }
739  }
740 }

◆ SampleToString()

std::string tesseract::TrainingSampleSet::SampleToString ( const TrainingSample sample) const

Definition at line 247 of file trainingsampleset.cpp.

247  {
248  std::string boxfile_str;
249  MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()), sample.bounding_box(),
250  sample.page_num(), boxfile_str);
251  return std::string(fontinfo_table_.at(sample.font_id()).name) + " " + boxfile_str;
252 }
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str)
Definition: boxread.cpp:273
T & at(int index) const
Definition: genericvector.h:93
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279

◆ Serialize()

bool tesseract::TrainingSampleSet::Serialize ( FILE *  fp) const

Definition at line 100 of file trainingsampleset.cpp.

100  {
101  if (!tesseract::Serialize(fp, samples_)) {
102  return false;
103  }
104  if (!unicharset_.save_to_file(fp)) {
105  return false;
106  }
107  if (!font_id_map_.Serialize(fp)) {
108  return false;
109  }
110  int8_t not_null = font_class_array_ != nullptr;
111  if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) {
112  return false;
113  }
114  if (not_null) {
115  if (!font_class_array_->SerializeClasses(fp)) {
116  return false;
117  }
118  }
119  return true;
120 }
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:251
bool SerializeClasses(FILE *fp) const
Definition: matrix.h:204
bool Serialize(FILE *fp) const
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361

◆ SetupFontIdMap()

void tesseract::TrainingSampleSet::SetupFontIdMap ( )

Definition at line 599 of file trainingsampleset.cpp.

599  {
600  // Number of samples for each font_id.
601  std::vector<int> font_counts;
602  for (auto &sample : samples_) {
603  const int font_id = sample->font_id();
604  while (font_id >= font_counts.size()) {
605  font_counts.push_back(0);
606  }
607  ++font_counts[font_id];
608  }
609  font_id_map_.Init(font_counts.size(), false);
610  for (size_t f = 0; f < font_counts.size(); ++f) {
611  font_id_map_.SetMap(f, font_counts[f] > 0);
612  }
613  font_id_map_.Setup();
614 }
void Init(int size, bool all_mapped)
void SetMap(int sparse_index, bool mapped)

◆ UnicharDistance()

float tesseract::TrainingSampleSet::UnicharDistance ( const UnicharAndFonts uf1,
const UnicharAndFonts uf2,
bool  matched_fonts,
const IntFeatureMap feature_map 
)

Definition at line 273 of file trainingsampleset.cpp.

274  {
275  int num_fonts1 = uf1.font_ids.size();
276  int c1 = uf1.unichar_id;
277  int num_fonts2 = uf2.font_ids.size();
278  int c2 = uf2.unichar_id;
279  double dist_sum = 0.0;
280  int dist_count = 0;
281  const bool debug = false;
282  if (matched_fonts) {
283  // Compute distances only where fonts match.
284  for (int i = 0; i < num_fonts1; ++i) {
285  int f1 = uf1.font_ids[i];
286  for (int j = 0; j < num_fonts2; ++j) {
287  int f2 = uf2.font_ids[j];
288  if (f1 == f2) {
289  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
290  ++dist_count;
291  }
292  }
293  }
294  } else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
295  // Small enough sets to compute all the distances.
296  for (int i = 0; i < num_fonts1; ++i) {
297  int f1 = uf1.font_ids[i];
298  for (int j = 0; j < num_fonts2; ++j) {
299  int f2 = uf2.font_ids[j];
300  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
301  if (debug) {
302  tprintf("Cluster dist %d %d %d %d = %g\n", f1, c1, f2, c2,
303  ClusterDistance(f1, c1, f2, c2, feature_map));
304  }
305  ++dist_count;
306  }
307  }
308  } else {
309  // Subsample distances, using the largest set once, and stepping through
310  // the smaller set so as to ensure that all the pairs are different.
311  int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
312  int index = 0;
313  int num_samples = std::max(num_fonts1, num_fonts2);
314  for (int i = 0; i < num_samples; ++i, index += increment) {
315  int f1 = uf1.font_ids[i % num_fonts1];
316  int f2 = uf2.font_ids[index % num_fonts2];
317  if (debug) {
318  tprintf("Cluster dist %d %d %d %d = %g\n", f1, c1, f2, c2,
319  ClusterDistance(f1, c1, f2, c2, feature_map));
320  }
321  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
322  ++dist_count;
323  }
324  }
325  if (dist_count == 0) {
326  if (matched_fonts) {
327  return UnicharDistance(uf1, uf2, false, feature_map);
328  }
329  return 0.0f;
330  }
331  return dist_sum / dist_count;
332 }
const int kPrime2
const int kPrime1
const int kSquareLimit
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)

◆ unicharset()

const UNICHARSET& tesseract::TrainingSampleSet::unicharset ( ) const
inline

Definition at line 62 of file trainingsampleset.h.

62  {
63  return unicharset_;
64  }

The documentation for this class was generated from the following files: