tesseract  5.0.0
tesseract::MasterTrainer Class Reference

#include <mastertrainer.h>

Public Member Functions

 MasterTrainer (NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples, int debug_level)
 
 ~MasterTrainer ()
 
bool Serialize (FILE *fp) const
 
void LoadUnicharset (const char *filename)
 
void SetFeatureSpace (const IntFeatureSpace &fs)
 
void ReadTrainingSamples (const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
 
void AddSample (bool verification, const char *unichar_str, TrainingSample *sample)
 
void LoadPageImages (const char *filename)
 
void PostLoadCleanup ()
 
void PreTrainingSetup ()
 
void SetupMasterShapes ()
 
void IncludeJunk ()
 
void ReplicateAndRandomizeSamplesIfRequired ()
 
bool LoadFontInfo (const char *filename)
 
bool LoadXHeights (const char *filename)
 
bool AddSpacingInfo (const char *filename)
 
int GetFontInfoId (const char *font_name)
 
int GetBestMatchingFontInfoId (const char *filename)
 
const std::string & GetTRFileName (int index) const
 
void SetupFlatShapeTable (ShapeTable *shape_table)
 
CLUSTERERSetupForClustering (const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
 
void WriteInttempAndPFFMTable (const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
 
const UNICHARSETunicharset () const
 
TrainingSampleSetGetSamples ()
 
const ShapeTablemaster_shapes () const
 
void DebugCanonical (const char *unichar_str1, const char *unichar_str2)
 
void DisplaySamples (const char *unichar_str1, int cloud_font, const char *unichar_str2, int canonical_font)
 
void TestClassifierVOld (bool replicate_samples, ShapeClassifier *test_classifier, ShapeClassifier *old_classifier)
 
void TestClassifierOnSamples (CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, std::string *report_string)
 
double TestClassifier (CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, std::string *report_string)
 
float ShapeDistance (const ShapeTable &shapes, int s1, int s2)
 

Detailed Description

Definition at line 66 of file mastertrainer.h.

Constructor & Destructor Documentation

◆ MasterTrainer()

tesseract::MasterTrainer::MasterTrainer ( NormalizationMode  norm_mode,
bool  shape_analysis,
bool  replicate_samples,
int  debug_level 
)

Definition at line 52 of file mastertrainer.cpp.

54  : norm_mode_(norm_mode),
55  samples_(fontinfo_table_),
56  junk_samples_(fontinfo_table_),
57  verify_samples_(fontinfo_table_),
58  charsetsize_(0),
59  enable_shape_analysis_(shape_analysis),
60  enable_replication_(replicate_samples),
61  fragments_(nullptr),
62  prev_unichar_id_(-1),
63  debug_level_(debug_level) {}

◆ ~MasterTrainer()

tesseract::MasterTrainer::~MasterTrainer ( )

Definition at line 65 of file mastertrainer.cpp.

65  {
66  delete[] fragments_;
67  for (auto &page_image : page_images_) {
68  page_image.destroy();
69  }
70 }

Member Function Documentation

◆ AddSample()

void tesseract::MasterTrainer::AddSample ( bool  verification,
const char *  unichar_str,
TrainingSample sample 
)

Definition at line 193 of file mastertrainer.cpp.

194  {
195  if (verification) {
196  verify_samples_.AddSample(unichar, sample);
197  prev_unichar_id_ = -1;
198  } else if (unicharset_.contains_unichar(unichar)) {
199  if (prev_unichar_id_ >= 0) {
200  fragments_[prev_unichar_id_] = -1;
201  }
202  prev_unichar_id_ = samples_.AddSample(unichar, sample);
203  if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0) {
204  flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());
205  }
206  } else {
207  const int junk_id = junk_samples_.AddSample(unichar, sample);
208  if (prev_unichar_id_ >= 0) {
209  CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar);
210  if (frag != nullptr && frag->is_natural()) {
211  if (fragments_[prev_unichar_id_] == 0) {
212  fragments_[prev_unichar_id_] = junk_id;
213  } else if (fragments_[prev_unichar_id_] != junk_id) {
214  fragments_[prev_unichar_id_] = -1;
215  }
216  }
217  delete frag;
218  }
219  prev_unichar_id_ = -1;
220  }
221 }
static CHAR_FRAGMENT * parse_from_string(const char *str)
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
unsigned AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:351
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:400
int AddSample(const char *unichar, TrainingSample *sample)

◆ AddSpacingInfo()

bool tesseract::MasterTrainer::AddSpacingInfo ( const char *  filename)

Definition at line 463 of file mastertrainer.cpp.

463  {
464  FILE *fontinfo_file = fopen(filename, "rb");
465  if (fontinfo_file == nullptr) {
466  return true; // We silently ignore missing files!
467  }
468  // Find the fontinfo_id.
469  int fontinfo_id = GetBestMatchingFontInfoId(filename);
470  if (fontinfo_id < 0) {
471  tprintf("No font found matching fontinfo filename %s\n", filename);
472  fclose(fontinfo_file);
473  return false;
474  }
475  tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);
476  // TODO(rays) scale should probably be a double, but keep as an int for now
477  // to duplicate current behavior.
478  int scale = kBlnXHeight / xheights_[fontinfo_id];
479  int num_unichars;
480  char uch[UNICHAR_LEN];
481  char kerned_uch[UNICHAR_LEN];
482  int x_gap, x_gap_before, x_gap_after, num_kerned;
483  ASSERT_HOST(tfscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
484  FontInfo *fi = &fontinfo_table_.at(fontinfo_id);
485  fi->init_spacing(unicharset_.size());
486  FontSpacingInfo *spacing = nullptr;
487  for (int l = 0; l < num_unichars; ++l) {
488  if (tfscanf(fontinfo_file, "%s %d %d %d", uch, &x_gap_before, &x_gap_after,
489  &num_kerned) != 4) {
490  tprintf("Bad format of font spacing file %s\n", filename);
491  fclose(fontinfo_file);
492  return false;
493  }
494  bool valid = unicharset_.contains_unichar(uch);
495  if (valid) {
496  spacing = new FontSpacingInfo();
497  spacing->x_gap_before = static_cast<int16_t>(x_gap_before * scale);
498  spacing->x_gap_after = static_cast<int16_t>(x_gap_after * scale);
499  }
500  for (int k = 0; k < num_kerned; ++k) {
501  if (tfscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {
502  tprintf("Bad format of font spacing file %s\n", filename);
503  fclose(fontinfo_file);
504  delete spacing;
505  return false;
506  }
507  if (!valid || !unicharset_.contains_unichar(kerned_uch)) {
508  continue;
509  }
510  spacing->kerned_unichar_ids.push_back(
511  unicharset_.unichar_to_id(kerned_uch));
512  spacing->kerned_x_gaps.push_back(static_cast<int16_t>(x_gap * scale));
513  }
514  if (valid) {
515  fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);
516  }
517  }
518  fclose(fontinfo_file);
519  return true;
520 }
#define UNICHAR_LEN
Definition: unichar.h:33
#define ASSERT_HOST(x)
Definition: errcode.h:59
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:189
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const int kBlnXHeight
Definition: normalis.h:33
void init_spacing(int unicharset_size)
Definition: fontinfo.h:79
T & at(int index) const
Definition: genericvector.h:93
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
size_t size() const
Definition: unicharset.h:355
int GetBestMatchingFontInfoId(const char *filename)

◆ DebugCanonical()

void tesseract::MasterTrainer::DebugCanonical ( const char *  unichar_str1,
const char *  unichar_str2 
)

Definition at line 692 of file mastertrainer.cpp.

693  {
694  int class_id1 = unicharset_.unichar_to_id(unichar_str1);
695  int class_id2 = unicharset_.unichar_to_id(unichar_str2);
696  if (class_id2 == INVALID_UNICHAR_ID) {
697  class_id2 = class_id1;
698  }
699  if (class_id1 == INVALID_UNICHAR_ID) {
700  tprintf("No unicharset entry found for %s\n", unichar_str1);
701  return;
702  } else {
703  tprintf("Font ambiguities for unichar %d = %s and %d = %s\n", class_id1,
704  unichar_str1, class_id2, unichar_str2);
705  }
706  int num_fonts = samples_.NumFonts();
707  const IntFeatureMap &feature_map = feature_map_;
708  // Iterate the fonts to get the similarity with other fonst of the same
709  // class.
710  tprintf(" ");
711  for (int f = 0; f < num_fonts; ++f) {
712  if (samples_.NumClassSamples(f, class_id2, false) == 0) {
713  continue;
714  }
715  tprintf("%6d", f);
716  }
717  tprintf("\n");
718  for (int f1 = 0; f1 < num_fonts; ++f1) {
719  // Map the features of the canonical_sample.
720  if (samples_.NumClassSamples(f1, class_id1, false) == 0) {
721  continue;
722  }
723  tprintf("%4d ", f1);
724  for (int f2 = 0; f2 < num_fonts; ++f2) {
725  if (samples_.NumClassSamples(f2, class_id2, false) == 0) {
726  continue;
727  }
728  float dist =
729  samples_.ClusterDistance(f1, class_id1, f2, class_id2, feature_map);
730  tprintf(" %5.3f", dist);
731  }
732  tprintf("\n");
733  }
734  // Build a fake ShapeTable containing all the sample types.
735  ShapeTable shapes(unicharset_);
736  for (int f = 0; f < num_fonts; ++f) {
737  if (samples_.NumClassSamples(f, class_id1, true) > 0) {
738  shapes.AddShape(class_id1, f);
739  }
740  if (class_id1 != class_id2 &&
741  samples_.NumClassSamples(f, class_id2, true) > 0) {
742  shapes.AddShape(class_id2, f);
743  }
744  }
745 }
int NumClassSamples(int font_id, int class_id, bool randomize) const
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)

◆ DisplaySamples()

void tesseract::MasterTrainer::DisplaySamples ( const char *  unichar_str1,
int  cloud_font,
const char *  unichar_str2,
int  canonical_font 
)

Definition at line 758 of file mastertrainer.cpp.

760  {
761  const IntFeatureMap &feature_map = feature_map_;
762  const IntFeatureSpace &feature_space = feature_map.feature_space();
763  ScrollView *f_window = CreateFeatureSpaceWindow("Features", 100, 500);
765  f_window);
766  int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);
767  if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
768  const TrainingSample *sample =
769  samples_.GetCanonicalSample(canonical_font, class_id2);
770  for (uint32_t f = 0; f < sample->num_features(); ++f) {
771  RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
772  }
773  }
774  int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);
775  if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
776  const BitVector &cloud = samples_.GetCloudFeatures(cloud_font, class_id1);
777  for (int f = 0; f < cloud.size(); ++f) {
778  if (cloud[f]) {
779  INT_FEATURE_STRUCT feature = feature_map.InverseIndexFeature(f);
780  RenderIntFeature(f_window, &feature, ScrollView::GREEN);
781  }
782  }
783  }
784  f_window->Update();
785  ScrollView *s_window = CreateFeatureSpaceWindow("Samples", 100, 500);
786  SVEventType ev_type;
787  do {
788  SVEvent *ev;
789  // Wait until a click or popup event.
790  ev = f_window->AwaitEvent(SVET_ANY);
791  ev_type = ev->type;
792  if (ev_type == SVET_CLICK) {
793  int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);
794  if (feature_index >= 0) {
795  // Iterate samples and display those with the feature.
796  Shape shape;
797  shape.AddToShape(class_id1, cloud_font);
798  s_window->Clear();
799  samples_.DisplaySamplesWithFeature(feature_index, shape, feature_space,
800  ScrollView::GREEN, s_window);
801  s_window->Update();
802  }
803  }
804  delete ev;
805  } while (ev_type != SVET_DESTROY);
806 }
@ SVET_DESTROY
Definition: scrollview.h:53
@ SVET_CLICK
Definition: scrollview.h:55
@ character
Definition: mfoutline.h:53
@ baseline
Definition: mfoutline.h:53
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1622
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1500
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:887
@ NM_BASELINE
Definition: normalis.h:47
const IntFeatureSpace & feature_space() const
Definition: intfeaturemap.h:60
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
const UNICHARSET & unicharset() const
const BitVector & GetCloudFeatures(int font_id, int class_id) const
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const

◆ GetBestMatchingFontInfoId()

int tesseract::MasterTrainer::GetBestMatchingFontInfoId ( const char *  filename)

Definition at line 535 of file mastertrainer.cpp.

535  {
536  int fontinfo_id = -1;
537  int best_len = 0;
538  for (int f = 0; f < fontinfo_table_.size(); ++f) {
539  if (strstr(filename, fontinfo_table_.at(f).name) != nullptr) {
540  int len = strlen(fontinfo_table_.at(f).name);
541  // Use the longest matching length in case a substring of a font matched.
542  if (len > best_len) {
543  best_len = len;
544  fontinfo_id = f;
545  }
546  }
547  }
548  return fontinfo_id;
549 }
unsigned size() const
Definition: genericvector.h:74

◆ GetFontInfoId()

int tesseract::MasterTrainer::GetFontInfoId ( const char *  font_name)

Definition at line 524 of file mastertrainer.cpp.

524  {
525  FontInfo fontinfo;
526  // We are only borrowing the string, so it is OK to const cast it.
527  fontinfo.name = const_cast<char *>(font_name);
528  fontinfo.properties = 0; // Not used to lookup in the table
529  fontinfo.universal_id = 0;
530  return fontinfo_table_.get_index(fontinfo);
531 }
int get_index(const T &object) const

◆ GetSamples()

TrainingSampleSet* tesseract::MasterTrainer::GetSamples ( )
inline

Definition at line 181 of file mastertrainer.h.

181  {
182  return &samples_;
183  }

◆ GetTRFileName()

const std::string& tesseract::MasterTrainer::GetTRFileName ( int  index) const
inline

Definition at line 157 of file mastertrainer.h.

157  {
158  return tr_filenames_[index];
159  }

◆ IncludeJunk()

void tesseract::MasterTrainer::IncludeJunk ( )

Definition at line 339 of file mastertrainer.cpp.

339  {
340  // Get ids of fragments in junk_samples_ that replace the dead chars.
341  const UNICHARSET &junk_set = junk_samples_.unicharset();
342  const UNICHARSET &sample_set = samples_.unicharset();
343  int num_junks = junk_samples_.num_samples();
344  tprintf("Moving %d junk samples to master sample set.\n", num_junks);
345  for (int s = 0; s < num_junks; ++s) {
346  TrainingSample *sample = junk_samples_.mutable_sample(s);
347  int junk_id = sample->class_id();
348  const char *junk_utf8 = junk_set.id_to_unichar(junk_id);
349  int sample_id = sample_set.unichar_to_id(junk_utf8);
350  if (sample_id == INVALID_UNICHAR_ID) {
351  sample_id = 0;
352  }
353  sample->set_class_id(sample_id);
354  junk_samples_.extract_sample(s);
355  samples_.AddSample(sample_id, sample);
356  }
357  junk_samples_.DeleteDeadSamples();
358  samples_.OrganizeByFontAndClass();
359 }
UNICHAR_ID class_id() const
TrainingSample * extract_sample(int index)
TrainingSample * mutable_sample(int index)

◆ LoadFontInfo()

bool tesseract::MasterTrainer::LoadFontInfo ( const char *  filename)

Definition at line 379 of file mastertrainer.cpp.

379  {
380  FILE *fp = fopen(filename, "rb");
381  if (fp == nullptr) {
382  fprintf(stderr, "Failed to load font_properties from %s\n", filename);
383  return false;
384  }
385  int italic, bold, fixed, serif, fraktur;
386  while (!feof(fp)) {
387  FontInfo fontinfo;
388  char *font_name = new char[1024];
389  fontinfo.name = font_name;
390  fontinfo.properties = 0;
391  fontinfo.universal_id = 0;
392  if (tfscanf(fp, "%1024s %i %i %i %i %i\n", font_name, &italic, &bold,
393  &fixed, &serif, &fraktur) != 6) {
394  delete[] font_name;
395  continue;
396  }
397  fontinfo.properties = (italic << 0) + (bold << 1) + (fixed << 2) +
398  (serif << 3) + (fraktur << 4);
399  if (fontinfo_table_.get_index(fontinfo) < 0) {
400  // fontinfo not in table.
401  fontinfo_table_.push_back(fontinfo);
402  } else {
403  delete[] font_name;
404  }
405  }
406  fclose(fp);
407  return true;
408 }

◆ LoadPageImages()

void tesseract::MasterTrainer::LoadPageImages ( const char *  filename)

Definition at line 226 of file mastertrainer.cpp.

226  {
227  size_t offset = 0;
228  int page;
229  Image pix;
230  for (page = 0;; page++) {
231  pix = pixReadFromMultipageTiff(filename, &offset);
232  if (!pix) {
233  break;
234  }
235  page_images_.push_back(pix);
236  if (!offset) {
237  break;
238  }
239  }
240  tprintf("Loaded %d page images from %s\n", page, filename);
241 }

◆ LoadUnicharset()

void tesseract::MasterTrainer::LoadUnicharset ( const char *  filename)

Definition at line 111 of file mastertrainer.cpp.

111  {
112  if (!unicharset_.load_from_file(filename)) {
113  tprintf(
114  "Failed to load unicharset from file %s\n"
115  "Building unicharset for training from scratch...\n",
116  filename);
117  unicharset_.clear();
118  UNICHARSET initialized;
119  // Add special characters, as they were removed by the clear, but the
120  // default constructor puts them in.
121  unicharset_.AppendOtherUnicharset(initialized);
122  }
123  charsetsize_ = unicharset_.size();
124  delete[] fragments_;
125  fragments_ = new int[charsetsize_];
126  memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);
127  samples_.LoadUnicharset(filename);
128  junk_samples_.LoadUnicharset(filename);
129  verify_samples_.LoadUnicharset(filename);
130 }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:454
void LoadUnicharset(const char *filename)

◆ LoadXHeights()

bool tesseract::MasterTrainer::LoadXHeights ( const char *  filename)

Definition at line 412 of file mastertrainer.cpp.

412  {
413  tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());
414  xheights_.clear();
415  xheights_.resize(fontinfo_table_.size(), -1);
416  if (filename == nullptr) {
417  return true;
418  }
419  FILE *f = fopen(filename, "rb");
420  if (f == nullptr) {
421  fprintf(stderr, "Failed to load font xheights from %s\n", filename);
422  return false;
423  }
424  tprintf("Reading x-heights from %s ...\n", filename);
425  FontInfo fontinfo;
426  fontinfo.properties = 0; // Not used to lookup in the table.
427  fontinfo.universal_id = 0;
428  char buffer[1024];
429  int xht;
430  int total_xheight = 0;
431  int xheight_count = 0;
432  while (!feof(f)) {
433  if (tfscanf(f, "%1023s %d\n", buffer, &xht) != 2) {
434  continue;
435  }
436  buffer[1023] = '\0';
437  fontinfo.name = buffer;
438  auto fontinfo_id = fontinfo_table_.get_index(fontinfo);
439  if (fontinfo_id < 0) {
440  // fontinfo not in table.
441  continue;
442  }
443  xheights_[fontinfo_id] = xht;
444  total_xheight += xht;
445  ++xheight_count;
446  }
447  if (xheight_count == 0) {
448  fprintf(stderr, "No valid xheights in %s!\n", filename);
449  fclose(f);
450  return false;
451  }
452  int mean_xheight = DivRounded(total_xheight, xheight_count);
453  for (int i = 0; i < fontinfo_table_.size(); ++i) {
454  if (xheights_[i] < 0) {
455  xheights_[i] = mean_xheight;
456  }
457  }
458  fclose(f);
459  return true;
460 } // LoadXHeights
int DivRounded(int a, int b)
Definition: helpers.h:167

◆ master_shapes()

const ShapeTable& tesseract::MasterTrainer::master_shapes ( ) const
inline

Definition at line 184 of file mastertrainer.h.

184  {
185  return master_shapes_;
186  }

◆ PostLoadCleanup()

void tesseract::MasterTrainer::PostLoadCleanup ( )

Definition at line 248 of file mastertrainer.cpp.

248  {
249  if (debug_level_ > 0) {
250  tprintf("PostLoadCleanup...\n");
251  }
252  if (enable_shape_analysis_) {
253  ReplaceFragmentedSamples();
254  }
255  SampleIterator sample_it;
256  sample_it.Init(nullptr, nullptr, true, &verify_samples_);
257  sample_it.NormalizeSamples();
258  verify_samples_.OrganizeByFontAndClass();
259 
260  samples_.IndexFeatures(feature_space_);
261  // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness
262  // against current training.
263  // samples_.DeleteOutliers(feature_space_, debug_level_ > 0);
264  samples_.OrganizeByFontAndClass();
265  if (debug_level_ > 0) {
266  tprintf("ComputeCanonicalSamples...\n");
267  }
268  samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);
269 }
void IndexFeatures(const IntFeatureSpace &feature_space)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)

◆ PreTrainingSetup()

void tesseract::MasterTrainer::PreTrainingSetup ( )

Definition at line 274 of file mastertrainer.cpp.

274  {
275  if (debug_level_ > 0) {
276  tprintf("PreTrainingSetup...\n");
277  }
278  samples_.IndexFeatures(feature_space_);
279  samples_.ComputeCanonicalFeatures();
280  if (debug_level_ > 0) {
281  tprintf("ComputeCloudFeatures...\n");
282  }
283  samples_.ComputeCloudFeatures(feature_space_.Size());
284 }
void ComputeCloudFeatures(int feature_space_size)

◆ ReadTrainingSamples()

void tesseract::MasterTrainer::ReadTrainingSamples ( const char *  page_name,
const FEATURE_DEFS_STRUCT feature_defs,
bool  verification 
)

Definition at line 136 of file mastertrainer.cpp.

138  {
139  char buffer[2048];
140  const int int_feature_type =
142  const int micro_feature_type =
144  const int cn_feature_type =
146  const int geo_feature_type =
148 
149  FILE *fp = fopen(page_name, "rb");
150  if (fp == nullptr) {
151  tprintf("Failed to open tr file: %s\n", page_name);
152  return;
153  }
154  tr_filenames_.emplace_back(page_name);
155  while (fgets(buffer, sizeof(buffer), fp) != nullptr) {
156  if (buffer[0] == '\n') {
157  continue;
158  }
159 
160  char *space = strchr(buffer, ' ');
161  if (space == nullptr) {
162  tprintf("Bad format in tr file, reading fontname, unichar\n");
163  continue;
164  }
165  *space++ = '\0';
166  int font_id = GetFontInfoId(buffer);
167  if (font_id < 0) {
168  font_id = 0;
169  }
170  int page_number;
171  std::string unichar;
172  TBOX bounding_box;
173  if (!ParseBoxFileStr(space, &page_number, unichar, &bounding_box)) {
174  tprintf("Bad format in tr file, reading box coords\n");
175  continue;
176  }
177  auto char_desc = ReadCharDescription(feature_defs, fp);
178  auto *sample = new TrainingSample;
179  sample->set_font_id(font_id);
180  sample->set_page_num(page_number + page_images_.size());
181  sample->set_bounding_box(bounding_box);
182  sample->ExtractCharDesc(int_feature_type, micro_feature_type,
183  cn_feature_type, geo_feature_type, char_desc);
184  AddSample(verification, unichar.c_str(), sample);
185  delete char_desc;
186  }
187  charsetsize_ = unicharset_.size();
188  fclose(fp);
189 }
@ TBOX
const char *const kCNFeatureType
Definition: featdefs.cpp:34
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:203
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:198
CHAR_DESC_STRUCT * ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:172
const char *const kGeoFeatureType
Definition: featdefs.cpp:36
const char *const kIntFeatureType
Definition: featdefs.cpp:35
FEATURE_DEFS_STRUCT feature_defs
const char *const kMicroFeatureType
Definition: featdefs.cpp:33
int GetFontInfoId(const char *font_name)
void AddSample(bool verification, const char *unichar_str, TrainingSample *sample)

◆ ReplicateAndRandomizeSamplesIfRequired()

void tesseract::MasterTrainer::ReplicateAndRandomizeSamplesIfRequired ( )

Definition at line 366 of file mastertrainer.cpp.

366  {
367  if (enable_replication_) {
368  if (debug_level_ > 0) {
369  tprintf("ReplicateAndRandomize...\n");
370  }
371  verify_samples_.ReplicateAndRandomizeSamples();
372  samples_.ReplicateAndRandomizeSamples();
373  samples_.IndexFeatures(feature_space_);
374  }
375 }

◆ Serialize()

bool tesseract::MasterTrainer::Serialize ( FILE *  fp) const

Definition at line 75 of file mastertrainer.cpp.

75  {
76  uint32_t value = norm_mode_;
77  if (!tesseract::Serialize(fp, &value)) {
78  return false;
79  }
80  if (!unicharset_.save_to_file(fp)) {
81  return false;
82  }
83  if (!feature_space_.Serialize(fp)) {
84  return false;
85  }
86  if (!samples_.Serialize(fp)) {
87  return false;
88  }
89  if (!junk_samples_.Serialize(fp)) {
90  return false;
91  }
92  if (!verify_samples_.Serialize(fp)) {
93  return false;
94  }
95  if (!master_shapes_.Serialize(fp)) {
96  return false;
97  }
98  if (!flat_shapes_.Serialize(fp)) {
99  return false;
100  }
101  if (!fontinfo_table_.Serialize(fp)) {
102  return false;
103  }
104  if (!tesseract::Serialize(fp, xheights_)) {
105  return false;
106  }
107  return true;
108 }
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:251
TESS_API bool Serialize(FILE *fp) const
Definition: fontinfo.cpp:55
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361
bool Serialize(FILE *fp) const
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250

◆ SetFeatureSpace()

void tesseract::MasterTrainer::SetFeatureSpace ( const IntFeatureSpace fs)
inline

Definition at line 79 of file mastertrainer.h.

79  {
80  feature_space_ = fs;
81  feature_map_.Init(fs);
82  }
void Init(const IntFeatureSpace &feature_space)

◆ SetupFlatShapeTable()

void tesseract::MasterTrainer::SetupFlatShapeTable ( ShapeTable shape_table)

Definition at line 552 of file mastertrainer.cpp.

552  {
553  // To exactly mimic the results of the previous implementation, the shapes
554  // must be clustered in order the fonts arrived, and reverse order of the
555  // characters within each font.
556  // Get a list of the fonts in the order they appeared.
557  std::vector<int> active_fonts;
558  int num_shapes = flat_shapes_.NumShapes();
559  for (int s = 0; s < num_shapes; ++s) {
560  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
561  unsigned f = 0;
562  for (f = 0; f < active_fonts.size(); ++f) {
563  if (active_fonts[f] == font) {
564  break;
565  }
566  }
567  if (f == active_fonts.size()) {
568  active_fonts.push_back(font);
569  }
570  }
571  // For each font in order, add all the shapes with that font in reverse order.
572  int num_fonts = active_fonts.size();
573  for (int f = 0; f < num_fonts; ++f) {
574  for (int s = num_shapes - 1; s >= 0; --s) {
575  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
576  if (font == active_fonts[f]) {
577  shape_table->AddShape(flat_shapes_.GetShape(s));
578  }
579  }
580  }
581 }
int size() const
Definition: shapetable.h:169
void AddShape(const Shape &other)
Definition: shapetable.cpp:123
const Shape & GetShape(unsigned shape_id) const
Definition: shapetable.h:292
unsigned NumShapes() const
Definition: shapetable.h:248

◆ SetupForClustering()

CLUSTERER * tesseract::MasterTrainer::SetupForClustering ( const ShapeTable shape_table,
const FEATURE_DEFS_STRUCT feature_defs,
int  shape_id,
int *  num_samples 
)

Definition at line 585 of file mastertrainer.cpp.

587  {
589  int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;
590  ASSERT_HOST(num_params == (int)MicroFeatureParameter::MFCount);
591  CLUSTERER *clusterer = MakeClusterer(
592  num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);
593 
594  // We want to iterate over the samples of just the one shape.
595  IndexMapBiDi shape_map;
596  shape_map.Init(shape_table.NumShapes(), false);
597  shape_map.SetMap(shape_id, true);
598  shape_map.Setup();
599  // Reverse the order of the samples to match the previous behavior.
600  std::vector<const TrainingSample *> sample_ptrs;
601  SampleIterator it;
602  it.Init(&shape_map, &shape_table, false, &samples_);
603  for (it.Begin(); !it.AtEnd(); it.Next()) {
604  sample_ptrs.push_back(&it.GetSample());
605  }
606  uint32_t sample_id = 0;
607  for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
608  const TrainingSample *sample = sample_ptrs[i];
609  uint32_t num_features = sample->num_micro_features();
610  for (uint32_t f = 0; f < num_features; ++f) {
611  MakeSample(clusterer, sample->micro_features()[f].data(), sample_id);
612  }
613  ++sample_id;
614  }
615  *num_samples = sample_id;
616  return clusterer;
617 }
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:1441
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
Definition: cluster.cpp:1492
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:43
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:54

◆ SetupMasterShapes()

void tesseract::MasterTrainer::SetupMasterShapes ( )

Definition at line 288 of file mastertrainer.cpp.

288  {
289  tprintf("Building master shape table\n");
290  const int num_fonts = samples_.NumFonts();
291 
292  ShapeTable char_shapes_begin_fragment(samples_.unicharset());
293  ShapeTable char_shapes_end_fragment(samples_.unicharset());
294  ShapeTable char_shapes(samples_.unicharset());
295  for (int c = 0; c < samples_.charsetsize(); ++c) {
296  ShapeTable shapes(samples_.unicharset());
297  for (int f = 0; f < num_fonts; ++f) {
298  if (samples_.NumClassSamples(f, c, true) > 0) {
299  shapes.AddShape(c, f);
300  }
301  }
302  ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);
303 
304  const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);
305 
306  if (fragment == nullptr) {
307  char_shapes.AppendMasterShapes(shapes, nullptr);
308  } else if (fragment->is_beginning()) {
309  char_shapes_begin_fragment.AppendMasterShapes(shapes, nullptr);
310  } else if (fragment->is_ending()) {
311  char_shapes_end_fragment.AppendMasterShapes(shapes, nullptr);
312  } else {
313  char_shapes.AppendMasterShapes(shapes, nullptr);
314  }
315  }
317  &char_shapes_begin_fragment);
318  char_shapes.AppendMasterShapes(char_shapes_begin_fragment, nullptr);
320  &char_shapes_end_fragment);
321  char_shapes.AppendMasterShapes(char_shapes_end_fragment, nullptr);
323  &char_shapes);
324  master_shapes_.AppendMasterShapes(char_shapes, nullptr);
325  tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().c_str());
326 }
const float kFontMergeDistance
const int kMinClusteredShapes
const int kMaxUnicharsPerCluster
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:769
std::string SummaryStr() const
Definition: shapetable.cpp:325
void AppendMasterShapes(const ShapeTable &other, std::vector< int > *shape_map)
Definition: shapetable.cpp:683

◆ ShapeDistance()

float tesseract::MasterTrainer::ShapeDistance ( const ShapeTable shapes,
int  s1,
int  s2 
)

Definition at line 869 of file mastertrainer.cpp.

869  {
870  const IntFeatureMap &feature_map = feature_map_;
871  const Shape &shape1 = shapes.GetShape(s1);
872  const Shape &shape2 = shapes.GetShape(s2);
873  int num_chars1 = shape1.size();
874  int num_chars2 = shape2.size();
875  float dist_sum = 0.0f;
876  int dist_count = 0;
877  if (num_chars1 > 1 || num_chars2 > 1) {
878  // In the multi-char case try to optimize the calculation by computing
879  // distances between characters of matching font where possible.
880  for (int c1 = 0; c1 < num_chars1; ++c1) {
881  for (int c2 = 0; c2 < num_chars2; ++c2) {
882  dist_sum +=
883  samples_.UnicharDistance(shape1[c1], shape2[c2], true, feature_map);
884  ++dist_count;
885  }
886  }
887  } else {
888  // In the single unichar case, there is little alternative, but to compute
889  // the squared-order distance between pairs of fonts.
890  dist_sum =
891  samples_.UnicharDistance(shape1[0], shape2[0], false, feature_map);
892  ++dist_count;
893  }
894  return dist_sum / dist_count;
895 }
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)

◆ TestClassifier()

double tesseract::MasterTrainer::TestClassifier ( CountTypes  error_mode,
int  report_level,
bool  replicate_samples,
TrainingSampleSet samples,
ShapeClassifier test_classifier,
std::string *  report_string 
)

Definition at line 843 of file mastertrainer.cpp.

847  {
848  SampleIterator sample_it;
849  sample_it.Init(nullptr, nullptr, replicate_samples, samples);
850  if (report_level > 0) {
851  int num_samples = 0;
852  for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next()) {
853  ++num_samples;
854  }
855  tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",
856  sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),
857  test_classifier->GetShapeTable()->NumShapes(), num_samples);
858  tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");
859  }
860  double unichar_error = 0.0;
861  ErrorCounter::ComputeErrorRate(test_classifier, report_level, error_mode,
862  fontinfo_table_, page_images_, &sample_it,
863  &unichar_error, nullptr, report_string);
864  return unichar_error;
865 }
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const std::vector< Image > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, std::string *fonts_report)

◆ TestClassifierOnSamples()

void tesseract::MasterTrainer::TestClassifierOnSamples ( CountTypes  error_mode,
int  report_level,
bool  replicate_samples,
ShapeClassifier test_classifier,
std::string *  report_string 
)

Definition at line 821 of file mastertrainer.cpp.

825  {
826  TestClassifier(error_mode, report_level, replicate_samples, &samples_,
827  test_classifier, report_string);
828 }
double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, std::string *report_string)

◆ TestClassifierVOld()

void tesseract::MasterTrainer::TestClassifierVOld ( bool  replicate_samples,
ShapeClassifier test_classifier,
ShapeClassifier old_classifier 
)

Definition at line 809 of file mastertrainer.cpp.

811  {
812  SampleIterator sample_it;
813  sample_it.Init(nullptr, nullptr, replicate_samples, &samples_);
814  ErrorCounter::DebugNewErrors(test_classifier, old_classifier,
815  CT_UNICHAR_TOPN_ERR, fontinfo_table_,
816  page_images_, &sample_it);
817 }
@ CT_UNICHAR_TOPN_ERR
Definition: errorcounter.h:76
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const std::vector< Image > &page_images, SampleIterator *it)

◆ unicharset()

const UNICHARSET& tesseract::MasterTrainer::unicharset ( ) const
inline

Definition at line 178 of file mastertrainer.h.

178  {
179  return samples_.unicharset();
180  }

◆ WriteInttempAndPFFMTable()

void tesseract::MasterTrainer::WriteInttempAndPFFMTable ( const UNICHARSET unicharset,
const UNICHARSET shape_set,
const ShapeTable shape_table,
CLASS_STRUCT float_classes,
const char *  inttemp_file,
const char *  pffmtable_file 
)

Definition at line 623 of file mastertrainer.cpp.

628  {
629  auto *classify = new tesseract::Classify();
630  // Move the fontinfo table to classify.
631  fontinfo_table_.MoveTo(&classify->get_fontinfo_table());
632  INT_TEMPLATES_STRUCT *int_templates =
633  classify->CreateIntTemplates(float_classes, shape_set);
634  FILE *fp = fopen(inttemp_file, "wb");
635  if (fp == nullptr) {
636  tprintf("Error, failed to open file \"%s\"\n", inttemp_file);
637  } else {
638  classify->WriteIntTemplates(fp, int_templates, shape_set);
639  fclose(fp);
640  }
641  // Now write pffmtable. This is complicated by the fact that the adaptive
642  // classifier still wants one indexed by unichar-id, but the static
643  // classifier needs one indexed by its shape class id.
644  // We put the shapetable_cutoffs in a vector, and compute the
645  // unicharset cutoffs along the way.
646  std::vector<uint16_t> shapetable_cutoffs;
647  std::vector<uint16_t> unichar_cutoffs(unicharset.size());
648  /* then write out each class */
649  for (int i = 0; i < int_templates->NumClasses; ++i) {
650  INT_CLASS_STRUCT *Class = ClassForClassId(int_templates, i);
651  // Todo: Test with min instead of max
652  // int MaxLength = LengthForConfigId(Class, 0);
653  uint16_t max_length = 0;
654  for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {
655  // Todo: Test with min instead of max
656  // if (LengthForConfigId (Class, config_id) < MaxLength)
657  uint16_t length = Class->ConfigLengths[config_id];
658  if (length > max_length) {
659  max_length = Class->ConfigLengths[config_id];
660  }
661  int shape_id = float_classes[i].font_set.at(config_id);
662  const Shape &shape = shape_table.GetShape(shape_id);
663  for (int c = 0; c < shape.size(); ++c) {
664  int unichar_id = shape[c].unichar_id;
665  if (length > unichar_cutoffs[unichar_id]) {
666  unichar_cutoffs[unichar_id] = length;
667  }
668  }
669  }
670  shapetable_cutoffs.push_back(max_length);
671  }
672  fp = fopen(pffmtable_file, "wb");
673  if (fp == nullptr) {
674  tprintf("Error, failed to open file \"%s\"\n", pffmtable_file);
675  } else {
676  tesseract::Serialize(fp, shapetable_cutoffs);
677  for (int c = 0; c < unicharset.size(); ++c) {
678  const char *unichar = unicharset.id_to_unichar(c);
679  if (strcmp(unichar, " ") == 0) {
680  unichar = "NULL";
681  }
682  fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);
683  }
684  fclose(fp);
685  }
686  delete int_templates;
687  delete classify;
688 }
#define ClassForClassId(T, c)
Definition: intproto.h:156
TESS_API void MoveTo(UnicityTable< FontInfo > *target)
Definition: fontinfo.cpp:116
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
const UNICHARSET & unicharset() const

The documentation for this class was generated from the following files: