tesseract  5.0.0
mftraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: mftraining.c
3  ** Purpose: Separates training pages into files for each character.
4  ** Strips from files only the features and there parameters of
5  ** the feature type mf.
6  ** Author: Dan Johnson
7  ** Revisment: Christy Russon
8  **
9  ** (c) Copyright Hewlett-Packard Company, 1988.
10  ** Licensed under the Apache License, Version 2.0 (the "License");
11  ** you may not use this file except in compliance with the License.
12  ** You may obtain a copy of the License at
13  ** http://www.apache.org/licenses/LICENSE-2.0
14  ** Unless required by applicable law or agreed to in writing, software
15  ** distributed under the License is distributed on an "AS IS" BASIS,
16  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  ** See the License for the specific language governing permissions and
18  ** limitations under the License.
19  ******************************************************************************/
20 /*----------------------------------------------------------------------------
21  Include Files and Type Defines
22 ----------------------------------------------------------------------------*/
23 
24 #define _USE_MATH_DEFINES // for M_PI
25 #ifdef HAVE_CONFIG_H
26 # include "config_auto.h"
27 #endif
28 
29 #include <cmath> // for M_PI
30 #include <cstdio>
31 #include <cstring>
32 
33 #include "classify.h"
34 #include "cluster.h"
35 #include "clusttool.h"
36 #include "commontraining.h"
37 #include "featdefs.h"
38 #include "fontinfo.h"
39 #include "indexmapbidi.h"
40 #include "intproto.h"
41 #include "mastertrainer.h"
42 #include "mergenf.h"
43 #include "mf.h"
44 #include "ocrfeatures.h"
45 #include "oldlist.h"
46 #include "protos.h"
47 #include "shapetable.h"
48 #include "tprintf.h"
49 #include "unicity_table.h"
50 
51 using namespace tesseract;
52 
53 /*----------------------------------------------------------------------------
54  Public Code
55 -----------------------------------------------------------------------------*/
56 #ifndef GRAPHICS_DISABLED
57 static void DisplayProtoList(const char *ch, LIST protolist) {
58  auto window = std::make_unique<ScrollView>("Char samples", 50, 200, 520, 520, 260, 260, true);
59  LIST proto = protolist;
60  iterate(proto) {
61  auto *prototype = reinterpret_cast<PROTOTYPE *>(proto->first_node());
62  if (prototype->Significant) {
63  window->Pen(ScrollView::GREEN);
64  } else if (prototype->NumSamples == 0) {
65  window->Pen(ScrollView::BLUE);
66  } else if (prototype->Merged) {
67  window->Pen(ScrollView::MAGENTA);
68  } else {
69  window->Pen(ScrollView::RED);
70  }
71  float x = CenterX(prototype->Mean);
72  float y = CenterY(prototype->Mean);
73  double angle = OrientationOf(prototype->Mean) * 2 * M_PI;
74  auto dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);
75  auto dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);
76  window->SetCursor((x - dx) * 256, (y - dy) * 256);
77  window->DrawTo((x + dx) * 256, (y + dy) * 256);
78  if (prototype->Significant) {
79  tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototype->NumSamples);
80  } else if (prototype->NumSamples > 0 && !prototype->Merged) {
81  tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototype->NumSamples);
82  }
83  }
84  window->Update();
85 }
86 #endif // !GRAPHICS_DISABLED
87 
88 // Helper to run clustering on a single config.
89 // Mostly copied from the old mftraining, but with renamed variables.
90 static LIST ClusterOneConfig(int shape_id, const char *class_label, LIST mf_classes,
91  const ShapeTable &shape_table, MasterTrainer *trainer) {
92  int num_samples;
93  CLUSTERER *clusterer =
94  trainer->SetupForClustering(shape_table, feature_defs, shape_id, &num_samples);
95  Config.MagicSamples = num_samples;
96  LIST proto_list = ClusterSamples(clusterer, &Config);
97  CleanUpUnusedData(proto_list);
98 
99  // Merge protos where reasonable to make more of them significant by
100  // representing almost all samples of the class/font.
101  MergeInsignificantProtos(proto_list, class_label, clusterer, &Config);
102 #ifndef GRAPHICS_DISABLED
103  if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0) {
104  DisplayProtoList(FLAGS_test_ch.c_str(), proto_list);
105  }
106 #endif // !GRAPHICS_DISABLED
107  // Delete the protos that will not be used in the inttemp output file.
108  proto_list = RemoveInsignificantProtos(proto_list, true, false, clusterer->SampleSize);
109  FreeClusterer(clusterer);
110  MERGE_CLASS merge_class = FindClass(mf_classes, class_label);
111  if (merge_class == nullptr) {
112  merge_class = new MERGE_CLASS_NODE(class_label);
113  mf_classes = push(mf_classes, merge_class);
114  }
115  int config_id = AddConfigToClass(merge_class->Class);
116  merge_class->Class->font_set.push_back(shape_id);
117  LIST proto_it = proto_list;
118  iterate(proto_it) {
119  auto *prototype = reinterpret_cast<PROTOTYPE *>(proto_it->first_node());
120  // See if proto can be approximated by existing proto.
121  int p_id = FindClosestExistingProto(merge_class->Class, merge_class->NumMerged, prototype);
122  if (p_id == NO_PROTO) {
123  // Need to make a new proto, as it doesn't match anything.
124  p_id = AddProtoToClass(merge_class->Class);
125  MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype);
126  merge_class->NumMerged[p_id] = 1;
127  } else {
128  PROTO_STRUCT dummy_proto;
129  MakeNewFromOld(&dummy_proto, prototype);
130  // Merge with the similar proto.
131  ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto,
132  static_cast<float>(merge_class->NumMerged[p_id]), 1.0,
133  ProtoIn(merge_class->Class, p_id));
134  merge_class->NumMerged[p_id]++;
135  }
136  AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]);
137  }
138  FreeProtoList(&proto_list);
139  return mf_classes;
140 }
141 
142 // Helper to setup the config map.
143 // Setup an index mapping from the shapes in the shape table to the classes
144 // that will be trained. In keeping with the original design, each shape
145 // with the same list of unichars becomes a different class and the configs
146 // represent the different combinations of fonts.
147 static void SetupConfigMap(ShapeTable *shape_table, IndexMapBiDi *config_map) {
148  int num_configs = shape_table->NumShapes();
149  config_map->Init(num_configs, true);
150  config_map->Setup();
151  for (int c1 = 0; c1 < num_configs; ++c1) {
152  // Only process ids that are not already merged.
153  if (config_map->SparseToCompact(c1) == c1) {
154  Shape *shape1 = shape_table->MutableShape(c1);
155  // Find all the subsequent shapes that are equal.
156  for (int c2 = c1 + 1; c2 < num_configs; ++c2) {
157  if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) {
158  config_map->Merge(c1, c2);
159  }
160  }
161  }
162  }
163  config_map->CompleteMerges();
164 }
165 
193 int main(int argc, char **argv) {
194  tesseract::CheckSharedLibraryVersion();
195 
196  ParseArguments(&argc, &argv);
197 
198  ShapeTable *shape_table = nullptr;
199  std::string file_prefix;
200  // Load the training data.
201  auto trainer = tesseract::LoadTrainingData(argv + 1, false, &shape_table, file_prefix);
202  if (trainer == nullptr) {
203  return 1; // Failed.
204  }
205 
206  // Setup an index mapping from the shapes in the shape table to the classes
207  // that will be trained. In keeping with the original design, each shape
208  // with the same list of unichars becomes a different class and the configs
209  // represent the different combinations of fonts.
210  IndexMapBiDi config_map;
211  SetupConfigMap(shape_table, &config_map);
212 
213  WriteShapeTable(file_prefix, *shape_table);
214  // If the shape_table is flat, then either we didn't run shape clustering, or
215  // it did nothing, so we just output the trainer's unicharset.
216  // Otherwise shape_set will hold a fake unicharset with an entry for each
217  // shape in the shape table, and we will output that instead.
218  UNICHARSET shape_set;
219  const UNICHARSET *unicharset = &trainer->unicharset();
220  // If we ran shapeclustering (and it worked) then at least one shape will
221  // have multiple unichars, so we have to build a fake unicharset.
222  if (shape_table->AnyMultipleUnichars()) {
223  unicharset = &shape_set;
224  // Now build a fake unicharset for the compact shape space to keep the
225  // output modules happy that we are doing things correctly.
226  int num_shapes = config_map.CompactSize();
227  for (int s = 0; s < num_shapes; ++s) {
228  char shape_label[14];
229  snprintf(shape_label, sizeof(shape_label), "sh%04d", s);
230  shape_set.unichar_insert(shape_label);
231  }
232  }
233 
234  // Now train each config separately.
235  int num_configs = shape_table->NumShapes();
236  LIST mf_classes = NIL_LIST;
237  for (int s = 0; s < num_configs; ++s) {
238  int unichar_id, font_id;
239  if (unicharset == &shape_set) {
240  // Using fake unichar_ids from the config_map/shape_set.
241  unichar_id = config_map.SparseToCompact(s);
242  } else {
243  // Get the real unichar_id from the shape table/unicharset.
244  shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
245  }
246  const char *class_label = unicharset->id_to_unichar(unichar_id);
247  mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, trainer.get());
248  }
249  std::string inttemp_file = file_prefix;
250  inttemp_file += "inttemp";
251  std::string pffmtable_file = file_prefix;
252  pffmtable_file += "pffmtable";
253  CLASS_STRUCT *float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
254  // Now write the inttemp and pffmtable.
255  trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, *shape_table, float_classes,
256  inttemp_file.c_str(), pffmtable_file.c_str());
257  for (int c = 0; c < unicharset->size(); ++c) {
258  FreeClassFields(&float_classes[c]);
259  }
260  delete[] float_classes;
261  FreeLabeledClassList(mf_classes);
262  delete shape_table;
263  printf("Done!\n");
264  if (!FLAGS_test_ch.empty()) {
265  // If we are displaying debug window(s), wait for the user to look at them.
266  printf("Hit return to exit...\n");
267  while (getchar() != '\n') {
268  ;
269  }
270  }
271  return 0;
272 } /* main */
#define AddProtoToConfig(Pid, Config)
Definition: protos.h:61
#define ProtoIn(Class, Pid)
Definition: protos.h:70
#define iterate(l)
Definition: oldlist.h:91
#define NIL_LIST
Definition: oldlist.h:75
#define NO_PROTO
Definition: matchdefs.h:41
void ComputeMergedProto(PROTO_STRUCT *p1, PROTO_STRUCT *p2, float w1, float w2, PROTO_STRUCT *MergedProto)
Definition: mergenf.cpp:130
int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], PROTOTYPE *Prototype)
Definition: mergenf.cpp:158
void MakeNewFromOld(PROTO_STRUCT *New, PROTOTYPE *Old)
Definition: mergenf.cpp:194
#define CenterX(M)
Definition: mergenf.h:48
#define CenterY(M)
Definition: mergenf.h:49
#define LengthOf(M)
Definition: mergenf.h:50
#define OrientationOf(M)
Definition: mergenf.h:51
int main(int argc, char **argv)
Definition: mftraining.cpp:193
MERGE_CLASS FindClass(LIST List, const std::string &Label)
void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void ParseArguments(int *argc, char ***argv)
int AddConfigToClass(CLASS_TYPE Class)
Definition: protos.cpp:49
int AddProtoToClass(CLASS_TYPE Class)
Definition: protos.cpp:82
CLUSTERCONFIG Config
void FreeClassFields(CLASS_TYPE Class)
Definition: protos.cpp:131
FEATURE_DEFS_STRUCT feature_defs
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1598
std::unique_ptr< MasterTrainer > LoadTrainingData(const char *const *filelist, bool replication, ShapeTable **shape_table, std::string &file_prefix)
void CleanUpUnusedData(LIST ProtoList)
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:1576
LIST push(LIST list, void *element)
Definition: oldlist.cpp:178
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:1544
void FreeLabeledClassList(LIST ClassList)
int push_back(T object)
Add an element in the table.
Definition: unicity_table.h:73
int CompactSize() const
Definition: indexmapbidi.h:63
void Init(int size, bool all_mapped)
bool Merge(int compact_index1, int compact_index2)
int SparseToCompact(int sparse_index) const override
Definition: indexmapbidi.h:140
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
size_t size() const
Definition: unicharset.h:355
int16_t SampleSize
Definition: cluster.h:92
std::vector< BIT_VECTOR > Configurations
Definition: protos.h:46
UnicityTable< int > font_set
Definition: protos.h:47
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:222
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:458
unsigned NumShapes() const
Definition: shapetable.h:248
Shape * MutableShape(unsigned shape_id)
Definition: shapetable.h:295
void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:420
list_rec * first_node()
Definition: oldlist.h:107
tesseract::CLASS_TYPE Class
int NumMerged[MAX_NUM_PROTOS]
const UNICHARSET & unicharset() const
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
CLUSTERER * SetupForClustering(const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)