tesseract  5.0.0
commontraining.h
Go to the documentation of this file.
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #ifndef TESSERACT_TRAINING_COMMONTRAINING_H_
15 #define TESSERACT_TRAINING_COMMONTRAINING_H_
16 
17 #ifdef HAVE_CONFIG_H
18 # include "config_auto.h"
19 #endif
20 
21 #include "commandlineflags.h"
22 #include "export.h"
23 #include "tprintf.h"
24 
25 #include <tesseract/baseapi.h>
26 
27 #include <memory>
28 
29 namespace tesseract {
30 
31 TESS_COMMON_TRAINING_API
32 void ParseArguments(int *argc, char ***argv);
33 
34 // Check whether the shared tesseract library is the right one.
35 // This function must be inline because otherwise it would be part of
36 // the shared library, so it could not compare the versions.
37 static inline void CheckSharedLibraryVersion() {
38 #ifdef HAVE_CONFIG_H
40  tprintf(
41  "ERROR: shared library version mismatch (was %s, expected %s\n"
42  "Did you use a wrong shared tesseract library?\n",
44  exit(1);
45  }
46 #endif
47 }
48 
49 } // namespace tesseract
50 
51 #ifndef DISABLED_LEGACY_ENGINE
52 
53 # include "cluster.h"
54 # include "featdefs.h"
55 # include "intproto.h"
56 # include "oldlist.h"
57 
58 namespace tesseract {
59 
60 class Classify;
61 class MasterTrainer;
62 class ShapeTable;
63 
65 // Globals ///////////////////////////////////////////////////////////////////
67 
68 TESS_COMMON_TRAINING_API
69 extern FEATURE_DEFS_STRUCT feature_defs;
70 
71 // Must be defined in the file that "implements" commonTraining facilities.
72 TESS_COMMON_TRAINING_API
73 extern CLUSTERCONFIG Config;
74 
76 // Structs ///////////////////////////////////////////////////////////////////
82  LABELEDLISTNODE(const char *label) : Label(label) {
83  }
84  std::string Label;
85  int SampleCount = 0;
87  LIST List = nullptr;
88 };
90 
93  }
94  std::string Label;
97 };
99 
101 // Functions /////////////////////////////////////////////////////////////////
103 
104 // Helper loads shape table from the given file.
105 ShapeTable *LoadShapeTable(const std::string &file_prefix);
106 // Helper to write the shape_table.
107 TESS_COMMON_TRAINING_API
108 void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table);
109 
110 // Creates a MasterTraininer and loads the training data into it:
111 // Initializes feature_defs and IntegerFX.
112 // Loads the shape_table if shape_table != nullptr.
113 // Loads initial unicharset from -U command-line option.
114 // If FLAGS_input_trainer is set, loads the majority of data from there, else:
115 // Loads font info from -F option.
116 // Loads xheights from -X option.
117 // Loads samples from .tr files in remaining command-line args.
118 // Deletes outliers and computes canonical samples.
119 // If FLAGS_output_trainer is set, saves the trainer for future use.
120 // Computes canonical and cloud features.
121 // If shape_table is not nullptr, but failed to load, make a fake flat one,
122 // as shape clustering was not run.
123 TESS_COMMON_TRAINING_API
124 std::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication,
125  ShapeTable **shape_table, std::string &file_prefix);
126 
127 LABELEDLIST FindList(tesseract::LIST List, const std::string &Label);
128 
129 TESS_COMMON_TRAINING_API
131  const char *feature_name, int max_samples,
132  tesseract::UNICHARSET *unicharset, FILE *file,
133  tesseract::LIST *training_samples);
134 
135 void WriteTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory,
136  tesseract::LIST CharList, const char *program_feature_type);
137 
138 TESS_COMMON_TRAINING_API
139 void FreeTrainingSamples(tesseract::LIST CharList);
140 
141 TESS_COMMON_TRAINING_API
142 void FreeLabeledList(LABELEDLIST LabeledList);
143 
144 TESS_COMMON_TRAINING_API
145 void FreeLabeledClassList(tesseract::LIST ClassListList);
146 
147 TESS_COMMON_TRAINING_API
149  LABELEDLIST CharSample, const char *program_feature_type);
150 
151 TESS_COMMON_TRAINING_API
152 tesseract::LIST RemoveInsignificantProtos(tesseract::LIST ProtoList, bool KeepSigProtos,
153  bool KeepInsigProtos, int N);
154 
155 TESS_COMMON_TRAINING_API
156 void CleanUpUnusedData(tesseract::LIST ProtoList);
157 
158 TESS_COMMON_TRAINING_API
159 void MergeInsignificantProtos(tesseract::LIST ProtoList, const char *label,
161 
162 TESS_COMMON_TRAINING_API
163 MERGE_CLASS FindClass(tesseract::LIST List, const std::string &Label);
164 
165 TESS_COMMON_TRAINING_API
167  tesseract::LIST LabeledClassList);
168 
169 void Normalize(float *Values);
170 
171 TESS_COMMON_TRAINING_API
172 void FreeNormProtoList(tesseract::LIST CharList);
173 
174 TESS_COMMON_TRAINING_API
175 void AddToNormProtosList(tesseract::LIST *NormProtoList, tesseract::LIST ProtoList, const std::string &CharName);
176 
177 TESS_COMMON_TRAINING_API
178 int NumberOfProtos(tesseract::LIST ProtoList, bool CountSigProtos, bool CountInsigProtos);
179 
181 
182 } // namespace tesseract
183 
184 #endif // def DISABLED_LEGACY_ENGINE
185 
186 #endif // TESSERACT_TRAINING_COMMONTRAINING_H_
#define TESSERACT_VERSION_STR
Definition: version.h:32
#define MAX_NUM_PROTOS
Definition: intproto.h:48
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
void WriteTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory, tesseract::LIST CharList, const char *program_feature_type)
MERGE_CLASS FindClass(LIST List, const std::string &Label)
void Normalize(float *Values)
void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table)
void FreeLabeledList(LABELEDLIST LabeledList)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void ParseArguments(int *argc, char ***argv)
void FreeNormProtoList(LIST CharList)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
ShapeTable * LoadShapeTable(const std::string &file_prefix)
CLUSTERCONFIG Config
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName)
FEATURE_DEFS_STRUCT feature_defs
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
std::unique_ptr< MasterTrainer > LoadTrainingData(const char *const *filelist, bool replication, ShapeTable **shape_table, std::string &file_prefix)
void FreeTrainingSamples(LIST CharList)
void CleanUpUnusedData(LIST ProtoList)
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:145
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
void FreeLabeledClassList(LIST ClassList)
LABELEDLIST FindList(LIST List, const std::string &Label)
void allocNormProtos()
static const char * Version()
Definition: baseapi.cpp:238
LABELEDLISTNODE(const char *label)
MERGE_CLASS_NODE(const char *label)
tesseract::CLASS_TYPE Class
int NumMerged[MAX_NUM_PROTOS]