tesseract  5.0.0
cntraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: cntraining.cpp
3  ** Purpose: Generates a normproto and pffmtable.
4  ** Author: Dan Johnson
5  ** Revisment: Christy Russon
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 
19 /*----------------------------------------------------------------------------
20  Include Files and Type Defines
21 ----------------------------------------------------------------------------*/
22 #include <tesseract/unichar.h>
23 #include <cmath>
24 #include <cstdio>
25 #include <cstring>
26 #include "cluster.h"
27 #include "clusttool.h"
28 #include "commontraining.h"
29 #include "featdefs.h"
30 #include "ocrfeatures.h"
31 #include "oldlist.h"
32 
33 #define PROGRAM_FEATURE_TYPE "cn"
34 
35 using namespace tesseract;
36 
37 /*----------------------------------------------------------------------------
38  Private Function Prototypes
39 ----------------------------------------------------------------------------*/
40 
41 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
42  const FEATURE_DESC_STRUCT *feature_desc);
43 
44 static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos,
45  bool WriteInsigProtos);
46 
47 /*----------------------------------------------------------------------------
48  Global Data Definitions and Declarations
49 ----------------------------------------------------------------------------*/
50 /* global variable to hold configuration parameters to control clustering */
51 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3
52 static const CLUSTERCONFIG CNConfig = {elliptical, 0.025, 0.05, 0.8, 1e-3, 0};
53 
54 /*----------------------------------------------------------------------------
55  Public Code
56 ----------------------------------------------------------------------------*/
57 
103 int main(int argc, char *argv[]) {
104  tesseract::CheckSharedLibraryVersion();
105 
106  // Set the global Config parameters before parsing the command line.
107  Config = CNConfig;
108 
109  LIST CharList = NIL_LIST;
110  CLUSTERER *Clusterer = nullptr;
111  LIST ProtoList = NIL_LIST;
112  LIST NormProtoList = NIL_LIST;
113  LIST pCharList;
114  LABELEDLIST CharSample;
115  FEATURE_DEFS_STRUCT FeatureDefs;
116  InitFeatureDefs(&FeatureDefs);
117 
118  ParseArguments(&argc, &argv);
119  int num_fonts = 0;
120  for (const char *PageName = *++argv; PageName != nullptr; PageName = *++argv) {
121  printf("Reading %s ...\n", PageName);
122  FILE *TrainingPage = fopen(PageName, "rb");
123  ASSERT_HOST(TrainingPage);
124  if (TrainingPage) {
125  ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr, TrainingPage, &CharList);
126  fclose(TrainingPage);
127  ++num_fonts;
128  }
129  }
130  printf("Clustering ...\n");
131  // To allow an individual font to form a separate cluster,
132  // reduce the min samples:
133  // Config.MinSamples = 0.5 / num_fonts;
134  pCharList = CharList;
135  // The norm protos will count the source protos, so we keep them here in
136  // freeable_protos, so they can be freed later.
137  std::vector<LIST> freeable_protos;
138  iterate(pCharList) {
139  // Cluster
140  CharSample = reinterpret_cast<LABELEDLIST>(pCharList->first_node());
141  Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
142  if (Clusterer == nullptr) { // To avoid a SIGSEGV
143  fprintf(stderr, "Error: nullptr clusterer!\n");
144  return 1;
145  }
146  float SavedMinSamples = Config.MinSamples;
147  // To disable the tendency to produce a single cluster for all fonts,
148  // make MagicSamples an impossible to achieve number:
149  // Config.MagicSamples = CharSample->SampleCount * 10;
150  Config.MagicSamples = CharSample->SampleCount;
151  while (Config.MinSamples > 0.001) {
152  ProtoList = ClusterSamples(Clusterer, &Config);
153  if (NumberOfProtos(ProtoList, true, false) > 0) {
154  break;
155  } else {
156  Config.MinSamples *= 0.95;
157  printf(
158  "0 significant protos for %s."
159  " Retrying clustering with MinSamples = %f%%\n",
160  CharSample->Label.c_str(), Config.MinSamples);
161  }
162  }
163  Config.MinSamples = SavedMinSamples;
164  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
165  freeable_protos.push_back(ProtoList);
166  FreeClusterer(Clusterer);
167  }
168  FreeTrainingSamples(CharList);
169  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
170  WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]);
171  FreeNormProtoList(NormProtoList);
172  for (auto &freeable_proto : freeable_protos) {
173  FreeProtoList(&freeable_proto);
174  }
175  printf("\n");
176  return 0;
177 } // main
178 
179 /*----------------------------------------------------------------------------
180  Private Code
181 ----------------------------------------------------------------------------*/
182 
183 /*----------------------------------------------------------------------------*/
192 static void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
193  const FEATURE_DESC_STRUCT *feature_desc) {
194  FILE *File;
195  LABELEDLIST LabeledProto;
196  int N;
197 
198  std::string Filename = "";
199  if (Directory != nullptr && Directory[0] != '\0') {
200  Filename += Directory;
201  Filename += "/";
202  }
203  Filename += "normproto";
204  printf("\nWriting %s ...", Filename.c_str());
205  File = fopen(Filename.c_str(), "wb");
206  ASSERT_HOST(File);
207  fprintf(File, "%0d\n", feature_desc->NumParams);
208  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
209  iterate(LabeledProtoList) {
210  LabeledProto = reinterpret_cast<LABELEDLIST>(LabeledProtoList->first_node());
211  N = NumberOfProtos(LabeledProto->List, true, false);
212  if (N < 1) {
213  printf(
214  "\nError! Not enough protos for %s: %d protos"
215  " (%d significant protos"
216  ", %d insignificant protos)\n",
217  LabeledProto->Label.c_str(), N, NumberOfProtos(LabeledProto->List, true, false),
218  NumberOfProtos(LabeledProto->List, false, true));
219  exit(1);
220  }
221  fprintf(File, "\n%s %d\n", LabeledProto->Label.c_str(), N);
222  WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
223  }
224  fclose(File);
225 
226 } // WriteNormProtos
227 
228 /*-------------------------------------------------------------------------*/
229 
230 static void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, bool WriteSigProtos,
231  bool WriteInsigProtos) {
232  PROTOTYPE *Proto;
233 
234  // write prototypes
235  iterate(ProtoList) {
236  Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
237  if ((Proto->Significant && WriteSigProtos) || (!Proto->Significant && WriteInsigProtos)) {
238  WritePrototype(File, N, Proto);
239  }
240  }
241 } // WriteProtos
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define iterate(l)
Definition: oldlist.h:91
#define NIL_LIST
Definition: oldlist.h:75
int main(int argc, char *argv[])
Definition: cntraining.cpp:103
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:33
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:203
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:244
void ParseArguments(int *argc, char ***argv)
void FreeNormProtoList(LIST CharList)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
CLUSTERCONFIG Config
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:87
void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto)
Definition: clusttool.cpp:271
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1598
void FreeTrainingSamples(LIST CharList)
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:1576
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
@ elliptical
Definition: cluster.h:53
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:1544
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:43
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:54
list_rec * first_node()
Definition: oldlist.h:107