tesseract  5.0.0
combine_tessdata.cpp
Go to the documentation of this file.
1 // File: combine_tessdata.cpp
3 // Description: Creates a unified traineddata file from several
4 // data files produced by the training process.
5 // Author: Daria Antonova
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "commontraining.h" // CheckSharedLibraryVersion
21 #include "lstmrecognizer.h"
22 #include "tessdatamanager.h"
23 
24 #include <cerrno>
25 #include <iostream> // std::cout
26 
27 using namespace tesseract;
28 
29 static int list_components(TessdataManager &tm, const char *filename) {
30  // Initialize TessdataManager with the data in the given traineddata file.
31  if (filename != nullptr && !tm.Init(filename)) {
32  tprintf("Failed to read %s\n", filename);
33  return EXIT_FAILURE;
34  }
35  tm.Directory();
36  return EXIT_SUCCESS;
37 }
38 
39 static int list_network(TessdataManager &tm, const char *filename) {
40  if (filename != nullptr && !tm.Init(filename)) {
41  tprintf("Failed to read %s\n", filename);
42  return EXIT_FAILURE;
43  }
46  tesseract::LSTMRecognizer recognizer;
47  if (!recognizer.DeSerialize(&tm, &fp)) {
48  tprintf("Failed to deserialize LSTM in %s!\n", filename);
49  return EXIT_FAILURE;
50  }
51  std::cout << "LSTM: network=" << recognizer.GetNetwork()
52  << ", int_mode=" << recognizer.IsIntMode()
53  << ", recoding=" << recognizer.IsRecoding()
54  << ", iteration=" << recognizer.training_iteration()
55  << ", sample_iteration=" << recognizer.sample_iteration()
56  << ", null_char=" << recognizer.null_char()
57  << ", learning_rate=" << recognizer.learning_rate()
58  << ", momentum=" << recognizer.GetMomentum()
59  << ", adam_beta=" << recognizer.GetAdamBeta() << '\n';
60 
61  std::cout << "Layer Learning Rates: ";
62  auto layers = recognizer.EnumerateLayers();
63  for (const auto &id : layers) {
64  auto layer = recognizer.GetLayer(id);
65  std::cout << id << "(" << layer->name() << ")"
66  << "=" << recognizer.GetLayerLearningRate(id)
67  << (layers[layers.size() - 1] != id ? ", " : "");
68  }
69  std::cout << "\n";
70  }
71  return EXIT_SUCCESS;
72 }
73 
74 // Main program to combine/extract/overwrite tessdata components
75 // in [lang].traineddata files.
76 //
77 // To combine all the individual tessdata components (unicharset, DAWGs,
78 // classifier templates, ambiguities, language configs) located at, say,
79 // /home/$USER/temp/eng.* run:
80 //
81 // combine_tessdata /home/$USER/temp/eng.
82 //
83 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
84 //
85 // Specify option -e if you would like to extract individual components
86 // from a combined traineddata file. For example, to extract language config
87 // file and the unicharset from tessdata/eng.traineddata run:
88 //
89 // combine_tessdata -e tessdata/eng.traineddata
90 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
91 //
92 // The desired config file and unicharset will be written to
93 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
94 //
95 // Specify option -o to overwrite individual components of the given
96 // [lang].traineddata file. For example, to overwrite language config
97 // and unichar ambiguities files in tessdata/eng.traineddata use:
98 //
99 // combine_tessdata -o tessdata/eng.traineddata
100 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
101 //
102 // As a result, tessdata/eng.traineddata will contain the new language config
103 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
104 //
105 // Note: the file names of the files to extract to and to overwrite from should
106 // have the appropriate file suffixes (extensions) indicating their tessdata
107 // component type (.unicharset for the unicharset, .unicharambigs for unichar
108 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
109 //
110 // Specify option -u to unpack all the components to the specified path:
111 //
112 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
113 //
114 // This will create /home/$USER/temp/eng.* files with individual tessdata
115 // components from tessdata/eng.traineddata.
116 //
117 int main(int argc, char **argv) {
118  tesseract::CheckSharedLibraryVersion();
119 
120  int i;
122  if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
123  printf("%s\n", tesseract::TessBaseAPI::Version());
124  return EXIT_SUCCESS;
125  } else if (argc == 2) {
126  printf("Combining tessdata files\n");
127  std::string lang = argv[1];
128  char *last = &argv[1][strlen(argv[1]) - 1];
129  if (*last != '.') {
130  lang += '.';
131  }
132  std::string output_file = lang;
133  output_file += kTrainedDataSuffix;
134  if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) {
135  printf("Error combining tessdata files into %s\n", output_file.c_str());
136  } else {
137  printf("Output %s created successfully.\n", output_file.c_str());
138  }
139  } else if (argc >= 4 &&
140  (strcmp(argv[1], "-e") == 0 || strcmp(argv[1], "-u") == 0)) {
141  // Initialize TessdataManager with the data in the given traineddata file.
142  if (!tm.Init(argv[2])) {
143  tprintf("Failed to read %s\n", argv[2]);
144  return EXIT_FAILURE;
145  }
146  printf("Extracting tessdata components from %s\n", argv[2]);
147  if (strcmp(argv[1], "-e") == 0) {
148  for (i = 3; i < argc; ++i) {
149  errno = 0;
150  if (tm.ExtractToFile(argv[i])) {
151  printf("Wrote %s\n", argv[i]);
152  } else if (errno == 0) {
153  printf(
154  "Not extracting %s, since this component"
155  " is not present\n",
156  argv[i]);
157  return EXIT_FAILURE;
158  } else {
159  printf("Error, could not extract %s: %s\n", argv[i], strerror(errno));
160  return EXIT_FAILURE;
161  }
162  }
163  } else { // extract all the components
164  for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
165  std::string filename = argv[3];
166  char *last = &argv[3][strlen(argv[3]) - 1];
167  if (*last != '.') {
168  filename += '.';
169  }
170  filename += tesseract::kTessdataFileSuffixes[i];
171  errno = 0;
172  if (tm.ExtractToFile(filename.c_str())) {
173  printf("Wrote %s\n", filename.c_str());
174  } else if (errno != 0) {
175  printf("Error, could not extract %s: %s\n", filename.c_str(),
176  strerror(errno));
177  return EXIT_FAILURE;
178  }
179  }
180  }
181  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
182  // Rename the current traineddata file to a temporary name.
183  const char *new_traineddata_filename = argv[2];
184  std::string traineddata_filename = new_traineddata_filename;
185  traineddata_filename += ".__tmp__";
186  if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) {
187  tprintf("Failed to create a temporary file %s\n",
188  traineddata_filename.c_str());
189  return EXIT_FAILURE;
190  }
191 
192  // Initialize TessdataManager with the data in the given traineddata file.
193  tm.Init(traineddata_filename.c_str());
194 
195  // Write the updated traineddata file.
196  tm.OverwriteComponents(new_traineddata_filename, argv + 3, argc - 3);
197  } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
198  if (!tm.Init(argv[2])) {
199  tprintf("Failed to read %s\n", argv[2]);
200  return EXIT_FAILURE;
201  }
202  tesseract::TFile fp;
203  if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
204  tprintf("No LSTM Component found in %s!\n", argv[2]);
205  return EXIT_FAILURE;
206  }
207  tesseract::LSTMRecognizer recognizer;
208  if (!recognizer.DeSerialize(&tm, &fp)) {
209  tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
210  return EXIT_FAILURE;
211  }
212  recognizer.ConvertToInt();
213  std::vector<char> lstm_data;
214  fp.OpenWrite(&lstm_data);
215  ASSERT_HOST(recognizer.Serialize(&tm, &fp));
216  tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
217  lstm_data.size());
218  if (!tm.SaveFile(argv[2], nullptr)) {
219  tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
220  return EXIT_FAILURE;
221  }
222  } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
223  return list_components(tm, argv[2]);
224  } else if (argc == 3 && strcmp(argv[1], "-l") == 0) {
225  return list_network(tm, argv[2]);
226  } else if (argc == 3 && strcmp(argv[1], "-dl") == 0) {
227  int result = list_components(tm, argv[2]);
228  if (result == EXIT_SUCCESS) {
229  result = list_network(tm, nullptr);
230  }
231  return result;
232  } else if (argc == 3 && strcmp(argv[1], "-ld") == 0) {
233  int result = list_network(tm, argv[2]);
234  if (result == EXIT_SUCCESS) {
235  result = list_components(tm, nullptr);
236  }
237  return result;
238  } else {
239  printf(
240  "Usage for combining tessdata components:\n"
241  " %s language_data_path_prefix\n"
242  " (e.g. %s tessdata/eng.)\n\n",
243  argv[0], argv[0]);
244  printf(
245  "Usage for extracting tessdata components:\n"
246  " %s -e traineddata_file [output_component_file...]\n"
247  " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
248  argv[0], argv[0]);
249  printf(
250  "Usage for overwriting tessdata components:\n"
251  " %s -o traineddata_file [input_component_file...]\n"
252  " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
253  argv[0], argv[0]);
254  printf(
255  "Usage for unpacking all tessdata components:\n"
256  " %s -u traineddata_file output_path_prefix\n"
257  " (e.g. %s -u eng.traineddata tmp/eng.)\n\n",
258  argv[0], argv[0]);
259  printf(
260  "Usage for listing the network information\n"
261  " %s -l traineddata_file\n"
262  " (e.g. %s -l eng.traineddata)\n\n",
263  argv[0], argv[0]);
264  printf(
265  "Usage for listing directory of components:\n"
266  " %s -d traineddata_file\n\n",
267  argv[0]);
268  printf(
269  "Usage for compacting LSTM component to int:\n"
270  " %s -c traineddata_file\n",
271  argv[0]);
272  return 1;
273  }
274  tm.Directory();
275  return EXIT_SUCCESS;
276 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
int main(int argc, char **argv)
LIST last(LIST var_list)
Definition: oldlist.cpp:153
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ TESSDATA_NUM_ENTRIES
static const char * Version()
Definition: baseapi.cpp:238
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246
void OverwriteEntry(TessdataType type, const char *data, int size)
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool GetComponent(TessdataType type, TFile *fp)
bool SaveFile(const char *filename, FileWriter writer) const
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool ExtractToFile(const char *filename)
bool Init(const char *data_file_name)
const char * GetNetwork() const
std::vector< std::string > EnumerateLayers() const
float GetLayerLearningRate(const std::string &id) const
bool Serialize(const TessdataManager *mgr, TFile *fp) const
Network * GetLayer(const std::string &id) const
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
const std::string & name() const
Definition: network.h:140