tesseract  5.0.0
tessdatamanager.h
Go to the documentation of this file.
1 // File: tessdatamanager.h
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
20 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21 
22 #include <tesseract/baseapi.h> // FileReader
23 #include <string> // std::string
24 #include <vector> // std::vector
25 #include "serialis.h" // FileWriter
26 
27 static const char kTrainedDataSuffix[] = "traineddata";
28 
29 // When adding new tessdata types and file suffixes, please make sure to
30 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
31 static const char kLangConfigFileSuffix[] = "config";
32 static const char kUnicharsetFileSuffix[] = "unicharset";
33 static const char kAmbigsFileSuffix[] = "unicharambigs";
34 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
35 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
36 static const char kNormProtoFileSuffix[] = "normproto";
37 static const char kPuncDawgFileSuffix[] = "punc-dawg";
38 static const char kSystemDawgFileSuffix[] = "word-dawg";
39 static const char kNumberDawgFileSuffix[] = "number-dawg";
40 static const char kFreqDawgFileSuffix[] = "freq-dawg";
41 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
42 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
43 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
44 static const char kShapeTableFileSuffix[] = "shapetable";
45 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
46 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
47 static const char kParamsModelFileSuffix[] = "params-model";
48 static const char kLSTMModelFileSuffix[] = "lstm";
49 static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";
50 static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";
51 static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";
52 static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";
53 static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";
54 static const char kVersionFileSuffix[] = "version";
55 
56 namespace tesseract {
57 
69  TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
70  TESSDATA_CUBE_UNICHARSET, // 11 // deprecated
71  TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated
83 
85 };
86 
91 static const char *const kTessdataFileSuffixes[] = {
92  kLangConfigFileSuffix, // 0
93  kUnicharsetFileSuffix, // 1
94  kAmbigsFileSuffix, // 2
95  kBuiltInTemplatesFileSuffix, // 3
96  kBuiltInCutoffsFileSuffix, // 4
97  kNormProtoFileSuffix, // 5
98  kPuncDawgFileSuffix, // 6
99  kSystemDawgFileSuffix, // 7
100  kNumberDawgFileSuffix, // 8
101  kFreqDawgFileSuffix, // 9
102  kFixedLengthDawgsFileSuffix, // 10 // deprecated
103  kCubeUnicharsetFileSuffix, // 11 // deprecated
104  kCubeSystemDawgFileSuffix, // 12 // deprecated
105  kShapeTableFileSuffix, // 13
106  kBigramDawgFileSuffix, // 14
107  kUnambigDawgFileSuffix, // 15
108  kParamsModelFileSuffix, // 16
109  kLSTMModelFileSuffix, // 17
110  kLSTMPuncDawgFileSuffix, // 18
111  kLSTMSystemDawgFileSuffix, // 19
112  kLSTMNumberDawgFileSuffix, // 20
113  kLSTMUnicharsetFileSuffix, // 21
114  kLSTMRecoderFileSuffix, // 22
115  kVersionFileSuffix, // 23
116 };
117 
125 static const int kMaxNumTessdataEntries = 1000;
126 
128 public:
129  TessdataManager();
130  explicit TessdataManager(FileReader reader);
131 
132  ~TessdataManager() = default;
133 
134  bool swap() const {
135  return swap_;
136  }
137  bool is_loaded() const {
138  return is_loaded_;
139  }
140 
141  // Lazily loads from the the given filename. Won't actually read the file
142  // until it needs it.
143  void LoadFileLater(const char *data_file_name);
148  bool Init(const char *data_file_name);
149  // Loads from the given memory buffer as if a file, remembering name as some
150  // arbitrary source id for caching.
151  bool LoadMemBuffer(const char *name, const char *data, int size);
152  // Overwrites a single entry of the given type.
153  void OverwriteEntry(TessdataType type, const char *data, int size);
154 
155  // Saves to the given filename.
156  bool SaveFile(const char *filename, FileWriter writer) const;
157  // Serializes to the given vector.
158  void Serialize(std::vector<char> *data) const;
159  // Resets to the initial state, keeping the reader.
160  void Clear();
161 
162  // Prints a directory of contents.
163  void Directory() const;
164 
165  // Returns true if the component requested is present.
167  return !entries_[type].empty();
168  }
169  // Opens the given TFile pointer to the given component type.
170  // Returns false in case of failure.
171  bool GetComponent(TessdataType type, TFile *fp);
172  // As non-const version except it can't load the component if not already
173  // loaded.
174  bool GetComponent(TessdataType type, TFile *fp) const;
175 
176  // Returns the current version string.
177  std::string VersionString() const;
178  // Sets the version string to the given v_str.
179  void SetVersionString(const std::string &v_str);
180 
181  // Returns true if the base Tesseract components are present.
182  bool IsBaseAvailable() const {
183  return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty();
184  }
185 
186  // Returns true if the LSTM components are present.
187  bool IsLSTMAvailable() const {
188  return !entries_[TESSDATA_LSTM].empty();
189  }
190 
191  // Return the name of the underlying data file.
192  const std::string &GetDataFileName() const {
193  return data_file_name_;
194  }
195 
201  bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename);
202 
208  bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames,
209  int num_new_components);
210 
221  bool ExtractToFile(const char *filename);
222 
223 private:
224  // Use libarchive.
225  bool LoadArchiveFile(const char *filename);
226 
233  static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type);
234 
239  static bool TessdataTypeFromFileName(const char *filename, TessdataType *type);
240 
241  // Name of file it came from.
242  std::string data_file_name_;
243  // Function to load the file when we need it.
244  FileReader reader_;
245  // True if the file has been loaded.
246  bool is_loaded_;
247  // True if the bytes need swapping.
248  bool swap_;
249  // Contents of each element of the traineddata file.
250  std::vector<char> entries_[TESSDATA_NUM_ENTRIES];
251 };
252 
253 } // namespace tesseract
254 
255 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
bool(*)(const std::vector< char > &data, const char *filename) FileWriter
Definition: serialis.h:48
@ TESSDATA_UNAMBIG_DAWG
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_LSTM_UNICHARSET
@ TESSDATA_CUBE_SYSTEM_DAWG
@ TESSDATA_PARAMS_MODEL
@ TESSDATA_NUMBER_DAWG
@ TESSDATA_CUBE_UNICHARSET
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_BIGRAM_DAWG
@ TESSDATA_LSTM_RECODER
@ TESSDATA_LANG_CONFIG
@ TESSDATA_LSTM_NUMBER_DAWG
@ TESSDATA_NUM_ENTRIES
@ TESSDATA_SHAPE_TABLE
@ TESSDATA_FIXED_LENGTH_DAWGS
@ TESSDATA_SYSTEM_DAWG
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:251
bool(*)(const char *filename, std::vector< char > *data) FileReader
Definition: baseapi.h:63
const std::string & GetDataFileName() const
bool IsComponentAvailable(TessdataType type) const
#define TESS_API
Definition: export.h:34