tesseract  5.0.0
tessdatamanager.cpp
Go to the documentation of this file.
1 // File: tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config_auto.h"
21 #endif
22 
23 #include "tessdatamanager.h"
24 
25 #include <cstdio>
26 #include <string>
27 
28 #if defined(HAVE_LIBARCHIVE)
29 # include <archive.h>
30 # include <archive_entry.h>
31 #endif
32 
33 #include <tesseract/version.h>
34 #include "errcode.h"
35 #include "helpers.h"
36 #include "params.h"
37 #include "serialis.h"
38 #include "tprintf.h"
39 
40 namespace tesseract {
41 
42 TessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {
44 }
45 
47  : reader_(reader), is_loaded_(false), swap_(false) {
49 }
50 
51 // Lazily loads from the the given filename. Won't actually read the file
52 // until it needs it.
53 void TessdataManager::LoadFileLater(const char *data_file_name) {
54  Clear();
55  data_file_name_ = data_file_name;
56 }
57 
58 #if defined(HAVE_LIBARCHIVE)
59 bool TessdataManager::LoadArchiveFile(const char *filename) {
60  bool result = false;
61  archive *a = archive_read_new();
62  if (a != nullptr) {
63  archive_read_support_filter_all(a);
64  archive_read_support_format_all(a);
65  if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
66  archive_entry *ae;
67  while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
68  const char *component = archive_entry_pathname(ae);
69  if (component != nullptr) {
70  TessdataType type;
71  if (TessdataTypeFromFileName(component, &type)) {
72  int64_t size = archive_entry_size(ae);
73  if (size > 0) {
74  entries_[type].resize(size);
75  if (archive_read_data(a, &entries_[type][0], size) == size) {
76  is_loaded_ = true;
77  }
78  }
79  }
80  }
81  }
82  result = is_loaded_;
83  }
84  archive_read_free(a);
85  }
86  return result;
87 }
88 #endif
89 
90 bool TessdataManager::Init(const char *data_file_name) {
91  std::vector<char> data;
92  if (reader_ == nullptr) {
93 #if defined(HAVE_LIBARCHIVE)
94  if (LoadArchiveFile(data_file_name)) {
95  return true;
96  }
97 #endif
98  if (!LoadDataFromFile(data_file_name, &data)) {
99  return false;
100  }
101  } else {
102  if (!(*reader_)(data_file_name, &data)) {
103  return false;
104  }
105  }
106  return LoadMemBuffer(data_file_name, &data[0], data.size());
107 }
108 
109 // Loads from the given memory buffer as if a file.
110 bool TessdataManager::LoadMemBuffer(const char *name, const char *data, int size) {
111  // TODO: This method supports only the proprietary file format.
112  Clear();
113  data_file_name_ = name;
114  TFile fp;
115  fp.Open(data, size);
116  uint32_t num_entries;
117  if (!fp.DeSerialize(&num_entries)) {
118  return false;
119  }
120  swap_ = num_entries > kMaxNumTessdataEntries;
121  fp.set_swap(swap_);
122  if (swap_) {
123  ReverseN(&num_entries, sizeof(num_entries));
124  }
125  if (num_entries > kMaxNumTessdataEntries) {
126  return false;
127  }
128  // TODO: optimize (no init required).
129  std::vector<int64_t> offset_table(num_entries);
130  if (!fp.DeSerialize(&offset_table[0], num_entries)) {
131  return false;
132  }
133  for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
134  if (offset_table[i] >= 0) {
135  int64_t entry_size = size - offset_table[i];
136  unsigned j = i + 1;
137  while (j < num_entries && offset_table[j] == -1) {
138  ++j;
139  }
140  if (j < num_entries) {
141  entry_size = offset_table[j] - offset_table[i];
142  }
143  entries_[i].resize(entry_size);
144  if (!fp.DeSerialize(&entries_[i][0], entry_size)) {
145  return false;
146  }
147  }
148  }
149  if (entries_[TESSDATA_VERSION].empty()) {
150  SetVersionString("Pre-4.0.0");
151  }
152  is_loaded_ = true;
153  return true;
154 }
155 
156 // Overwrites a single entry of the given type.
157 void TessdataManager::OverwriteEntry(TessdataType type, const char *data, int size) {
158  is_loaded_ = true;
159  entries_[type].resize(size);
160  memcpy(&entries_[type][0], data, size);
161 }
162 
163 // Saves to the given filename.
164 bool TessdataManager::SaveFile(const char *filename, FileWriter writer) const {
165  // TODO: This method supports only the proprietary file format.
166  ASSERT_HOST(is_loaded_);
167  std::vector<char> data;
168  Serialize(&data);
169  if (writer == nullptr) {
170  return SaveDataToFile(data, filename);
171  } else {
172  return (*writer)(data, filename);
173  }
174 }
175 
176 // Serializes to the given vector.
177 void TessdataManager::Serialize(std::vector<char> *data) const {
178  // TODO: This method supports only the proprietary file format.
179  ASSERT_HOST(is_loaded_);
180  // Compute the offset_table and total size.
181  int64_t offset_table[TESSDATA_NUM_ENTRIES];
182  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
183  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
184  if (entries_[i].empty()) {
185  offset_table[i] = -1;
186  } else {
187  offset_table[i] = offset;
188  offset += entries_[i].size();
189  }
190  }
191  data->resize(offset, 0);
192  int32_t num_entries = TESSDATA_NUM_ENTRIES;
193  TFile fp;
194  fp.OpenWrite(data);
195  fp.Serialize(&num_entries);
196  fp.Serialize(&offset_table[0], countof(offset_table));
197  for (const auto &entry : entries_) {
198  if (!entry.empty()) {
199  fp.Serialize(&entry[0], entry.size());
200  }
201  }
202 }
203 
204 // Resets to the initial state, keeping the reader.
206  for (auto &entry : entries_) {
207  entry.clear();
208  }
209  is_loaded_ = false;
210 }
211 
212 // Prints a directory of contents.
214  tprintf("Version:%s\n", VersionString().c_str());
215  auto offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
216  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
217  if (!entries_[i].empty()) {
218  tprintf("%u:%s:size=%zu, offset=%zu\n", i, kTessdataFileSuffixes[i], entries_[i].size(),
219  offset);
220  offset += entries_[i].size();
221  }
222  }
223 }
224 
225 // Opens the given TFile pointer to the given component type.
226 // Returns false in case of failure.
228  if (!is_loaded_ && !Init(data_file_name_.c_str())) {
229  return false;
230  }
231  const TessdataManager *const_this = this;
232  return const_this->GetComponent(type, fp);
233 }
234 
235 // As non-const version except it can't load the component if not already
236 // loaded.
238  ASSERT_HOST(is_loaded_);
239  if (entries_[type].empty()) {
240  return false;
241  }
242  fp->Open(&entries_[type][0], entries_[type].size());
243  fp->set_swap(swap_);
244  return true;
245 }
246 
247 // Returns the current version string.
248 std::string TessdataManager::VersionString() const {
249  return std::string(&entries_[TESSDATA_VERSION][0], entries_[TESSDATA_VERSION].size());
250 }
251 
252 // Sets the version string to the given v_str.
253 void TessdataManager::SetVersionString(const std::string &v_str) {
254  entries_[TESSDATA_VERSION].resize(v_str.size());
255  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
256 }
257 
258 bool TessdataManager::CombineDataFiles(const char *language_data_path_prefix,
259  const char *output_filename) {
260  // Load individual tessdata components from files.
261  for (auto filesuffix : kTessdataFileSuffixes) {
262  TessdataType type;
263  ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));
264  std::string filename = language_data_path_prefix;
265  filename += filesuffix;
266  FILE *fp = fopen(filename.c_str(), "rb");
267  if (fp != nullptr) {
268  fclose(fp);
269  if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {
270  tprintf("Load of file %s failed!\n", filename.c_str());
271  return false;
272  }
273  }
274  }
275  is_loaded_ = true;
276 
277  // Make sure that the required components are present.
278  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
279  tprintf(
280  "Error: traineddata file must contain at least (a unicharset file"
281  "and inttemp) OR an lstm file.\n");
282  return false;
283  }
284  // Write updated data to the output traineddata file.
285  return SaveFile(output_filename, nullptr);
286 }
287 
288 bool TessdataManager::OverwriteComponents(const char *new_traineddata_filename,
289  char **component_filenames, int num_new_components) {
290  // Open the files with the new components.
291  // TODO: This method supports only the proprietary file format.
292  for (int i = 0; i < num_new_components; ++i) {
293  TessdataType type;
294  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
295  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
296  tprintf("Failed to read component file:%s\n", component_filenames[i]);
297  return false;
298  }
299  }
300  }
301 
302  // Write updated data to the output traineddata file.
303  return SaveFile(new_traineddata_filename, nullptr);
304 }
305 
306 bool TessdataManager::ExtractToFile(const char *filename) {
308  ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
309  if (entries_[type].empty()) {
310  return false;
311  }
312  return SaveDataToFile(entries_[type], filename);
313 }
314 
315 bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type) {
316  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
317  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
318  *type = static_cast<TessdataType>(i);
319  return true;
320  }
321  }
322 #if !defined(NDEBUG)
323  tprintf(
324  "TessdataManager can't determine which tessdata"
325  " component is represented by %s\n",
326  suffix);
327 #endif
328  return false;
329 }
330 
331 bool TessdataManager::TessdataTypeFromFileName(const char *filename, TessdataType *type) {
332  // Get the file suffix (extension)
333  const char *suffix = strrchr(filename, '.');
334  if (suffix == nullptr || *(++suffix) == '\0') {
335  return false;
336  }
337  return TessdataTypeFromFileSuffix(suffix, type);
338 }
339 
340 } // namespace tesseract
#define TESSERACT_VERSION_STR
Definition: version.h:32
#define ASSERT_HOST(x)
Definition: errcode.h:59
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:189
bool(*)(const std::vector< char > &data, const char *filename) FileWriter
Definition: serialis.h:48
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ TESSDATA_NUM_ENTRIES
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:42
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)
bool(*)(const char *filename, std::vector< char > *data) FileReader
Definition: baseapi.h:63
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246
bool DeSerialize(std::string &data)
Definition: serialis.cpp:94
bool Serialize(const std::string &data)
Definition: serialis.cpp:107
void set_swap(bool value)
Definition: serialis.h:83
bool Open(const char *filename, FileReader reader)
Definition: serialis.cpp:140
void OverwriteEntry(TessdataType type, const char *data, int size)
std::string VersionString() const
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
void SetVersionString(const std::string &v_str)
bool GetComponent(TessdataType type, TFile *fp)
bool SaveFile(const char *filename, FileWriter writer) const
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool ExtractToFile(const char *filename)
void LoadFileLater(const char *data_file_name)
bool LoadMemBuffer(const char *name, const char *data, int size)
bool Init(const char *data_file_name)
void Serialize(std::vector< char > *data) const