tesseract  5.0.0
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 TessdataManager (FileReader reader)
 
 ~TessdataManager ()=default
 
bool swap () const
 
bool is_loaded () const
 
void LoadFileLater (const char *data_file_name)
 
bool Init (const char *data_file_name)
 
bool LoadMemBuffer (const char *name, const char *data, int size)
 
void OverwriteEntry (TessdataType type, const char *data, int size)
 
bool SaveFile (const char *filename, FileWriter writer) const
 
void Serialize (std::vector< char > *data) const
 
void Clear ()
 
void Directory () const
 
bool IsComponentAvailable (TessdataType type) const
 
bool GetComponent (TessdataType type, TFile *fp)
 
bool GetComponent (TessdataType type, TFile *fp) const
 
std::string VersionString () const
 
void SetVersionString (const std::string &v_str)
 
bool IsBaseAvailable () const
 
bool IsLSTMAvailable () const
 
const std::string & GetDataFileName () const
 
bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Detailed Description

Definition at line 127 of file tessdatamanager.h.

Constructor & Destructor Documentation

◆ TessdataManager() [1/2]

tesseract::TessdataManager::TessdataManager ( )

Definition at line 42 of file tessdatamanager.cpp.

42  : reader_(nullptr), is_loaded_(false), swap_(false) {
44 }
#define TESSERACT_VERSION_STR
Definition: version.h:32
void SetVersionString(const std::string &v_str)

◆ TessdataManager() [2/2]

tesseract::TessdataManager::TessdataManager ( FileReader  reader)
explicit

Definition at line 46 of file tessdatamanager.cpp.

47  : reader_(reader), is_loaded_(false), swap_(false) {
49 }

◆ ~TessdataManager()

tesseract::TessdataManager::~TessdataManager ( )
default

Member Function Documentation

◆ Clear()

void tesseract::TessdataManager::Clear ( )

Definition at line 205 of file tessdatamanager.cpp.

205  {
206  for (auto &entry : entries_) {
207  entry.clear();
208  }
209  is_loaded_ = false;
210 }

◆ CombineDataFiles()

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 258 of file tessdatamanager.cpp.

259  {
260  // Load individual tessdata components from files.
261  for (auto filesuffix : kTessdataFileSuffixes) {
262  TessdataType type;
263  ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));
264  std::string filename = language_data_path_prefix;
265  filename += filesuffix;
266  FILE *fp = fopen(filename.c_str(), "rb");
267  if (fp != nullptr) {
268  fclose(fp);
269  if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {
270  tprintf("Load of file %s failed!\n", filename.c_str());
271  return false;
272  }
273  }
274  }
275  is_loaded_ = true;
276 
277  // Make sure that the required components are present.
278  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
279  tprintf(
280  "Error: traineddata file must contain at least (a unicharset file"
281  "and inttemp) OR an lstm file.\n");
282  return false;
283  }
284  // Write updated data to the output traineddata file.
285  return SaveFile(output_filename, nullptr);
286 }
#define ASSERT_HOST(x)
Definition: errcode.h:59
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
bool SaveFile(const char *filename, FileWriter writer) const

◆ Directory()

void tesseract::TessdataManager::Directory ( ) const

Definition at line 213 of file tessdatamanager.cpp.

213  {
214  tprintf("Version:%s\n", VersionString().c_str());
215  auto offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
216  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
217  if (!entries_[i].empty()) {
218  tprintf("%u:%s:size=%zu, offset=%zu\n", i, kTessdataFileSuffixes[i], entries_[i].size(),
219  offset);
220  offset += entries_[i].size();
221  }
222  }
223 }
@ TESSDATA_NUM_ENTRIES
std::string VersionString() const

◆ ExtractToFile()

bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 306 of file tessdatamanager.cpp.

306  {
308  ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
309  if (entries_[type].empty()) {
310  return false;
311  }
312  return SaveDataToFile(entries_[type], filename);
313 }
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)

◆ GetComponent() [1/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
)

Definition at line 227 of file tessdatamanager.cpp.

227  {
228  if (!is_loaded_ && !Init(data_file_name_.c_str())) {
229  return false;
230  }
231  const TessdataManager *const_this = this;
232  return const_this->GetComponent(type, fp);
233 }
bool Init(const char *data_file_name)

◆ GetComponent() [2/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
) const

Definition at line 237 of file tessdatamanager.cpp.

237  {
238  ASSERT_HOST(is_loaded_);
239  if (entries_[type].empty()) {
240  return false;
241  }
242  fp->Open(&entries_[type][0], entries_[type].size());
243  fp->set_swap(swap_);
244  return true;
245 }

◆ GetDataFileName()

const std::string& tesseract::TessdataManager::GetDataFileName ( ) const
inline

Definition at line 192 of file tessdatamanager.h.

192  {
193  return data_file_name_;
194  }

◆ Init()

bool tesseract::TessdataManager::Init ( const char *  data_file_name)

Opens and reads the given data file right now.

Returns
true on success.

Definition at line 90 of file tessdatamanager.cpp.

90  {
91  std::vector<char> data;
92  if (reader_ == nullptr) {
93 #if defined(HAVE_LIBARCHIVE)
94  if (LoadArchiveFile(data_file_name)) {
95  return true;
96  }
97 #endif
98  if (!LoadDataFromFile(data_file_name, &data)) {
99  return false;
100  }
101  } else {
102  if (!(*reader_)(data_file_name, &data)) {
103  return false;
104  }
105  }
106  return LoadMemBuffer(data_file_name, &data[0], data.size());
107 }
bool LoadMemBuffer(const char *name, const char *data, int size)

◆ is_loaded()

bool tesseract::TessdataManager::is_loaded ( ) const
inline

Definition at line 137 of file tessdatamanager.h.

137  {
138  return is_loaded_;
139  }

◆ IsBaseAvailable()

bool tesseract::TessdataManager::IsBaseAvailable ( ) const
inline

Definition at line 182 of file tessdatamanager.h.

182  {
183  return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty();
184  }

◆ IsComponentAvailable()

bool tesseract::TessdataManager::IsComponentAvailable ( TessdataType  type) const
inline

Definition at line 166 of file tessdatamanager.h.

166  {
167  return !entries_[type].empty();
168  }

◆ IsLSTMAvailable()

bool tesseract::TessdataManager::IsLSTMAvailable ( ) const
inline

Definition at line 187 of file tessdatamanager.h.

187  {
188  return !entries_[TESSDATA_LSTM].empty();
189  }

◆ LoadFileLater()

void tesseract::TessdataManager::LoadFileLater ( const char *  data_file_name)

Definition at line 53 of file tessdatamanager.cpp.

53  {
54  Clear();
55  data_file_name_ = data_file_name;
56 }

◆ LoadMemBuffer()

bool tesseract::TessdataManager::LoadMemBuffer ( const char *  name,
const char *  data,
int  size 
)

Definition at line 110 of file tessdatamanager.cpp.

110  {
111  // TODO: This method supports only the proprietary file format.
112  Clear();
113  data_file_name_ = name;
114  TFile fp;
115  fp.Open(data, size);
116  uint32_t num_entries;
117  if (!fp.DeSerialize(&num_entries)) {
118  return false;
119  }
120  swap_ = num_entries > kMaxNumTessdataEntries;
121  fp.set_swap(swap_);
122  if (swap_) {
123  ReverseN(&num_entries, sizeof(num_entries));
124  }
125  if (num_entries > kMaxNumTessdataEntries) {
126  return false;
127  }
128  // TODO: optimize (no init required).
129  std::vector<int64_t> offset_table(num_entries);
130  if (!fp.DeSerialize(&offset_table[0], num_entries)) {
131  return false;
132  }
133  for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
134  if (offset_table[i] >= 0) {
135  int64_t entry_size = size - offset_table[i];
136  unsigned j = i + 1;
137  while (j < num_entries && offset_table[j] == -1) {
138  ++j;
139  }
140  if (j < num_entries) {
141  entry_size = offset_table[j] - offset_table[i];
142  }
143  entries_[i].resize(entry_size);
144  if (!fp.DeSerialize(&entries_[i][0], entry_size)) {
145  return false;
146  }
147  }
148  }
149  if (entries_[TESSDATA_VERSION].empty()) {
150  SetVersionString("Pre-4.0.0");
151  }
152  is_loaded_ = true;
153  return true;
154 }
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:189

◆ OverwriteComponents()

bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 288 of file tessdatamanager.cpp.

289  {
290  // Open the files with the new components.
291  // TODO: This method supports only the proprietary file format.
292  for (int i = 0; i < num_new_components; ++i) {
293  TessdataType type;
294  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
295  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
296  tprintf("Failed to read component file:%s\n", component_filenames[i]);
297  return false;
298  }
299  }
300  }
301 
302  // Write updated data to the output traineddata file.
303  return SaveFile(new_traineddata_filename, nullptr);
304 }

◆ OverwriteEntry()

void tesseract::TessdataManager::OverwriteEntry ( TessdataType  type,
const char *  data,
int  size 
)

Definition at line 157 of file tessdatamanager.cpp.

157  {
158  is_loaded_ = true;
159  entries_[type].resize(size);
160  memcpy(&entries_[type][0], data, size);
161 }

◆ SaveFile()

bool tesseract::TessdataManager::SaveFile ( const char *  filename,
FileWriter  writer 
) const

Definition at line 164 of file tessdatamanager.cpp.

164  {
165  // TODO: This method supports only the proprietary file format.
166  ASSERT_HOST(is_loaded_);
167  std::vector<char> data;
168  Serialize(&data);
169  if (writer == nullptr) {
170  return SaveDataToFile(data, filename);
171  } else {
172  return (*writer)(data, filename);
173  }
174 }
void Serialize(std::vector< char > *data) const

◆ Serialize()

void tesseract::TessdataManager::Serialize ( std::vector< char > *  data) const

Definition at line 177 of file tessdatamanager.cpp.

177  {
178  // TODO: This method supports only the proprietary file format.
179  ASSERT_HOST(is_loaded_);
180  // Compute the offset_table and total size.
181  int64_t offset_table[TESSDATA_NUM_ENTRIES];
182  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
183  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
184  if (entries_[i].empty()) {
185  offset_table[i] = -1;
186  } else {
187  offset_table[i] = offset;
188  offset += entries_[i].size();
189  }
190  }
191  data->resize(offset, 0);
192  int32_t num_entries = TESSDATA_NUM_ENTRIES;
193  TFile fp;
194  fp.OpenWrite(data);
195  fp.Serialize(&num_entries);
196  fp.Serialize(&offset_table[0], countof(offset_table));
197  for (const auto &entry : entries_) {
198  if (!entry.empty()) {
199  fp.Serialize(&entry[0], entry.size());
200  }
201  }
202 }
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:42

◆ SetVersionString()

void tesseract::TessdataManager::SetVersionString ( const std::string &  v_str)

Definition at line 253 of file tessdatamanager.cpp.

253  {
254  entries_[TESSDATA_VERSION].resize(v_str.size());
255  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
256 }

◆ swap()

bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 134 of file tessdatamanager.h.

134  {
135  return swap_;
136  }

◆ VersionString()

std::string tesseract::TessdataManager::VersionString ( ) const

Definition at line 248 of file tessdatamanager.cpp.

248  {
249  return std::string(&entries_[TESSDATA_VERSION][0], entries_[TESSDATA_VERSION].size());
250 }

The documentation for this class was generated from the following files: