tesseract  5.0.0
tessedit.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tessedit.cpp (Formerly tessedit.c)
3  * Description: (Previously) Main program for merge of tess and editor.
4  * Now just code to load the language model and various
5  * engine-specific data files.
6  * Author: Ray Smith
7  *
8  * (C) Copyright 1992, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 # include "config_auto.h"
24 #endif
25 
26 #include <regex> // for std::regex_match
27 
28 #include "control.h"
29 #include "matchdefs.h"
30 #include "pageres.h"
31 #include "params.h"
32 #include "stopper.h"
33 #include "tesseractclass.h"
34 #include "tessvars.h"
35 #include "tprintf.h"
36 #ifndef DISABLED_LEGACY_ENGINE
37 # include "chop.h"
38 # include "intmatcher.h"
39 # include "reject.h"
40 #endif
41 #include "lstmrecognizer.h"
42 
43 namespace tesseract {
44 
45 // Read a "config" file containing a set of variable, value pairs.
46 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
47 // and also accepts a relative or absolute path name.
48 void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
49  std::string path = datadir;
50  path += "configs/";
51  path += filename;
52  FILE *fp;
53  if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
54  fclose(fp);
55  } else {
56  path = datadir;
57  path += "tessconfigs/";
58  path += filename;
59  if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
60  fclose(fp);
61  } else {
62  path = filename;
63  }
64  }
65  ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
66 }
67 
68 // Returns false if a unicharset file for the specified language was not found
69 // or was invalid.
70 // This function initializes TessdataManager. After TessdataManager is
71 // no longer needed, TessdataManager::End() should be called.
72 //
73 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
74 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
75 // from the language-specific config file (stored in [lang].traineddata), from
76 // the config files specified on the command line or left as the default
77 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
78 bool Tesseract::init_tesseract_lang_data(const std::string &arg0,
79  const std::string &language, OcrEngineMode oem,
80  char **configs, int configs_size,
81  const std::vector<std::string> *vars_vec,
82  const std::vector<std::string> *vars_values,
83  bool set_only_non_debug_params, TessdataManager *mgr) {
84  // Set the language data path prefix
85  lang = !language.empty() ? language : "eng";
89 
90  // Initialize TessdataManager.
91  std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
92  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
93  tprintf("Error opening data file %s\n", tessdata_path.c_str());
94  tprintf(
95  "Please make sure the TESSDATA_PREFIX environment variable is set"
96  " to your \"tessdata\" directory.\n");
97  return false;
98  }
99 #ifdef DISABLED_LEGACY_ENGINE
100  tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
101 #else
102  if (oem == OEM_DEFAULT) {
103  // Set the engine mode from availability, which can then be overridden by
104  // the config file when we read it below.
105  if (!mgr->IsLSTMAvailable()) {
106  tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
107  } else if (!mgr->IsBaseAvailable()) {
108  tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
109  } else {
110  tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
111  }
112  }
113 #endif // ndef DISABLED_LEGACY_ENGINE
114 
115  // If a language specific config file (lang.config) exists, load it in.
116  TFile fp;
117  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
119  }
120 
121  SetParamConstraint set_params_constraint =
123  // Load tesseract variables from config files. This is done after loading
124  // language-specific variables from [lang].traineddata file, so that custom
125  // config files can override values in [lang].traineddata file.
126  for (int i = 0; i < configs_size; ++i) {
127  read_config_file(configs[i], set_params_constraint);
128  }
129 
130  // Set params specified in vars_vec (done after setting params from config
131  // files, so that params in vars_vec can override those from files).
132  if (vars_vec != nullptr && vars_values != nullptr) {
133  for (unsigned i = 0; i < vars_vec->size(); ++i) {
134  if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
135  set_params_constraint, this->params())) {
136  tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
137  }
138  }
139  }
140 
141  if (!tessedit_write_params_to_file.empty()) {
142  FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
143  if (params_file != nullptr) {
144  ParamUtils::PrintParams(params_file, this->params());
145  fclose(params_file);
146  } else {
147  tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
148  }
149  }
150 
151 #ifndef DISABLED_LEGACY_ENGINE
152  // Determine which ocr engine(s) should be loaded and used for recognition.
153  if (oem != OEM_DEFAULT) {
154  tessedit_ocr_engine_mode.set_value(oem);
155  }
156 #endif
157 
158  // If we are only loading the config file (and so not planning on doing any
159  // recognition) then there's nothing else do here.
160  if (tessedit_init_config_only) {
161  return true;
162  }
163 
164 // The various OcrEngineMode settings (see tesseract/publictypes.h) determine
165 // which engine-specific data files need to be loaded. If LSTM_ONLY is
166 // requested, the base Tesseract files are *Not* required.
167 #ifdef DISABLED_LEGACY_ENGINE
168  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
169 #else
170  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
171  tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
172 #endif // ndef DISABLED_LEGACY_ENGINE
174  lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
175  ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
176  } else {
177  tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
178  tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
179  }
180  }
181 
182  // Load the unicharset
183  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
184  // Avoid requiring a unicharset when we aren't running base tesseract.
185  unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
186  }
187 #ifndef DISABLED_LEGACY_ENGINE
188  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
189  tprintf(
190  "Error: Tesseract (legacy) engine requested, but components are "
191  "not present in %s!!\n",
192  tessdata_path.c_str());
193  return false;
194  }
195 #endif // ndef DISABLED_LEGACY_ENGINE
196  if (unicharset.size() > MAX_NUM_CLASSES) {
197  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
198  return false;
199  }
200  right_to_left_ = unicharset.major_right_to_left();
201 
202 #ifndef DISABLED_LEGACY_ENGINE
203 
204  // Setup initial unichar ambigs table and read universal ambigs.
205  UNICHARSET encoder_unicharset;
206  encoder_unicharset.CopyFrom(unicharset);
207  unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
208  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
209 
210  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
211  unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
212  use_ambigs_for_adaption, &unicharset);
213  }
214 
215  // Init ParamsModel.
216  // Load pass1 and pass2 weights (for now these two sets are the same, but in
217  // the future separate sets of weights can be generated).
219  language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
220  if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
221  if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
222  return false;
223  }
224  }
225  }
226 #endif // ndef DISABLED_LEGACY_ENGINE
227 
228  return true;
229 }
230 
231 // Helper returns true if the given string is in the vector of strings.
232 static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {
233  for (const auto &i : str_list) {
234  if (i == str) {
235  return true;
236  }
237  }
238  return false;
239 }
240 
241 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
242 // Langs with no prefix get appended to to_load, provided they
243 // are not in there already.
244 // Langs with ~ prefix get appended to not_to_load, provided they are not in
245 // there already.
246 void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
247  std::vector<std::string> *not_to_load) {
248  std::string remains(lang_str);
249  // Look whether the model file uses a prefix which must be applied to
250  // included model files as well.
251  std::regex e("(.*)/[^/]*");
252  std::cmatch cm;
253  std::string prefix;
254  if (std::regex_match(lang.c_str(), cm, e, std::regex_constants::match_default)) {
255  // A prefix was found.
256  prefix = cm[1].str() + "/";
257  }
258  while (!remains.empty()) {
259  // Find the start of the lang code and which vector to add to.
260  const char *start = remains.c_str();
261  while (*start == '+') {
262  ++start;
263  }
264  std::vector<std::string> *target = to_load;
265  if (*start == '~') {
266  target = not_to_load;
267  ++start;
268  }
269  // Find the index of the end of the lang code in string start.
270  int end = strlen(start);
271  const char *plus = strchr(start, '+');
272  if (plus != nullptr && plus - start < end) {
273  end = plus - start;
274  }
275  std::string lang_code(start);
276  lang_code.resize(end);
277  std::string next(start + end);
278  remains = next;
279  lang_code = prefix + lang_code;
280  // Check whether lang_code is already in the target vector and add.
281  if (!IsStrInList(lang_code, *target)) {
282  target->push_back(lang_code);
283  }
284  }
285 }
286 
287 // Initialize for potentially a set of languages defined by the language
288 // string and recursively any additional languages required by any language
289 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
290 // See init_tesseract_internal for args.
291 int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,
292  const std::string &language, OcrEngineMode oem, char **configs,
293  int configs_size, const std::vector<std::string> *vars_vec,
294  const std::vector<std::string> *vars_values,
295  bool set_only_non_debug_params, TessdataManager *mgr) {
296  std::vector<std::string> langs_to_load;
297  std::vector<std::string> langs_not_to_load;
298  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
299 
300  for (auto *lang : sub_langs_) {
301  delete lang;
302  }
303 
304  // Set the basename, compute the data directory.
305  main_setup(arg0, textbase);
306 
307  sub_langs_.clear();
308  // Find the first loadable lang and load into this.
309  // Add any languages that this language requires
310  bool loaded_primary = false;
311  // Load the rest into sub_langs_.
312  // A range based for loop does not work here because langs_to_load
313  // might be changed in the loop when a new submodel is found.
314  for (auto &lang_to_load : langs_to_load) {
315  if (!IsStrInList(lang_to_load, langs_not_to_load)) {
316  const char *lang_str = lang_to_load.c_str();
317  Tesseract *tess_to_init;
318  if (!loaded_primary) {
319  tess_to_init = this;
320  } else {
321  tess_to_init = new Tesseract;
322  tess_to_init->main_setup(arg0, textbase);
323  }
324 
325  int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
326  configs_size, vars_vec, vars_values,
327  set_only_non_debug_params, mgr);
328  // Forget that language, but keep any reader we were given.
329  mgr->Clear();
330 
331  if (!loaded_primary) {
332  if (result < 0) {
333  tprintf("Failed loading language '%s'\n", lang_str);
334  } else {
335  ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
336  &langs_not_to_load);
337  loaded_primary = true;
338  }
339  } else {
340  if (result < 0) {
341  tprintf("Failed loading language '%s'\n", lang_str);
342  delete tess_to_init;
343  } else {
344  sub_langs_.push_back(tess_to_init);
345  // Add any languages that this language requires
346  ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
347  &langs_not_to_load);
348  }
349  }
350  }
351  }
352  if (!loaded_primary && !langs_to_load.empty()) {
353  tprintf("Tesseract couldn't load any languages!\n");
354  return -1; // Couldn't load any language!
355  }
356 #ifndef DISABLED_LEGACY_ENGINE
357  if (!sub_langs_.empty()) {
358  // In multilingual mode word ratings have to be directly comparable,
359  // so use the same language model weights for all languages:
360  // use the primary language's params model if
361  // tessedit_use_primary_params_model is set,
362  // otherwise use default language model weights.
363  if (tessedit_use_primary_params_model) {
364  for (auto &sub_lang : sub_langs_) {
365  sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
366  }
367  tprintf("Using params model of the primary language\n");
368  } else {
369  this->language_model_->getParamsModel().Clear();
370  for (auto &sub_lang : sub_langs_) {
371  sub_lang->language_model_->getParamsModel().Clear();
372  }
373  }
374  }
375 
377 #endif // ndef DISABLED_LEGACY_ENGINE
378  return 0;
379 }
380 
381 // Common initialization for a single language.
382 // arg0 is the datapath for the tessdata directory, which could be the
383 // path of the tessdata directory with no trailing /, or (if tessdata
384 // lives in the same directory as the executable, the path of the executable,
385 // hence the name arg0.
386 // textbase is an optional output file basename (used only for training)
387 // language is the language code to load.
388 // oem controls which engine(s) will operate on the image
389 // configs (argv) is an array of config filenames to load variables from.
390 // May be nullptr.
391 // configs_size (argc) is the number of elements in configs.
392 // vars_vec is an optional vector of variables to set.
393 // vars_values is an optional corresponding vector of values for the variables
394 // in vars_vec.
395 // If set_only_non_debug_params is true, only params that do not contain
396 // "debug" in the name will be set.
397 int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,
398  const std::string &language, OcrEngineMode oem,
399  char **configs, int configs_size,
400  const std::vector<std::string> *vars_vec,
401  const std::vector<std::string> *vars_values,
402  bool set_only_non_debug_params, TessdataManager *mgr) {
403  if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,
404  vars_values, set_only_non_debug_params, mgr)) {
405  return -1;
406  }
407  if (tessedit_init_config_only) {
408  return 0;
409  }
410  // If only LSTM will be used, skip loading Tesseract classifier's
411  // pre-trained templates and dictionary.
412  bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
413  program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
414  return 0; // Normal exit
415 }
416 
417 #ifndef DISABLED_LEGACY_ENGINE
418 
419 // Helper builds the all_fonts table by adding new fonts from new_fonts.
420 static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,
421  UnicityTable<FontInfo> *all_fonts) {
422  for (int i = 0; i < new_fonts.size(); ++i) {
423  // UnicityTable uniques as we go.
424  all_fonts->push_back(new_fonts.at(i));
425  }
426 }
427 
428 // Helper assigns an id to lang_fonts using the index in all_fonts table.
429 static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {
430  for (int i = 0; i < lang_fonts->size(); ++i) {
431  auto index = all_fonts.get_index(lang_fonts->at(i));
432  lang_fonts->at(i).universal_id = index;
433  }
434 }
435 
436 // Set the universal_id member of each font to be unique among all
437 // instances of the same font loaded.
439  // Note that we can get away with bitwise copying FontInfo in
440  // all_fonts, as it is a temporary structure and we avoid setting the
441  // delete callback.
442  UnicityTable<FontInfo> all_fonts;
443 
444  // Create the universal ID table.
445  CollectFonts(get_fontinfo_table(), &all_fonts);
446  for (auto &sub_lang : sub_langs_) {
447  CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
448  }
449  // Assign ids from the table to each font table.
450  AssignIds(all_fonts, &get_fontinfo_table());
451  for (auto &sub_lang : sub_langs_) {
452  AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
453  }
454  font_table_size_ = all_fonts.size();
455 }
456 
457 #endif // ndef DISABLED_LEGACY_ENGINE
458 
460  end_recog();
461 }
462 
463 /* Define command type identifiers */
464 
466 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
@ OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:268
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:266
SetParamConstraint
Definition: params.h:38
@ SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
Definition: params.h:41
@ SET_PARAM_CONSTRAINT_NONE
Definition: params.h:39
@ RECOG_PSEUDO
Definition: pgedit.cpp:67
@ ACTION_2_CMD_EVENT
Definition: tessedit.cpp:465
@ RECOG_WERDS
Definition: pgedit.cpp:66
@ ACTION_1_CMD_EVENT
Definition: tessedit.cpp:465
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ TESSDATA_PARAMS_MODEL
@ TESSDATA_LANG_CONFIG
int init_tesseract(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:291
void ParseLanguageString(const std::string &lang_str, std::vector< std::string > *to_load, std::vector< std::string > *not_to_load)
Definition: tessedit.cpp:246
void SetupUniversalFontIds()
Definition: tessedit.cpp:438
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:48
int init_tesseract_internal(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:397
bool init_tesseract_lang_data(const std::string &arg0, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:78
int size() const
Return the size used.
Definition: unicity_table.h:51
int push_back(T object)
Add an element in the table.
Definition: unicity_table.h:73
const T & at(int id) const
Return the object from an id.
Definition: unicity_table.h:56
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:64
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:51
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:72
std::string language_data_path_prefix
Definition: ccutil.h:60
UNICHARSET unicharset
Definition: ccutil.h:61
std::string lang
Definition: ccutil.h:59
ParamsVectors * params()
Definition: ccutil.h:53
std::string datadir
Definition: ccutil.h:57
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:63
void main_setup(const std::string &argv0, const std::string &basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:40
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:51
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:164
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:81
bool GetComponent(TessdataType type, TFile *fp)
bool IsComponentAvailable(TessdataType type) const
bool Init(const char *data_file_name)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
bool major_right_to_left() const
Definition: unicharset.cpp:983
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:438
size_t size() const
Definition: unicharset.h:355
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:324
const UNICHARSET & GetUnicharset() const
bool Load(const ParamsVectors *params, const std::string &lang, TessdataManager *mgr)
void program_editup(const std::string &textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:39
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:382