36 #ifndef DISABLED_LEGACY_ENGINE
53 if ((fp = fopen(path.c_str(),
"rb")) !=
nullptr) {
57 path +=
"tessconfigs/";
59 if ((fp = fopen(path.c_str(),
"rb")) !=
nullptr) {
80 char **configs,
int configs_size,
81 const std::vector<std::string> *vars_vec,
82 const std::vector<std::string> *vars_values,
85 lang = !language.empty() ? language :
"eng";
93 tprintf(
"Error opening data file %s\n", tessdata_path.c_str());
95 "Please make sure the TESSDATA_PREFIX environment variable is set"
96 " to your \"tessdata\" directory.\n");
99 #ifdef DISABLED_LEGACY_ENGINE
126 for (
int i = 0; i < configs_size; ++i) {
132 if (vars_vec !=
nullptr && vars_values !=
nullptr) {
133 for (
unsigned i = 0; i < vars_vec->size(); ++i) {
135 set_params_constraint, this->
params())) {
136 tprintf(
"Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
141 if (!tessedit_write_params_to_file.empty()) {
142 FILE *params_file = fopen(tessedit_write_params_to_file.c_str(),
"wb");
143 if (params_file !=
nullptr) {
147 tprintf(
"Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
151 #ifndef DISABLED_LEGACY_ENGINE
154 tessedit_ocr_engine_mode.set_value(oem);
160 if (tessedit_init_config_only) {
167 #ifdef DISABLED_LEGACY_ENGINE
175 ASSERT_HOST(lstm_recognizer_->
Load(this->params(), lstm_use_matrix ? language :
"", mgr));
177 tprintf(
"Error: LSTM requested, but not present!! Loading tesseract.\n");
187 #ifndef DISABLED_LEGACY_ENGINE
190 "Error: Tesseract (legacy) engine requested, but components are "
191 "not present in %s!!\n",
192 tessdata_path.c_str());
197 tprintf(
"Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
202 #ifndef DISABLED_LEGACY_ENGINE
232 static bool IsStrInList(
const std::string &str,
const std::vector<std::string> &str_list) {
233 for (
const auto &i : str_list) {
247 std::vector<std::string> *not_to_load) {
248 std::string remains(lang_str);
251 std::regex e(
"(.*)/[^/]*");
254 if (std::regex_match(
lang.c_str(), cm, e, std::regex_constants::match_default)) {
256 prefix = cm[1].str() +
"/";
258 while (!remains.empty()) {
260 const char *start = remains.c_str();
261 while (*start ==
'+') {
264 std::vector<std::string> *target = to_load;
266 target = not_to_load;
270 int end = strlen(start);
271 const char *plus = strchr(start,
'+');
272 if (plus !=
nullptr && plus - start < end) {
275 std::string lang_code(start);
276 lang_code.resize(end);
277 std::string next(start + end);
279 lang_code = prefix + lang_code;
281 if (!IsStrInList(lang_code, *target)) {
282 target->push_back(lang_code);
292 const std::string &language,
OcrEngineMode oem,
char **configs,
293 int configs_size,
const std::vector<std::string> *vars_vec,
294 const std::vector<std::string> *vars_values,
296 std::vector<std::string> langs_to_load;
297 std::vector<std::string> langs_not_to_load;
300 for (
auto *
lang : sub_langs_) {
310 bool loaded_primary =
false;
314 for (
auto &lang_to_load : langs_to_load) {
315 if (!IsStrInList(lang_to_load, langs_not_to_load)) {
316 const char *lang_str = lang_to_load.c_str();
318 if (!loaded_primary) {
326 configs_size, vars_vec, vars_values,
327 set_only_non_debug_params, mgr);
331 if (!loaded_primary) {
333 tprintf(
"Failed loading language '%s'\n", lang_str);
337 loaded_primary =
true;
341 tprintf(
"Failed loading language '%s'\n", lang_str);
344 sub_langs_.push_back(tess_to_init);
352 if (!loaded_primary && !langs_to_load.empty()) {
353 tprintf(
"Tesseract couldn't load any languages!\n");
356 #ifndef DISABLED_LEGACY_ENGINE
357 if (!sub_langs_.empty()) {
363 if (tessedit_use_primary_params_model) {
364 for (
auto &sub_lang : sub_langs_) {
365 sub_lang->language_model_->getParamsModel().Copy(this->
language_model_->getParamsModel());
367 tprintf(
"Using params model of the primary language\n");
370 for (
auto &sub_lang : sub_langs_) {
371 sub_lang->language_model_->getParamsModel().Clear();
399 char **configs,
int configs_size,
400 const std::vector<std::string> *vars_vec,
401 const std::vector<std::string> *vars_values,
404 vars_values, set_only_non_debug_params, mgr)) {
407 if (tessedit_init_config_only) {
417 #ifndef DISABLED_LEGACY_ENGINE
422 for (
int i = 0; i < new_fonts.
size(); ++i) {
430 for (
int i = 0; i < lang_fonts->size(); ++i) {
431 auto index = all_fonts.get_index(lang_fonts->at(i));
432 lang_fonts->at(i).universal_id = index;
446 for (
auto &sub_lang : sub_langs_) {
447 CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
451 for (
auto &sub_lang : sub_langs_) {
452 AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
454 font_table_size_ = all_fonts.
size();
@ OEM_TESSERACT_LSTM_COMBINED
@ SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
@ SET_PARAM_CONSTRAINT_NONE
void tprintf(const char *format,...)
int init_tesseract(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
void ParseLanguageString(const std::string &lang_str, std::vector< std::string > *to_load, std::vector< std::string > *not_to_load)
void SetupUniversalFontIds()
void read_config_file(const char *filename, SetParamConstraint constraint)
int init_tesseract_internal(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
bool init_tesseract_lang_data(const std::string &arg0, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
int size() const
Return the size used.
int push_back(T object)
Add an element in the table.
const T & at(int id) const
Return the object from an id.
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
std::string language_data_path_prefix
UnicharAmbigs unichar_ambigs
void main_setup(const std::string &argv0, const std::string &basename)
CCUtil::main_setup - set location of tessdata and name of image.
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
bool IsLSTMAvailable() const
bool GetComponent(TessdataType type, TFile *fp)
bool IsBaseAvailable() const
bool IsComponentAvailable(TessdataType type) const
bool Init(const char *data_file_name)
bool load_from_file(const char *const filename, bool skip_fragments)
bool major_right_to_left() const
void CopyFrom(const UNICHARSET &src)
UnicityTable< FontInfo > & get_fontinfo_table()
const UNICHARSET & GetUnicharset() const
bool Load(const ParamsVectors *params, const std::string &lang, TessdataManager *mgr)
void program_editup(const std::string &textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
std::unique_ptr< LanguageModel > language_model_