tesseract  5.0.0
combine_lang_model.cpp File Reference
#include "commandlineflags.h"
#include "commontraining.h"
#include "lang_model_helpers.h"
#include "tprintf.h"
#include "unicharset_training_utils.h"

Go to the source code of this file.

Functions

int main (int argc, char **argv)
 

Function Documentation

◆ main()

int main ( int  argc,
char **  argv 
)

Definition at line 41 of file combine_lang_model.cpp.

41  {
42  // Sets properties on the input unicharset file, and writes:
43  // rootdir/lang/lang.charset_size=ddd.txt
44  // rootdir/lang/lang.traineddata
45  // rootdir/lang/lang.unicharset
46  // If the 3 word lists are provided, the dawgs are also added
47  // to the traineddata file.
48  // The output unicharset and charset_size files are just for
49  // human readability.
50  tesseract::CheckSharedLibraryVersion();
51  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
52 
53  // If these reads fail, we get a warning message and an empty list of words.
54  std::vector<std::string> words = split(tesseract::ReadFile(FLAGS_words.c_str()), '\n');
55  std::vector<std::string> puncs = split(tesseract::ReadFile(FLAGS_puncs.c_str()), '\n');
56  std::vector<std::string> numbers = split(tesseract::ReadFile(FLAGS_numbers.c_str()), '\n');
57  // Load the input unicharset
58  UNICHARSET unicharset;
59  if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) {
60  tprintf("Failed to load unicharset from %s\n", FLAGS_input_unicharset.c_str());
61  return 1;
62  }
63  tprintf("Loaded unicharset of size %zu from file %s\n", unicharset.size(),
64  FLAGS_input_unicharset.c_str());
65 
66  // Set unichar properties
67  tprintf("Setting unichar properties\n");
68  tesseract::SetupBasicProperties(/*report_errors*/ true,
69  /*decompose (NFD)*/ false, &unicharset);
70  tprintf("Setting script properties\n");
71  tesseract::SetScriptProperties(FLAGS_script_dir.c_str(), &unicharset);
72  // Combine everything into a traineddata file.
73  return tesseract::CombineLangModel(unicharset, FLAGS_script_dir.c_str(),
74  FLAGS_version_str.c_str(), FLAGS_output_dir.c_str(),
75  FLAGS_lang.c_str(), FLAGS_pass_through_recoder, words, puncs,
76  numbers, FLAGS_lang_is_rtl, /*reader*/ nullptr,
77  /*writer*/ nullptr);
78 }
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:41
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
void SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset)
std::string ReadFile(const std::string &filename, FileReader reader)
int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, const std::string &version_str, const std::string &output_dir, const std::string &lang, bool pass_through_recoder, const std::vector< std::string > &words, const std::vector< std::string > &puncs, const std::vector< std::string > &numbers, bool lang_is_rtl, FileReader reader, FileWriter writer)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
size_t size() const
Definition: unicharset.h:355