tesseract  5.0.0
baseapi.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: baseapi.cpp
3  * Description: Simple API for calling tesseract.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 2006, Google Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #define _USE_MATH_DEFINES // for M_PI
20 
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 # include "config_auto.h"
24 #endif
25 
26 #include "boxword.h" // for BoxWord
27 #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST
28 #include "dawg_cache.h" // for DawgCache
29 #include "dict.h" // for Dict
30 #include "elst.h" // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH
31 #include "environ.h" // for l_uint8
32 #ifndef DISABLED_LEGACY_ENGINE
33 #include "equationdetect.h" // for EquationDetect, destructor of equ_detect_
34 #endif // ndef DISABLED_LEGACY_ENGINE
35 #include "errcode.h" // for ASSERT_HOST
36 #include "helpers.h" // for IntCastRounded, chomp_string
37 #include "host.h" // for MAX_PATH
38 #include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ...
39 #ifndef DISABLED_LEGACY_ENGINE
40 # include "intfx.h" // for INT_FX_RESULT_STRUCT
41 #endif
42 #include "mutableiterator.h" // for MutableIterator
43 #include "normalis.h" // for kBlnBaselineOffset, kBlnXHeight
44 #if defined(USE_OPENCL)
45 # include "openclwrapper.h" // for OpenclDevice
46 #endif
47 #include "pageres.h" // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE...
48 #include "paragraphs.h" // for DetectParagraphs
49 #include "params.h" // for BoolParam, IntParam, DoubleParam, Stri...
50 #include "pdblock.h" // for PDBLK
51 #include "points.h" // for FCOORD
52 #include "polyblk.h" // for POLY_BLOCK
53 #include "rect.h" // for TBOX
54 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
55 #include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix
56 #include "tesseractclass.h" // for Tesseract
57 #include "tprintf.h" // for tprintf
58 #include "werd.h" // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP
59 #include "thresholder.h" // for ImageThresholder
60 
61 #include <tesseract/baseapi.h>
62 #include <tesseract/ocrclass.h> // for ETEXT_DESC
63 #include <tesseract/osdetect.h> // for OSResults, OSBestResult, OrientationId...
64 #include <tesseract/renderer.h> // for TessResultRenderer
65 #include <tesseract/resultiterator.h> // for ResultIterator
66 
67 #include <cmath> // for round, M_PI
68 #include <cstdint> // for int32_t
69 #include <cstring> // for strcmp, strcpy
70 #include <fstream> // for size_t
71 #include <iostream> // for std::cin
72 #include <locale> // for std::locale::classic
73 #include <memory> // for std::unique_ptr
74 #include <set> // for std::pair
75 #include <sstream> // for std::stringstream
76 #include <vector> // for std::vector
77 
78 #include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box...
79 #ifdef HAVE_LIBCURL
80 # include <curl/curl.h>
81 #endif
82 
83 #ifdef __linux__
84 # include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE
85 #endif
86 
87 #if defined(_WIN32)
88 # include <fcntl.h>
89 # include <io.h>
90 #else
91 # include <dirent.h> // for closedir, opendir, readdir, DIR, dirent
92 # include <libgen.h>
93 # include <sys/stat.h> // for stat, S_IFDIR
94 # include <sys/types.h>
95 # include <unistd.h>
96 #endif // _WIN32
97 
98 namespace tesseract {
99 
100 static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin");
101 static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)");
102 
104 const int kMinRectSize = 10;
106 const char kTesseractReject = '~';
108 const char kUNLVReject = '~';
110 const char kUNLVSuspect = '^';
114 static const char *kOldVarsFile = "failed_vars.txt";
115 
116 #ifndef DISABLED_LEGACY_ENGINE
121 static const char *kInputFile = "noname.tif";
122 static const char kUnknownFontName[] = "UnknownFont";
123 
124 static STRING_VAR(classify_font_name, kUnknownFontName,
125  "Default font name to be used in training");
126 
127 // Finds the name of the training font and returns it in fontname, by cutting
128 // it out based on the expectation that the filename is of the form:
129 // /path/to/dir/[lang].[fontname].exp[num]
130 // The [lang], [fontname] and [num] fields should not have '.' characters.
131 // If the global parameter classify_font_name is set, its value is used instead.
132 static void ExtractFontName(const char* filename, std::string* fontname) {
133  *fontname = classify_font_name;
134  if (*fontname == kUnknownFontName) {
135  // filename is expected to be of the form [lang].[fontname].exp[num]
136  // The [lang], [fontname] and [num] fields should not have '.' characters.
137  const char *basename = strrchr(filename, '/');
138  const char *firstdot = strchr(basename ? basename : filename, '.');
139  const char *lastdot = strrchr(filename, '.');
140  if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {
141  ++firstdot;
142  *fontname = firstdot;
143  fontname->resize(lastdot - firstdot);
144  }
145  }
146 }
147 #endif
148 
149 /* Add all available languages recursively.
150  */
151 static void addAvailableLanguages(const std::string &datadir, const std::string &base,
152  std::vector<std::string> *langs) {
153  auto base2 = base;
154  if (!base2.empty()) {
155  base2 += "/";
156  }
157  const size_t extlen = sizeof(kTrainedDataSuffix);
158 #ifdef _WIN32
159  WIN32_FIND_DATA data;
160  HANDLE handle = FindFirstFile((datadir + base2 + "*").c_str(), &data);
161  if (handle != INVALID_HANDLE_VALUE) {
162  BOOL result = TRUE;
163  for (; result;) {
164  char *name = data.cFileName;
165  // Skip '.', '..', and hidden files
166  if (name[0] != '.') {
167  if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == FILE_ATTRIBUTE_DIRECTORY) {
168  addAvailableLanguages(datadir, base2 + name, langs);
169  } else {
170  size_t len = strlen(name);
171  if (len > extlen && name[len - extlen] == '.' &&
172  strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
173  name[len - extlen] = '\0';
174  langs->push_back(base2 + name);
175  }
176  }
177  }
178  result = FindNextFile(handle, &data);
179  }
180  FindClose(handle);
181  }
182 #else // _WIN32
183  DIR *dir = opendir((datadir + base).c_str());
184  if (dir != nullptr) {
185  dirent *de;
186  while ((de = readdir(dir))) {
187  char *name = de->d_name;
188  // Skip '.', '..', and hidden files
189  if (name[0] != '.') {
190  struct stat st;
191  if (stat((datadir + base2 + name).c_str(), &st) == 0 && (st.st_mode & S_IFDIR) == S_IFDIR) {
192  addAvailableLanguages(datadir, base2 + name, langs);
193  } else {
194  size_t len = strlen(name);
195  if (len > extlen && name[len - extlen] == '.' &&
196  strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
197  name[len - extlen] = '\0';
198  langs->push_back(base2 + name);
199  }
200  }
201  }
202  }
203  closedir(dir);
204  }
205 #endif
206 }
207 
209  : tesseract_(nullptr)
210  , osd_tesseract_(nullptr)
211  , equ_detect_(nullptr)
212  , reader_(nullptr)
213  ,
214  // thresholder_ is initialized to nullptr here, but will be set before use
215  // by: A constructor of a derived API or created
216  // implicitly when used in InternalSetImage.
217  thresholder_(nullptr)
218  , paragraph_models_(nullptr)
219  , block_list_(nullptr)
220  , page_res_(nullptr)
221  , last_oem_requested_(OEM_DEFAULT)
222  , recognition_done_(false)
223  , rect_left_(0)
224  , rect_top_(0)
225  , rect_width_(0)
226  , rect_height_(0)
227  , image_width_(0)
228  , image_height_(0) {
229 }
230 
232  End();
233 }
234 
238 const char *TessBaseAPI::Version() {
239  return TESSERACT_VERSION_STR;
240 }
241 
249 size_t TessBaseAPI::getOpenCLDevice(void **data) {
250 #ifdef USE_OPENCL
251  ds_device device = OpenclDevice::getDeviceSelection();
252  if (device.type == DS_DEVICE_OPENCL_DEVICE) {
253  *data = new cl_device_id;
254  memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id));
255  return sizeof(cl_device_id);
256  }
257 #endif
258 
259  *data = nullptr;
260  return 0;
261 }
262 
267 void TessBaseAPI::SetInputName(const char *name) {
268  input_file_ = name ? name : "";
269 }
270 
272 void TessBaseAPI::SetOutputName(const char *name) {
273  output_file_ = name ? name : "";
274 }
275 
276 bool TessBaseAPI::SetVariable(const char *name, const char *value) {
277  if (tesseract_ == nullptr) {
278  tesseract_ = new Tesseract;
279  }
281  tesseract_->params());
282 }
283 
284 bool TessBaseAPI::SetDebugVariable(const char *name, const char *value) {
285  if (tesseract_ == nullptr) {
286  tesseract_ = new Tesseract;
287  }
289 }
290 
291 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
292  auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params,
294  if (p == nullptr) {
295  return false;
296  }
297  *value = (int32_t)(*p);
298  return true;
299 }
300 
301 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
302  auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params,
304  if (p == nullptr) {
305  return false;
306  }
307  *value = bool(*p);
308  return true;
309 }
310 
311 const char *TessBaseAPI::GetStringVariable(const char *name) const {
312  auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params,
314  return (p != nullptr) ? p->c_str() : nullptr;
315 }
316 
317 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
318  auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params,
320  if (p == nullptr) {
321  return false;
322  }
323  *value = (double)(*p);
324  return true;
325 }
326 
328 bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const {
329  return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
330 }
331 
332 #ifndef DISABLED_LEGACY_ENGINE
333 
335 void TessBaseAPI::PrintFontsTable(FILE *fp) const {
336  const int fontinfo_size = tesseract_->get_fontinfo_table().size();
337  for (int font_index = 1; font_index < fontinfo_size; ++font_index) {
338  FontInfo font = tesseract_->get_fontinfo_table().at(font_index);
339  fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s"
340  " is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n",
341  font_index, font.name,
342  font.is_italic() ? "true" : "false",
343  font.is_bold() ? "true" : "false",
344  font.is_fixed_pitch() ? "true" : "false",
345  font.is_serif() ? "true" : "false",
346  font.is_fraktur() ? "true" : "false");
347  }
348 }
349 
350 #endif
351 
353 void TessBaseAPI::PrintVariables(FILE *fp) const {
355 }
356 
365 int TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs,
366  int configs_size, const std::vector<std::string> *vars_vec,
367  const std::vector<std::string> *vars_values, bool set_only_non_debug_params) {
368  return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values,
369  set_only_non_debug_params, nullptr);
370 }
371 
372 // In-memory version reads the traineddata file directly from the given
373 // data[data_size] array. Also implements the version with a datapath in data,
374 // flagged by data_size = 0.
375 int TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem,
376  char **configs, int configs_size, const std::vector<std::string> *vars_vec,
377  const std::vector<std::string> *vars_values, bool set_only_non_debug_params,
378  FileReader reader) {
379  if (language == nullptr) {
380  language = "";
381  }
382  if (data == nullptr) {
383  data = "";
384  }
385  std::string datapath = data_size == 0 ? data : language;
386  // If the datapath, OcrEngineMode or the language have changed - start again.
387  // Note that the language_ field stores the last requested language that was
388  // initialized successfully, while tesseract_->lang stores the language
389  // actually used. They differ only if the requested language was nullptr, in
390  // which case tesseract_->lang is set to the Tesseract default ("eng").
391  if (tesseract_ != nullptr &&
392  (datapath_.empty() || language_.empty() || datapath_ != datapath ||
393  last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) {
394  delete tesseract_;
395  tesseract_ = nullptr;
396  }
397 #ifdef USE_OPENCL
398  OpenclDevice od;
399  od.InitEnv();
400 #endif
401  bool reset_classifier = true;
402  if (tesseract_ == nullptr) {
403  reset_classifier = false;
404  tesseract_ = new Tesseract;
405  if (reader != nullptr) {
406  reader_ = reader;
407  }
409  if (data_size != 0) {
410  mgr.LoadMemBuffer(language, data, data_size);
411  }
412  if (tesseract_->init_tesseract(datapath.c_str(), output_file_.c_str(), language, oem, configs,
413  configs_size, vars_vec, vars_values, set_only_non_debug_params,
414  &mgr) != 0) {
415  return -1;
416  }
417  }
418 
419  // Update datapath and language requested for the last valid initialization.
420  datapath_ = datapath;
421  if (datapath_.empty() && !tesseract_->datadir.empty()) {
423  }
424 
425  language_ = language;
427 
428 #ifndef DISABLED_LEGACY_ENGINE
429  // For same language and datapath, just reset the adaptive classifier.
430  if (reset_classifier) {
432  }
433 #endif // ndef DISABLED_LEGACY_ENGINE
434  return 0;
435 }
436 
446  return language_.c_str();
447 }
448 
454 void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const {
455  langs->clear();
456  if (tesseract_ != nullptr) {
457  langs->push_back(tesseract_->lang);
458  int num_subs = tesseract_->num_sub_langs();
459  for (int i = 0; i < num_subs; ++i) {
460  langs->push_back(tesseract_->get_sub_lang(i)->lang);
461  }
462  }
463 }
464 
468 void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const {
469  langs->clear();
470  if (tesseract_ != nullptr) {
471  addAvailableLanguages(tesseract_->datadir, "", langs);
472  std::sort(langs->begin(), langs->end());
473  }
474 }
475 
481  if (tesseract_ == nullptr) {
482  tesseract_ = new Tesseract;
483 #ifndef DISABLED_LEGACY_ENGINE
485 #endif
486  }
487 }
488 
494 void TessBaseAPI::ReadConfigFile(const char *filename) {
496 }
497 
499 void TessBaseAPI::ReadDebugConfigFile(const char *filename) {
501 }
502 
509  if (tesseract_ == nullptr) {
510  tesseract_ = new Tesseract;
511  }
512  tesseract_->tessedit_pageseg_mode.set_value(mode);
513 }
514 
517  if (tesseract_ == nullptr) {
518  return PSM_SINGLE_BLOCK;
519  }
520  return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode));
521 }
522 
536 char *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
537  int bytes_per_line, int left, int top, int width, int height) {
538  if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) {
539  return nullptr; // Nothing worth doing.
540  }
541 
542  // Since this original api didn't give the exact size of the image,
543  // we have to invent a reasonable value.
544  int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
545  SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel,
546  bytes_per_line);
547  SetRectangle(left, top, width, height);
548 
549  return GetUTF8Text();
550 }
551 
552 #ifndef DISABLED_LEGACY_ENGINE
558  if (tesseract_ == nullptr) {
559  return;
560  }
563 }
564 #endif // ndef DISABLED_LEGACY_ENGINE
565 
573 void TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height,
574  int bytes_per_pixel, int bytes_per_line) {
575  if (InternalSetImage()) {
576  thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
578  }
579 }
580 
582  if (thresholder_) {
584  } else {
585  tprintf("Please call SetImage before SetSourceResolution.\n");
586  }
587 }
588 
597 void TessBaseAPI::SetImage(Pix *pix) {
598  if (InternalSetImage()) {
599  if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) {
600  // remove alpha channel from png
601  Pix *p1 = pixRemoveAlpha(pix);
602  pixSetSpp(p1, 3);
603  (void)pixCopy(pix, p1);
604  pixDestroy(&p1);
605  }
606  thresholder_->SetImage(pix);
608  }
609 }
610 
616 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
617  if (thresholder_ == nullptr) {
618  return;
619  }
620  thresholder_->SetRectangle(left, top, width, height);
621  ClearResults();
622 }
623 
629  if (tesseract_ == nullptr || thresholder_ == nullptr) {
630  return nullptr;
631  }
632  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
633  return nullptr;
634  }
635  return tesseract_->pix_binary().clone();
636 }
637 
643 Boxa *TessBaseAPI::GetRegions(Pixa **pixa) {
644  return GetComponentImages(RIL_BLOCK, false, pixa, nullptr);
645 }
646 
655 Boxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa,
656  int **blockids, int **paraids) {
657  return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids);
658 }
659 
668 Boxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) {
669  return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
670 }
671 
677 Boxa *TessBaseAPI::GetWords(Pixa **pixa) {
678  return GetComponentImages(RIL_WORD, true, pixa, nullptr);
679 }
680 
688  return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr);
689 }
690 
699 Boxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image,
700  const int raw_padding, Pixa **pixa, int **blockids,
701  int **paraids) {
702  /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator());
703  if (page_it == nullptr) {
704  page_it.reset(AnalyseLayout());
705  }
706  if (page_it == nullptr) {
707  return nullptr; // Failed.
708  }
709 
710  // Count the components to get a size for the arrays.
711  int component_count = 0;
712  int left, top, right, bottom;
713 
714  if (raw_image) {
715  // Get bounding box in original raw image with padding.
716  do {
717  if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) &&
718  (!text_only || PTIsTextType(page_it->BlockType()))) {
719  ++component_count;
720  }
721  } while (page_it->Next(level));
722  } else {
723  // Get bounding box from binarized imaged. Note that this could be
724  // differently scaled from the original image.
725  do {
726  if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
727  (!text_only || PTIsTextType(page_it->BlockType()))) {
728  ++component_count;
729  }
730  } while (page_it->Next(level));
731  }
732 
733  Boxa *boxa = boxaCreate(component_count);
734  if (pixa != nullptr) {
735  *pixa = pixaCreate(component_count);
736  }
737  if (blockids != nullptr) {
738  *blockids = new int[component_count];
739  }
740  if (paraids != nullptr) {
741  *paraids = new int[component_count];
742  }
743 
744  int blockid = 0;
745  int paraid = 0;
746  int component_index = 0;
747  page_it->Begin();
748  do {
749  bool got_bounding_box;
750  if (raw_image) {
751  got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom);
752  } else {
753  got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom);
754  }
755  if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) {
756  Box *lbox = boxCreate(left, top, right - left, bottom - top);
757  boxaAddBox(boxa, lbox, L_INSERT);
758  if (pixa != nullptr) {
759  Pix *pix = nullptr;
760  if (raw_image) {
761  pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top);
762  } else {
763  pix = page_it->GetBinaryImage(level);
764  }
765  pixaAddPix(*pixa, pix, L_INSERT);
766  pixaAddBox(*pixa, lbox, L_CLONE);
767  }
768  if (paraids != nullptr) {
769  (*paraids)[component_index] = paraid;
770  if (page_it->IsAtFinalElement(RIL_PARA, level)) {
771  ++paraid;
772  }
773  }
774  if (blockids != nullptr) {
775  (*blockids)[component_index] = blockid;
776  if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
777  ++blockid;
778  paraid = 0;
779  }
780  }
781  ++component_index;
782  }
783  } while (page_it->Next(level));
784  return boxa;
785 }
786 
788  if (thresholder_ == nullptr) {
789  return 0;
790  }
791  return thresholder_->GetScaleFactor();
792 }
793 
810  return AnalyseLayout(false);
811 }
812 
813 PageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
814  if (FindLines() == 0) {
815  if (block_list_->empty()) {
816  return nullptr; // The page was empty.
817  }
818  page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr);
819  DetectParagraphs(false);
823  }
824  return nullptr;
825 }
826 
832  if (tesseract_ == nullptr) {
833  return -1;
834  }
835  if (FindLines() != 0) {
836  return -1;
837  }
838  delete page_res_;
839  if (block_list_->empty()) {
841  return 0; // Empty page.
842  }
843 
845  recognition_done_ = true;
846 #ifndef DISABLED_LEGACY_ENGINE
847  if (tesseract_->tessedit_resegment_from_line_boxes) {
849  } else if (tesseract_->tessedit_resegment_from_boxes) {
851  } else
852 #endif // ndef DISABLED_LEGACY_ENGINE
853  {
854  page_res_ =
856  }
857 
858  if (page_res_ == nullptr) {
859  return -1;
860  }
861 
862  if (tesseract_->tessedit_train_line_recognizer) {
864  return -1;
865  }
867  return 0;
868  }
869 #ifndef DISABLED_LEGACY_ENGINE
870  if (tesseract_->tessedit_make_boxes_from_boxes) {
872  return 0;
873  }
874 #endif // ndef DISABLED_LEGACY_ENGINE
875 
876  int result = 0;
877  if (tesseract_->interactive_display_mode) {
878 #ifndef GRAPHICS_DISABLED
880 #endif // !GRAPHICS_DISABLED
881  // The page_res is invalid after an interactive session, so cleanup
882  // in a way that lets us continue to the next page without crashing.
883  delete page_res_;
884  page_res_ = nullptr;
885  return -1;
886 #ifndef DISABLED_LEGACY_ENGINE
887  } else if (tesseract_->tessedit_train_from_boxes) {
888  std::string fontname;
889  ExtractFontName(output_file_.c_str(), &fontname);
891  } else if (tesseract_->tessedit_ambigs_training) {
892  FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str());
893  // OCR the page segmented into words by tesseract.
895  training_output_file);
896  fclose(training_output_file);
897 #endif // ndef DISABLED_LEGACY_ENGINE
898  } else {
899  // Now run the main recognition.
900  bool wait_for_text = true;
901  GetBoolVariable("paragraph_text_based", &wait_for_text);
902  if (!wait_for_text) {
903  DetectParagraphs(false);
904  }
905  if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) {
906  if (wait_for_text) {
907  DetectParagraphs(true);
908  }
909  } else {
910  result = -1;
911  }
912  }
913  return result;
914 }
915 
916 // Takes ownership of the input pix.
919 }
920 
922  return tesseract_->pix_original();
923 }
924 
926  if (!input_file_.empty()) {
927  return input_file_.c_str();
928  }
929  return nullptr;
930 }
931 
933  return tesseract_->datadir.c_str();
934 }
935 
937  if (thresholder_ == nullptr)
938  return -1;
940 }
941 
942 // If flist exists, get data from there. Otherwise get data from buf.
943 // Seems convoluted, but is the easiest way I know of to meet multiple
944 // goals. Support streaming from stdin, and also work on platforms
945 // lacking fmemopen.
946 // TODO: check different logic for flist/buf and simplify.
947 bool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config,
948  int timeout_millisec, TessResultRenderer *renderer,
949  int tessedit_page_number) {
950  if (!flist && !buf) {
951  return false;
952  }
953  unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
954  char pagename[MAX_PATH];
955 
956  std::vector<std::string> lines;
957  if (!flist) {
958  std::string line;
959  for (const auto ch : *buf) {
960  if (ch == '\n') {
961  lines.push_back(line);
962  line.clear();
963  } else {
964  line.push_back(ch);
965  }
966  }
967  if (!line.empty()) {
968  // Add last line without terminating LF.
969  lines.push_back(line);
970  }
971  if (lines.empty()) {
972  return false;
973  }
974  }
975 
976  // Skip to the requested page number.
977  for (unsigned i = 0; i < page; i++) {
978  if (flist) {
979  if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
980  break;
981  }
982  }
983  }
984 
985  // Begin producing output
986  if (renderer && !renderer->BeginDocument(document_title.c_str())) {
987  return false;
988  }
989 
990  // Loop over all pages - or just the requested one
991  while (true) {
992  if (flist) {
993  if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
994  break;
995  }
996  } else {
997  if (page >= lines.size()) {
998  break;
999  }
1000  snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
1001  }
1002  chomp_string(pagename);
1003  Pix *pix = pixRead(pagename);
1004  if (pix == nullptr) {
1005  tprintf("Image file %s cannot be read!\n", pagename);
1006  return false;
1007  }
1008  tprintf("Page %u : %s\n", page, pagename);
1009  bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer);
1010  pixDestroy(&pix);
1011  if (!r) {
1012  return false;
1013  }
1014  if (tessedit_page_number >= 0) {
1015  break;
1016  }
1017  ++page;
1018  }
1019 
1020  // Finish producing output
1021  if (renderer && !renderer->EndDocument()) {
1022  return false;
1023  }
1024  return true;
1025 }
1026 
1027 bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename,
1028  const char *retry_config, int timeout_millisec,
1029  TessResultRenderer *renderer,
1030  int tessedit_page_number) {
1031  Pix *pix = nullptr;
1032  int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
1033  size_t offset = 0;
1034  for (;; ++page) {
1035  if (tessedit_page_number >= 0) {
1036  page = tessedit_page_number;
1037  pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page);
1038  } else {
1039  pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
1040  : pixReadFromMultipageTiff(filename, &offset);
1041  }
1042  if (pix == nullptr) {
1043  break;
1044  }
1045  if (offset || page > 0) {
1046  // Only print page number for multipage TIFF file.
1047  tprintf("Page %d\n", page + 1);
1048  }
1049  auto page_string = std::to_string(page);
1050  SetVariable("applybox_page", page_string.c_str());
1051  bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer);
1052  pixDestroy(&pix);
1053  if (!r) {
1054  return false;
1055  }
1056  if (tessedit_page_number >= 0) {
1057  break;
1058  }
1059  if (!offset) {
1060  break;
1061  }
1062  }
1063  return true;
1064 }
1065 
1066 // Master ProcessPages calls ProcessPagesInternal and then does any post-
1067 // processing required due to being in a training mode.
1068 bool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec,
1069  TessResultRenderer *renderer) {
1070  bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
1071 #ifndef DISABLED_LEGACY_ENGINE
1072  if (result) {
1073  if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) {
1074  tprintf("Write of TR file failed: %s\n", output_file_.c_str());
1075  return false;
1076  }
1077  }
1078 #endif // ndef DISABLED_LEGACY_ENGINE
1079  return result;
1080 }
1081 
1082 #ifdef HAVE_LIBCURL
1083 static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
1084  size = size * nmemb;
1085  auto *buf = reinterpret_cast<std::string *>(userp);
1086  buf->append(reinterpret_cast<const char *>(contents), size);
1087  return size;
1088 }
1089 #endif
1090 
1091 // In the ideal scenario, Tesseract will start working on data as soon
1092 // as it can. For example, if you stream a filelist through stdin, we
1093 // should start the OCR process as soon as the first filename is
1094 // available. This is particularly useful when hooking Tesseract up to
1095 // slow hardware such as a book scanning machine.
1096 //
1097 // Unfortunately there are tradeoffs. You can't seek on stdin. That
1098 // makes automatic detection of datatype (TIFF? filelist? PNG?)
1099 // impractical. So we support a command line flag to explicitly
1100 // identify the scenario that really matters: filelists on
1101 // stdin. We'll still do our best if the user likes pipes.
1102 bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config,
1103  int timeout_millisec, TessResultRenderer *renderer) {
1104  bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
1105  if (stdInput) {
1106 #ifdef WIN32
1107  if (_setmode(_fileno(stdin), _O_BINARY) == -1)
1108  tprintf("ERROR: cin to binary: %s", strerror(errno));
1109 #endif // WIN32
1110  }
1111 
1112  if (stream_filelist) {
1113  return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer,
1114  tesseract_->tessedit_page_number);
1115  }
1116 
1117  // At this point we are officially in autodection territory.
1118  // That means any data in stdin must be buffered, to make it
1119  // seekable.
1120  std::string buf;
1121  const l_uint8 *data = nullptr;
1122  if (stdInput) {
1123  buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>()));
1124  data = reinterpret_cast<const l_uint8 *>(buf.data());
1125  } else if (strstr(filename, "://") != nullptr) {
1126  // Get image or image list by URL.
1127 #ifdef HAVE_LIBCURL
1128  CURL *curl = curl_easy_init();
1129  if (curl == nullptr) {
1130  fprintf(stderr, "Error, curl_easy_init failed\n");
1131  return false;
1132  } else {
1133  CURLcode curlcode;
1134  auto error = [curl, &curlcode](const char *function) {
1135  fprintf(stderr, "Error, %s failed with error %s\n", function, curl_easy_strerror(curlcode));
1136  curl_easy_cleanup(curl);
1137  return false;
1138  };
1139  curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
1140  if (curlcode != CURLE_OK) {
1141  return error("curl_easy_setopt");
1142  }
1143  curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
1144  if (curlcode != CURLE_OK) {
1145  return error("curl_easy_setopt");
1146  }
1147  curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
1148  if (curlcode != CURLE_OK) {
1149  return error("curl_easy_setopt");
1150  }
1151  curlcode = curl_easy_perform(curl);
1152  if (curlcode != CURLE_OK) {
1153  return error("curl_easy_perform");
1154  }
1155  curl_easy_cleanup(curl);
1156  data = reinterpret_cast<const l_uint8 *>(buf.data());
1157  }
1158 #else
1159  fprintf(stderr, "Error, this tesseract has no URL support\n");
1160  return false;
1161 #endif
1162  } else {
1163  // Check whether the input file can be read.
1164  if (FILE *file = fopen(filename, "rb")) {
1165  fclose(file);
1166  } else {
1167  fprintf(stderr, "Error, cannot read input file %s: %s\n", filename, strerror(errno));
1168  return false;
1169  }
1170  }
1171 
1172  // Here is our autodetection
1173  int format;
1174  int r =
1175  (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format);
1176 
1177  // Maybe we have a filelist
1178  if (r != 0 || format == IFF_UNKNOWN) {
1179  std::string s;
1180  if (data != nullptr) {
1181  s = buf.c_str();
1182  } else {
1183  std::ifstream t(filename);
1184  std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
1185  s = u.c_str();
1186  }
1187  return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer,
1188  tesseract_->tessedit_page_number);
1189  }
1190 
1191  // Maybe we have a TIFF which is potentially multipage
1192  bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE ||
1193  format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
1194 #if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76
1195  format == IFF_TIFF_JPEG ||
1196 #endif
1197  format == IFF_TIFF_ZIP);
1198 
1199  // Fail early if we can, before producing any output
1200  Pix *pix = nullptr;
1201  if (!tiff) {
1202  pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
1203  if (pix == nullptr) {
1204  return false;
1205  }
1206  }
1207 
1208  // Begin the output
1209  if (renderer && !renderer->BeginDocument(document_title.c_str())) {
1210  pixDestroy(&pix);
1211  return false;
1212  }
1213 
1214  // Produce output
1215  r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec,
1216  renderer, tesseract_->tessedit_page_number)
1217  : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer);
1218 
1219  // Clean up memory as needed
1220  pixDestroy(&pix);
1221 
1222  // End the output
1223  if (!r || (renderer && !renderer->EndDocument())) {
1224  return false;
1225  }
1226  return true;
1227 }
1228 
1229 bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
1230  const char *retry_config, int timeout_millisec,
1231  TessResultRenderer *renderer) {
1232  SetInputName(filename);
1233  SetImage(pix);
1234  bool failed = false;
1235 
1236  if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
1237  // Disabled character recognition
1238  if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) {
1239  failed = true;
1240  }
1241  } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) {
1242  failed = FindLines() != 0;
1243  } else if (timeout_millisec > 0) {
1244  // Running with a timeout.
1245  ETEXT_DESC monitor;
1246  monitor.cancel = nullptr;
1247  monitor.cancel_this = nullptr;
1248  monitor.set_deadline_msecs(timeout_millisec);
1249 
1250  // Now run the main recognition.
1251  failed = Recognize(&monitor) < 0;
1252  } else {
1253  // Normal layout and character recognition with no timeout.
1254  failed = Recognize(nullptr) < 0;
1255  }
1256 
1257  if (tesseract_->tessedit_write_images) {
1258  Pix *page_pix = GetThresholdedImage();
1259  std::string output_filename = output_file_ + ".processed";
1260  if (page_index > 0) {
1261  output_filename += std::to_string(page_index);
1262  }
1263  output_filename += ".tif";
1264  pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4);
1265  pixDestroy(&page_pix);
1266  }
1267 
1268  if (failed && retry_config != nullptr && retry_config[0] != '\0') {
1269  // Save current config variables before switching modes.
1270  FILE *fp = fopen(kOldVarsFile, "wb");
1271  if (fp == nullptr) {
1272  tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile);
1273  } else {
1274  PrintVariables(fp);
1275  fclose(fp);
1276  }
1277  // Switch to alternate mode for retry.
1278  ReadConfigFile(retry_config);
1279  SetImage(pix);
1280  Recognize(nullptr);
1281  // Restore saved config variables.
1282  ReadConfigFile(kOldVarsFile);
1283  }
1284 
1285  if (renderer && !failed) {
1286  failed = !renderer->AddImage(this);
1287  }
1288 
1289  return !failed;
1290 }
1291 
1297  if (tesseract_ == nullptr || page_res_ == nullptr) {
1298  return nullptr;
1299  }
1303 }
1304 
1314  if (tesseract_ == nullptr || page_res_ == nullptr) {
1315  return nullptr;
1316  }
1320 }
1321 
1331  if (tesseract_ == nullptr || page_res_ == nullptr) {
1332  return nullptr;
1333  }
1337 }
1338 
1341  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1342  return nullptr;
1343  }
1344  std::string text("");
1345  const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator());
1346  do {
1347  if (it->Empty(RIL_PARA)) {
1348  continue;
1349  }
1350  const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
1351  text += para_text.get();
1352  } while (it->Next(RIL_PARA));
1353  char *result = new char[text.length() + 1];
1354  strncpy(result, text.c_str(), text.length() + 1);
1355  return result;
1356 }
1357 
1358 static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) {
1359  int left, top, right, bottom;
1360  it->BoundingBox(level, &left, &top, &right, &bottom);
1361  text += "\t" + std::to_string(left);
1362  text += "\t" + std::to_string(top);
1363  text += "\t" + std::to_string(right - left);
1364  text += "\t" + std::to_string(bottom - top);
1365 }
1366 
1372 char *TessBaseAPI::GetTSVText(int page_number) {
1373  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
1374  return nullptr;
1375  }
1376 
1377  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1378  int page_id = page_number + 1; // we use 1-based page numbers.
1379 
1380  int page_num = page_id;
1381  int block_num = 0;
1382  int par_num = 0;
1383  int line_num = 0;
1384  int word_num = 0;
1385 
1386  std::string tsv_str;
1387  tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
1388  tsv_str += "\t" + std::to_string(block_num);
1389  tsv_str += "\t" + std::to_string(par_num);
1390  tsv_str += "\t" + std::to_string(line_num);
1391  tsv_str += "\t" + std::to_string(word_num);
1392  tsv_str += "\t" + std::to_string(rect_left_);
1393  tsv_str += "\t" + std::to_string(rect_top_);
1394  tsv_str += "\t" + std::to_string(rect_width_);
1395  tsv_str += "\t" + std::to_string(rect_height_);
1396  tsv_str += "\t-1\t\n";
1397 
1398  const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
1399  while (!res_it->Empty(RIL_BLOCK)) {
1400  if (res_it->Empty(RIL_WORD)) {
1401  res_it->Next(RIL_WORD);
1402  continue;
1403  }
1404 
1405  // Add rows for any new block/paragraph/textline.
1406  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1407  block_num++;
1408  par_num = 0;
1409  line_num = 0;
1410  word_num = 0;
1411  tsv_str += "2\t" + std::to_string(page_num); // level 2 - block
1412  tsv_str += "\t" + std::to_string(block_num);
1413  tsv_str += "\t" + std::to_string(par_num);
1414  tsv_str += "\t" + std::to_string(line_num);
1415  tsv_str += "\t" + std::to_string(word_num);
1416  AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
1417  tsv_str += "\t-1\t\n"; // end of row for block
1418  }
1419  if (res_it->IsAtBeginningOf(RIL_PARA)) {
1420  par_num++;
1421  line_num = 0;
1422  word_num = 0;
1423  tsv_str += "3\t" + std::to_string(page_num); // level 3 - paragraph
1424  tsv_str += "\t" + std::to_string(block_num);
1425  tsv_str += "\t" + std::to_string(par_num);
1426  tsv_str += "\t" + std::to_string(line_num);
1427  tsv_str += "\t" + std::to_string(word_num);
1428  AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
1429  tsv_str += "\t-1\t\n"; // end of row for para
1430  }
1431  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1432  line_num++;
1433  word_num = 0;
1434  tsv_str += "4\t" + std::to_string(page_num); // level 4 - line
1435  tsv_str += "\t" + std::to_string(block_num);
1436  tsv_str += "\t" + std::to_string(par_num);
1437  tsv_str += "\t" + std::to_string(line_num);
1438  tsv_str += "\t" + std::to_string(word_num);
1439  AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
1440  tsv_str += "\t-1\t\n"; // end of row for line
1441  }
1442 
1443  // Now, process the word...
1444  int left, top, right, bottom;
1445  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1446  word_num++;
1447  tsv_str += "5\t" + std::to_string(page_num); // level 5 - word
1448  tsv_str += "\t" + std::to_string(block_num);
1449  tsv_str += "\t" + std::to_string(par_num);
1450  tsv_str += "\t" + std::to_string(line_num);
1451  tsv_str += "\t" + std::to_string(word_num);
1452  tsv_str += "\t" + std::to_string(left);
1453  tsv_str += "\t" + std::to_string(top);
1454  tsv_str += "\t" + std::to_string(right - left);
1455  tsv_str += "\t" + std::to_string(bottom - top);
1456  tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
1457  tsv_str += "\t";
1458 
1459  // Increment counts if at end of block/paragraph/textline.
1460  if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
1461  lcnt++;
1462  }
1463  if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) {
1464  pcnt++;
1465  }
1466  if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) {
1467  bcnt++;
1468  }
1469 
1470  do {
1471  tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1472  res_it->Next(RIL_SYMBOL);
1473  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1474  tsv_str += "\n"; // end of row
1475  wcnt++;
1476  }
1477 
1478  char *ret = new char[tsv_str.length() + 1];
1479  strcpy(ret, tsv_str.c_str());
1480  return ret;
1481 }
1482 
1484 const int kNumbersPerBlob = 5;
1489 const int kBytesPerNumber = 5;
1497 const int kBytesPer64BitNumber = 20;
1505 
1512 char *TessBaseAPI::GetBoxText(int page_number) {
1513  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1514  return nullptr;
1515  }
1516  int blob_count;
1517  int utf8_length = TextLength(&blob_count);
1518  int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine;
1519  char *result = new char[total_length];
1520  result[0] = '\0';
1521  int output_length = 0;
1523  do {
1524  int left, top, right, bottom;
1525  if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1526  const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
1527  // Tesseract uses space for recognition failure. Fix to a reject
1528  // character, kTesseractReject so we don't create illegal box files.
1529  for (int i = 0; text[i] != '\0'; ++i) {
1530  if (text[i] == ' ') {
1531  text[i] = kTesseractReject;
1532  }
1533  }
1534  snprintf(result + output_length, total_length - output_length, "%s %d %d %d %d %d\n",
1535  text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number);
1536  output_length += strlen(result + output_length);
1537  // Just in case...
1538  if (output_length + kMaxBytesPerLine > total_length) {
1539  break;
1540  }
1541  }
1542  } while (it->Next(RIL_SYMBOL));
1543  delete it;
1544  return result;
1545 }
1546 
1552 const int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0};
1554 const int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0};
1555 
1562  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1563  return nullptr;
1564  }
1565  bool tilde_crunch_written = false;
1566  bool last_char_was_newline = true;
1567  bool last_char_was_tilde = false;
1568 
1569  int total_length = TextLength(nullptr);
1570  PAGE_RES_IT page_res_it(page_res_);
1571  char *result = new char[total_length];
1572  char *ptr = result;
1573  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
1574  WERD_RES *word = page_res_it.word();
1575  // Process the current word.
1576  if (word->unlv_crunch_mode != CR_NONE) {
1577  if (word->unlv_crunch_mode != CR_DELETE &&
1578  (!tilde_crunch_written ||
1579  (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 &&
1580  !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
1581  if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) &&
1582  !word->word->flag(W_FUZZY_SP)) {
1583  /* Write a space to separate from preceding good text */
1584  *ptr++ = ' ';
1585  last_char_was_tilde = false;
1586  }
1587  if (!last_char_was_tilde) {
1588  // Write a reject char.
1589  last_char_was_tilde = true;
1590  *ptr++ = kUNLVReject;
1591  tilde_crunch_written = true;
1592  last_char_was_newline = false;
1593  }
1594  }
1595  } else {
1596  // NORMAL PROCESSING of non tilde crunched words.
1597  tilde_crunch_written = false;
1599  const char *wordstr = word->best_choice->unichar_string().c_str();
1600  const auto &lengths = word->best_choice->unichar_lengths();
1601  int length = lengths.length();
1602  int i = 0;
1603  int offset = 0;
1604 
1605  if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') {
1606  // Prevent adjacent tilde across words - we know that adjacent tildes
1607  // within words have been removed.
1608  // Skip the first character.
1609  offset = lengths[i++];
1610  }
1611  if (i < length && wordstr[offset] != 0) {
1612  if (!last_char_was_newline) {
1613  *ptr++ = ' ';
1614  } else {
1615  last_char_was_newline = false;
1616  }
1617  for (; i < length; offset += lengths[i++]) {
1618  if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) {
1619  *ptr++ = kUNLVReject;
1620  last_char_was_tilde = true;
1621  } else {
1622  if (word->reject_map[i].rejected()) {
1623  *ptr++ = kUNLVSuspect;
1624  }
1625  UNICHAR ch(wordstr + offset, lengths[i]);
1626  int uni_ch = ch.first_uni();
1627  for (int j = 0; kUniChs[j] != 0; ++j) {
1628  if (kUniChs[j] == uni_ch) {
1629  uni_ch = kLatinChs[j];
1630  break;
1631  }
1632  }
1633  if (uni_ch <= 0xff) {
1634  *ptr++ = static_cast<char>(uni_ch);
1635  last_char_was_tilde = false;
1636  } else {
1637  *ptr++ = kUNLVReject;
1638  last_char_was_tilde = true;
1639  }
1640  }
1641  }
1642  }
1643  }
1644  if (word->word->flag(W_EOL) && !last_char_was_newline) {
1645  /* Add a new line output */
1646  *ptr++ = '\n';
1647  tilde_crunch_written = false;
1648  last_char_was_newline = true;
1649  last_char_was_tilde = false;
1650  }
1651  }
1652  *ptr++ = '\n';
1653  *ptr = '\0';
1654  return result;
1655 }
1656 
1657 #ifndef DISABLED_LEGACY_ENGINE
1658 
1668 bool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf,
1669  const char **script_name, float *script_conf) {
1670  OSResults osr;
1671 
1672  bool osd = DetectOS(&osr);
1673  if (!osd) {
1674  return false;
1675  }
1676 
1677  int orient_id = osr.best_result.orientation_id;
1678  int script_id = osr.get_best_script(orient_id);
1679  if (orient_conf) {
1680  *orient_conf = osr.best_result.oconfidence;
1681  }
1682  if (orient_deg) {
1683  *orient_deg = orient_id * 90; // convert quadrant to degrees
1684  }
1685 
1686  if (script_name) {
1687  const char *script = osr.unicharset->get_script_from_script_id(script_id);
1688 
1689  *script_name = script;
1690  }
1691 
1692  if (script_conf) {
1693  *script_conf = osr.best_result.sconfidence;
1694  }
1695 
1696  return true;
1697 }
1698 
1704 char *TessBaseAPI::GetOsdText(int page_number) {
1705  int orient_deg;
1706  float orient_conf;
1707  const char *script_name;
1708  float script_conf;
1709 
1710  if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) {
1711  return nullptr;
1712  }
1713 
1714  // clockwise rotation needed to make the page upright
1715  int rotate = OrientationIdToValue(orient_deg / 90);
1716 
1717  std::stringstream stream;
1718  // Use "C" locale (needed for float values orient_conf and script_conf).
1719  stream.imbue(std::locale::classic());
1720  // Use fixed notation with 2 digits after the decimal point for float values.
1721  stream.precision(2);
1722  stream << std::fixed << "Page number: " << page_number << "\n"
1723  << "Orientation in degrees: " << orient_deg << "\n"
1724  << "Rotate: " << rotate << "\n"
1725  << "Orientation confidence: " << orient_conf << "\n"
1726  << "Script: " << script_name << "\n"
1727  << "Script confidence: " << script_conf << "\n";
1728  const std::string &text = stream.str();
1729  char *result = new char[text.length() + 1];
1730  strcpy(result, text.c_str());
1731  return result;
1732 }
1733 
1734 #endif // ndef DISABLED_LEGACY_ENGINE
1735 
1738  int *conf = AllWordConfidences();
1739  if (!conf) {
1740  return 0;
1741  }
1742  int sum = 0;
1743  int *pt = conf;
1744  while (*pt >= 0) {
1745  sum += *pt++;
1746  }
1747  if (pt != conf) {
1748  sum /= pt - conf;
1749  }
1750  delete[] conf;
1751  return sum;
1752 }
1753 
1756  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1757  return nullptr;
1758  }
1759  int n_word = 0;
1760  PAGE_RES_IT res_it(page_res_);
1761  for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1762  n_word++;
1763  }
1764 
1765  int *conf = new int[n_word + 1];
1766  n_word = 0;
1767  for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1768  WERD_RES *word = res_it.word();
1769  WERD_CHOICE *choice = word->best_choice;
1770  int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1771  // This is the eq for converting Tesseract confidence to 1..100
1772  if (w_conf < 0) {
1773  w_conf = 0;
1774  }
1775  if (w_conf > 100) {
1776  w_conf = 100;
1777  }
1778  conf[n_word++] = w_conf;
1779  }
1780  conf[n_word] = -1;
1781  return conf;
1782 }
1783 
1784 #ifndef DISABLED_LEGACY_ENGINE
1795 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) {
1796  int debug = 0;
1797  GetIntVariable("applybox_debug", &debug);
1798  bool success = true;
1799  PageSegMode current_psm = GetPageSegMode();
1800  SetPageSegMode(mode);
1801  SetVariable("classify_enable_learning", "0");
1802  const std::unique_ptr<const char[]> text(GetUTF8Text());
1803  if (debug) {
1804  tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
1805  }
1806  if (text != nullptr) {
1807  PAGE_RES_IT it(page_res_);
1808  WERD_RES *word_res = it.word();
1809  if (word_res != nullptr) {
1810  word_res->word->set_text(wordstr);
1811  // Check to see if text matches wordstr.
1812  int w = 0;
1813  int t;
1814  for (t = 0; text[t] != '\0'; ++t) {
1815  if (text[t] == '\n' || text[t] == ' ') {
1816  continue;
1817  }
1818  while (wordstr[w] == ' ') {
1819  ++w;
1820  }
1821  if (text[t] != wordstr[w]) {
1822  break;
1823  }
1824  ++w;
1825  }
1826  if (text[t] != '\0' || wordstr[w] != '\0') {
1827  // No match.
1828  delete page_res_;
1829  std::vector<TBOX> boxes;
1833  PAGE_RES_IT pr_it(page_res_);
1834  if (pr_it.word() == nullptr) {
1835  success = false;
1836  } else {
1837  word_res = pr_it.word();
1838  }
1839  } else {
1840  word_res->BestChoiceToCorrectText();
1841  }
1842  if (success) {
1843  tesseract_->EnableLearning = true;
1844  tesseract_->LearnWord(nullptr, word_res);
1845  }
1846  } else {
1847  success = false;
1848  }
1849  } else {
1850  success = false;
1851  }
1852  SetPageSegMode(current_psm);
1853  return success;
1854 }
1855 #endif // ndef DISABLED_LEGACY_ENGINE
1856 
1864  if (thresholder_ != nullptr) {
1865  thresholder_->Clear();
1866  }
1867  ClearResults();
1868  if (tesseract_ != nullptr) {
1869  SetInputImage(nullptr);
1870  }
1871 }
1872 
1880  Clear();
1881  delete thresholder_;
1882  thresholder_ = nullptr;
1883  delete page_res_;
1884  page_res_ = nullptr;
1885  delete block_list_;
1886  block_list_ = nullptr;
1887  if (paragraph_models_ != nullptr) {
1888  for (auto model : *paragraph_models_) {
1889  delete model;
1890  }
1891  delete paragraph_models_;
1892  paragraph_models_ = nullptr;
1893  }
1894 #ifndef DISABLED_LEGACY_ENGINE
1895  if (osd_tesseract_ == tesseract_) {
1896  osd_tesseract_ = nullptr;
1897  }
1898  delete osd_tesseract_;
1899  osd_tesseract_ = nullptr;
1900  delete equ_detect_;
1901  equ_detect_ = nullptr;
1902 #endif // ndef DISABLED_LEGACY_ENGINE
1903  delete tesseract_;
1904  tesseract_ = nullptr;
1905  input_file_.clear();
1906  output_file_.clear();
1907  datapath_.clear();
1908  language_.clear();
1909 }
1910 
1911 // Clear any library-level memory caches.
1912 // There are a variety of expensive-to-load constant data structures (mostly
1913 // language dictionaries) that are cached globally -- surviving the Init()
1914 // and End() of individual TessBaseAPI's. This function allows the clearing
1915 // of these caches.
1918 }
1919 
1924 int TessBaseAPI::IsValidWord(const char *word) const {
1925  return tesseract_->getDict().valid_word(word);
1926 }
1927 // Returns true if utf8_character is defined in the UniCharset.
1928 bool TessBaseAPI::IsValidCharacter(const char *utf8_character) const {
1929  return tesseract_->unicharset.contains_unichar(utf8_character);
1930 }
1931 
1932 // TODO(rays) Obsolete this function and replace with a more aptly named
1933 // function that returns image coordinates rather than tesseract coordinates.
1934 bool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) {
1935  const std::unique_ptr<const PageIterator> it(AnalyseLayout());
1936  if (it == nullptr) {
1937  return false;
1938  }
1939  int x1, x2, y1, y2;
1940  it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
1941  // Calculate offset and slope (NOTE: Kind of ugly)
1942  if (x2 <= x1) {
1943  x2 = x1 + 1;
1944  }
1945  // Convert the point pair to slope/offset of the baseline (in image coords.)
1946  *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
1947  *out_offset = static_cast<int>(y1 - *out_slope * x1);
1948  // Get the y-coord of the baseline at the left and right edges of the
1949  // textline's bounding box.
1950  int left, top, right, bottom;
1951  if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
1952  return false;
1953  }
1954  int left_y = IntCastRounded(*out_slope * left + *out_offset);
1955  int right_y = IntCastRounded(*out_slope * right + *out_offset);
1956  // Shift the baseline down so it passes through the nearest bottom-corner
1957  // of the textline's bounding box. This is the difference between the y
1958  // at the lowest (max) edge of the box and the actual box bottom.
1959  *out_offset += bottom - std::max(left_y, right_y);
1960  // Switch back to bottom-up tesseract coordinates. Requires negation of
1961  // the slope and height - offset for the offset.
1962  *out_slope = -*out_slope;
1963  *out_offset = rect_height_ - *out_offset;
1964 
1965  return true;
1966 }
1967 
1970  if (tesseract_ != nullptr) {
1972  }
1973 }
1974 
1984  if (tesseract_ != nullptr) {
1986  // Set it for the sublangs too.
1987  int num_subs = tesseract_->num_sub_langs();
1988  for (int i = 0; i < num_subs; ++i) {
1990  }
1991  }
1992 }
1993 
1996  if (tesseract_ == nullptr) {
1997  tprintf("Please call Init before attempting to set an image.\n");
1998  return false;
1999  }
2000  if (thresholder_ == nullptr) {
2002  }
2003  ClearResults();
2004  return true;
2005 }
2006 
2013 bool TessBaseAPI::Threshold(Pix **pix) {
2014  ASSERT_HOST(pix != nullptr);
2015  if (*pix != nullptr) {
2016  pixDestroy(pix);
2017  }
2018  // Zero resolution messes up the algorithms, so make sure it is credible.
2019  int user_dpi = 0;
2020  GetIntVariable("user_defined_dpi", &user_dpi);
2021  int y_res = thresholder_->GetScaledYResolution();
2022  if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) {
2023  tprintf(
2024  "Warning: User defined image dpi is outside of expected range "
2025  "(%d - %d)!\n",
2027  }
2028  // Always use user defined dpi
2029  if (user_dpi) {
2031  } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
2032  if (y_res != 0) {
2033  // Show warning only if a resolution was given.
2034  tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n",
2035  y_res, kMinCredibleResolution);
2036  }
2038  }
2039 
2040  auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method));
2041 
2042  if (thresholding_method == ThresholdMethod::Otsu) {
2043  Image pix_binary(*pix);
2044  if (!thresholder_->ThresholdToPix(&pix_binary)) {
2045  return false;
2046  }
2047  *pix = pix_binary;
2048 
2049  if (!thresholder_->IsBinary()) {
2052  } else {
2053  tesseract_->set_pix_thresholds(nullptr);
2054  tesseract_->set_pix_grey(nullptr);
2055  }
2056  } else {
2057  auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method);
2058 
2059  if (!ok) {
2060  return false;
2061  }
2062  *pix = pix_binary;
2063 
2064  tesseract_->set_pix_thresholds(pix_thresholds);
2065  tesseract_->set_pix_grey(pix_grey);
2066  }
2067 
2069  &image_height_);
2070 
2071  // Set the internal resolution that is used for layout parameters from the
2072  // estimated resolution, rather than the image resolution, which may be
2073  // fabricated, but we will use the image resolution, if there is one, to
2074  // report output point sizes.
2075  int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
2077  if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
2078  tprintf(
2079  "Estimated internal resolution %d out of range! "
2080  "Corrected to %d.\n",
2081  thresholder_->GetScaledEstimatedResolution(), estimated_res);
2082  }
2083  tesseract_->set_source_resolution(estimated_res);
2084  return true;
2085 }
2086 
2089  if (thresholder_ == nullptr || thresholder_->IsEmpty()) {
2090  tprintf("Please call SetImage before attempting recognition.\n");
2091  return -1;
2092  }
2093  if (recognition_done_) {
2094  ClearResults();
2095  }
2096  if (!block_list_->empty()) {
2097  return 0;
2098  }
2099  if (tesseract_ == nullptr) {
2100  tesseract_ = new Tesseract;
2101 #ifndef DISABLED_LEGACY_ENGINE
2103 #endif
2104  }
2105  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2106  return -1;
2107  }
2108 
2110 
2111 #ifndef DISABLED_LEGACY_ENGINE
2112  if (tesseract_->textord_equation_detect) {
2113  if (equ_detect_ == nullptr && !datapath_.empty()) {
2114  equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr);
2115  }
2116  if (equ_detect_ == nullptr) {
2117  tprintf("Warning: Could not set equation detector\n");
2118  } else {
2120  }
2121  }
2122 #endif // ndef DISABLED_LEGACY_ENGINE
2123 
2124  Tesseract *osd_tess = osd_tesseract_;
2125  OSResults osr;
2126 #ifndef DISABLED_LEGACY_ENGINE
2127  if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) {
2128  if (strcmp(language_.c_str(), "osd") == 0) {
2129  osd_tess = tesseract_;
2130  } else {
2131  osd_tesseract_ = new Tesseract;
2132  TessdataManager mgr(reader_);
2133  if (datapath_.empty()) {
2134  tprintf(
2135  "Warning: Auto orientation and script detection requested,"
2136  " but data path is undefined\n");
2137  delete osd_tesseract_;
2138  osd_tesseract_ = nullptr;
2139  } else if (osd_tesseract_->init_tesseract(datapath_.c_str(), "", "osd", OEM_TESSERACT_ONLY,
2140  nullptr, 0, nullptr, nullptr, false, &mgr) == 0) {
2141  osd_tess = osd_tesseract_;
2143  } else {
2144  tprintf(
2145  "Warning: Auto orientation and script detection requested,"
2146  " but osd language failed to load\n");
2147  delete osd_tesseract_;
2148  osd_tesseract_ = nullptr;
2149  }
2150  }
2151  }
2152 #endif // ndef DISABLED_LEGACY_ENGINE
2153 
2154  if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) {
2155  return -1;
2156  }
2157 
2158  // If Devanagari is being recognized, we use different images for page seg
2159  // and for OCR.
2160  tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
2161  return 0;
2162 }
2163 
2166  if (tesseract_ != nullptr) {
2167  tesseract_->Clear();
2168  }
2169  delete page_res_;
2170  page_res_ = nullptr;
2171  recognition_done_ = false;
2172  if (block_list_ == nullptr) {
2173  block_list_ = new BLOCK_LIST;
2174  } else {
2175  block_list_->clear();
2176  }
2177  if (paragraph_models_ != nullptr) {
2178  for (auto model : *paragraph_models_) {
2179  delete model;
2180  }
2181  delete paragraph_models_;
2182  paragraph_models_ = nullptr;
2183  }
2184 }
2185 
2193 int TessBaseAPI::TextLength(int *blob_count) const {
2194  if (tesseract_ == nullptr || page_res_ == nullptr) {
2195  return 0;
2196  }
2197 
2198  PAGE_RES_IT page_res_it(page_res_);
2199  int total_length = 2;
2200  int total_blobs = 0;
2201  // Iterate over the data structures to extract the recognition result.
2202  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2203  WERD_RES *word = page_res_it.word();
2204  WERD_CHOICE *choice = word->best_choice;
2205  if (choice != nullptr) {
2206  total_blobs += choice->length() + 2;
2207  total_length += choice->unichar_string().length() + 2;
2208  for (int i = 0; i < word->reject_map.length(); ++i) {
2209  if (word->reject_map[i].rejected()) {
2210  ++total_length;
2211  }
2212  }
2213  }
2214  }
2215  if (blob_count != nullptr) {
2216  *blob_count = total_blobs;
2217  }
2218  return total_length;
2219 }
2220 
2221 #ifndef DISABLED_LEGACY_ENGINE
2227  if (tesseract_ == nullptr) {
2228  return false;
2229  }
2230  ClearResults();
2231  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2232  return false;
2233  }
2234 
2235  if (input_file_.empty()) {
2236  input_file_ = kInputFile;
2237  }
2238  return orientation_and_script_detection(input_file_.c_str(), osr, tesseract_) > 0;
2239 }
2240 #endif // #ifndef DISABLED_LEGACY_ENGINE
2241 
2243  tesseract_->min_orientation_margin.set_value(margin);
2244 }
2245 
2260 void TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) {
2261  delete[] * block_orientation;
2262  *block_orientation = nullptr;
2263  delete[] * vertical_writing;
2264  *vertical_writing = nullptr;
2265  BLOCK_IT block_it(block_list_);
2266 
2267  block_it.move_to_first();
2268  int num_blocks = 0;
2269  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2270  if (!block_it.data()->pdblk.poly_block()->IsText()) {
2271  continue;
2272  }
2273  ++num_blocks;
2274  }
2275  if (!num_blocks) {
2276  tprintf("WARNING: Found no blocks\n");
2277  return;
2278  }
2279  *block_orientation = new int[num_blocks];
2280  *vertical_writing = new bool[num_blocks];
2281  block_it.move_to_first();
2282  int i = 0;
2283  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2284  if (!block_it.data()->pdblk.poly_block()->IsText()) {
2285  continue;
2286  }
2287  FCOORD re_rotation = block_it.data()->re_rotation();
2288  float re_theta = re_rotation.angle();
2289  FCOORD classify_rotation = block_it.data()->classify_rotation();
2290  float classify_theta = classify_rotation.angle();
2291  double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI;
2292  if (rot_theta < 0) {
2293  rot_theta += 4;
2294  }
2295  int num_rotations = static_cast<int>(rot_theta + 0.5);
2296  (*block_orientation)[i] = num_rotations;
2297  // The classify_rotation is non-zero only if the text has vertical
2298  // writing direction.
2299  (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
2300  ++i;
2301  }
2302 }
2303 
2304 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
2305  int debug_level = 0;
2306  GetIntVariable("paragraph_debug_level", &debug_level);
2307  if (paragraph_models_ == nullptr) {
2308  paragraph_models_ = new std::vector<ParagraphModel *>;
2309  }
2310  MutableIterator *result_it = GetMutableIterator();
2311  do { // Detect paragraphs for this block
2312  std::vector<ParagraphModel *> models;
2313  ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models);
2314  paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end());
2315  } while (result_it->Next(RIL_BLOCK));
2316  delete result_it;
2317 }
2318 
2320 const char *TessBaseAPI::GetUnichar(int unichar_id) const {
2321  return tesseract_->unicharset.id_to_unichar(unichar_id);
2322 }
2323 
2325 const Dawg *TessBaseAPI::GetDawg(int i) const {
2326  if (tesseract_ == nullptr || i >= NumDawgs()) {
2327  return nullptr;
2328  }
2329  return tesseract_->getDict().GetDawg(i);
2330 }
2331 
2334  return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();
2335 }
2336 
2338 std::string HOcrEscape(const char *text) {
2339  std::string ret;
2340  const char *ptr;
2341  for (ptr = text; *ptr; ptr++) {
2342  switch (*ptr) {
2343  case '<':
2344  ret += "&lt;";
2345  break;
2346  case '>':
2347  ret += "&gt;";
2348  break;
2349  case '&':
2350  ret += "&amp;";
2351  break;
2352  case '"':
2353  ret += "&quot;";
2354  break;
2355  case '\'':
2356  ret += "&#39;";
2357  break;
2358  default:
2359  ret += *ptr;
2360  }
2361  }
2362  return ret;
2363 }
2364 
2365 } // namespace tesseract
struct TessResultRenderer TessResultRenderer
Definition: capi.h:61
#define TRUE
Definition: capi.h:40
#define BOOL
Definition: capi.h:39
#define UNICHAR_LEN
Definition: unichar.h:33
#define TESSERACT_VERSION_STR
Definition: version.h:32
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define MAX_PATH
Definition: host.h:41
#define BOOL_VAR(name, val, comment)
Definition: params.h:359
#define STRING_VAR(name, val, comment)
Definition: params.h:362
@ W_BOL
start of line
Definition: werd.h:34
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_EOL
end of line
Definition: werd.h:35
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:266
@ SET_PARAM_CONSTRAINT_NON_INIT_ONLY
Definition: params.h:42
@ SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:40
const char kTesseractReject
Definition: baseapi.cpp:106
const int kMinRectSize
Definition: baseapi.cpp:104
const int kBytesPerBoxFileLine
Definition: baseapi.cpp:1495
TESS_API int OrientationIdToValue(const int &id)
Definition: osdetect.cpp:566
@ CR_NONE
Definition: pageres.h:160
@ CR_KEEP_SPACE
Definition: pageres.h:160
@ CR_DELETE
Definition: pageres.h:160
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:188
@ PSM_OSD_ONLY
Orientation and script detection only.
Definition: publictypes.h:160
@ PSM_AUTO_ONLY
Automatic page segmentation, but no OSD, or OCR.
Definition: publictypes.h:163
@ PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:168
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:175
int(Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, bool) const DictFunc
Definition: baseapi.h:66
void chomp_string(char *str)
Definition: helpers.h:89
const int kBytesPer64BitNumber
Definition: baseapi.cpp:1497
double(Dict::*)(const char *, const char *, int, const char *, int) ProbabilityInContextFunc
Definition: baseapi.h:68
const int kMaxBytesPerLine
Definition: baseapi.cpp:1504
const int kLatinChs[]
Definition: baseapi.cpp:1554
int orientation_and_script_detection(const char *filename, OSResults *, tesseract::Tesseract *)
Definition: osdetect.cpp:188
constexpr int kMaxCredibleResolution
Definition: publictypes.h:40
std::string HOcrEscape(const char *text)
Definition: baseapi.cpp:2338
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:110
const int kBytesPerNumber
Definition: baseapi.cpp:1489
const char kUNLVReject
Definition: baseapi.cpp:108
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:36
const int kNumbersPerBlob
Definition: baseapi.cpp:1484
bool(*)(const char *filename, std::vector< char > *data) FileReader
Definition: baseapi.h:63
constexpr int kMinCredibleResolution
Definition: publictypes.h:38
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:82
void DetectParagraphs(int debug_level, std::vector< RowInfo > *row_infos, std::vector< PARA * > *row_owners, PARA_LIST *paragraphs, std::vector< ParagraphModel * > *models)
const char kUNLVSuspect
Definition: baseapi.cpp:110
const int kUniChs[]
Definition: baseapi.cpp:1552
EquationDetect * equ_detect_
The equation detector.
Definition: baseapi.h:769
virtual ~TessBaseAPI()
Definition: baseapi.cpp:231
const char * GetInitLanguagesAsString() const
Definition: baseapi.cpp:445
bool ProcessPagesInternal(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1102
const char * GetInputName()
Definition: baseapi.cpp:925
std::string input_file_
Name used by training code.
Definition: baseapi.h:775
virtual bool Threshold(Pix **pix)
Definition: baseapi.cpp:2013
bool ProcessPage(Pix *pix, int page_index, const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1229
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:831
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:774
void SetPageSegMode(PageSegMode mode)
Definition: baseapi.cpp:508
void GetBlockTextOrientations(int **block_orientation, bool **vertical_writing)
Definition: baseapi.cpp:2260
bool SetDebugVariable(const char *name, const char *value)
Definition: baseapi.cpp:284
const char * GetDatapath()
Definition: baseapi.cpp:932
bool GetVariableAsString(const char *name, std::string *val) const
Definition: baseapi.cpp:328
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:767
bool GetIntVariable(const char *name, int *value) const
Definition: baseapi.cpp:291
Boxa * GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:655
void SetRectangle(int left, int top, int width, int height)
Definition: baseapi.cpp:616
int NumDawgs() const
Definition: baseapi.cpp:2333
MutableIterator * GetMutableIterator()
Definition: baseapi.cpp:1330
int IsValidWord(const char *word) const
Definition: baseapi.cpp:1924
bool SetVariable(const char *name, const char *value)
Definition: baseapi.cpp:276
bool IsValidCharacter(const char *utf8_character) const
Definition: baseapi.cpp:1928
void DetectParagraphs(bool after_text_recognition)
Definition: baseapi.cpp:2304
static const char * Version()
Definition: baseapi.cpp:238
Boxa * GetWords(Pixa **pixa)
Definition: baseapi.cpp:677
std::string language_
Last initialized language.
Definition: baseapi.h:778
void GetAvailableLanguagesAsVector(std::vector< std::string > *langs) const
Definition: baseapi.cpp:468
void SetSourceResolution(int ppi)
Definition: baseapi.cpp:581
void ReadDebugConfigFile(const char *filename)
Definition: baseapi.cpp:499
ResultIterator * GetIterator()
Definition: baseapi.cpp:1313
bool GetTextDirection(int *out_offset, float *out_slope)
Definition: baseapi.cpp:1934
bool ProcessPages(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1068
int TextLength(int *blob_count) const
Definition: baseapi.cpp:2193
std::string datapath_
Current location of tessdata.
Definition: baseapi.h:777
int GetThresholdedImageScaleFactor() const
Definition: baseapi.cpp:787
bool DetectOS(OSResults *)
Definition: baseapi.cpp:2226
PageSegMode GetPageSegMode() const
Definition: baseapi.cpp:516
static void ClearPersistentCache()
Definition: baseapi.cpp:1916
std::vector< ParagraphModel * > * paragraph_models_
Definition: baseapi.h:772
void SetDictFunc(DictFunc f)
Definition: baseapi.cpp:1969
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:780
const Dawg * GetDawg(int i) const
Definition: baseapi.cpp:2325
FileReader reader_
Reads files from any filesystem.
Definition: baseapi.h:770
char * GetTSVText(int page_number)
Definition: baseapi.cpp:1372
void SetInputName(const char *name)
Definition: baseapi.cpp:267
char * GetOsdText(int page_number)
Definition: baseapi.cpp:1704
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:365
OcrEngineMode oem() const
Definition: baseapi.h:717
void PrintVariables(FILE *fp) const
Definition: baseapi.cpp:353
void GetLoadedLanguagesAsVector(std::vector< std::string > *langs) const
Definition: baseapi.cpp:454
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:771
static size_t getOpenCLDevice(void **device)
Definition: baseapi.cpp:249
std::string output_file_
Name used by debug code.
Definition: baseapi.h:776
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: baseapi.cpp:573
Boxa * GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:699
PageIterator * AnalyseLayout()
Definition: baseapi.cpp:809
char * GetBoxText(int page_number)
Definition: baseapi.cpp:1512
const char * GetStringVariable(const char *name) const
Definition: baseapi.cpp:311
void ReadConfigFile(const char *filename)
Definition: baseapi.cpp:494
bool AdaptToWordStr(PageSegMode mode, const char *wordstr)
Definition: baseapi.cpp:1795
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:773
void set_min_orientation_margin(double margin)
Definition: baseapi.cpp:2242
Boxa * GetStrips(Pixa **pixa, int **blockids)
Definition: baseapi.cpp:668
bool DetectOrientationScript(int *orient_deg, float *orient_conf, const char **script_name, float *script_conf)
Definition: baseapi.cpp:1668
void PrintFontsTable(FILE *fp) const
Definition: baseapi.cpp:335
char * TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height)
Definition: baseapi.cpp:536
void SetProbabilityInContextFunc(ProbabilityInContextFunc f)
Definition: baseapi.cpp:1983
LTRResultIterator * GetLTRIterator()
Definition: baseapi.cpp:1296
Tesseract * osd_tesseract_
For orientation & script detection.
Definition: baseapi.h:768
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:301
void ClearAdaptiveClassifier()
Definition: baseapi.cpp:557
bool GetDoubleVariable(const char *name, double *value) const
Definition: baseapi.cpp:317
Pix * GetThresholdedImage()
Definition: baseapi.cpp:628
const char * GetUnichar(int unichar_id) const
Definition: baseapi.cpp:2320
Boxa * GetConnectedComponents(Pixa **cc)
Definition: baseapi.cpp:687
void SetInputImage(Pix *pix)
Definition: baseapi.cpp:917
void SetOutputName(const char *name)
Definition: baseapi.cpp:272
OcrEngineMode last_oem_requested_
Last ocr language mode requested.
Definition: baseapi.h:779
Boxa * GetRegions(Pixa **pixa)
Definition: baseapi.cpp:643
char * GetUTF8Text(PageIteratorLevel level) const
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:115
void set_deadline_msecs(int32_t deadline_msecs)
Definition: ocrclass.h:127
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:111
OSBestResult best_result
Definition: osdetect.h:82
TESS_API int get_best_script(int orientation_id) const
Definition: osdetect.cpp:113
UNICHARSET * unicharset
Definition: osdetect.h:81
virtual bool Next(PageIteratorLevel level)
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
bool AddImage(TessBaseAPI *api)
Definition: renderer.cpp:88
bool BeginDocument(const char *title)
Definition: renderer.cpp:75
bool Next(PageIteratorLevel level) override
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
int first_uni() const
Definition: unichar.cpp:105
Image * mutable_pix_binary()
void SetEquationDetect(EquationDetect *detector)
int init_tesseract(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:291
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:270
void set_pix_grey(Image grey_pix)
bool TrainLineRecognizer(const char *input_imagename, const std::string &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:41
PAGE_RES * ApplyBoxes(const char *filename, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:110
int num_sub_langs() const
void TidyUp(PAGE_RES *page_res)
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:48
void ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res)
void ReSegmentByClassification(PAGE_RES *page_res)
void set_pix_thresholds(Image thresholds)
Dict & getDict() override
Image pix_original() const
void recog_training_segmented(const char *filename, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
void set_pix_original(Image original_pix)
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
int SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
void set_source_resolution(int ppi)
void CorrectClassifyWords(PAGE_RES *page_res)
Image pix_binary() const
void pgeditor_main(int width, int height, PAGE_RES *page_res)
Definition: pgedit.cpp:354
FILE * init_recog_training(const char *filename)
PAGE_RES * SetupApplyBoxes(const std::vector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:197
bool AnyLSTMLang() const
Tesseract * get_sub_lang(int index) const
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:287
int GetScaledEstimatedResolution() const
Definition: thresholder.h:115
virtual Image GetPixRectThresholds()
int GetSourceYResolution() const
Definition: thresholder.h:99
virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth, int *imageheight)
bool IsEmpty() const
Return true if no image has been set.
Definition: thresholder.cpp:58
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:70
int GetScaledYResolution() const
Definition: thresholder.h:102
virtual std::tuple< bool, Image, Image, Image > Threshold(TessBaseAPI *api, ThresholdMethod method)
void SetRectangle(int left, int top, int width, int height)
virtual Image GetPixRectGrey()
virtual bool ThresholdToPix(Image *pix)
Returns false on error.
bool IsBinary() const
Returns true if the source image is binary.
Definition: thresholder.h:84
void SetSourceYResolution(int ppi)
Definition: thresholder.h:95
virtual void Clear()
Destroy the Pix if there is one, freeing memory.
Definition: thresholder.cpp:53
bool is_italic() const
Definition: fontinfo.h:118
bool is_fixed_pitch() const
Definition: fontinfo.h:124
bool is_bold() const
Definition: fontinfo.h:121
bool is_fraktur() const
Definition: fontinfo.h:130
bool is_serif() const
Definition: fontinfo.h:127
Pix * pix_
Definition: image.h:27
Image clone() const
Definition: image.cpp:24
WERD_CHOICE * best_choice
Definition: pageres.h:239
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:313
void BestChoiceToCorrectText()
Definition: pageres.cpp:956
WERD_RES * restart_page()
Definition: pageres.h:710
WERD_RES * forward()
Definition: pageres.h:743
WERD_RES * word() const
Definition: pageres.h:763
float angle() const
find angle
Definition: points.h:246
float y() const
Definition: points.h:209
float certainty() const
Definition: ratngs.h:311
const std::string & unichar_lengths() const
Definition: ratngs.h:529
unsigned length() const
Definition: ratngs.h:283
std::string & unichar_string()
Definition: ratngs.h:515
uint16_t length() const
Definition: rejctmap.h:333
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128
uint8_t space() const
Definition: werd.h:100
void set_text(const char *new_text)
Definition: werd.h:124
UNICHARSET unicharset
Definition: ccutil.h:61
std::string lang
Definition: ccutil.h:59
ParamsVectors * params()
Definition: ccutil.h:53
std::string datadir
Definition: ccutil.h:57
std::vector< BoolParam * > bool_params
Definition: params.h:47
std::vector< StringParam * > string_params
Definition: params.h:48
std::vector< IntParam * > int_params
Definition: params.h:46
std::vector< DoubleParam * > double_params
Definition: params.h:49
static bool GetParamAsString(const char *name, const ParamsVectors *member_params, std::string *value)
Definition: params.cpp:130
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:164
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:81
bool LoadMemBuffer(const char *name, const char *data, int size)
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:887
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:262
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:324
bool WriteTRFile(const char *filename)
Definition: blobclass.cpp:60
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
void DeleteUnusedDawgs()
Definition: dawg_cache.h:42
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:172
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:345
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:385
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:801
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:381
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:354
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387