tesseract  5.0.0
classify.h
Go to the documentation of this file.
1 // File: classify.h
3 // Description: classify class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H_
20 #define TESSERACT_CLASSIFY_CLASSIFY_H_
21 
22 // Include automatically generated configuration file if running autoconf.
23 #ifdef HAVE_CONFIG_H
24 # include "config_auto.h"
25 #endif
26 
27 #ifdef DISABLED_LEGACY_ENGINE
28 
29 # include "ccstruct.h"
30 # include "dict.h"
31 
32 namespace tesseract {
33 
34 class Classify : public CCStruct {
35 public:
36  Classify();
37  virtual ~Classify();
38  virtual Dict &getDict() {
39  return dict_;
40  }
41 
42  // Member variables.
43 
44  INT_VAR_H(classify_debug_level);
45  BOOL_VAR_H(classify_bln_numeric_mode);
46  double_VAR_H(classify_max_rating_ratio);
47  double_VAR_H(classify_max_certainty_margin);
48 
49 private:
50  Dict dict_;
51 };
52 
53 } // namespace tesseract
54 
55 #else // DISABLED_LEGACY_ENGINE not defined
56 
57 # include "adaptive.h"
58 # include "ccstruct.h"
59 # include "dict.h"
60 # include "featdefs.h"
61 # include "fontinfo.h"
62 # include "intfx.h"
63 # include "intmatcher.h"
64 # include "normalis.h"
65 # include "ocrfeatures.h"
66 # include "ratngs.h"
67 # include "unicity_table.h"
68 
69 namespace tesseract {
70 
71 class ScrollView;
72 class WERD_CHOICE;
73 class WERD_RES;
74 struct ADAPT_RESULTS;
75 struct NORM_PROTOS;
76 
77 static const int kUnknownFontinfoId = -1;
78 static const int kBlankFontinfoId = -2;
79 
80 class ShapeClassifier;
81 struct ShapeRating;
82 class ShapeTable;
83 struct UnicharRating;
84 
85 // How segmented is a blob. In this enum, character refers to a classifiable
86 // unit, but that is too long and character is usually easier to understand.
88  CST_FRAGMENT, // A partial character.
89  CST_WHOLE, // A correctly segmented character.
90  CST_IMPROPER, // More than one but less than 2 characters.
91  CST_NGRAM // Multiple characters.
92 };
93 
94 class TESS_API Classify : public CCStruct {
95 public:
96  Classify();
97  ~Classify() override;
98  virtual Dict &getDict() {
99  return dict_;
100  }
101 
102  const ShapeTable *shape_table() const {
103  return shape_table_;
104  }
105 
106  // Takes ownership of the given classifier, and uses it for future calls
107  // to CharNormClassifier.
108  void SetStaticClassifier(ShapeClassifier *static_classifier);
109 
110  // Adds a noise classification result that is a bit worse than the worst
111  // current result, or the worst possible result if no current results.
112  void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices);
113 
114  // Returns true if the blob is small enough to be a large speckle.
115  bool LargeSpeckle(const TBLOB &blob);
116 
117  /* adaptive.cpp ************************************************************/
118  int GetFontinfoId(ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId);
119  // Runs the class pruner from int_templates on the given features, returning
120  // the number of classes output in results.
121  // int_templates Class pruner tables
122  // num_features Number of features in blob
123  // features Array of features
124  // normalization_factors (input) Array of int_templates->NumClasses fudge
125  // factors from blob normalization process.
126  // (Indexed by CLASS_INDEX)
127  // expected_num_features (input) Array of int_templates->NumClasses
128  // expected number of features for each class.
129  // (Indexed by CLASS_INDEX)
130  // results (output) Sorted Array of pruned classes.
131  // Array must be sized to take the maximum possible
132  // number of outputs : int_templates->NumClasses.
133  int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this,
134  const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors,
135  const uint16_t *expected_num_features, std::vector<CP_RESULT_STRUCT> *results);
136  void ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs);
137  void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates);
138  void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates);
139  ADAPT_TEMPLATES_STRUCT *ReadAdaptedTemplates(TFile *File);
140  /* normmatch.cpp ************************************************************/
141  float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch);
142  void FreeNormProtos();
143  NORM_PROTOS *ReadNormProtos(TFile *fp);
144  /* protos.cpp ***************************************************************/
145  void ConvertProto(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class);
146  INT_TEMPLATES_STRUCT *CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset);
147  /* adaptmatch.cpp ***********************************************************/
148 
149  // Learns the given word using its chopped_word, seam_array, denorm,
150  // box_word, best_state, and correct_text to learn both correctly and
151  // incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
152  // is called and the data will be saved in an internal buffer.
153  // Otherwise AdaptToBlob is called for adaption within a document.
154  void LearnWord(const char *fontname, WERD_RES *word);
155 
156  // Builds a blob of length fragments, from the word, starting at start,
157  // and then learns it, as having the given correct_text.
158  // If fontname is not nullptr, then LearnBlob is called and the data will be
159  // saved in an internal buffer for static training.
160  // Otherwise AdaptToBlob is called for adaption within a document.
161  // threshold is a magic number required by AdaptToChar and generated by
162  // ComputeAdaptionThresholds.
163  // Although it can be partly inferred from the string, segmentation is
164  // provided to explicitly clarify the character segmentation.
165  void LearnPieces(const char *fontname, int start, int length, float threshold,
166  CharSegmentationType segmentation, const char *correct_text, WERD_RES *word);
167  void InitAdaptiveClassifier(TessdataManager *mgr);
168  void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class,
169  ADAPT_TEMPLATES_STRUCT *Templates);
170  void AmbigClassifier(const std::vector<INT_FEATURE_STRUCT> &int_features,
171  const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob,
172  INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities,
173  ADAPT_RESULTS *results);
174  void MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features,
175  const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors,
176  ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box,
177  const std::vector<CP_RESULT_STRUCT> &results, ADAPT_RESULTS *final_results);
178  // Converts configs to fonts, and if the result is not adapted, and a
179  // shape_table_ is present, the shape is expanded to include all
180  // unichar_ids represented, before applying a set of corrections to the
181  // distance rating in int_result, (see ComputeCorrectedRating.)
182  // The results are added to the final_results output.
183  void ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom,
184  int top, float cp_rating, int blob_length,
185  int matcher_multiplier, const uint8_t *cn_factors,
186  UnicharRating *int_result, ADAPT_RESULTS *final_results);
187  // Applies a set of corrections to the distance im_rating,
188  // including the cn_correction, miss penalty and additional penalty
189  // for non-alnums being vertical misfits. Returns the corrected distance.
190  double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating,
191  int feature_misses, int bottom, int top, int blob_length,
192  int matcher_multiplier, const uint8_t *cn_factors);
193  void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results,
194  BLOB_CHOICE_LIST *Choices);
195  void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results);
196  int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures);
197 
198 # ifndef GRAPHICS_DISABLED
199  void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results);
200 # endif
201  PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[],
202  INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask);
203  int MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId,
204  int NumFeatures, INT_FEATURE_ARRAY Features,
205  FEATURE_SET FloatFeatures);
206  void MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob);
207  void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results);
208  void RemoveExtraPuncs(ADAPT_RESULTS *Results);
209  void RemoveBadMatches(ADAPT_RESULTS *Results);
210  void SetAdaptiveThreshold(float Threshold);
211  void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features);
212  // Returns a string for the classifier class_id: either the corresponding
213  // unicharset debug_str or the shape_table_ debug str.
214  std::string ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id,
215  int config_id) const;
216  // Converts a classifier class_id index with a config ID to:
217  // shape_table_ present: a shape_table_ index OR
218  // No shape_table_: a font ID.
219  // Without shape training, each class_id, config pair represents a single
220  // unichar id/font combination, so this function looks up the corresponding
221  // font id.
222  // With shape training, each class_id, config pair represents a single
223  // shape table index, so the fontset_table stores the shape table index,
224  // and the shape_table_ must be consulted to obtain the actual unichar_id/
225  // font combinations that the shape represents.
226  int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const;
227  // Converts a shape_table_ index to a classifier class_id index (not a
228  // unichar-id!). Uses a search, so not fast.
229  int ShapeIDToClassID(int shape_id) const;
230  UNICHAR_ID *BaselineClassifier(TBLOB *Blob, const std::vector<INT_FEATURE_STRUCT> &int_features,
231  const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates,
232  ADAPT_RESULTS *Results);
233  int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results);
234 
235  // As CharNormClassifier, but operates on a TrainingSample and outputs to
236  // a vector of ShapeRating without conversion to classes.
237  int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample,
238  std::vector<UnicharRating> *results);
239  UNICHAR_ID *GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass);
240  void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results);
241  void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold,
242  ADAPT_TEMPLATES_STRUCT *adaptive_templates);
243  void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class);
244  bool AdaptableWord(WERD_RES *word);
245  void EndAdaptiveClassifier();
246  void SettupPass1();
247  void SettupPass2();
248  void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices);
249  void ClassifyAsNoise(ADAPT_RESULTS *Results);
250  void ResetAdaptiveClassifierInternal();
251  void SwitchAdaptiveClassifier();
252  void StartBackupAdaptiveClassifier();
253 
254  int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates,
255  uint8_t *pruner_norm_array, uint8_t *char_norm_array);
256  // Computes the char_norm_array for the unicharset and, if not nullptr, the
257  // pruner_array as appropriate according to the existence of the shape_table.
258  // The norm_feature is deleted as it is almost certainly no longer needed.
259  void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates,
260  uint8_t *char_norm_array, uint8_t *pruner_array);
261 
262  bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config);
263  void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
264 
266  return NumAdaptationsFailed > 0;
267  }
269  return AdaptedTemplates->NumPermClasses == 0;
270  }
271  bool LooksLikeGarbage(TBLOB *blob);
272 #ifndef GRAPHICS_DISABLED
273  void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox);
274 #endif
275  // intfx.cpp
276  // Computes the DENORMS for bl(baseline) and cn(character) normalization
277  // during feature extraction. The input denorm describes the current state
278  // of the blob, which is usually a baseline-normalized word.
279  // The Transforms setup are as follows:
280  // Baseline Normalized (bl) Output:
281  // We center the grapheme by aligning the x-coordinate of its centroid with
282  // x=128 and leaving the already-baseline-normalized y as-is.
283  //
284  // Character Normalized (cn) Output:
285  // We align the grapheme's centroid at the origin and scale it
286  // asymmetrically in x and y so that the 2nd moments are a standard value
287  // (51.2) ie the result is vaguely square.
288  // If classify_nonlinear_norm is true:
289  // A non-linear normalization is setup that attempts to evenly distribute
290  // edges across x and y.
291  //
292  // Some of the fields of fx_info are also setup:
293  // Length: Total length of outline.
294  // Rx: Rounded y second moment. (Reversed by convention.)
295  // Ry: rounded x second moment.
296  // Xmean: Rounded x center of mass of the blob.
297  // Ymean: Rounded y center of mass of the blob.
298  static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm,
299  DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info);
300 
301  // Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
302  // (x,y) position and angle as measured counterclockwise from the vector
303  // <-1, 0>, from blob using two normalizations defined by bl_denorm and
304  // cn_denorm. See SetpuBLCNDenorms for definitions.
305  // If outline_cn_counts is not nullptr, on return it contains the cumulative
306  // number of cn features generated for each outline in the blob (in order).
307  // Thus after the first outline, there were (*outline_cn_counts)[0] features,
308  // after the second outline, there were (*outline_cn_counts)[1] features etc.
309  static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm,
310  std::vector<INT_FEATURE_STRUCT> *bl_features,
311  std::vector<INT_FEATURE_STRUCT> *cn_features,
312  INT_FX_RESULT_STRUCT *results, std::vector<int> *outline_cn_counts);
313  /* float2int.cpp ************************************************************/
314  void ClearCharNormArray(uint8_t *char_norm_array);
315  void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array);
316  void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
317  /* intproto.cpp *************************************************************/
318  INT_TEMPLATES_STRUCT *ReadIntTemplates(TFile *fp);
319  void WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates, const UNICHARSET &target_unicharset);
320  CLASS_ID GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *pretrained_on,
321  int *shape_id);
322  void ShowMatchDisplay();
323  /* font detection ***********************************************************/
325  return fontinfo_table_;
326  }
328  return fontinfo_table_;
329  }
331  return fontset_table_;
332  }
333  /* mfoutline.cpp ***********************************************************/
334  void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale);
335  /* outfeat.cpp ***********************************************************/
336  FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob);
337  /* picofeat.cpp ***********************************************************/
338  FEATURE_SET ExtractPicoFeatures(TBLOB *Blob);
339  FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info);
340  FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info);
341  /* blobclass.cpp ***********************************************************/
342  // Extracts features from the given blob and saves them in the tr_file_data_
343  // member variable.
344  // fontname: Name of font that this blob was printed in.
345  // cn_denorm: Character normalization transformation to apply to the blob.
346  // fx_info: Character normalization parameters computed with cn_denorm.
347  // blob_text: Ground truth text for the blob.
348  void LearnBlob(const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm,
349  const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text);
350  // Writes stored training data to a .tr file based on the given filename.
351  // Returns false on error.
352  bool WriteTRFile(const char *filename);
353 
354  // Member variables.
355 
356  // Parameters.
357  // Set during training (in lang.config) to indicate whether the divisible
358  // blobs chopper should be used (true for latin script.)
359  BOOL_VAR_H(allow_blob_division);
360  // Set during training (in lang.config) to indicate whether the divisible
361  // blobs chopper should be used in preference to chopping. Set to true for
362  // southern Indic scripts.
363  BOOL_VAR_H(prioritize_division);
364  BOOL_VAR_H(classify_enable_learning);
365  INT_VAR_H(classify_debug_level);
366 
367  /* mfoutline.cpp ***********************************************************/
368  /* control knobs used to control normalization of outlines */
369  INT_VAR_H(classify_norm_method);
370  double_VAR_H(classify_char_norm_range);
371  double_VAR_H(classify_max_rating_ratio);
372  double_VAR_H(classify_max_certainty_margin);
373 
374  /* adaptmatch.cpp ***********************************************************/
375  BOOL_VAR_H(tess_cn_matching);
376  BOOL_VAR_H(tess_bn_matching);
378  BOOL_VAR_H(classify_use_pre_adapted_templates);
379  BOOL_VAR_H(classify_save_adapted_templates);
380  BOOL_VAR_H(classify_enable_adaptive_debugger);
381  BOOL_VAR_H(classify_nonlinear_norm);
382  INT_VAR_H(matcher_debug_level);
383  INT_VAR_H(matcher_debug_flags);
384  INT_VAR_H(classify_learning_debug_level);
385  double_VAR_H(matcher_good_threshold);
386  double_VAR_H(matcher_reliable_adaptive_result);
387  double_VAR_H(matcher_perfect_threshold);
388  double_VAR_H(matcher_bad_match_pad);
389  double_VAR_H(matcher_rating_margin);
390  double_VAR_H(matcher_avg_noise_size);
391  INT_VAR_H(matcher_permanent_classes_min);
392  INT_VAR_H(matcher_min_examples_for_prototyping);
393  INT_VAR_H(matcher_sufficient_examples_for_prototyping);
394  double_VAR_H(matcher_clustering_max_angle_delta);
395  double_VAR_H(classify_misfit_junk_penalty);
396  double_VAR_H(rating_scale);
397  double_VAR_H(certainty_scale);
398  double_VAR_H(tessedit_class_miss_scale);
399  double_VAR_H(classify_adapted_pruning_factor);
400  double_VAR_H(classify_adapted_pruning_threshold);
401  INT_VAR_H(classify_adapt_proto_threshold);
402  INT_VAR_H(classify_adapt_feature_threshold);
403  BOOL_VAR_H(disable_character_fragments);
404  double_VAR_H(classify_character_fragments_garbage_certainty_threshold);
405  BOOL_VAR_H(classify_debug_character_fragments);
406  BOOL_VAR_H(matcher_debug_separate_windows);
407  STRING_VAR_H(classify_learn_debug_str);
408 
409  /* intmatcher.cpp **********************************************************/
410  INT_VAR_H(classify_class_pruner_threshold);
411  INT_VAR_H(classify_class_pruner_multiplier);
412  INT_VAR_H(classify_cp_cutoff_strength);
413  INT_VAR_H(classify_integer_matcher_multiplier);
414 
415  BOOL_VAR_H(classify_bln_numeric_mode);
416  double_VAR_H(speckle_large_max_size);
417  double_VAR_H(speckle_rating_penalty);
418 
419  // Use class variables to hold onto built-in templates and adapted templates.
420  INT_TEMPLATES_STRUCT *PreTrainedTemplates = nullptr;
421  ADAPT_TEMPLATES_STRUCT *AdaptedTemplates = nullptr;
422  // The backup adapted templates are created from the previous page (only)
423  // so they are always ready and reasonably well trained if the primary
424  // adapted templates become full.
425  ADAPT_TEMPLATES_STRUCT *BackupAdaptedTemplates = nullptr;
426 
427  // Create dummy proto and config masks for use with the built-in templates.
428  BIT_VECTOR AllProtosOn = nullptr;
429  BIT_VECTOR AllConfigsOn = nullptr;
430  BIT_VECTOR AllConfigsOff = nullptr;
431  BIT_VECTOR TempProtoMask = nullptr;
432  /* normmatch.cpp */
433  NORM_PROTOS *NormProtos = nullptr;
434  /* font detection ***********************************************************/
436  // Without shape training, each class_id, config pair represents a single
437  // unichar id/font combination, so each fontset_table_ entry holds font ids
438  // for each config in the class.
439  // With shape training, each class_id, config pair represents a single
440  // shape_table_ index, so the fontset_table_ stores the shape_table_ index,
441  // and the shape_table_ must be consulted to obtain the actual unichar_id/
442  // font combinations that the shape represents.
444 
445 protected:
448  // If a shape_table_ is present, it is used to remap classifier output in
449  // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
450  // mean an index to the shape_table_ and the choices returned are *all* the
451  // shape_table_ entries at that index.
452  ShapeTable *shape_table_ = nullptr;
453 
454 private:
455  // The currently active static classifier.
456  ShapeClassifier *static_classifier_ = nullptr;
457 #ifndef GRAPHICS_DISABLED
458  ScrollView *learn_debug_win_ = nullptr;
459  ScrollView *learn_fragmented_word_debug_win_ = nullptr;
460  ScrollView *learn_fragments_debug_win_ = nullptr;
461 #endif
462 
463  // Training data gathered here for all the images in a document.
464  std::string tr_file_data_;
465 
466  Dict dict_;
467 
468  std::vector<uint16_t> shapetable_cutoffs_;
469 
470  /* variables used to hold performance statistics */
471  int NumAdaptationsFailed = 0;
472 
473  // Expected number of features in the class pruner, used to penalize
474  // unknowns that have too few features (like a c being classified as e) so
475  // it doesn't recognize everything as '@' or '#'.
476  // CharNormCutoffs is for the static classifier (with no shapetable).
477  // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
478  // value in the adaptive classifier. Both are indexed by unichar_id.
479  // shapetable_cutoffs_ provides a similar value for each shape in the
480  // shape_table_
481  uint16_t CharNormCutoffs[MAX_NUM_CLASSES];
482  uint16_t BaselineCutoffs[MAX_NUM_CLASSES];
483 
484 public:
485  bool EnableLearning = true;
486 };
487 
488 } // namespace tesseract
489 
490 #endif // DISABLED_LEGACY_ENGINE
491 
492 #endif // TESSERACT_CLASSIFY_CLASSIFY_H_
#define classify_enable_adaptive_matcher
Definition: adaptmatch.cpp:78
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
void SetAdaptiveThreshold(float Threshold)
int UNICHAR_ID
Definition: unichar.h:36
int16_t PROTO_ID
Definition: matchdefs.h:40
void ShowMatchDisplay()
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:137
uint8_t FEATURE_ID
Definition: matchdefs.h:46
CharSegmentationType
Definition: classify.h:87
@ CST_IMPROPER
Definition: classify.h:90
@ CST_NGRAM
Definition: classify.h:91
@ CST_WHOLE
Definition: classify.h:89
@ CST_FRAGMENT
Definition: classify.h:88
INT_VAR_H(classify_learning_debug_level)
BOOL_VAR_H(classify_use_pre_adapted_templates)
IntegerMatcher im_
Definition: classify.h:446
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:268
INT_VAR_H(classify_class_pruner_multiplier)
BOOL_VAR_H(disable_character_fragments)
BOOL_VAR_H(prioritize_division)
double_VAR_H(classify_max_certainty_margin)
double_VAR_H(classify_adapted_pruning_factor)
INT_VAR_H(classify_norm_method)
double_VAR_H(classify_character_fragments_garbage_certainty_threshold)
INT_VAR_H(classify_class_pruner_threshold)
double_VAR_H(certainty_scale)
double_VAR_H(matcher_good_threshold)
double_VAR_H(speckle_rating_penalty)
~Classify() override
Definition: classify.cpp:151
double_VAR_H(speckle_large_max_size)
double_VAR_H(matcher_avg_noise_size)
STRING_VAR_H(classify_learn_debug_str)
BOOL_VAR_H(classify_enable_adaptive_matcher)
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:324
BOOL_VAR_H(tess_bn_matching)
double_VAR_H(matcher_perfect_threshold)
INT_VAR_H(classify_debug_level)
INT_VAR_H(classify_integer_matcher_multiplier)
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:447
double_VAR_H(classify_char_norm_range)
double_VAR_H(classify_adapted_pruning_threshold)
UnicityTable< FontSet > fontset_table_
Definition: classify.h:443
double_VAR_H(classify_max_rating_ratio)
UnicityTable< FontSet > & get_fontset_table()
Definition: classify.h:330
BOOL_VAR_H(classify_nonlinear_norm)
INT_VAR_H(matcher_min_examples_for_prototyping)
BOOL_VAR_H(classify_bln_numeric_mode)
INT_VAR_H(classify_adapt_feature_threshold)
const UnicityTable< FontInfo > & get_fontinfo_table() const
Definition: classify.h:327
double_VAR_H(rating_scale)
double_VAR_H(matcher_clustering_max_angle_delta)
BOOL_VAR_H(classify_enable_adaptive_debugger)
INT_VAR_H(matcher_debug_level)
BOOL_VAR_H(matcher_debug_separate_windows)
bool AdaptiveClassifierIsFull() const
Definition: classify.h:265
INT_VAR_H(matcher_permanent_classes_min)
INT_VAR_H(classify_adapt_proto_threshold)
BOOL_VAR_H(tess_cn_matching)
INT_VAR_H(classify_cp_cutoff_strength)
double_VAR_H(matcher_reliable_adaptive_result)
double_VAR_H(matcher_rating_margin)
virtual Dict & getDict()
Definition: classify.h:98
INT_VAR_H(matcher_sufficient_examples_for_prototyping)
INT_VAR_H(matcher_debug_flags)
BOOL_VAR_H(classify_debug_character_fragments)
BOOL_VAR_H(classify_enable_learning)
double_VAR_H(tessedit_class_miss_scale)
double_VAR_H(classify_misfit_junk_penalty)
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:435
BOOL_VAR_H(classify_save_adapted_templates)
const ShapeTable * shape_table() const
Definition: classify.h:102
BOOL_VAR_H(allow_blob_division)
double_VAR_H(matcher_bad_match_pad)
#define TESS_API
Definition: export.h:34