tesseract  5.0.0
adaptmatch.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: adaptmatch.cpp
3  ** Purpose: High level adaptive matcher.
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  ******************************************************************************/
17 
18 /*-----------------------------------------------------------------------------
19  Include Files and Type Defines
20 -----------------------------------------------------------------------------*/
21 #ifdef HAVE_CONFIG_H
22 # include "config_auto.h"
23 #endif
24 
25 #include "adaptive.h" // for ADAPT_CLASS
26 #include "ambigs.h" // for UnicharIdVector, UnicharAmbigs
27 #include "bitvec.h" // for FreeBitVector, NewBitVector, BIT_VECTOR
28 #include "blobs.h" // for TBLOB, TWERD
29 #include "classify.h" // for Classify, CST_FRAGMENT, CST_WHOLE
30 #include "dict.h" // for Dict
31 #include "errcode.h" // for ASSERT_HOST
32 #include "featdefs.h" // for CharNormDesc
33 #include "float2int.h" // for BASELINE_Y_SHIFT
34 #include "fontinfo.h" // for ScoredFont, FontSet
35 #include "intfx.h" // for BlobToTrainingSample, INT_FX_RESULT_S...
36 #include "intmatcher.h" // for CP_RESULT_STRUCT, IntegerMatcher
37 #include "intproto.h" // for INT_FEATURE_STRUCT, (anonymous), Clas...
38 #include "matchdefs.h" // for CLASS_ID, FEATURE_ID, PROTO_ID, NO_PROTO
39 #include "mfoutline.h" // for baseline, character, MF_SCALE_FACTOR
40 #include "normalis.h" // for DENORM, kBlnBaselineOffset, kBlnXHeight
41 #include "normfeat.h" // for ActualOutlineLength, CharNormLength
42 #include "ocrfeatures.h" // for FEATURE_STRUCT, FEATURE
43 #include "oldlist.h" // for push, delete_d
44 #include "outfeat.h" // for OutlineFeatDir, OutlineFeatLength
45 #include "pageres.h" // for WERD_RES
46 #include "params.h" // for IntParam, BoolParam, DoubleParam, Str...
47 #include "picofeat.h" // for PicoFeatDir, PicoFeatX, PicoFeatY
48 #include "protos.h" // for PROTO_STRUCT, FillABC
49 #include "ratngs.h" // for BLOB_CHOICE_IT, BLOB_CHOICE_LIST, BLO...
50 #include "rect.h" // for TBOX
51 #include "scrollview.h" // for ScrollView, ScrollView::BROWN, Scroll...
52 #include "seam.h" // for SEAM
53 #include "shapeclassifier.h" // for ShapeClassifier
54 #include "shapetable.h" // for UnicharRating, ShapeTable, Shape, Uni...
55 #include "tessclassifier.h" // for TessClassifier
56 #include "tessdatamanager.h" // for TessdataManager, TESSDATA_INTTEMP
57 #include "tprintf.h" // for tprintf
58 #include "trainingsample.h" // for TrainingSample
59 #include "unicharset.h" // for UNICHARSET, CHAR_FRAGMENT, UNICHAR_SPACE
60 #include "unicity_table.h" // for UnicityTable
61 
62 #include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
63 #include "helpers.h" // for IntCastRounded, ClipToRange
64 #include "serialis.h" // for TFile
65 
66 #include <algorithm> // for max, min
67 #include <cassert> // for assert
68 #include <cmath> // for fabs
69 #include <cstdint> // for INT32_MAX, UINT8_MAX
70 #include <cstdio> // for fflush, fclose, fopen, stdout, FILE
71 #include <cstring> // for strstr, memset, strcmp
72 
73 namespace tesseract {
74 
75 // TODO: The parameter classify_enable_adaptive_matcher can cause
76 // a segmentation fault if it is set to false (issue #256),
77 // so override it here.
78 #define classify_enable_adaptive_matcher true
79 
80 #define ADAPT_TEMPLATE_SUFFIX ".a"
81 
82 #define MAX_MATCHES 10
83 #define UNLIKELY_NUM_FEAT 200
84 #define NO_DEBUG 0
85 #define MAX_ADAPTABLE_WERD_SIZE 40
86 
87 #define ADAPTABLE_WERD_ADJUSTMENT (0.05)
88 
89 #define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT)
90 
91 #define WORST_POSSIBLE_RATING (0.0f)
92 
93 struct ADAPT_RESULTS {
94  int32_t BlobLength;
98  float best_rating;
99  std::vector<UnicharRating> match;
100  std::vector<CP_RESULT_STRUCT> CPResults;
101 
104  inline void Initialize() {
105  BlobLength = INT32_MAX;
106  HasNonfragment = false;
107  ComputeBest();
108  }
109  // Computes best_unichar_id, best_match_index and best_rating.
110  void ComputeBest() {
111  best_unichar_id = INVALID_UNICHAR_ID;
112  best_match_index = -1;
114  for (unsigned i = 0; i < match.size(); ++i) {
115  if (match[i].rating > best_rating) {
116  best_rating = match[i].rating;
117  best_unichar_id = match[i].unichar_id;
118  best_match_index = i;
119  }
120  }
121  }
122 };
123 
124 struct PROTO_KEY {
127  int ConfigId;
128 };
129 
130 // Sort function to sort ratings appropriately by descending rating.
131 static bool SortDescendingRating(const UnicharRating &a, const UnicharRating &b) {
132  if (a.rating != b.rating) {
133  return a.rating > b.rating;
134  } else {
135  return a.unichar_id < b.unichar_id;
136  }
137 }
138 
139 /*-----------------------------------------------------------------------------
140  Private Macros
141 -----------------------------------------------------------------------------*/
142 inline bool MarginalMatch(float confidence, float matcher_great_threshold) {
143  return (1.0f - confidence) > matcher_great_threshold;
144 }
145 
146 /*-----------------------------------------------------------------------------
147  Private Function Prototypes
148 -----------------------------------------------------------------------------*/
149 // Returns the index of the given id in results, if present, or the size of the
150 // vector (index it will go at) if not present.
151 static unsigned FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
152  for (unsigned i = 0; i < results.match.size(); i++) {
153  if (results.match[i].unichar_id == id) {
154  return i;
155  }
156  }
157  return results.match.size();
158 }
159 
160 // Returns the current rating for a unichar id if we have rated it, defaulting
161 // to WORST_POSSIBLE_RATING.
162 static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
163  unsigned index = FindScoredUnichar(id, results);
164  if (index >= results.match.size()) {
165  return WORST_POSSIBLE_RATING;
166  }
167  return results.match[index].rating;
168 }
169 
170 void InitMatcherRatings(float *Rating);
171 
172 int MakeTempProtoPerm(void *item1, void *item2);
173 
174 void SetAdaptiveThreshold(float Threshold);
175 
176 /*-----------------------------------------------------------------------------
177  Public Code
178 -----------------------------------------------------------------------------*/
202 void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {
203  assert(Choices != nullptr);
204  auto *Results = new ADAPT_RESULTS;
205  Results->Initialize();
206 
207  ASSERT_HOST(AdaptedTemplates != nullptr);
208 
209  DoAdaptiveMatch(Blob, Results);
210 
211  RemoveBadMatches(Results);
212  std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
213  RemoveExtraPuncs(Results);
214  Results->ComputeBest();
215  ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results, Choices);
216 
217  // TODO(rays) Move to before ConvertMatchesToChoices!
218  if (LargeSpeckle(*Blob) || Choices->empty()) {
219  AddLargeSpeckleTo(Results->BlobLength, Choices);
220  }
221 
222  if (matcher_debug_level >= 1) {
223  tprintf("AD Matches = ");
224  PrintAdaptiveMatchResults(*Results);
225  }
226 
227 #ifndef GRAPHICS_DISABLED
228  if (classify_enable_adaptive_debugger) {
229  DebugAdaptiveClassifier(Blob, Results);
230  }
231 #endif
232 
233  delete Results;
234 } /* AdaptiveClassifier */
235 
236 #ifndef GRAPHICS_DISABLED
237 
238 // If *win is nullptr, sets it to a new ScrollView() object with title msg.
239 // Clears the window and draws baselines.
240 void Classify::RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset,
241  const TBOX &wbox) {
242  const int kSampleSpaceWidth = 500;
243  if (*win == nullptr) {
244  *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200, kSampleSpaceWidth * 2,
245  200, true);
246  }
247  (*win)->Clear();
248  (*win)->Pen(64, 64, 64);
249  (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset, kSampleSpaceWidth, kBlnBaselineOffset);
250  (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset, kSampleSpaceWidth,
252  (*win)->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
253 }
254 
255 #endif // !GRAPHICS_DISABLED
256 
257 // Learns the given word using its chopped_word, seam_array, denorm,
258 // box_word, best_state, and correct_text to learn both correctly and
259 // incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
260 // is called and the data will be saved in an internal buffer.
261 // Otherwise AdaptToBlob is called for adaption within a document.
262 void Classify::LearnWord(const char *fontname, WERD_RES *word) {
263  int word_len = word->correct_text.size();
264  if (word_len == 0) {
265  return;
266  }
267 
268  float *thresholds = nullptr;
269  if (fontname == nullptr) {
270  // Adaption mode.
271  if (!EnableLearning || word->best_choice == nullptr) {
272  return; // Can't or won't adapt.
273  }
274 
275  if (classify_learning_debug_level >= 1) {
276  tprintf("\n\nAdapting to word = %s\n", word->best_choice->debug_string().c_str());
277  }
278  thresholds = new float[word_len];
279  word->ComputeAdaptionThresholds(certainty_scale, matcher_perfect_threshold,
280  matcher_good_threshold, matcher_rating_margin, thresholds);
281  }
282  int start_blob = 0;
283 
284 #ifndef GRAPHICS_DISABLED
285  if (classify_debug_character_fragments) {
286  if (learn_fragmented_word_debug_win_ != nullptr) {
287  learn_fragmented_word_debug_win_->Wait();
288  }
289  RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
290  word->chopped_word->bounding_box());
291  RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
292  word->chopped_word->bounding_box());
293  word->chopped_word->plot(learn_fragmented_word_debug_win_);
295  }
296 #endif // !GRAPHICS_DISABLED
297 
298  for (int ch = 0; ch < word_len; ++ch) {
299  if (classify_debug_character_fragments) {
300  tprintf("\nLearning %s\n", word->correct_text[ch].c_str());
301  }
302  if (word->correct_text[ch].length() > 0) {
303  float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
304 
305  LearnPieces(fontname, start_blob, word->best_state[ch], threshold, CST_WHOLE,
306  word->correct_text[ch].c_str(), word);
307 
308  if (word->best_state[ch] > 1 && !disable_character_fragments) {
309  // Check that the character breaks into meaningful fragments
310  // that each match a whole character with at least
311  // classify_character_fragments_garbage_certainty_threshold
312  bool garbage = false;
313  int frag;
314  for (frag = 0; frag < word->best_state[ch]; ++frag) {
315  TBLOB *frag_blob = word->chopped_word->blobs[start_blob + frag];
316  if (classify_character_fragments_garbage_certainty_threshold < 0) {
317  garbage |= LooksLikeGarbage(frag_blob);
318  }
319  }
320  // Learn the fragments.
321  if (!garbage) {
322  bool pieces_all_natural = word->PiecesAllNatural(start_blob, word->best_state[ch]);
323  if (pieces_all_natural || !prioritize_division) {
324  for (frag = 0; frag < word->best_state[ch]; ++frag) {
325  std::vector<std::string> tokens = split(word->correct_text[ch], ' ');
326 
327  tokens[0] = CHAR_FRAGMENT::to_string(tokens[0].c_str(), frag, word->best_state[ch],
328  pieces_all_natural);
329 
330  std::string full_string;
331  for (unsigned i = 0; i < tokens.size(); i++) {
332  full_string += tokens[i];
333  if (i != tokens.size() - 1) {
334  full_string += ' ';
335  }
336  }
337  LearnPieces(fontname, start_blob + frag, 1, threshold, CST_FRAGMENT,
338  full_string.c_str(), word);
339  }
340  }
341  }
342  }
343 
344  // TODO(rays): re-enable this part of the code when we switch to the
345  // new classifier that needs to see examples of garbage.
346  /*
347 if (word->best_state[ch] > 1) {
348  // If the next blob is good, make junk with the rightmost fragment.
349  if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
350  LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
351  word->best_state[ch + 1] + 1,
352  threshold, CST_IMPROPER, INVALID_UNICHAR, word);
353  }
354  // If the previous blob is good, make junk with the leftmost fragment.
355  if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
356  LearnPieces(fontname, start_blob - word->best_state[ch - 1],
357  word->best_state[ch - 1] + 1,
358  threshold, CST_IMPROPER, INVALID_UNICHAR, word);
359  }
360 }
361 // If the next blob is good, make a join with it.
362 if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
363  std::string joined_text = word->correct_text[ch];
364  joined_text += word->correct_text[ch + 1];
365  LearnPieces(fontname, start_blob,
366  word->best_state[ch] + word->best_state[ch + 1],
367  threshold, CST_NGRAM, joined_text.c_str(), word);
368 }
369 */
370  }
371  start_blob += word->best_state[ch];
372  }
373  delete[] thresholds;
374 } // LearnWord.
375 
376 // Builds a blob of length fragments, from the word, starting at start,
377 // and then learns it, as having the given correct_text.
378 // If fontname is not nullptr, then LearnBlob is called and the data will be
379 // saved in an internal buffer for static training.
380 // Otherwise AdaptToBlob is called for adaption within a document.
381 // threshold is a magic number required by AdaptToChar and generated by
382 // ComputeAdaptionThresholds.
383 // Although it can be partly inferred from the string, segmentation is
384 // provided to explicitly clarify the character segmentation.
385 void Classify::LearnPieces(const char *fontname, int start, int length, float threshold,
386  CharSegmentationType segmentation, const char *correct_text,
387  WERD_RES *word) {
388  // TODO(daria) Remove/modify this if/when we want
389  // to train and/or adapt to n-grams.
390  if (segmentation != CST_WHOLE && (segmentation != CST_FRAGMENT || disable_character_fragments)) {
391  return;
392  }
393 
394  if (length > 1) {
395  SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);
396  }
397  TBLOB *blob = word->chopped_word->blobs[start];
398  // Rotate the blob if needed for classification.
399  TBLOB *rotated_blob = blob->ClassifyNormalizeIfNeeded();
400  if (rotated_blob == nullptr) {
401  rotated_blob = blob;
402  }
403 
404 #ifndef GRAPHICS_DISABLED
405  // Draw debug windows showing the blob that is being learned if needed.
406  if (strcmp(classify_learn_debug_str.c_str(), correct_text) == 0) {
407  RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600, word->chopped_word->bounding_box());
408  rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
409  learn_debug_win_->Update();
410  learn_debug_win_->Wait();
411  }
412  if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
413  ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
414  blob->plot(learn_fragments_debug_win_, ScrollView::BLUE, ScrollView::BROWN);
415  learn_fragments_debug_win_->Update();
416  }
417 #endif // !GRAPHICS_DISABLED
418 
419  if (fontname != nullptr) {
420  classify_norm_method.set_value(character); // force char norm spc 30/11/93
421  tess_bn_matching.set_value(false); // turn it off
422  tess_cn_matching.set_value(false);
423  DENORM bl_denorm, cn_denorm;
424  INT_FX_RESULT_STRUCT fx_info;
425  SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm, &bl_denorm, &cn_denorm, &fx_info);
426  LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
427  } else if (unicharset.contains_unichar(correct_text)) {
428  UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
429  int font_id = word->fontinfo != nullptr ? fontinfo_table_.get_index(*word->fontinfo) : 0;
430  if (classify_learning_debug_level >= 1) {
431  tprintf("Adapting to char = %s, thr= %g font_id= %d\n", unicharset.id_to_unichar(class_id),
432  threshold, font_id);
433  }
434  // If filename is not nullptr we are doing recognition
435  // (as opposed to training), so we must have already set word fonts.
436  AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
437  if (BackupAdaptedTemplates != nullptr) {
438  // Adapt the backup templates too. They will be used if the primary gets
439  // too full.
440  AdaptToChar(rotated_blob, class_id, font_id, threshold, BackupAdaptedTemplates);
441  }
442  } else if (classify_debug_level >= 1) {
443  tprintf("Can't adapt to %s not in unicharset\n", correct_text);
444  }
445  if (rotated_blob != blob) {
446  delete rotated_blob;
447  }
448 
449  SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);
450 } // LearnPieces.
451 
452 /*---------------------------------------------------------------------------*/
465  std::string Filename;
466  FILE *File;
467 
469  classify_save_adapted_templates) {
470  Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
471  File = fopen(Filename.c_str(), "wb");
472  if (File == nullptr) {
473  tprintf("Unable to save adapted templates to %s!\n", Filename.c_str());
474  } else {
475  tprintf("\nSaving adapted templates to %s ...", Filename.c_str());
476  fflush(stdout);
478  tprintf("\n");
479  fclose(File);
480  }
481  }
482 
483  delete AdaptedTemplates;
484  AdaptedTemplates = nullptr;
485  delete BackupAdaptedTemplates;
486  BackupAdaptedTemplates = nullptr;
487 
488  if (PreTrainedTemplates != nullptr) {
489  delete PreTrainedTemplates;
490  PreTrainedTemplates = nullptr;
491  }
493  FreeNormProtos();
494  if (AllProtosOn != nullptr) {
495  FreeBitVector(AllProtosOn);
496  FreeBitVector(AllConfigsOn);
497  FreeBitVector(AllConfigsOff);
498  FreeBitVector(TempProtoMask);
499  AllProtosOn = nullptr;
500  AllConfigsOn = nullptr;
501  AllConfigsOff = nullptr;
502  TempProtoMask = nullptr;
503  }
504  delete shape_table_;
505  shape_table_ = nullptr;
506  delete static_classifier_;
507  static_classifier_ = nullptr;
508 } /* EndAdaptiveClassifier */
509 
510 /*---------------------------------------------------------------------------*/
529  return;
530  }
531  if (AllProtosOn != nullptr) {
532  EndAdaptiveClassifier(); // Don't leak with multiple inits.
533  }
534 
535  // If there is no language_data_path_prefix, the classifier will be
536  // adaptive only.
537  if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
538  TFile fp;
541 
542  if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
544  if (!shape_table_->DeSerialize(&fp)) {
545  tprintf("Error loading shape table!\n");
546  delete shape_table_;
547  shape_table_ = nullptr;
548  }
549  }
550 
552  ReadNewCutoffs(&fp, CharNormCutoffs);
553 
555  NormProtos = ReadNormProtos(&fp);
556  static_classifier_ = new TessClassifier(false, this);
557  }
558 
559  InitIntegerFX();
560 
561  AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
562  AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
563  AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
564  TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
565  set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
566  set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
567  zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
568 
569  for (uint16_t &BaselineCutoff : BaselineCutoffs) {
570  BaselineCutoff = 0;
571  }
572 
573  if (classify_use_pre_adapted_templates) {
574  TFile fp;
575  std::string Filename = imagefile;
576  Filename += ADAPT_TEMPLATE_SUFFIX;
577  if (!fp.Open(Filename.c_str(), nullptr)) {
579  } else {
580  tprintf("\nReading pre-adapted templates from %s ...\n", Filename.c_str());
581  fflush(stdout);
583  tprintf("\n");
585 
586  for (unsigned i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
587  BaselineCutoffs[i] = CharNormCutoffs[i];
588  }
589  }
590  } else {
591  delete AdaptedTemplates;
593  }
594 } /* InitAdaptiveClassifier */
595 
597  if (classify_learning_debug_level > 0) {
598  tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n", NumAdaptationsFailed);
599  }
600  delete AdaptedTemplates;
602  delete BackupAdaptedTemplates;
603  BackupAdaptedTemplates = nullptr;
604  NumAdaptationsFailed = 0;
605 }
606 
607 // If there are backup adapted templates, switches to those, otherwise resets
608 // the main adaptive classifier (because it is full.)
610  if (BackupAdaptedTemplates == nullptr) {
612  return;
613  }
614  if (classify_learning_debug_level > 0) {
615  tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
616  NumAdaptationsFailed);
617  }
618  delete AdaptedTemplates;
620  BackupAdaptedTemplates = nullptr;
621  NumAdaptationsFailed = 0;
622 }
623 
624 // Resets the backup adaptive classifier to empty.
626  delete BackupAdaptedTemplates;
628 }
629 
630 /*---------------------------------------------------------------------------*/
648  EnableLearning = classify_enable_learning;
649 
651 
652 } /* SettupPass1 */
653 
654 /*---------------------------------------------------------------------------*/
664  EnableLearning = false;
666 
667 } /* SettupPass2 */
668 
669 /*---------------------------------------------------------------------------*/
686 void Classify::InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class,
687  ADAPT_TEMPLATES_STRUCT *Templates) {
688  FEATURE_SET Features;
689  int Fid, Pid;
690  FEATURE Feature;
691  int NumFeatures;
692  PROTO_STRUCT *Proto;
693  INT_CLASS_STRUCT *IClass;
695 
696  classify_norm_method.set_value(baseline);
697  Features = ExtractOutlineFeatures(Blob);
698  NumFeatures = Features->NumFeatures;
699  if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
700  delete Features;
701  return;
702  }
703 
704  Config = new TEMP_CONFIG_STRUCT(NumFeatures - 1, FontinfoId);
705  TempConfigFor(Class, 0) = Config;
706 
707  /* this is a kludge to construct cutoffs for adapted templates */
708  if (Templates == AdaptedTemplates) {
709  BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
710  }
711 
712  IClass = ClassForClassId(Templates->Templates, ClassId);
713 
714  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
715  Pid = AddIntProto(IClass);
716  assert(Pid != NO_PROTO);
717 
718  Feature = Features->Features[Fid];
719  auto TempProto = new TEMP_PROTO_STRUCT;
720  Proto = &(TempProto->Proto);
721 
722  /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
723  ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
724  instead of the -0.25 to 0.75 used in baseline normalization */
725  Proto->Angle = Feature->Params[OutlineFeatDir];
726  Proto->X = Feature->Params[OutlineFeatX];
727  Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
728  Proto->Length = Feature->Params[OutlineFeatLength];
729  FillABC(Proto);
730 
731  TempProto->ProtoId = Pid;
732  SET_BIT(Config->Protos, Pid);
733 
734  ConvertProto(Proto, Pid, IClass);
735  AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);
736 
737  Class->TempProtos = push(Class->TempProtos, TempProto);
738  }
739  delete Features;
740 
741  AddIntConfig(IClass);
742  ConvertConfig(AllProtosOn, 0, IClass);
743 
744  if (classify_learning_debug_level >= 1) {
745  tprintf("Added new class '%s' with class id %d and %d protos.\n",
746  unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
747 #ifndef GRAPHICS_DISABLED
748  if (classify_learning_debug_level > 1) {
749  DisplayAdaptedChar(Blob, IClass);
750  }
751 #endif
752  }
753 
754  if (IsEmptyAdaptedClass(Class)) {
755  (Templates->NumNonEmptyClasses)++;
756  }
757 } /* InitAdaptedClass */
758 
759 /*---------------------------------------------------------------------------*/
779  FEATURE_SET *FloatFeatures) {
780  FEATURE_SET Features;
781  int NumFeatures;
782 
783  classify_norm_method.set_value(baseline);
784  Features = ExtractPicoFeatures(Blob);
785 
786  NumFeatures = Features->NumFeatures;
787  if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
788  delete Features;
789  return 0;
790  }
791 
792  ComputeIntFeatures(Features, IntFeatures);
793  *FloatFeatures = Features;
794 
795  return NumFeatures;
796 } /* GetAdaptiveFeatures */
797 
798 /*-----------------------------------------------------------------------------
799  Private Code
800 -----------------------------------------------------------------------------*/
801 /*---------------------------------------------------------------------------*/
812  if (word->best_choice == nullptr) {
813  return false;
814  }
815  auto BestChoiceLength = word->best_choice->length();
816  float adaptable_score = getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
817  return // rules that apply in general - simplest to compute first
818  BestChoiceLength > 0 && BestChoiceLength == word->rebuild_word->NumBlobs() &&
819  BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
820  // This basically ensures that the word is at least a dictionary match
821  // (freq word, user word, system dawg word, etc).
822  // Since all the other adjustments will make adjust factor higher
823  // than higher than adaptable_score=1.1+0.05=1.15
824  // Since these are other flags that ensure that the word is dict word,
825  // this check could be at times redundant.
826  word->best_choice->adjust_factor() <= adaptable_score &&
827  // Make sure that alternative choices are not dictionary words.
828  word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
829 }
830 
831 /*---------------------------------------------------------------------------*/
843 void Classify::AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold,
844  ADAPT_TEMPLATES_STRUCT *adaptive_templates) {
845  int NumFeatures;
846  INT_FEATURE_ARRAY IntFeatures;
847  UnicharRating int_result;
848  INT_CLASS_STRUCT *IClass;
849  ADAPT_CLASS_STRUCT *Class;
850  TEMP_CONFIG_STRUCT *TempConfig;
851  FEATURE_SET FloatFeatures;
852  int NewTempConfigId;
853 
854  if (!LegalClassId(ClassId)) {
855  return;
856  }
857 
858  int_result.unichar_id = ClassId;
859  Class = adaptive_templates->Class[ClassId];
860  assert(Class != nullptr);
861  if (IsEmptyAdaptedClass(Class)) {
862  InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
863  } else {
864  IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
865 
866  NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
867  if (NumFeatures <= 0) {
868  return; // Features already freed by GetAdaptiveFeatures.
869  }
870 
871  // Only match configs with the matching font.
872  BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
873  for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
874  if (GetFontinfoId(Class, cfg) == FontinfoId) {
875  SET_BIT(MatchingFontConfigs, cfg);
876  } else {
877  reset_bit(MatchingFontConfigs, cfg);
878  }
879  }
880  im_.Match(IClass, AllProtosOn, MatchingFontConfigs, NumFeatures, IntFeatures, &int_result,
881  classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);
882  FreeBitVector(MatchingFontConfigs);
883 
884  SetAdaptiveThreshold(Threshold);
885 
886  if (1.0f - int_result.rating <= Threshold) {
887  if (ConfigIsPermanent(Class, int_result.config)) {
888  if (classify_learning_debug_level >= 1) {
889  tprintf("Found good match to perm config %d = %4.1f%%.\n", int_result.config,
890  int_result.rating * 100.0);
891  }
892  delete FloatFeatures;
893  return;
894  }
895 
896  TempConfig = TempConfigFor(Class, int_result.config);
897  IncreaseConfidence(TempConfig);
898  if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
899  Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
900  }
901  if (classify_learning_debug_level >= 1) {
902  tprintf("Increasing reliability of temp config %d to %d.\n", int_result.config,
903  TempConfig->NumTimesSeen);
904  }
905 
906  if (TempConfigReliable(ClassId, TempConfig)) {
907  MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
908  UpdateAmbigsGroup(ClassId, Blob);
909  }
910  } else {
911  if (classify_learning_debug_level >= 1) {
912  tprintf("Found poor match to temp config %d = %4.1f%%.\n", int_result.config,
913  int_result.rating * 100.0);
914 #ifndef GRAPHICS_DISABLED
915  if (classify_learning_debug_level > 2) {
916  DisplayAdaptedChar(Blob, IClass);
917  }
918 #endif
919  }
920  NewTempConfigId = MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId, NumFeatures,
921  IntFeatures, FloatFeatures);
922  if (NewTempConfigId >= 0 &&
923  TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
924  MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
925  UpdateAmbigsGroup(ClassId, Blob);
926  }
927 
928 #ifndef GRAPHICS_DISABLED
929  if (classify_learning_debug_level > 1) {
930  DisplayAdaptedChar(Blob, IClass);
931  }
932 #endif
933  }
934  delete FloatFeatures;
935  }
936 } /* AdaptToChar */
937 
938 #ifndef GRAPHICS_DISABLED
939 
941  INT_FX_RESULT_STRUCT fx_info;
942  std::vector<INT_FEATURE_STRUCT> bl_features;
943  TrainingSample *sample =
944  BlobToTrainingSample(*blob, classify_nonlinear_norm, &fx_info, &bl_features);
945  if (sample == nullptr) {
946  return;
947  }
948 
949  UnicharRating int_result;
950  im_.Match(int_class, AllProtosOn, AllConfigsOn, bl_features.size(), &bl_features[0], &int_result,
951  classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);
952  tprintf("Best match to temp config %d = %4.1f%%.\n", int_result.config,
953  int_result.rating * 100.0);
954  if (classify_learning_debug_level >= 2) {
955  uint32_t ConfigMask;
956  ConfigMask = 1 << int_result.config;
958  im_.Match(int_class, AllProtosOn, static_cast<BIT_VECTOR>(&ConfigMask), bl_features.size(),
959  &bl_features[0], &int_result, classify_adapt_feature_threshold, 6 | 0x19,
960  matcher_debug_separate_windows);
962  }
963 
964  delete sample;
965 }
966 
967 #endif
968 
986 void Classify::AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results) {
987  auto old_match = FindScoredUnichar(new_result.unichar_id, *results);
988 
989  if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
990  (old_match < results->match.size() &&
991  new_result.rating <= results->match[old_match].rating)) {
992  return; // New one not good enough.
993  }
994 
995  if (!unicharset.get_fragment(new_result.unichar_id)) {
996  results->HasNonfragment = true;
997  }
998 
999  if (old_match < results->match.size()) {
1000  results->match[old_match].rating = new_result.rating;
1001  } else {
1002  results->match.push_back(new_result);
1003  }
1004 
1005  if (new_result.rating > results->best_rating &&
1006  // Ensure that fragments do not affect best rating, class and config.
1007  // This is needed so that at least one non-fragmented character is
1008  // always present in the results.
1009  // TODO(daria): verify that this helps accuracy and does not
1010  // hurt performance.
1011  !unicharset.get_fragment(new_result.unichar_id)) {
1012  results->best_match_index = old_match;
1013  results->best_rating = new_result.rating;
1014  results->best_unichar_id = new_result.unichar_id;
1015  }
1016 } /* AddNewResult */
1017 
1018 /*---------------------------------------------------------------------------*/
1037 void Classify::AmbigClassifier(const std::vector<INT_FEATURE_STRUCT> &int_features,
1038  const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob,
1039  INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes,
1040  UNICHAR_ID *ambiguities, ADAPT_RESULTS *results) {
1041  if (int_features.empty()) {
1042  return;
1043  }
1044  auto *CharNormArray = new uint8_t[unicharset.size()];
1045  UnicharRating int_result;
1046 
1047  results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr, CharNormArray);
1048  bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1049  if (debug) {
1050  tprintf("AM Matches = ");
1051  }
1052 
1053  int top = blob->bounding_box().top();
1054  int bottom = blob->bounding_box().bottom();
1055  while (*ambiguities >= 0) {
1056  CLASS_ID class_id = *ambiguities;
1057 
1058  int_result.unichar_id = class_id;
1059  im_.Match(ClassForClassId(templates, class_id), AllProtosOn, AllConfigsOn, int_features.size(),
1060  &int_features[0], &int_result, classify_adapt_feature_threshold, NO_DEBUG,
1061  matcher_debug_separate_windows);
1062 
1063  ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0, results->BlobLength,
1064  classify_integer_matcher_multiplier, CharNormArray, &int_result,
1065  results);
1066  ambiguities++;
1067  }
1068  delete[] CharNormArray;
1069 } /* AmbigClassifier */
1070 
1071 /*---------------------------------------------------------------------------*/
1074 void Classify::MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features,
1075  const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors,
1076  ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier,
1077  const TBOX &blob_box, const std::vector<CP_RESULT_STRUCT> &results,
1078  ADAPT_RESULTS *final_results) {
1079  int top = blob_box.top();
1080  int bottom = blob_box.bottom();
1081  UnicharRating int_result;
1082  for (auto result : results) {
1083  CLASS_ID class_id = result.Class;
1084  BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos : AllProtosOn;
1085  BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs : AllConfigsOn;
1086 
1087  int_result.unichar_id = class_id;
1088  im_.Match(ClassForClassId(templates, class_id), protos, configs, num_features, features,
1089  &int_result, classify_adapt_feature_threshold, debug, matcher_debug_separate_windows);
1090  bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1091  ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top, result.Rating,
1092  final_results->BlobLength, matcher_multiplier, norm_factors,
1093  &int_result, final_results);
1094  }
1095 }
1096 
1097 // Converts configs to fonts, and if the result is not adapted, and a
1098 // shape_table_ is present, the shape is expanded to include all
1099 // unichar_ids represented, before applying a set of corrections to the
1100 // distance rating in int_result, (see ComputeCorrectedRating.)
1101 // The results are added to the final_results output.
1102 void Classify::ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id,
1103  int bottom, int top, float cp_rating,
1104  int blob_length, int matcher_multiplier,
1105  const uint8_t *cn_factors, UnicharRating *int_result,
1106  ADAPT_RESULTS *final_results) {
1107  if (classes != nullptr) {
1108  // Adapted result. Convert configs to fontinfo_ids.
1109  int_result->adapted = true;
1110  for (auto &font : int_result->fonts) {
1111  font.fontinfo_id = GetFontinfoId(classes[class_id], font.fontinfo_id);
1112  }
1113  } else {
1114  // Pre-trained result. Map fonts using font_sets_.
1115  int_result->adapted = false;
1116  for (auto &font : int_result->fonts) {
1117  font.fontinfo_id = ClassAndConfigIDToFontOrShapeID(class_id, font.fontinfo_id);
1118  }
1119  if (shape_table_ != nullptr) {
1120  // Two possible cases:
1121  // 1. Flat shapetable. All unichar-ids of the shapes referenced by
1122  // int_result->fonts are the same. In this case build a new vector of
1123  // mapped fonts and replace the fonts in int_result.
1124  // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
1125  // by int_result. In this case, build a vector of UnicharRating to
1126  // gather together different font-ids for each unichar. Also covers case1.
1127  std::vector<UnicharRating> mapped_results;
1128  for (auto &f : int_result->fonts) {
1129  int shape_id = f.fontinfo_id;
1130  const Shape &shape = shape_table_->GetShape(shape_id);
1131  for (int c = 0; c < shape.size(); ++c) {
1132  int unichar_id = shape[c].unichar_id;
1133  if (!unicharset.get_enabled(unichar_id)) {
1134  continue;
1135  }
1136  // Find the mapped_result for unichar_id.
1137  unsigned r = 0;
1138  for (r = 0; r < mapped_results.size() && mapped_results[r].unichar_id != unichar_id;
1139  ++r) {
1140  }
1141  if (r == mapped_results.size()) {
1142  mapped_results.push_back(*int_result);
1143  mapped_results[r].unichar_id = unichar_id;
1144  mapped_results[r].fonts.clear();
1145  }
1146  for (int font_id : shape[c].font_ids) {
1147  mapped_results[r].fonts.emplace_back(font_id, f.score);
1148  }
1149  }
1150  }
1151  for (auto &m : mapped_results) {
1152  m.rating = ComputeCorrectedRating(
1153  debug, m.unichar_id, cp_rating, int_result->rating,
1154  int_result->feature_misses, bottom, top, blob_length, matcher_multiplier, cn_factors);
1155  AddNewResult(m, final_results);
1156  }
1157  return;
1158  }
1159  }
1160  if (unicharset.get_enabled(class_id)) {
1161  int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating, int_result->rating,
1162  int_result->feature_misses, bottom, top,
1163  blob_length, matcher_multiplier, cn_factors);
1164  AddNewResult(*int_result, final_results);
1165  }
1166 }
1167 
1168 // Applies a set of corrections to the confidence im_rating,
1169 // including the cn_correction, miss penalty and additional penalty
1170 // for non-alnums being vertical misfits. Returns the corrected confidence.
1171 double Classify::ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
1172  double im_rating, int feature_misses, int bottom, int top,
1173  int blob_length, int matcher_multiplier,
1174  const uint8_t *cn_factors) {
1175  // Compute class feature corrections.
1176  double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length, cn_factors[unichar_id],
1177  matcher_multiplier);
1178  double miss_penalty = tessedit_class_miss_scale * feature_misses;
1179  double vertical_penalty = 0.0;
1180  // Penalize non-alnums for being vertical misfits.
1181  if (!unicharset.get_isalpha(unichar_id) && !unicharset.get_isdigit(unichar_id) &&
1182  cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
1183  int min_bottom, max_bottom, min_top, max_top;
1184  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);
1185  if (debug) {
1186  tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n", top, min_top, max_top, bottom,
1187  min_bottom, max_bottom);
1188  }
1189  if (top < min_top || top > max_top || bottom < min_bottom || bottom > max_bottom) {
1190  vertical_penalty = classify_misfit_junk_penalty;
1191  }
1192  }
1193  double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
1194  if (result < WORST_POSSIBLE_RATING) {
1195  result = WORST_POSSIBLE_RATING;
1196  }
1197  if (debug) {
1198  tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
1199  unicharset.id_to_unichar(unichar_id), result * 100.0, cp_rating * 100.0,
1200  (1.0 - im_rating) * 100.0, (cn_corrected - (1.0 - im_rating)) * 100.0,
1201  cn_factors[unichar_id], miss_penalty * 100.0, vertical_penalty * 100.0);
1202  }
1203  return result;
1204 }
1205 
1206 /*---------------------------------------------------------------------------*/
1225  const std::vector<INT_FEATURE_STRUCT> &int_features,
1226  const INT_FX_RESULT_STRUCT &fx_info,
1227  ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results) {
1228  if (int_features.empty()) {
1229  return nullptr;
1230  }
1231  auto *CharNormArray = new uint8_t[unicharset.size()];
1232  ClearCharNormArray(CharNormArray);
1233 
1235  PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0], CharNormArray,
1236  BaselineCutoffs, &Results->CPResults);
1237 
1238  if (matcher_debug_level >= 2 || classify_debug_level > 1) {
1239  tprintf("BL Matches = ");
1240  }
1241 
1242  MasterMatcher(Templates->Templates, int_features.size(), &int_features[0], CharNormArray,
1243  Templates->Class, matcher_debug_flags, 0, Blob->bounding_box(), Results->CPResults,
1244  Results);
1245 
1246  delete[] CharNormArray;
1247  CLASS_ID ClassId = Results->best_unichar_id;
1248  if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0) {
1249  return nullptr;
1250  }
1251 
1252  return Templates->Class[ClassId]
1253  ->Config[Results->match[Results->best_match_index].config]
1254  .Perm->Ambigs;
1255 } /* BaselineClassifier */
1256 
1257 /*---------------------------------------------------------------------------*/
1274  ADAPT_RESULTS *adapt_results) {
1275  // This is the length that is used for scaling ratings vs certainty.
1276  adapt_results->BlobLength = IntCastRounded(sample.outline_length() / kStandardFeatureLength);
1277  std::vector<UnicharRating> unichar_results;
1278  static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0, -1, &unichar_results);
1279  // Convert results to the format used internally by AdaptiveClassifier.
1280  for (auto &r : unichar_results) {
1281  AddNewResult(r, adapt_results);
1282  }
1283  return sample.num_features();
1284 } /* CharNormClassifier */
1285 
1286 // As CharNormClassifier, but operates on a TrainingSample and outputs to
1287 // a vector of ShapeRating without conversion to classes.
1288 int Classify::CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample,
1289  std::vector<UnicharRating> *results) {
1290  results->clear();
1291  std::unique_ptr<ADAPT_RESULTS> adapt_results(new ADAPT_RESULTS());
1292  adapt_results->Initialize();
1293  // Compute the bounding box of the features.
1294  uint32_t num_features = sample.num_features();
1295  // Only the top and bottom of the blob_box are used by MasterMatcher, so
1296  // fabricate right and left using top and bottom.
1297  TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1298  sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1299  // Compute the char_norm_array from the saved cn_feature.
1300  FEATURE norm_feature = sample.GetCNFeature();
1301  std::vector<uint8_t> char_norm_array(unicharset.size());
1302  auto num_pruner_classes = std::max(static_cast<unsigned>(unicharset.size()), PreTrainedTemplates->NumClasses);
1303  std::vector<uint8_t> pruner_norm_array(num_pruner_classes);
1304  adapt_results->BlobLength = static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5f);
1305  ComputeCharNormArrays(norm_feature, PreTrainedTemplates, &char_norm_array[0], &pruner_norm_array[0]);
1306 
1307  PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(), &pruner_norm_array[0],
1308  shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1309  &adapt_results->CPResults);
1310  if (keep_this >= 0) {
1311  adapt_results->CPResults[0].Class = keep_this;
1312  adapt_results->CPResults.resize(1);
1313  }
1314  if (pruner_only) {
1315  // Convert pruner results to output format.
1316  for (auto &it : adapt_results->CPResults) {
1317  int class_id = it.Class;
1318  results->push_back(UnicharRating(class_id, 1.0f - it.Rating));
1319  }
1320  } else {
1321  MasterMatcher(PreTrainedTemplates, num_features, sample.features(), &char_norm_array[0], nullptr,
1322  matcher_debug_flags, classify_integer_matcher_multiplier, blob_box,
1323  adapt_results->CPResults, adapt_results.get());
1324  // Convert master matcher results to output format.
1325  for (auto &i : adapt_results->match) {
1326  results->push_back(i);
1327  }
1328  if (results->size() > 1) {
1329  std::sort(results->begin(), results->end(), SortDescendingRating);
1330  }
1331  }
1332  return num_features;
1333 } /* CharNormTrainingSample */
1334 
1335 /*---------------------------------------------------------------------------*/
1348  float rating = results->BlobLength / matcher_avg_noise_size;
1349  rating *= rating;
1350  rating /= 1 + rating;
1351 
1352  AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
1353 } /* ClassifyAsNoise */
1354 
1361 void Classify::ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box,
1362  ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices) {
1363  assert(Choices != nullptr);
1364  float Rating;
1365  float Certainty;
1366  BLOB_CHOICE_IT temp_it;
1367  bool contains_nonfrag = false;
1368  temp_it.set_to_list(Choices);
1369  int choices_length = 0;
1370  // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
1371  // number of returned results, but with a shape_table_ we want to have room
1372  // for at least the biggest shape (which might contain hundreds of Indic
1373  // grapheme fragments) and more, so use double the size of the biggest shape
1374  // if that is more than the default.
1375  int max_matches = MAX_MATCHES;
1376  if (shape_table_ != nullptr) {
1377  max_matches = shape_table_->MaxNumUnichars() * 2;
1378  if (max_matches < MAX_MATCHES) {
1379  max_matches = MAX_MATCHES;
1380  }
1381  }
1382 
1383  float best_certainty = -FLT_MAX;
1384  for (auto &it : Results->match) {
1385  const UnicharRating &result = it;
1386  bool adapted = result.adapted;
1387  bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
1388  if (temp_it.length() + 1 == max_matches && !contains_nonfrag && current_is_frag) {
1389  continue; // look for a non-fragmented character to fill the
1390  // last spot in Choices if only fragments are present
1391  }
1392  // BlobLength can never be legally 0, this means recognition failed.
1393  // But we must return a classification result because some invoking
1394  // functions (chopper/permuter) do not anticipate a null blob choice.
1395  // So we need to assign a poor, but not infinitely bad score.
1396  if (Results->BlobLength == 0) {
1397  Certainty = -20;
1398  Rating = 100; // should be -certainty * real_blob_length
1399  } else {
1400  Rating = Certainty = (1.0f - result.rating);
1401  Rating *= rating_scale * Results->BlobLength;
1402  Certainty *= -(getDict().certainty_scale);
1403  }
1404  // Adapted results, by their very nature, should have good certainty.
1405  // Those that don't are at best misleading, and often lead to errors,
1406  // so don't accept adapted results that are too far behind the best result,
1407  // whether adapted or static.
1408  // TODO(rays) find some way of automatically tuning these constants.
1409  if (Certainty > best_certainty) {
1410  best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
1411  } else if (adapted && Certainty / classify_adapted_pruning_factor < best_certainty) {
1412  continue; // Don't accept bad adapted results.
1413  }
1414 
1415  float min_xheight, max_xheight, yshift;
1416  denorm.XHeightRange(result.unichar_id, unicharset, box, &min_xheight, &max_xheight, &yshift);
1417  auto *choice = new BLOB_CHOICE(
1418  result.unichar_id, Rating, Certainty, unicharset.get_script(result.unichar_id), min_xheight,
1419  max_xheight, yshift, adapted ? BCC_ADAPTED_CLASSIFIER : BCC_STATIC_CLASSIFIER);
1420  choice->set_fonts(result.fonts);
1421  temp_it.add_to_end(choice);
1422  contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1423  choices_length++;
1424  if (choices_length >= max_matches) {
1425  break;
1426  }
1427  }
1428  Results->match.resize(choices_length);
1429 } // ConvertMatchesToChoices
1430 
1431 /*---------------------------------------------------------------------------*/
1432 #ifndef GRAPHICS_DISABLED
1441  if (static_classifier_ == nullptr) {
1442  return;
1443  }
1444  INT_FX_RESULT_STRUCT fx_info;
1445  std::vector<INT_FEATURE_STRUCT> bl_features;
1446  TrainingSample *sample = BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1447  if (sample == nullptr) {
1448  return;
1449  }
1450  static_classifier_->DebugDisplay(*sample, blob->denorm().pix(), Results->best_unichar_id);
1451 } /* DebugAdaptiveClassifier */
1452 #endif
1453 
1454 /*---------------------------------------------------------------------------*/
1475  UNICHAR_ID *Ambiguities;
1476 
1477  INT_FX_RESULT_STRUCT fx_info;
1478  std::vector<INT_FEATURE_STRUCT> bl_features;
1479  TrainingSample *sample =
1480  BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);
1481  if (sample == nullptr) {
1482  return;
1483  }
1484 
1485  // TODO: With LSTM, static_classifier_ is nullptr.
1486  // Return to avoid crash in CharNormClassifier.
1487  if (static_classifier_ == nullptr) {
1488  delete sample;
1489  return;
1490  }
1491 
1492  if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min || tess_cn_matching) {
1493  CharNormClassifier(Blob, *sample, Results);
1494  } else {
1495  Ambiguities = BaselineClassifier(Blob, bl_features, fx_info, AdaptedTemplates, Results);
1496  if ((!Results->match.empty() &&
1497  MarginalMatch(Results->best_rating, matcher_reliable_adaptive_result) &&
1498  !tess_bn_matching) ||
1499  Results->match.empty()) {
1500  CharNormClassifier(Blob, *sample, Results);
1501  } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1502  AmbigClassifier(bl_features, fx_info, Blob, PreTrainedTemplates, AdaptedTemplates->Class,
1503  Ambiguities, Results);
1504  }
1505  }
1506 
1507  // Force the blob to be classified as noise
1508  // if the results contain only fragments.
1509  // TODO(daria): verify that this is better than
1510  // just adding a nullptr classification.
1511  if (!Results->HasNonfragment || Results->match.empty()) {
1512  ClassifyAsNoise(Results);
1513  }
1514  delete sample;
1515 } /* DoAdaptiveMatch */
1516 
1517 /*---------------------------------------------------------------------------*/
1533  auto *Results = new ADAPT_RESULTS();
1534  UNICHAR_ID *Ambiguities;
1535 
1536  Results->Initialize();
1537  INT_FX_RESULT_STRUCT fx_info;
1538  std::vector<INT_FEATURE_STRUCT> bl_features;
1539  TrainingSample *sample =
1540  BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);
1541  if (sample == nullptr) {
1542  delete Results;
1543  return nullptr;
1544  }
1545 
1546  CharNormClassifier(Blob, *sample, Results);
1547  delete sample;
1548  RemoveBadMatches(Results);
1549  std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
1550 
1551  /* copy the class id's into an string of ambiguities - don't copy if
1552  the correct class is the only class id matched */
1553  Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1554  if (Results->match.size() > 1 ||
1555  (Results->match.size() == 1 && Results->match[0].unichar_id != CorrectClass)) {
1556  unsigned i;
1557  for (i = 0; i < Results->match.size(); i++) {
1558  Ambiguities[i] = Results->match[i].unichar_id;
1559  }
1560  Ambiguities[i] = -1;
1561  } else {
1562  Ambiguities[0] = -1;
1563  }
1564 
1565  delete Results;
1566  return Ambiguities;
1567 } /* GetAmbiguities */
1568 
1569 // Returns true if the given blob looks too dissimilar to any character
1570 // present in the classifier templates.
1572  auto *ratings = new BLOB_CHOICE_LIST();
1573  AdaptiveClassifier(blob, ratings);
1574  BLOB_CHOICE_IT ratings_it(ratings);
1576  if (classify_debug_character_fragments) {
1577  print_ratings_list("======================\nLooksLikeGarbage() got ", ratings, unicharset);
1578  }
1579  for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list(); ratings_it.forward()) {
1580  if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
1581  continue;
1582  }
1583  float certainty = ratings_it.data()->certainty();
1584  delete ratings;
1585  return certainty < classify_character_fragments_garbage_certainty_threshold;
1586  }
1587  delete ratings;
1588  return true; // no whole characters in ratings
1589 }
1590 
1591 /*---------------------------------------------------------------------------*/
1614  uint8_t *pruner_norm_array, uint8_t *char_norm_array) {
1615  auto norm_feature = new FEATURE_STRUCT(&CharNormDesc);
1616  float baseline = kBlnBaselineOffset;
1617  float scale = MF_SCALE_FACTOR;
1618  norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1619  norm_feature->Params[CharNormLength] = fx_info.Length * scale / LENGTH_COMPRESSION;
1620  norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1621  norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1622  // Deletes norm_feature.
1623  ComputeCharNormArrays(norm_feature, templates, char_norm_array, pruner_norm_array);
1624  return IntCastRounded(fx_info.Length / kStandardFeatureLength);
1625 } /* GetCharNormFeature */
1626 
1627 // Computes the char_norm_array for the unicharset and, if not nullptr, the
1628 // pruner_array as appropriate according to the existence of the shape_table.
1630  uint8_t *char_norm_array, uint8_t *pruner_array) {
1631  ComputeIntCharNormArray(*norm_feature, char_norm_array);
1632  //if (pruner_array != nullptr) {
1633  if (shape_table_ == nullptr) {
1634  ComputeIntCharNormArray(*norm_feature, pruner_array);
1635  } else {
1636  memset(&pruner_array[0], UINT8_MAX, templates->NumClasses * sizeof(pruner_array[0]));
1637  // Each entry in the pruner norm array is the MIN of all the entries of
1638  // the corresponding unichars in the CharNormArray.
1639  for (unsigned id = 0; id < templates->NumClasses; ++id) {
1640  int font_set_id = templates->Class[id]->font_set_id;
1641  const FontSet &fs = fontset_table_.at(font_set_id);
1642  for (auto f : fs) {
1643  const Shape &shape = shape_table_->GetShape(f);
1644  for (int c = 0; c < shape.size(); ++c) {
1645  if (char_norm_array[shape[c].unichar_id] < pruner_array[id]) {
1646  pruner_array[id] = char_norm_array[shape[c].unichar_id];
1647  }
1648  }
1649  }
1650  }
1651  }
1652  //}
1653  delete norm_feature;
1654 }
1655 
1656 /*---------------------------------------------------------------------------*/
1669 int Classify::MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId,
1670  int NumFeatures, INT_FEATURE_ARRAY Features,
1671  FEATURE_SET FloatFeatures) {
1672  INT_CLASS_STRUCT *IClass;
1673  ADAPT_CLASS_STRUCT *Class;
1674  PROTO_ID OldProtos[MAX_NUM_PROTOS];
1675  FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
1676  int NumOldProtos;
1677  int NumBadFeatures;
1678  int MaxProtoId, OldMaxProtoId;
1679  int MaskSize;
1680  int ConfigId;
1681  int i;
1682  int debug_level = NO_DEBUG;
1683 
1684  if (classify_learning_debug_level >= 3) {
1686  }
1687 
1688  IClass = ClassForClassId(Templates->Templates, ClassId);
1689  Class = Templates->Class[ClassId];
1690 
1691  if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
1692  ++NumAdaptationsFailed;
1693  if (classify_learning_debug_level >= 1) {
1694  tprintf("Cannot make new temporary config: maximum number exceeded.\n");
1695  }
1696  return -1;
1697  }
1698 
1699  OldMaxProtoId = IClass->NumProtos - 1;
1700 
1701  NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff, NumFeatures, Features,
1702  OldProtos, classify_adapt_proto_threshold, debug_level);
1703 
1704  MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
1705  zero_all_bits(TempProtoMask, MaskSize);
1706  for (i = 0; i < NumOldProtos; i++) {
1707  SET_BIT(TempProtoMask, OldProtos[i]);
1708  }
1709 
1710  NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn, NumFeatures, Features,
1711  BadFeatures, classify_adapt_feature_threshold, debug_level);
1712 
1713  MaxProtoId =
1714  MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures, IClass, Class, TempProtoMask);
1715  if (MaxProtoId == NO_PROTO) {
1716  ++NumAdaptationsFailed;
1717  if (classify_learning_debug_level >= 1) {
1718  tprintf("Cannot make new temp protos: maximum number exceeded.\n");
1719  }
1720  return -1;
1721  }
1722 
1723  ConfigId = AddIntConfig(IClass);
1724  ConvertConfig(TempProtoMask, ConfigId, IClass);
1725  auto Config = new TEMP_CONFIG_STRUCT(MaxProtoId, FontinfoId);
1726  TempConfigFor(Class, ConfigId) = Config;
1727  copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
1728 
1729  if (classify_learning_debug_level >= 1) {
1730  tprintf(
1731  "Making new temp config %d fontinfo id %d"
1732  " using %d old and %d new protos.\n",
1733  ConfigId, Config->FontinfoId, NumOldProtos, MaxProtoId - OldMaxProtoId);
1734  }
1735 
1736  return ConfigId;
1737 } /* MakeNewTemporaryConfig */
1738 
1739 /*---------------------------------------------------------------------------*/
1758 PROTO_ID Classify::MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[],
1759  INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class,
1760  BIT_VECTOR TempProtoMask) {
1761  FEATURE_ID *ProtoStart;
1762  FEATURE_ID *ProtoEnd;
1763  FEATURE_ID *LastBad;
1764  PROTO_STRUCT *Proto;
1765  FEATURE F1, F2;
1766  float X1, X2, Y1, Y2;
1767  float A1, A2, AngleDelta;
1768  float SegmentLength;
1769  PROTO_ID Pid;
1770 
1771  for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat; ProtoStart < LastBad;
1772  ProtoStart = ProtoEnd) {
1773  F1 = Features->Features[*ProtoStart];
1774  X1 = F1->Params[PicoFeatX];
1775  Y1 = F1->Params[PicoFeatY];
1776  A1 = F1->Params[PicoFeatDir];
1777 
1778  for (ProtoEnd = ProtoStart + 1, SegmentLength = GetPicoFeatureLength(); ProtoEnd < LastBad;
1779  ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
1780  F2 = Features->Features[*ProtoEnd];
1781  X2 = F2->Params[PicoFeatX];
1782  Y2 = F2->Params[PicoFeatY];
1783  A2 = F2->Params[PicoFeatDir];
1784 
1785  AngleDelta = std::fabs(A1 - A2);
1786  if (AngleDelta > 0.5f) {
1787  AngleDelta = 1 - AngleDelta;
1788  }
1789 
1790  if (AngleDelta > matcher_clustering_max_angle_delta || std::fabs(X1 - X2) > SegmentLength ||
1791  std::fabs(Y1 - Y2) > SegmentLength) {
1792  break;
1793  }
1794  }
1795 
1796  F2 = Features->Features[*(ProtoEnd - 1)];
1797  X2 = F2->Params[PicoFeatX];
1798  Y2 = F2->Params[PicoFeatY];
1799  A2 = F2->Params[PicoFeatDir];
1800 
1801  Pid = AddIntProto(IClass);
1802  if (Pid == NO_PROTO) {
1803  return (NO_PROTO);
1804  }
1805 
1806  auto TempProto = new TEMP_PROTO_STRUCT;
1807  Proto = &(TempProto->Proto);
1808 
1809  /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
1810  ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
1811  instead of the -0.25 to 0.75 used in baseline normalization */
1812  Proto->Length = SegmentLength;
1813  Proto->Angle = A1;
1814  Proto->X = (X1 + X2) / 2;
1815  Proto->Y = (Y1 + Y2) / 2 - Y_DIM_OFFSET;
1816  FillABC(Proto);
1817 
1818  TempProto->ProtoId = Pid;
1819  SET_BIT(TempProtoMask, Pid);
1820 
1821  ConvertProto(Proto, Pid, IClass);
1822  AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);
1823 
1824  Class->TempProtos = push(Class->TempProtos, TempProto);
1825  }
1826  return IClass->NumProtos - 1;
1827 } /* MakeNewTempProtos */
1828 
1829 /*---------------------------------------------------------------------------*/
1839 void Classify::MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId,
1840  TBLOB *Blob) {
1841  UNICHAR_ID *Ambigs;
1842  PROTO_KEY ProtoKey;
1843 
1844  auto Class = Templates->Class[ClassId];
1845  auto Config = TempConfigFor(Class, ConfigId);
1846 
1847  MakeConfigPermanent(Class, ConfigId);
1848  if (Class->NumPermConfigs == 0) {
1849  Templates->NumPermClasses++;
1850  }
1851  Class->NumPermConfigs++;
1852 
1853  // Initialize permanent config.
1854  Ambigs = GetAmbiguities(Blob, ClassId);
1855  auto Perm = new PERM_CONFIG_STRUCT;
1856  Perm->Ambigs = Ambigs;
1857  Perm->FontinfoId = Config->FontinfoId;
1858 
1859  // Free memory associated with temporary config (since ADAPTED_CONFIG
1860  // is a union we need to clean up before we record permanent config).
1861  ProtoKey.Templates = Templates;
1862  ProtoKey.ClassId = ClassId;
1863  ProtoKey.ConfigId = ConfigId;
1864  Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
1865  delete Config;
1866 
1867  // Record permanent config.
1868  PermConfigFor(Class, ConfigId) = Perm;
1869 
1870  if (classify_learning_debug_level >= 1) {
1871  tprintf(
1872  "Making config %d for %s (ClassId %d) permanent:"
1873  " fontinfo id %d, ambiguities '",
1874  ConfigId, getDict().getUnicharset().debug_str(ClassId).c_str(), ClassId,
1875  PermConfigFor(Class, ConfigId)->FontinfoId);
1876  for (UNICHAR_ID *AmbigsPointer = Ambigs; *AmbigsPointer >= 0; ++AmbigsPointer) {
1877  tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
1878  }
1879  tprintf("'.\n");
1880  }
1881 } /* MakePermanent */
1882 
1883 /*---------------------------------------------------------------------------*/
1896 int MakeTempProtoPerm(void *item1, void *item2) {
1897  auto TempProto = static_cast<TEMP_PROTO_STRUCT *>(item1);
1898  auto ProtoKey = static_cast<PROTO_KEY *>(item2);
1899 
1900  auto Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
1901  auto Config = TempConfigFor(Class, ProtoKey->ConfigId);
1902 
1903  if (TempProto->ProtoId > Config->MaxProtoId || !test_bit(Config->Protos, TempProto->ProtoId)) {
1904  return false;
1905  }
1906 
1907  MakeProtoPermanent(Class, TempProto->ProtoId);
1908  AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId, ProtoKey->Templates->Templates);
1909  delete TempProto;
1910 
1911  return true;
1912 } /* MakeTempProtoPerm */
1913 
1914 /*---------------------------------------------------------------------------*/
1923  for (auto &it : results.match) {
1924  tprintf("%s ", unicharset.debug_str(it.unichar_id).c_str());
1925  it.Print();
1926  }
1927 } /* PrintAdaptiveMatchResults */
1928 
1929 /*---------------------------------------------------------------------------*/
1943  unsigned Next, NextGood;
1944  float BadMatchThreshold;
1945  static const char *romans = "i v x I V X";
1946  BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
1947 
1948  if (classify_bln_numeric_mode) {
1949  UNICHAR_ID unichar_id_one =
1951  UNICHAR_ID unichar_id_zero =
1953  float scored_one = ScoredUnichar(unichar_id_one, *Results);
1954  float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
1955 
1956  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
1957  const UnicharRating &match = Results->match[Next];
1958  if (match.rating >= BadMatchThreshold) {
1959  if (!unicharset.get_isalpha(match.unichar_id) ||
1960  strstr(romans, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
1961  } else if (unicharset.eq(match.unichar_id, "l") && scored_one < BadMatchThreshold) {
1962  Results->match[Next].unichar_id = unichar_id_one;
1963  } else if (unicharset.eq(match.unichar_id, "O") && scored_zero < BadMatchThreshold) {
1964  Results->match[Next].unichar_id = unichar_id_zero;
1965  } else {
1966  Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
1967  }
1968  if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
1969  if (NextGood == Next) {
1970  ++NextGood;
1971  } else {
1972  Results->match[NextGood++] = Results->match[Next];
1973  }
1974  }
1975  }
1976  }
1977  } else {
1978  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
1979  if (Results->match[Next].rating >= BadMatchThreshold) {
1980  if (NextGood == Next) {
1981  ++NextGood;
1982  } else {
1983  Results->match[NextGood++] = Results->match[Next];
1984  }
1985  }
1986  }
1987  }
1988  Results->match.resize(NextGood);
1989 } /* RemoveBadMatches */
1990 
1991 /*----------------------------------------------------------------------------*/
2000  unsigned Next, NextGood;
2001  int punc_count; /*no of garbage characters */
2002  int digit_count;
2003  /*garbage characters */
2004  static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2005  static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2006 
2007  punc_count = 0;
2008  digit_count = 0;
2009  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2010  const UnicharRating &match = Results->match[Next];
2011  bool keep = true;
2012  if (strstr(punc_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2013  if (punc_count >= 2) {
2014  keep = false;
2015  }
2016  punc_count++;
2017  } else {
2018  if (strstr(digit_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2019  if (digit_count >= 1) {
2020  keep = false;
2021  }
2022  digit_count++;
2023  }
2024  }
2025  if (keep) {
2026  if (NextGood == Next) {
2027  ++NextGood;
2028  } else {
2029  Results->match[NextGood++] = match;
2030  }
2031  }
2032  }
2033  Results->match.resize(NextGood);
2034 } /* RemoveExtraPuncs */
2035 
2036 /*---------------------------------------------------------------------------*/
2047 void Classify::SetAdaptiveThreshold(float Threshold) {
2048  Threshold = (Threshold == matcher_good_threshold) ? 0.9f : (1 - Threshold);
2049  classify_adapt_proto_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));
2050  classify_adapt_feature_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));
2051 } /* SetAdaptiveThreshold */
2052 
2053 #ifndef GRAPHICS_DISABLED
2054 
2055 /*---------------------------------------------------------------------------*/
2065 void Classify::ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features,
2066  int num_features) {
2067  uint32_t config_mask;
2068  if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2069  tprintf("No built-in templates for class/shape %d\n", shape_id);
2070  return;
2071  }
2072  if (num_features <= 0) {
2073  tprintf("Illegal blob (char norm features)!\n");
2074  return;
2075  }
2076  UnicharRating cn_result;
2077  classify_norm_method.set_value(character);
2079  features, &cn_result, classify_adapt_feature_threshold, NO_DEBUG,
2080  matcher_debug_separate_windows);
2081  tprintf("\n");
2082  config_mask = 1 << cn_result.config;
2083 
2084  tprintf("Static Shape ID: %d\n", shape_id);
2085  ShowMatchDisplay();
2086  im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, &config_mask, num_features,
2087  features, &cn_result, classify_adapt_feature_threshold, matcher_debug_flags,
2088  matcher_debug_separate_windows);
2090 } /* ShowBestMatchFor */
2091 
2092 #endif // !GRAPHICS_DISABLED
2093 
2094 // Returns a string for the classifier class_id: either the corresponding
2095 // unicharset debug_str or the shape_table_ debug str.
2096 std::string Classify::ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id,
2097  int config_id) const {
2098  std::string class_string;
2099  if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
2100  int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
2101  class_string = shape_table_->DebugStr(shape_id);
2102  } else {
2103  class_string = unicharset.debug_str(class_id);
2104  }
2105  return class_string;
2106 }
2107 
2108 // Converts a classifier class_id index to a shape_table_ index
2109 int Classify::ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const {
2110  int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
2111  // Older inttemps have no font_ids.
2112  if (font_set_id < 0) {
2113  return kBlankFontinfoId;
2114  }
2115  const FontSet &fs = fontset_table_.at(font_set_id);
2116  return fs.at(int_result_config);
2117 }
2118 
2119 // Converts a shape_table_ index to a classifier class_id index (not a
2120 // unichar-id!). Uses a search, so not fast.
2121 int Classify::ShapeIDToClassID(int shape_id) const {
2122  for (unsigned id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
2123  int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
2124  ASSERT_HOST(font_set_id >= 0);
2125  const FontSet &fs = fontset_table_.at(font_set_id);
2126  for (auto f : fs) {
2127  if (f == shape_id) {
2128  return id;
2129  }
2130  }
2131  }
2132  tprintf("Shape %d not found\n", shape_id);
2133  return -1;
2134 }
2135 
2136 // Returns true if the given TEMP_CONFIG_STRUCT is good enough to make it
2137 // a permanent config.
2139  if (classify_learning_debug_level >= 1) {
2140  tprintf("NumTimesSeen for config of %s is %d\n",
2141  getDict().getUnicharset().debug_str(class_id).c_str(), config->NumTimesSeen);
2142  }
2143  if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {
2144  return true;
2145  } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
2146  return false;
2147  } else if (use_ambigs_for_adaption) {
2148  // Go through the ambigs vector and see whether we have already seen
2149  // enough times all the characters represented by the ambigs vector.
2150  const UnicharIdVector *ambigs = getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);
2151  int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2152  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2153  ADAPT_CLASS_STRUCT *ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
2154  assert(ambig_class != nullptr);
2155  if (ambig_class->NumPermConfigs == 0 &&
2156  ambig_class->MaxNumTimesSeen < matcher_min_examples_for_prototyping) {
2157  if (classify_learning_debug_level >= 1) {
2158  tprintf(
2159  "Ambig %s has not been seen enough times,"
2160  " not making config for %s permanent\n",
2161  getDict().getUnicharset().debug_str((*ambigs)[ambig]).c_str(),
2162  getDict().getUnicharset().debug_str(class_id).c_str());
2163  }
2164  return false;
2165  }
2166  }
2167  }
2168  return true;
2169 }
2170 
2172  const UnicharIdVector *ambigs = getDict().getUnicharAmbigs().ReverseAmbigsForAdaption(class_id);
2173  int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2174  if (classify_learning_debug_level >= 1) {
2175  tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
2176  getDict().getUnicharset().debug_str(class_id).c_str(), class_id);
2177  }
2178  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2179  CLASS_ID ambig_class_id = (*ambigs)[ambig];
2180  const ADAPT_CLASS_STRUCT *ambigs_class = AdaptedTemplates->Class[ambig_class_id];
2181  for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
2182  if (ConfigIsPermanent(ambigs_class, cfg)) {
2183  continue;
2184  }
2185  const TEMP_CONFIG_STRUCT *config = TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
2186  if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
2187  if (classify_learning_debug_level >= 1) {
2188  tprintf("Making config %d of %s permanent\n", cfg,
2189  getDict().getUnicharset().debug_str(ambig_class_id).c_str());
2190  }
2191  MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
2192  }
2193  }
2194  }
2195 }
2196 
2197 } // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:59
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:83
#define MakeProtoPermanent(Class, ProtoId)
Definition: adaptive.h:89
#define MakeConfigPermanent(Class, ConfigId)
Definition: adaptive.h:87
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:85
#define IncreaseConfidence(TempConfig)
Definition: adaptive.h:95
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:93
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
#define MAX_ADAPTABLE_WERD_SIZE
Definition: adaptmatch.cpp:85
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:83
#define NO_DEBUG
Definition: adaptmatch.cpp:84
#define ADAPTABLE_WERD_ADJUSTMENT
Definition: adaptmatch.cpp:87
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:80
#define WORST_POSSIBLE_RATING
Definition: adaptmatch.cpp:91
#define classify_enable_adaptive_matcher
Definition: adaptmatch.cpp:78
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:89
#define MAX_MATCHES
Definition: adaptmatch.cpp:82
#define UnusedClassIdIn(T, c)
Definition: intproto.h:155
#define MAX_NUM_PROTOS
Definition: intproto.h:48
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:116
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
#define ClassForClassId(T, c)
Definition: intproto.h:156
#define LegalClassId(c)
Definition: intproto.h:154
#define PRINT_MATCH_SUMMARY
Definition: intproto.h:165
#define PRINT_PROTO_MATCHES
Definition: intproto.h:169
#define PRINT_FEATURE_MATCHES
Definition: intproto.h:168
#define LENGTH_COMPRESSION
Definition: normfeat.h:26
#define GetPicoFeatureLength()
Definition: picofeat.h:56
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
#define reset_bit(array, bit)
Definition: bitvec.h:57
#define test_bit(array, bit)
Definition: bitvec.h:59
#define SET_BIT(array, bit)
Definition: bitvec.h:55
#define NO_PROTO
Definition: matchdefs.h:41
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:41
bool MarginalMatch(float confidence, float matcher_great_threshold)
Definition: adaptmatch.cpp:142
const double kStandardFeatureLength
Definition: intfx.h:44
@ OutlineFeatLength
Definition: outfeat.h:30
@ OutlineFeatY
Definition: outfeat.h:29
@ OutlineFeatX
Definition: outfeat.h:28
@ OutlineFeatDir
Definition: outfeat.h:31
void AddProtoToProtoPruner(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class, bool debug)
Definition: intproto.cpp:344
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:430
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:804
@ PicoFeatDir
Definition: picofeat.h:43
@ PicoFeatX
Definition: picofeat.h:43
@ PicoFeatY
Definition: picofeat.h:43
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, std::vector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:79
int IntCastRounded(double x)
Definition: helpers.h:175
@ character
Definition: mfoutline.h:53
@ baseline
Definition: mfoutline.h:53
const int kBlnXHeight
Definition: normalis.h:33
@ TESSDATA_SHAPE_TABLE
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:88
void InitIntegerFX()
Definition: intfx.cpp:54
std::vector< int > FontSet
Definition: fontinfo.h:154
CLUSTERCONFIG Config
const float MF_SCALE_FACTOR
Definition: mfoutline.h:61
void SetAdaptiveThreshold(float Threshold)
@ GeoTop
Definition: picofeat.h:37
@ GeoBottom
Definition: picofeat.h:36
@ CharNormLength
Definition: normfeat.h:30
@ CharNormRy
Definition: normfeat.h:30
@ CharNormY
Definition: normfeat.h:30
@ CharNormRx
Definition: normfeat.h:30
int MakeTempProtoPerm(void *item1, void *item2)
int UNICHAR_ID
Definition: unichar.h:36
int16_t PROTO_ID
Definition: matchdefs.h:40
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ BCC_STATIC_CLASSIFIER
Definition: ratngs.h:49
@ BCC_ADAPTED_CLASSIFIER
Definition: ratngs.h:50
void AddProtoToClassPruner(PROTO_STRUCT *Proto, CLASS_ID ClassId, INT_TEMPLATES_STRUCT *Templates)
Definition: intproto.cpp:306
const FEATURE_DESC_STRUCT CharNormDesc
void UpdateMatchDisplay()
Definition: intproto.cpp:413
float ActualOutlineLength(FEATURE Feature)
Definition: normfeat.cpp:27
std::vector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:38
LIST push(LIST list, void *element)
Definition: oldlist.cpp:178
int AddIntConfig(INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:250
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:137
const int kBlnBaselineOffset
Definition: normalis.h:34
void FillABC(PROTO_STRUCT *Proto)
Definition: protos.cpp:103
uint8_t FEATURE_ID
Definition: matchdefs.h:46
int AddIntProto(INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:270
CharSegmentationType
Definition: classify.h:87
@ CST_WHOLE
Definition: classify.h:89
@ CST_FRAGMENT
Definition: classify.h:88
void InitMatcherRatings(float *Rating)
const DENORM & denorm() const
Definition: blobs.h:368
TBOX bounding_box() const
Definition: blobs.cpp:466
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:509
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:353
TBOX bounding_box() const
Definition: blobs.cpp:863
std::vector< TBLOB * > blobs
Definition: blobs.h:462
unsigned NumBlobs() const
Definition: blobs.h:449
void plot(ScrollView *window)
Definition: blobs.cpp:907
const T & at(int id) const
Return the object from an id.
Definition: unicity_table.h:56
void XHeightRange(int unichar_id, const UNICHARSET &unicharset, const TBOX &bbox, float *min_xht, float *max_xht, float *yshift) const
Definition: normalis.cpp:437
Image pix() const
Definition: normalis.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:239
std::vector< std::string > correct_text
Definition: pageres.h:287
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:570
TWERD * chopped_word
Definition: pageres.h:210
const FontInfo * fontinfo
Definition: pageres.h:307
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:441
std::vector< int > best_state
Definition: pageres.h:283
TWERD * rebuild_word
Definition: pageres.h:264
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1111
std::vector< SEAM * > seam_array
Definition: pageres.h:212
std::string debug_string() const
Definition: ratngs.h:475
unsigned length() const
Definition: ratngs.h:283
float adjust_factor() const
Definition: ratngs.h:286
TDimension left() const
Definition: rect.h:82
TDimension top() const
Definition: rect.h:68
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
static void JoinPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:204
static void BreakPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:181
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:208
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:198
std::string language_data_path_prefix
Definition: ccutil.h:60
UNICHARSET unicharset
Definition: ccutil.h:61
std::string imagefile
Definition: ccutil.h:65
bool Open(const char *filename, FileReader reader)
Definition: serialis.cpp:140
bool GetComponent(TessdataType type, TFile *fp)
std::string to_string() const
Definition: unicharset.h:91
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:912
size_t size() const
Definition: unicharset.h:355
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:713
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:769
PERM_CONFIG_STRUCT * Perm
Definition: adaptive.h:52
ADAPTED_CONFIG Config[MAX_NUM_CONFIGS]
Definition: adaptive.h:64
ADAPT_CLASS_STRUCT * Class[MAX_NUM_CLASSES]
Definition: adaptive.h:75
INT_TEMPLATES_STRUCT * Templates
Definition: adaptive.h:72
std::vector< UnicharRating > match
Definition: adaptmatch.cpp:99
std::vector< CP_RESULT_STRUCT > CPResults
Definition: adaptmatch.cpp:100
ADAPT_TEMPLATES_STRUCT * Templates
Definition: adaptmatch.cpp:125
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
BIT_VECTOR AllProtosOn
Definition: classify.h:428
IntegerMatcher im_
Definition: classify.h:446
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, std::vector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:427
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:811
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:940
void RemoveBadMatches(ADAPT_RESULTS *Results)
bool LooksLikeGarbage(TBLOB *blob)
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:778
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
BIT_VECTOR TempProtoMask
Definition: classify.h:431
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:986
void LearnBlob(const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:35
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:262
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
Definition: adaptive.cpp:345
ADAPT_TEMPLATES_STRUCT * AdaptedTemplates
Definition: classify.h:421
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:596
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:625
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results)
ShapeTable * shape_table_
Definition: classify.h:452
void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:129
void ClearCharNormArray(uint8_t *char_norm_array)
Definition: float2int.cpp:41
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:191
void ConvertProto(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:452
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:240
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask)
UnicityTable< FontSet > fontset_table_
Definition: classify.h:443
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
INT_TEMPLATES_STRUCT * ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:627
ADAPT_TEMPLATES_STRUCT * ReadAdaptedTemplates(TFile *File)
Definition: adaptive.cpp:235
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
void ClassifyAsNoise(ADAPT_RESULTS *Results)
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
void ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:173
BIT_VECTOR AllConfigsOff
Definition: classify.h:430
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)
ADAPT_TEMPLATES_STRUCT * BackupAdaptedTemplates
Definition: classify.h:425
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class, ADAPT_TEMPLATES_STRUCT *Templates)
Definition: adaptmatch.cpp:686
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:609
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config)
void AmbigClassifier(const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
INT_TEMPLATES_STRUCT * PreTrainedTemplates
Definition: classify.h:420
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:202
int MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
Definition: float2int.cpp:58
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:385
virtual Dict & getDict()
Definition: classify.h:98
NORM_PROTOS * NormProtos
Definition: classify.h:433
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:60
void MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box, const std::vector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
BIT_VECTOR AllConfigsOn
Definition: classify.h:429
void SetAdaptiveThreshold(float Threshold)
int GetFontinfoId(ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId)
Definition: adaptive.cpp:118
void ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs)
Definition: cutoffs.cpp:41
std::string ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample, std::vector< UnicharRating > *results)
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:85
int ShapeIDToClassID(int shape_id) const
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:464
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:435
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES_STRUCT *adaptive_templates)
Definition: adaptmatch.cpp:843
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
Definition: adaptive.cpp:153
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:40
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
void MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:170
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
void Match(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:482
int FindBadFeatures(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray, int AdaptFeatureThreshold, int Debug)
Definition: intmatcher.cpp:619
float ApplyCNCorrection(float rating, int blob_length, int normalization_factor, int matcher_multiplier)
int FindGoodProtos(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray, int AdaptProtoThreshold, int Debug)
Definition: intmatcher.cpp:555
INT_CLASS_STRUCT * Class[MAX_NUM_CLASSES]
Definition: intproto.h:111
std::vector< float > Params
Definition: ocrfeatures.h:66
std::vector< FEATURE_STRUCT * > Features
Definition: ocrfeatures.h:85
virtual int UnicharClassifySample(const TrainingSample &sample, Image page_pix, int debug, UNICHAR_ID keep_this, std::vector< UnicharRating > *results)
void DebugDisplay(const TrainingSample &sample, Image page_pix, UNICHAR_ID unichar_id)
std::vector< ScoredFont > fonts
Definition: shapetable.h:71
int size() const
Definition: shapetable.h:169
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:255
std::string DebugStr(unsigned shape_id) const
Definition: shapetable.cpp:292
const Shape & GetShape(unsigned shape_id) const
Definition: shapetable.h:292
int MaxNumUnichars() const
Definition: shapetable.cpp:472
uint32_t num_features() const
FEATURE_STRUCT * GetCNFeature() const
const INT_FEATURE_STRUCT * features() const
int geo_feature(int index) const
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:366
const UNICHARSET & getUnicharset() const
Definition: dict.h:104
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111
void EndDangerousAmbigs()
Definition: stopper.cpp:358
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:362
static void Update()
Definition: scrollview.cpp:713