tesseract  5.0.0
shapetable.h
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: shapetable.h
5 // Description: Class to map a classifier shape index to unicharset
6 // indices and font indices.
7 // Author: Ray Smith
8 //
9 // (C) Copyright 2010, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
23 #define TESSERACT_CLASSIFY_SHAPETABLE_H_
24 
25 #include "bitvector.h"
26 #include "fontinfo.h"
27 #include "genericheap.h"
28 #include "intmatcher.h"
29 
30 namespace tesseract {
31 
32 class UNICHARSET;
33 class ShapeTable;
34 
35 // Simple struct to hold a single classifier unichar selection, a corresponding
36 // rating, and a list of appropriate fonts.
37 struct UnicharRating {
38  UnicharRating() : unichar_id(0), rating(0.0f), adapted(false), config(0), feature_misses(0) {}
39  UnicharRating(int u, float r)
40  : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {}
41 
42  // Print debug info.
43  void Print() const {
44  tprintf(
45  "Unichar-id=%d, rating=%g, adapted=%d, config=%d, misses=%u,"
46  " %zu fonts\n",
48  }
49 
50  // Helper function to get the index of the first result with the required
51  // unichar_id. If the results are sorted by rating, this will also be the
52  // best result with the required unichar_id.
53  // Returns -1 if the unichar_id is not found
54  static int FirstResultWithUnichar(const std::vector<UnicharRating> &results,
56 
57  // Index into some UNICHARSET table indicates the class of the answer.
59  // Rating from classifier with 1.0 perfect and 0.0 impossible.
60  // Call it a probability if you must.
61  float rating;
62  // True if this result is from the adaptive classifier.
63  bool adapted;
64  // Index of best matching font configuration of result.
65  uint8_t config;
66  // Number of features that were total misses - were liked by no classes.
67  uint16_t feature_misses;
68  // Unsorted collection of fontinfo ids and scores. Note that a raw result
69  // from the IntegerMatch will contain config ids, that require transforming
70  // to fontinfo ids via fontsets and (possibly) shapetable.
71  std::vector<ScoredFont> fonts;
72 };
73 
74 // Classifier result from a low-level classification is an index into some
75 // ShapeTable and a rating.
76 struct ShapeRating {
77  ShapeRating() : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f), joined(false), broken(false) {}
78  ShapeRating(int s, float r)
79  : shape_id(s), rating(r), raw(1.0f), font(0.0f), joined(false), broken(false) {}
80 
81  // Helper function to get the index of the first result with the required
82  // unichar_id. If the results are sorted by rating, this will also be the
83  // best result with the required unichar_id.
84  // Returns -1 if the unichar_id is not found
85  static int FirstResultWithUnichar(const std::vector<ShapeRating> &results,
86  const ShapeTable &shape_table, UNICHAR_ID unichar_id);
87 
88  // Index into some shape table indicates the class of the answer.
89  int shape_id;
90  // Rating from classifier with 1.0 perfect and 0.0 impossible.
91  // Call it a probability if you must.
92  float rating;
93  // Subsidiary rating that a classifier may use internally.
94  float raw;
95  // Subsidiary rating that a classifier may use internally.
96  float font;
97  // Flag indicating that the input may be joined.
98  bool joined;
99  // Flag indicating that the input may be broken (a fragment).
100  bool broken;
101 };
102 
103 // Simple struct to hold an entry for a heap-based priority queue of
104 // ShapeRating.
107  ShapeQueueEntry(const ShapeRating &rating, int level0) : result(rating), level(level0) {}
108 
109  // Sort by decreasing rating and decreasing level for equal rating.
110  bool operator<(const ShapeQueueEntry &other) const {
111  if (result.rating > other.result.rating) {
112  return true;
113  }
114  if (result.rating == other.result.rating) {
115  return level > other.level;
116  }
117  return false;
118  }
119 
120  // Output from classifier.
122  // Which level in the tree did this come from?
123  int level;
124 };
126 
127 // Simple struct to hold a set of fonts associated with a single unichar-id.
128 // A vector of UnicharAndFonts makes a shape.
131  UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
132  font_ids.push_back(font_id);
133  }
134 
135  // Writes to the given file. Returns false in case of error.
136  bool Serialize(FILE *fp) const;
137  // Reads from the given file. Returns false in case of error.
138  bool DeSerialize(TFile *fp);
139 
140  // Sort function to sort a pair of UnicharAndFonts by unichar_id.
141  static int SortByUnicharId(const void *v1, const void *v2);
142  static bool StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2);
143 
144  std::vector<int32_t> font_ids;
145  int32_t unichar_id;
146 };
147 
148 // A Shape is a collection of unichar-ids and a list of fonts associated with
149 // each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
150 // a classifiable unit, and represents a group of characters or parts of
151 // characters that have a similar or identical shape. Shapes/ShapeTables may
152 // be organized hierarchically from identical shapes at the leaves to vaguely
153 // similar shapes near the root.
155 public:
156  Shape() : destination_index_(-1) {}
157 
158  // Writes to the given file. Returns false in case of error.
159  bool Serialize(FILE *fp) const;
160  // Reads from the given file. Returns false in case of error.
161  bool DeSerialize(TFile *fp);
162 
163  int destination_index() const {
164  return destination_index_;
165  }
166  void set_destination_index(int index) {
167  destination_index_ = index;
168  }
169  int size() const {
170  return unichars_.size();
171  }
172  // Returns a UnicharAndFonts entry for the given index, which must be
173  // in the range [0, size()).
174  const UnicharAndFonts &operator[](int index) const {
175  return unichars_[index];
176  }
177  // Sets the unichar_id of the given index to the new unichar_id.
178  void SetUnicharId(int index, int unichar_id) {
179  unichars_[index].unichar_id = unichar_id;
180  }
181  // Adds a font_id for the given unichar_id. If the unichar_id is not
182  // in the shape, it is added.
183  void AddToShape(int unichar_id, int font_id);
184  // Adds everything in other to this.
185  void AddShape(const Shape &other);
186  // Returns true if the shape contains the given unichar_id, font_id pair.
187  bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
188  // Returns true if the shape contains the given unichar_id, ignoring font.
189  bool ContainsUnichar(int unichar_id) const;
190  // Returns true if the shape contains the given font, ignoring unichar_id.
191  bool ContainsFont(int font_id) const;
192  // Returns true if the shape contains the given font properties, ignoring
193  // unichar_id.
194  bool ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const;
195  // Returns true if the shape contains multiple different font properties,
196  // ignoring unichar_id.
197  bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const;
198  // Returns true if this shape is equal to other (ignoring order of unichars
199  // and fonts).
200  bool operator==(const Shape &other) const;
201  // Returns true if this is a subset (including equal) of other.
202  bool IsSubsetOf(const Shape &other) const;
203  // Returns true if the lists of unichar ids are the same in this and other,
204  // ignoring fonts.
205  // NOT const, as it will sort the unichars on demand.
206  bool IsEqualUnichars(Shape *other);
207 
208 private:
209  // Sorts the unichars_ vector by unichar.
210  void SortUnichars();
211 
212  // Flag indicates that the unichars are sorted, allowing faster set
213  // operations with another shape.
214  bool unichars_sorted_ = false;
215  // If this Shape is part of a ShapeTable the destiation_index_ is the index
216  // of some other shape in the ShapeTable with which this shape is merged.
217  int destination_index_ = 0;
218  // Array of unichars, each with a set of fonts. Each unichar has at most
219  // one entry in the vector.
220  std::vector<UnicharAndFonts> unichars_;
221 };
222 
223 // ShapeTable is a class to encapsulate the triple indirection that is
224 // used here.
225 // ShapeTable is a vector of shapes.
226 // Each shape is a vector of UnicharAndFonts representing the set of unichars
227 // that the shape represents.
228 // Each UnicharAndFonts also lists the fonts of the unichar_id that were
229 // mapped to the shape during training.
231 public:
232  ShapeTable();
233  // The UNICHARSET reference supplied here, or in set_unicharset below must
234  // exist for the entire life of the ShapeTable. It is used only by DebugStr.
235  explicit ShapeTable(const UNICHARSET &unicharset);
237  for (auto data : shape_table_) {
238  delete data;
239  }
240  }
241 
242  // Writes to the given file. Returns false in case of error.
243  bool Serialize(FILE *fp) const;
244  // Reads from the given file. Returns false in case of error.
245  bool DeSerialize(TFile *fp);
246 
247  // Accessors.
248  unsigned NumShapes() const {
249  return shape_table_.size();
250  }
251  const UNICHARSET &unicharset() const {
252  return *unicharset_;
253  }
254  // Returns the number of fonts used in this ShapeTable, computing it if
255  // necessary.
256  int NumFonts() const;
257  // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
258  // entire life of the ShapeTable.
259  void set_unicharset(const UNICHARSET &unicharset) {
260  unicharset_ = &unicharset;
261  }
262  // Re-indexes the class_ids in the shapetable according to the given map.
263  // Useful in conjunction with set_unicharset.
264  void ReMapClassIds(const std::vector<int> &unicharset_map);
265  // Returns a string listing the classes/fonts in a shape.
266  std::string DebugStr(unsigned shape_id) const;
267  // Returns a debug string summarizing the table.
268  std::string SummaryStr() const;
269 
270  // Adds a new shape starting with the given unichar_id and font_id.
271  // Returns the assigned index.
272  unsigned AddShape(int unichar_id, int font_id);
273  // Adds a copy of the given shape unless it is already present.
274  // Returns the assigned index or index of existing shape if already present.
275  unsigned AddShape(const Shape &other);
276  // Removes the shape given by the shape index. All indices above are changed!
277  void DeleteShape(unsigned shape_id);
278  // Adds a font_id to the given existing shape index for the given
279  // unichar_id. If the unichar_id is not in the shape, it is added.
280  void AddToShape(unsigned shape_id, int unichar_id, int font_id);
281  // Adds the given shape to the existing shape with the given index.
282  void AddShapeToShape(unsigned shape_id, const Shape &other);
283  // Returns the id of the shape that contains the given unichar and font.
284  // If not found, returns -1.
285  // If font_id < 0, the font_id is ignored and the first shape that matches
286  // the unichar_id is returned.
287  int FindShape(int unichar_id, int font_id) const;
288  // Returns the first unichar_id and font_id in the given shape.
289  void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const;
290 
291  // Accessors for the Shape with the given shape_id.
292  const Shape &GetShape(unsigned shape_id) const {
293  return *shape_table_[shape_id];
294  }
295  Shape *MutableShape(unsigned shape_id) {
296  return shape_table_[shape_id];
297  }
298 
299  // Expands all the classes/fonts in the shape individually to build
300  // a ShapeTable.
301  int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes);
302 
303  // Returns true if the shapes are already merged.
304  bool AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const;
305  // Returns true if any shape contains multiple unichars.
306  bool AnyMultipleUnichars() const;
307  // Returns the maximum number of unichars over all shapes.
308  int MaxNumUnichars() const;
309  // Merges shapes with a common unichar over the [start, end) interval.
310  // Assumes single unichar per shape.
311  void ForceFontMerges(unsigned start, unsigned end);
312  // Returns the number of unichars in the master shape.
313  unsigned MasterUnicharCount(unsigned shape_id) const;
314  // Returns the sum of the font counts in the master shape.
315  int MasterFontCount(unsigned shape_id) const;
316  // Returns the number of unichars that would result from merging the shapes.
317  int MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const;
318  // Merges two shape_ids, leaving shape_id2 marked as merged.
319  void MergeShapes(unsigned shape_id1, unsigned shape_id2);
320  // Swaps two shape_ids.
321  void SwapShapes(unsigned shape_id1, unsigned shape_id2);
322  // Appends the master shapes from other to this.
323  // Used to create a clean ShapeTable from a merged one, or to create a
324  // copy of a ShapeTable.
325  // If not nullptr, shape_map is set to map other shape_ids to this's
326  // shape_ids.
327  void AppendMasterShapes(const ShapeTable &other, std::vector<int> *shape_map);
328  // Returns the number of master shapes remaining after merging.
329  int NumMasterShapes() const;
330  // Returns the destination of this shape, (if merged), taking into account
331  // the fact that the destination may itself have been merged.
332  // For a non-merged shape, returns the input shape_id.
333  unsigned MasterDestinationIndex(unsigned shape_id) const;
334 
335  // Returns false if the unichars in neither shape is a subset of the other..
336  bool SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const;
337  // Returns false if the unichars in neither shape is a subset of the other..
338  bool MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const;
339  // Returns true if the unichar sets are equal between the shapes.
340  bool EqualUnichars(unsigned shape_id1, unsigned shape_id2) const;
341  bool MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const;
342  // Returns true if there is a common unichar between the shapes.
343  bool CommonUnichars(unsigned shape_id1, unsigned shape_id2) const;
344  // Returns true if there is a common font id between the shapes.
345  bool CommonFont(unsigned shape_id1, unsigned shape_id2) const;
346 
347  // Adds the unichars of the given shape_id to the vector of results. Any
348  // unichar_id that is already present just has the fonts added to the
349  // font set for that result without adding a new entry in the vector.
350  // NOTE: it is assumed that the results are given to this function in order
351  // of decreasing rating.
352  // The unichar_map vector indicates the index of the results entry containing
353  // each unichar, or -1 if the unichar is not yet included in results.
354  void AddShapeToResults(const ShapeRating &shape_rating, std::vector<int> *unichar_map,
355  std::vector<UnicharRating> *results) const;
356 
357 private:
358  // Adds the given unichar_id to the results if needed, updating unichar_map
359  // and returning the index of unichar in results.
360  int AddUnicharToResults(int unichar_id, float rating, std::vector<int> *unichar_map,
361  std::vector<UnicharRating> *results) const;
362 
363  // Pointer to a provided unicharset used only by the Debugstr member.
364  const UNICHARSET *unicharset_;
365  // Vector of pointers to the Shapes in this ShapeTable.
366  std::vector<Shape *> shape_table_;
367 
368  // Cached data calculated on demand.
369  mutable int num_fonts_;
370 };
371 
372 } // namespace tesseract.
373 
374 #endif // TESSERACT_CLASSIFY_SHAPETABLE_H_
bool operator==(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.cc:377
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool DeSerialize(bool swap, FILE *fp, std::vector< T > &data)
Definition: helpers.h:220
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:251
int UNICHAR_ID
Definition: unichar.h:36
std::vector< ScoredFont > fonts
Definition: shapetable.h:71
UnicharRating(int u, float r)
Definition: shapetable.h:39
static int FirstResultWithUnichar(const std::vector< UnicharRating > &results, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:54
ShapeRating(int s, float r)
Definition: shapetable.h:78
static int FirstResultWithUnichar(const std::vector< ShapeRating > &results, const ShapeTable &shape_table, UNICHAR_ID unichar_id)
Definition: shapetable.cpp:38
ShapeQueueEntry(const ShapeRating &rating, int level0)
Definition: shapetable.h:107
bool operator<(const ShapeQueueEntry &other) const
Definition: shapetable.h:110
static bool StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2)
Definition: shapetable.cpp:81
std::vector< int32_t > font_ids
Definition: shapetable.h:144
static int SortByUnicharId(const void *v1, const void *v2)
Definition: shapetable.cpp:75
UnicharAndFonts(int uni_id, int font_id)
Definition: shapetable.h:131
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:70
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:65
int destination_index() const
Definition: shapetable.h:163
void set_destination_index(int index)
Definition: shapetable.h:166
const UnicharAndFonts & operator[](int index) const
Definition: shapetable.h:174
void SetUnicharId(int index, int unichar_id)
Definition: shapetable.h:178
int size() const
Definition: shapetable.h:169
const Shape & GetShape(unsigned shape_id) const
Definition: shapetable.h:292
unsigned NumShapes() const
Definition: shapetable.h:248
void set_unicharset(const UNICHARSET &unicharset)
Definition: shapetable.h:259
Shape * MutableShape(unsigned shape_id)
Definition: shapetable.h:295
const UNICHARSET & unicharset() const
Definition: shapetable.h:251
#define TESS_API
Definition: export.h:34