tesseract  5.0.0
detlinefit.cpp
Go to the documentation of this file.
1 // File: detlinefit.cpp
3 // Description: Deterministic least median squares line fitting.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2008, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "detlinefit.h"
20 #include "helpers.h" // for IntCastRounded
21 #include "statistc.h"
22 #include "tprintf.h"
23 
24 #include <algorithm>
25 #include <cfloat> // for FLT_MAX
26 
27 namespace tesseract {
28 
29 // The number of points to consider at each end.
30 const int kNumEndPoints = 3;
31 // The minimum number of points at which to switch to number of points
32 // for badly fitted lines.
33 // To ensure a sensible error metric, kMinPointsForErrorCount should be at
34 // least kMaxRealDistance / (1 - %ile) where %ile is the fractile used in
35 // ComputeUpperQuartileError.
36 const int kMinPointsForErrorCount = 16;
37 // The maximum real distance to use before switching to number of
38 // mis-fitted points, which will get square-rooted for true distance.
39 const int kMaxRealDistance = 2.0;
40 
41 DetLineFit::DetLineFit() : square_length_(0.0) {}
42 
43 // Delete all Added points.
45  pts_.clear();
46  distances_.clear();
47 }
48 
49 // Add a new point. Takes a copy - the pt doesn't need to stay in scope.
50 void DetLineFit::Add(const ICOORD &pt) {
51  pts_.emplace_back(pt, 0);
52 }
53 // Associates a half-width with the given point if a point overlaps the
54 // previous point by more than half the width, and its distance is further
55 // than the previous point, then the more distant point is ignored in the
56 // distance calculation. Useful for ignoring i dots and other diacritics.
57 void DetLineFit::Add(const ICOORD &pt, int halfwidth) {
58  pts_.emplace_back(pt, halfwidth);
59 }
60 
61 // Fits a line to the points, ignoring the skip_first initial points and the
62 // skip_last final points, returning the fitted line as a pair of points,
63 // and the upper quartile error.
64 double DetLineFit::Fit(int skip_first, int skip_last, ICOORD *pt1, ICOORD *pt2) {
65  // Do something sensible with no points.
66  if (pts_.empty()) {
67  pt1->set_x(0);
68  pt1->set_y(0);
69  *pt2 = *pt1;
70  return 0.0;
71  }
72  // Count the points and find the first and last kNumEndPoints.
73  int pt_count = pts_.size();
74  ICOORD *starts[kNumEndPoints];
75  if (skip_first >= pt_count) {
76  skip_first = pt_count - 1;
77  }
78  int start_count = 0;
79  int end_i = std::min(skip_first + kNumEndPoints, pt_count);
80  for (int i = skip_first; i < end_i; ++i) {
81  starts[start_count++] = &pts_[i].pt;
82  }
83  ICOORD *ends[kNumEndPoints];
84  if (skip_last >= pt_count) {
85  skip_last = pt_count - 1;
86  }
87  int end_count = 0;
88  end_i = std::max(0, pt_count - kNumEndPoints - skip_last);
89  for (int i = pt_count - 1 - skip_last; i >= end_i; --i) {
90  ends[end_count++] = &pts_[i].pt;
91  }
92  // 1 or 2 points need special treatment.
93  if (pt_count <= 2) {
94  *pt1 = *starts[0];
95  if (pt_count > 1) {
96  *pt2 = *ends[0];
97  } else {
98  *pt2 = *pt1;
99  }
100  return 0.0;
101  }
102  // Although with between 2 and 2*kNumEndPoints-1 points, there will be
103  // overlap in the starts, ends sets, this is OK and taken care of by the
104  // if (*start != *end) test below, which also tests for equal input points.
105  double best_uq = -1.0;
106  // Iterate each pair of points and find the best fitting line.
107  for (int i = 0; i < start_count; ++i) {
108  ICOORD *start = starts[i];
109  for (int j = 0; j < end_count; ++j) {
110  ICOORD *end = ends[j];
111  if (*start != *end) {
112  ComputeDistances(*start, *end);
113  // Compute the upper quartile error from the line.
114  double dist = EvaluateLineFit();
115  if (dist < best_uq || best_uq < 0.0) {
116  best_uq = dist;
117  *pt1 = *start;
118  *pt2 = *end;
119  }
120  }
121  }
122  }
123  // Finally compute the square root to return the true distance.
124  return best_uq > 0.0 ? sqrt(best_uq) : best_uq;
125 }
126 
127 // Constrained fit with a supplied direction vector. Finds the best line_pt,
128 // that is one of the supplied points having the median cross product with
129 // direction, ignoring points that have a cross product outside of the range
130 // [min_dist, max_dist]. Returns the resulting error metric using the same
131 // reduced set of points.
132 // *Makes use of floating point arithmetic*
133 double DetLineFit::ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist,
134  bool debug, ICOORD *line_pt) {
135  ComputeConstrainedDistances(direction, min_dist, max_dist);
136  // Do something sensible with no points or computed distances.
137  if (pts_.empty() || distances_.empty()) {
138  line_pt->set_x(0);
139  line_pt->set_y(0);
140  return 0.0;
141  }
142  auto median_index = distances_.size() / 2;
143  std::nth_element(distances_.begin(), distances_.begin() + median_index, distances_.end());
144  *line_pt = distances_[median_index].data();
145  if (debug) {
146  tprintf("Constrained fit to dir %g, %g = %d, %d :%zu distances:\n", direction.x(), direction.y(),
147  line_pt->x(), line_pt->y(), distances_.size());
148  for (unsigned i = 0; i < distances_.size(); ++i) {
149  tprintf("%d: %d, %d -> %g\n", i, distances_[i].data().x(), distances_[i].data().y(),
150  distances_[i].key());
151  }
152  tprintf("Result = %zu\n", median_index);
153  }
154  // Center distances on the fitted point.
155  double dist_origin = direction * *line_pt;
156  for (auto &distance : distances_) {
157  distance.key() -= dist_origin;
158  }
159  return sqrt(EvaluateLineFit());
160 }
161 
162 // Returns true if there were enough points at the last call to Fit or
163 // ConstrainedFit for the fitted points to be used on a badly fitted line.
165  return distances_.size() >= kMinPointsForErrorCount;
166 }
167 
168 // Backwards compatible fit returning a gradient and constant.
169 // Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
170 // function in preference to the LMS class.
171 double DetLineFit::Fit(float *m, float *c) {
172  ICOORD start, end;
173  double error = Fit(&start, &end);
174  if (end.x() != start.x()) {
175  *m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x());
176  *c = start.y() - *m * start.x();
177  } else {
178  *m = 0.0f;
179  *c = 0.0f;
180  }
181  return error;
182 }
183 
184 // Backwards compatible constrained fit with a supplied gradient.
185 // Deprecated. Use ConstrainedFit(const FCOORD& direction) where possible
186 // to avoid potential difficulties with infinite gradients.
187 double DetLineFit::ConstrainedFit(double m, float *c) {
188  // Do something sensible with no points.
189  if (pts_.empty()) {
190  *c = 0.0f;
191  return 0.0;
192  }
193  double cos = 1.0 / sqrt(1.0 + m * m);
194  FCOORD direction(cos, m * cos);
195  ICOORD line_pt;
196  double error = ConstrainedFit(direction, -FLT_MAX, FLT_MAX, false, &line_pt);
197  *c = line_pt.y() - line_pt.x() * m;
198  return error;
199 }
200 
201 // Computes and returns the squared evaluation metric for a line fit.
202 double DetLineFit::EvaluateLineFit() {
203  // Compute the upper quartile error from the line.
204  double dist = ComputeUpperQuartileError();
205  if (distances_.size() >= kMinPointsForErrorCount && dist > kMaxRealDistance * kMaxRealDistance) {
206  // Use the number of mis-fitted points as the error metric, as this
207  // gives a better measure of fit for badly fitted lines where more
208  // than a quarter are badly fitted.
209  double threshold = kMaxRealDistance * sqrt(square_length_);
210  dist = NumberOfMisfittedPoints(threshold);
211  }
212  return dist;
213 }
214 
215 // Computes the absolute error distances of the points from the line,
216 // and returns the squared upper-quartile error distance.
217 double DetLineFit::ComputeUpperQuartileError() {
218  int num_errors = distances_.size();
219  if (num_errors == 0) {
220  return 0.0;
221  }
222  // Get the absolute values of the errors.
223  for (int i = 0; i < num_errors; ++i) {
224  if (distances_[i].key() < 0) {
225  distances_[i].key() = -distances_[i].key();
226  }
227  }
228  // Now get the upper quartile distance.
229  auto index = 3 * num_errors / 4;
230  std::nth_element(distances_.begin(), distances_.begin() + index, distances_.end());
231  double dist = distances_[index].key();
232  // The true distance is the square root of the dist squared / square_length.
233  // Don't bother with the square root. Just return the square distance.
234  return square_length_ > 0.0 ? dist * dist / square_length_ : 0.0;
235 }
236 
237 // Returns the number of sample points that have an error more than threshold.
238 int DetLineFit::NumberOfMisfittedPoints(double threshold) const {
239  int num_misfits = 0;
240  int num_dists = distances_.size();
241  // Get the absolute values of the errors.
242  for (int i = 0; i < num_dists; ++i) {
243  if (distances_[i].key() > threshold) {
244  ++num_misfits;
245  }
246  }
247  return num_misfits;
248 }
249 
250 // Computes all the cross product distances of the points from the line,
251 // storing the actual (signed) cross products in distances.
252 // Ignores distances of points that are further away than the previous point,
253 // and overlaps the previous point by at least half.
254 void DetLineFit::ComputeDistances(const ICOORD &start, const ICOORD &end) {
255  distances_.clear();
256  ICOORD line_vector = end;
257  line_vector -= start;
258  square_length_ = line_vector.sqlength();
259  int line_length = IntCastRounded(sqrt(square_length_));
260  // Compute the distance of each point from the line.
261  int prev_abs_dist = 0;
262  int prev_dot = 0;
263  for (unsigned i = 0; i < pts_.size(); ++i) {
264  ICOORD pt_vector = pts_[i].pt;
265  pt_vector -= start;
266  int dot = line_vector % pt_vector;
267  // Compute |line_vector||pt_vector|sin(angle between)
268  int dist = line_vector * pt_vector;
269  int abs_dist = dist < 0 ? -dist : dist;
270  if (abs_dist > prev_abs_dist && i > 0) {
271  // Ignore this point if it overlaps the previous one.
272  int separation = abs(dot - prev_dot);
273  if (separation < line_length * pts_[i].halfwidth ||
274  separation < line_length * pts_[i - 1].halfwidth) {
275  continue;
276  }
277  }
278  distances_.emplace_back(dist, pts_[i].pt);
279  prev_abs_dist = abs_dist;
280  prev_dot = dot;
281  }
282 }
283 
284 // Computes all the cross product distances of the points perpendicular to
285 // the given direction, ignoring distances outside of the give distance range,
286 // storing the actual (signed) cross products in distances_.
287 void DetLineFit::ComputeConstrainedDistances(const FCOORD &direction, double min_dist,
288  double max_dist) {
289  distances_.clear();
290  square_length_ = direction.sqlength();
291  // Compute the distance of each point from the line.
292  for (auto &pt : pts_) {
293  FCOORD pt_vector = pt.pt;
294  // Compute |line_vector||pt_vector|sin(angle between)
295  double dist = direction * pt_vector;
296  if (min_dist <= dist && dist <= max_dist) {
297  distances_.emplace_back(dist, pt.pt);
298  }
299  }
300 }
301 
302 } // namespace tesseract.
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
Definition: unicodetext.cc:44
const int kMaxRealDistance
Definition: detlinefit.cpp:39
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:175
const int kMinPointsForErrorCount
Definition: detlinefit.cpp:36
const int kNumEndPoints
Definition: detlinefit.cpp:30
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:50
double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug, ICOORD *line_pt)
Definition: detlinefit.cpp:133
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:73
bool SufficientPointsForIndependentFit() const
Definition: detlinefit.cpp:164
integer coordinate
Definition: points.h:36
void set_x(TDimension xin)
rewrite function
Definition: points.h:67
TDimension y() const
access_function
Definition: points.h:62
void set_y(TDimension yin)
rewrite function
Definition: points.h:71
TDimension x() const
access function
Definition: points.h:58
float y() const
Definition: points.h:209
float x() const
Definition: points.h:206