tesseract  5.0.0
ctc.cpp
Go to the documentation of this file.
1 // File: ctc.cpp
3 // Description: Slightly improved standard CTC to compute the targets.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2016, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
17 
18 #include "ctc.h"
19 
20 #include "matrix.h"
21 #include "network.h"
22 #include "networkio.h"
23 #include "scrollview.h"
24 
25 #include <algorithm>
26 #include <cfloat> // for FLT_MAX
27 #include <cmath>
28 #include <memory>
29 
30 namespace tesseract {
31 
32 // Magic constants that keep CTC stable.
33 // Minimum probability limit for softmax input to ctc_loss.
34 const float CTC::kMinProb_ = 1e-12;
35 // Maximum absolute argument to exp().
36 const double CTC::kMaxExpArg_ = 80.0;
37 // Minimum probability for total prob in time normalization.
38 const double CTC::kMinTotalTimeProb_ = 1e-8;
39 // Minimum probability for total prob in final normalization.
40 const double CTC::kMinTotalFinalProb_ = 1e-6;
41 
42 // Builds a target using CTC. Slightly improved as follows:
43 // Includes normalizations and clipping for stability.
44 // labels should be pre-padded with nulls everywhere.
45 // labels can be longer than the time sequence, but the total number of
46 // essential labels (non-null plus nulls between equal labels) must not exceed
47 // the number of timesteps in outputs.
48 // outputs is the output of the network, and should have already been
49 // normalized with NormalizeProbs.
50 // On return targets is filled with the computed targets.
51 // Returns false if there is insufficient time for the labels.
52 /* static */
53 bool CTC::ComputeCTCTargets(const std::vector<int> &labels, int null_char,
54  const GENERIC_2D_ARRAY<float> &outputs, NetworkIO *targets) {
55  std::unique_ptr<CTC> ctc(new CTC(labels, null_char, outputs));
56  if (!ctc->ComputeLabelLimits()) {
57  return false; // Not enough time.
58  }
59  // Generate simple targets purely from the truth labels by spreading them
60  // evenly over time.
61  GENERIC_2D_ARRAY<float> simple_targets;
62  ctc->ComputeSimpleTargets(&simple_targets);
63  // Add the simple targets as a starter bias to the network outputs.
64  float bias_fraction = ctc->CalculateBiasFraction();
65  simple_targets *= bias_fraction;
66  ctc->outputs_ += simple_targets;
67  NormalizeProbs(&ctc->outputs_);
68  // Run regular CTC on the biased outputs.
69  // Run forward and backward
70  GENERIC_2D_ARRAY<double> log_alphas, log_betas;
71  ctc->Forward(&log_alphas);
72  ctc->Backward(&log_betas);
73  // Normalize and come out of log space with a clipped softmax over time.
74  log_alphas += log_betas;
75  ctc->NormalizeSequence(&log_alphas);
76  ctc->LabelsToClasses(log_alphas, targets);
77  NormalizeProbs(targets);
78  return true;
79 }
80 
81 CTC::CTC(const std::vector<int> &labels, int null_char, const GENERIC_2D_ARRAY<float> &outputs)
82  : labels_(labels), outputs_(outputs), null_char_(null_char) {
83  num_timesteps_ = outputs.dim1();
84  num_classes_ = outputs.dim2();
85  num_labels_ = labels_.size();
86 }
87 
88 // Computes vectors of min and max label index for each timestep, based on
89 // whether skippability of nulls makes it possible to complete a valid path.
90 bool CTC::ComputeLabelLimits() {
91  min_labels_.clear();
92  min_labels_.resize(num_timesteps_, 0);
93  max_labels_.clear();
94  max_labels_.resize(num_timesteps_, 0);
95  int min_u = num_labels_ - 1;
96  if (labels_[min_u] == null_char_) {
97  --min_u;
98  }
99  for (int t = num_timesteps_ - 1; t >= 0; --t) {
100  min_labels_[t] = min_u;
101  if (min_u > 0) {
102  --min_u;
103  if (labels_[min_u] == null_char_ && min_u > 0 && labels_[min_u + 1] != labels_[min_u - 1]) {
104  --min_u;
105  }
106  }
107  }
108  int max_u = labels_[0] == null_char_;
109  for (int t = 0; t < num_timesteps_; ++t) {
110  max_labels_[t] = max_u;
111  if (max_labels_[t] < min_labels_[t]) {
112  return false; // Not enough room.
113  }
114  if (max_u + 1 < num_labels_) {
115  ++max_u;
116  if (labels_[max_u] == null_char_ && max_u + 1 < num_labels_ &&
117  labels_[max_u + 1] != labels_[max_u - 1]) {
118  ++max_u;
119  }
120  }
121  }
122  return true;
123 }
124 
125 // Computes targets based purely on the labels by spreading the labels evenly
126 // over the available timesteps.
127 void CTC::ComputeSimpleTargets(GENERIC_2D_ARRAY<float> *targets) const {
128  // Initialize all targets to zero.
129  targets->Resize(num_timesteps_, num_classes_, 0.0f);
130  std::vector<float> half_widths;
131  std::vector<int> means;
132  ComputeWidthsAndMeans(&half_widths, &means);
133  for (int l = 0; l < num_labels_; ++l) {
134  int label = labels_[l];
135  float left_half_width = half_widths[l];
136  float right_half_width = left_half_width;
137  int mean = means[l];
138  if (label == null_char_) {
139  if (!NeededNull(l)) {
140  if ((l > 0 && mean == means[l - 1]) || (l + 1 < num_labels_ && mean == means[l + 1])) {
141  continue; // Drop overlapping null.
142  }
143  }
144  // Make sure that no space is left unoccupied and that non-nulls always
145  // peak at 1 by stretching nulls to meet their neighbors.
146  if (l > 0) {
147  left_half_width = mean - means[l - 1];
148  }
149  if (l + 1 < num_labels_) {
150  right_half_width = means[l + 1] - mean;
151  }
152  }
153  if (mean >= 0 && mean < num_timesteps_) {
154  targets->put(mean, label, 1.0f);
155  }
156  for (int offset = 1; offset < left_half_width && mean >= offset; ++offset) {
157  float prob = 1.0f - offset / left_half_width;
158  if (mean - offset < num_timesteps_ && prob > targets->get(mean - offset, label)) {
159  targets->put(mean - offset, label, prob);
160  }
161  }
162  for (int offset = 1; offset < right_half_width && mean + offset < num_timesteps_; ++offset) {
163  float prob = 1.0f - offset / right_half_width;
164  if (mean + offset >= 0 && prob > targets->get(mean + offset, label)) {
165  targets->put(mean + offset, label, prob);
166  }
167  }
168  }
169 }
170 
171 // Computes mean positions and half widths of the simple targets by spreading
172 // the labels evenly over the available timesteps.
173 void CTC::ComputeWidthsAndMeans(std::vector<float> *half_widths, std::vector<int> *means) const {
174  // Count the number of labels of each type, in regexp terms, counts plus
175  // (non-null or necessary null, which must occur at least once) and star
176  // (optional null).
177  int num_plus = 0, num_star = 0;
178  for (int i = 0; i < num_labels_; ++i) {
179  if (labels_[i] != null_char_ || NeededNull(i)) {
180  ++num_plus;
181  } else {
182  ++num_star;
183  }
184  }
185  // Compute the size for each type. If there is enough space for everything
186  // to have size>=1, then all are equal, otherwise plus_size=1 and star gets
187  // whatever is left-over.
188  float plus_size = 1.0f, star_size = 0.0f;
189  float total_floating = num_plus + num_star;
190  if (total_floating <= num_timesteps_) {
191  plus_size = star_size = num_timesteps_ / total_floating;
192  } else if (num_star > 0) {
193  star_size = static_cast<float>(num_timesteps_ - num_plus) / num_star;
194  }
195  // Set the width and compute the mean of each.
196  float mean_pos = 0.0f;
197  for (int i = 0; i < num_labels_; ++i) {
198  float half_width;
199  if (labels_[i] != null_char_ || NeededNull(i)) {
200  half_width = plus_size / 2.0f;
201  } else {
202  half_width = star_size / 2.0f;
203  }
204  mean_pos += half_width;
205  means->push_back(static_cast<int>(mean_pos));
206  mean_pos += half_width;
207  half_widths->push_back(half_width);
208  }
209 }
210 
211 // Helper returns the index of the highest probability label at timestep t.
212 static int BestLabel(const GENERIC_2D_ARRAY<float> &outputs, int t) {
213  int result = 0;
214  int num_classes = outputs.dim2();
215  const float *outputs_t = outputs[t];
216  for (int c = 1; c < num_classes; ++c) {
217  if (outputs_t[c] > outputs_t[result]) {
218  result = c;
219  }
220  }
221  return result;
222 }
223 
224 // Calculates and returns a suitable fraction of the simple targets to add
225 // to the network outputs.
226 float CTC::CalculateBiasFraction() {
227  // Compute output labels via basic decoding.
228  std::vector<int> output_labels;
229  for (int t = 0; t < num_timesteps_; ++t) {
230  int label = BestLabel(outputs_, t);
231  while (t + 1 < num_timesteps_ && BestLabel(outputs_, t + 1) == label) {
232  ++t;
233  }
234  if (label != null_char_) {
235  output_labels.push_back(label);
236  }
237  }
238  // Simple bag of labels error calculation.
239  std::vector<int> truth_counts(num_classes_);
240  std::vector<int> output_counts(num_classes_);
241  for (int l = 0; l < num_labels_; ++l) {
242  ++truth_counts[labels_[l]];
243  }
244  for (auto l : output_labels) {
245  ++output_counts[l];
246  }
247  // Count the number of true and false positive non-nulls and truth labels.
248  int true_pos = 0, false_pos = 0, total_labels = 0;
249  for (int c = 0; c < num_classes_; ++c) {
250  if (c == null_char_) {
251  continue;
252  }
253  int truth_count = truth_counts[c];
254  int ocr_count = output_counts[c];
255  if (truth_count > 0) {
256  total_labels += truth_count;
257  if (ocr_count > truth_count) {
258  true_pos += truth_count;
259  false_pos += ocr_count - truth_count;
260  } else {
261  true_pos += ocr_count;
262  }
263  }
264  // We don't need to count classes that don't exist in the truth as
265  // false positives, because they don't affect CTC at all.
266  }
267  if (total_labels == 0) {
268  return 0.0f;
269  }
270  return exp(std::max(true_pos - false_pos, 1) * std::log(kMinProb_) / total_labels);
271 }
272 
273 // Given ln(x) and ln(y), returns ln(x + y), using:
274 // ln(x + y) = ln(y) + ln(1 + exp(ln(y) - ln(x)), ensuring that ln(x) is the
275 // bigger number to maximize precision.
276 static double LogSumExp(double ln_x, double ln_y) {
277  if (ln_x >= ln_y) {
278  return ln_x + log1p(exp(ln_y - ln_x));
279  } else {
280  return ln_y + log1p(exp(ln_x - ln_y));
281  }
282 }
283 
284 // Runs the forward CTC pass, filling in log_probs.
285 void CTC::Forward(GENERIC_2D_ARRAY<double> *log_probs) const {
286  log_probs->Resize(num_timesteps_, num_labels_, -FLT_MAX);
287  log_probs->put(0, 0, log(outputs_(0, labels_[0])));
288  if (labels_[0] == null_char_) {
289  log_probs->put(0, 1, log(outputs_(0, labels_[1])));
290  }
291  for (int t = 1; t < num_timesteps_; ++t) {
292  const float *outputs_t = outputs_[t];
293  for (int u = min_labels_[t]; u <= max_labels_[t]; ++u) {
294  // Continuing the same label.
295  double log_sum = log_probs->get(t - 1, u);
296  // Change from previous label.
297  if (u > 0) {
298  log_sum = LogSumExp(log_sum, log_probs->get(t - 1, u - 1));
299  }
300  // Skip the null if allowed.
301  if (u >= 2 && labels_[u - 1] == null_char_ && labels_[u] != labels_[u - 2]) {
302  log_sum = LogSumExp(log_sum, log_probs->get(t - 1, u - 2));
303  }
304  // Add in the log prob of the current label.
305  double label_prob = outputs_t[labels_[u]];
306  log_sum += log(label_prob);
307  log_probs->put(t, u, log_sum);
308  }
309  }
310 }
311 
312 // Runs the backward CTC pass, filling in log_probs.
313 void CTC::Backward(GENERIC_2D_ARRAY<double> *log_probs) const {
314  log_probs->Resize(num_timesteps_, num_labels_, -FLT_MAX);
315  log_probs->put(num_timesteps_ - 1, num_labels_ - 1, 0.0);
316  if (labels_[num_labels_ - 1] == null_char_) {
317  log_probs->put(num_timesteps_ - 1, num_labels_ - 2, 0.0);
318  }
319  for (int t = num_timesteps_ - 2; t >= 0; --t) {
320  const float *outputs_tp1 = outputs_[t + 1];
321  for (int u = min_labels_[t]; u <= max_labels_[t]; ++u) {
322  // Continuing the same label.
323  double log_sum = log_probs->get(t + 1, u) + std::log(outputs_tp1[labels_[u]]);
324  // Change from previous label.
325  if (u + 1 < num_labels_) {
326  double prev_prob = outputs_tp1[labels_[u + 1]];
327  log_sum = LogSumExp(log_sum, log_probs->get(t + 1, u + 1) + log(prev_prob));
328  }
329  // Skip the null if allowed.
330  if (u + 2 < num_labels_ && labels_[u + 1] == null_char_ && labels_[u] != labels_[u + 2]) {
331  double skip_prob = outputs_tp1[labels_[u + 2]];
332  log_sum = LogSumExp(log_sum, log_probs->get(t + 1, u + 2) + log(skip_prob));
333  }
334  log_probs->put(t, u, log_sum);
335  }
336  }
337 }
338 
339 // Normalizes and brings probs out of log space with a softmax over time.
340 void CTC::NormalizeSequence(GENERIC_2D_ARRAY<double> *probs) const {
341  double max_logprob = probs->Max();
342  for (int u = 0; u < num_labels_; ++u) {
343  double total = 0.0;
344  for (int t = 0; t < num_timesteps_; ++t) {
345  // Separate impossible path from unlikely probs.
346  double prob = probs->get(t, u);
347  if (prob > -FLT_MAX) {
348  prob = ClippedExp(prob - max_logprob);
349  } else {
350  prob = 0.0;
351  }
352  total += prob;
353  probs->put(t, u, prob);
354  }
355  // Note that although this is a probability distribution over time and
356  // therefore should sum to 1, it is important to allow some labels to be
357  // all zero, (or at least tiny) as it is necessary to skip some blanks.
358  if (total < kMinTotalTimeProb_) {
359  total = kMinTotalTimeProb_;
360  }
361  for (int t = 0; t < num_timesteps_; ++t) {
362  probs->put(t, u, probs->get(t, u) / total);
363  }
364  }
365 }
366 
367 // For each timestep computes the max prob for each class over all
368 // instances of the class in the labels_, and sets the targets to
369 // the max observed prob.
370 void CTC::LabelsToClasses(const GENERIC_2D_ARRAY<double> &probs, NetworkIO *targets) const {
371  // For each timestep compute the max prob for each class over all
372  // instances of the class in the labels_.
373  for (int t = 0; t < num_timesteps_; ++t) {
374  float *targets_t = targets->f(t);
375  std::vector<double> class_probs(num_classes_);
376  for (int u = 0; u < num_labels_; ++u) {
377  double prob = probs(t, u);
378  // Note that although Graves specifies sum over all labels of the same
379  // class, we need to allow skipped blanks to go to zero, so they don't
380  // interfere with the non-blanks, so max is better than sum.
381  if (prob > class_probs[labels_[u]]) {
382  class_probs[labels_[u]] = prob;
383  }
384  // class_probs[labels_[u]] += prob;
385  }
386  int best_class = 0;
387  for (int c = 0; c < num_classes_; ++c) {
388  targets_t[c] = class_probs[c];
389  if (class_probs[c] > class_probs[best_class]) {
390  best_class = c;
391  }
392  }
393  }
394 }
395 
396 // Normalizes the probabilities such that no target has a prob below min_prob,
397 // and, provided that the initial total is at least min_total_prob, then all
398 // probs will sum to 1, otherwise to sum/min_total_prob. The maximum output
399 // probability is thus 1 - (num_classes-1)*min_prob.
400 /* static */
401 void CTC::NormalizeProbs(GENERIC_2D_ARRAY<float> *probs) {
402  int num_timesteps = probs->dim1();
403  int num_classes = probs->dim2();
404  for (int t = 0; t < num_timesteps; ++t) {
405  float *probs_t = (*probs)[t];
406  // Compute the total and clip that to prevent amplification of noise.
407  double total = 0.0;
408  for (int c = 0; c < num_classes; ++c) {
409  total += probs_t[c];
410  }
411  if (total < kMinTotalFinalProb_) {
412  total = kMinTotalFinalProb_;
413  }
414  // Compute the increased total as a result of clipping.
415  double increment = 0.0;
416  for (int c = 0; c < num_classes; ++c) {
417  double prob = probs_t[c] / total;
418  if (prob < kMinProb_) {
419  increment += kMinProb_ - prob;
420  }
421  }
422  // Now normalize with clipping. Any additional clipping is negligible.
423  total += increment;
424  for (int c = 0; c < num_classes; ++c) {
425  float prob = probs_t[c] / total;
426  probs_t[c] = std::max(prob, kMinProb_);
427  }
428  }
429 }
430 
431 // Returns true if the label at index is a needed null.
432 bool CTC::NeededNull(int index) const {
433  return labels_[index] == null_char_ && index > 0 && index + 1 < num_labels_ &&
434  labels_[index + 1] == labels_[index - 1];
435 }
436 
437 } // namespace tesseract
T get(ICOORD pos) const
Definition: matrix.h:268
static bool ComputeCTCTargets(const std::vector< int > &truth_labels, int null_char, const GENERIC_2D_ARRAY< float > &outputs, NetworkIO *targets)
Definition: ctc.cpp:53
static void NormalizeProbs(NetworkIO *probs)
Definition: ctc.h:36