tesseract  5.0.0
weightmatrix.cpp
Go to the documentation of this file.
1 // File: weightmatrix.cpp
3 // Description: Hides distinction between float/int implementations.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2014, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
17 
18 #include "weightmatrix.h"
19 
20 #include <cassert> // for assert
21 #include "intsimdmatrix.h"
22 #include "simddetect.h" // for DotProduct
23 #include "statistc.h"
24 #include "tprintf.h" // forTFloat
25 
26 namespace tesseract {
27 
28 #if defined(ANDROID)
29 static inline TFloat log2(TFloat n) {
30  return log(n) / log(2.0);
31 }
32 #endif // ANDROID
33 
34 // Number of iterations after which the correction effectively becomes unity.
35 const int kAdamCorrectionIterations = 200000;
36 // Epsilon in Adam to prevent division by zero.
37 const TFloat kAdamEpsilon = 1e-8;
38 
39 // Utility functions convert between double and float arrays.
40 #ifdef FAST_FLOAT
41 static void DoubleToFloat(const GENERIC_2D_ARRAY<double> &src, GENERIC_2D_ARRAY<float> &dst) {
42  const auto dim1 = src.dim1();
43  const auto dim2 = src.dim2();
44  dst.ResizeNoInit(dim1, dim2);
45  for (int i = 0; i < dim1; ++i) {
46  const auto *src_i = src[i];
47  auto *dst_i = dst[i];
48  for (int j = 0; j < dim2; ++j) {
49  dst_i[j] = static_cast<float>(src_i[j]);
50  }
51  }
52 }
53 #endif
54 
55 static void FloatToDouble(const GENERIC_2D_ARRAY<float> &src, GENERIC_2D_ARRAY<double> &dst) {
56  const auto dim1 = src.dim1();
57  const auto dim2 = src.dim2();
58  dst.ResizeNoInit(dim1, dim2);
59  for (int i = 0; i < dim1; ++i) {
60  const auto *src_i = src[i];
61  auto *dst_i = dst[i];
62  for (int j = 0; j < dim2; ++j) {
63  dst_i[j] = static_cast<double>(src_i[j]);
64  }
65  }
66 }
67 
68 static bool DeSerialize(TFile *fp, GENERIC_2D_ARRAY<TFloat> &tfloat_array) {
69 #ifdef FAST_FLOAT
70  GENERIC_2D_ARRAY<double> double_array;
71  if (!double_array.DeSerialize(fp)) {
72  return false;
73  }
74  DoubleToFloat(double_array, tfloat_array);
75  return true;
76 #else
77  return tfloat_array.DeSerialize(fp);
78 #endif
79 }
80 
81 static bool Serialize(TFile *fp, const GENERIC_2D_ARRAY<TFloat> &tfloat_array) {
82 #ifdef FAST_FLOAT
83  GENERIC_2D_ARRAY<double> double_array;
84  FloatToDouble(tfloat_array, double_array);
85  return double_array.Serialize(fp);
86 #else
87  return tfloat_array.Serialize(fp);
88 #endif
89 }
90 
91 // Computes matrix.vector v = Wu.
92 // u is of size W.dim2() - add_bias_fwd and the output v is of size
93 // W.dim1() - skip_bias_back.
94 // If add_bias_fwd, u is imagined to have an extra element at the end with value
95 // 1, to implement the bias, weight.
96 // If skip_bias_back, we are actually performing the backwards product on a
97 // transposed matrix, so we need to drop the v output corresponding to the last
98 // element in dim1.
99 static inline void MatrixDotVectorInternal(const GENERIC_2D_ARRAY<TFloat> &w, bool add_bias_fwd,
100  bool skip_bias_back, const TFloat *u, TFloat *v) {
101  int num_results = w.dim1() - skip_bias_back;
102  int extent = w.dim2() - add_bias_fwd;
103  for (int i = 0; i < num_results; ++i) {
104  const TFloat *wi = w[i];
105  TFloat total = DotProduct(wi, u, extent);
106  if (add_bias_fwd) {
107  total += wi[extent]; // The bias value.
108  }
109  v[i] = total;
110  }
111 }
112 
113 // Copies the whole input transposed, converted to TFloat, into *this.
115  int width = input.dim1();
116  int num_features = input.dim2();
117  ResizeNoInit(num_features, width);
118  for (int t = 0; t < width; ++t) {
119  WriteStrided(t, input[t]);
120  }
121 }
122 
123 // Destructor.
124 // It is defined here, so the compiler can create a single vtable
125 // instead of weak vtables in every compilation unit.
127 
128 // Sets up the network for training. Initializes weights using weights of
129 // scale `range` picked according to the random number generator `randomizer`.
130 int WeightMatrix::InitWeightsFloat(int no, int ni, bool use_adam, float weight_range,
131  TRand *randomizer) {
132  int_mode_ = false;
133  wf_.Resize(no, ni, 0.0);
134  if (randomizer != nullptr) {
135  for (int i = 0; i < no; ++i) {
136  for (int j = 0; j < ni; ++j) {
137  wf_[i][j] = randomizer->SignedRand(weight_range);
138  }
139  }
140  }
141  use_adam_ = use_adam;
142  InitBackward();
143  return ni * no;
144 }
145 
146 // Changes the number of outputs to the size of the given code_map, copying
147 // the old weight matrix entries for each output from code_map[output] where
148 // non-negative, and uses the mean (over all outputs) of the existing weights
149 // for all outputs with negative code_map entries. Returns the new number of
150 // weights.
151 int WeightMatrix::RemapOutputs(const std::vector<int> &code_map) {
152  GENERIC_2D_ARRAY<TFloat> old_wf(wf_);
153  int old_no = wf_.dim1();
154  int new_no = code_map.size();
155  int ni = wf_.dim2();
156  std::vector<TFloat> means(ni, 0.0);
157  for (int c = 0; c < old_no; ++c) {
158  const TFloat *weights = wf_[c];
159  for (int i = 0; i < ni; ++i) {
160  means[i] += weights[i];
161  }
162  }
163  for (auto &mean : means) {
164  mean /= old_no;
165  }
166  wf_.Resize(new_no, ni, 0.0);
167  InitBackward();
168  for (int dest = 0; dest < new_no; ++dest) {
169  int src = code_map[dest];
170  const TFloat *src_data = src >= 0 ? old_wf[src] : means.data();
171  memcpy(wf_[dest], src_data, ni * sizeof(*src_data));
172  }
173  return ni * new_no;
174 }
175 
176 // Converts a float network to an int network. Each set of input weights that
177 // corresponds to a single output weight is converted independently:
178 // Compute the max absolute value of the weight set.
179 // Scale so the max absolute value becomes INT8_MAX.
180 // Round to integer.
181 // Store a multiplicative scale factor (as a TFloat) that will reproduce
182 // the original value, subject to rounding errors.
184  wi_.ResizeNoInit(wf_.dim1(), wf_.dim2());
185  scales_.reserve(wi_.dim1());
186  int dim2 = wi_.dim2();
187  for (int t = 0; t < wi_.dim1(); ++t) {
188  TFloat *f_line = wf_[t];
189  int8_t *i_line = wi_[t];
190  TFloat max_abs = 0;
191  for (int f = 0; f < dim2; ++f) {
192  TFloat abs_val = fabs(f_line[f]);
193  if (abs_val > max_abs) {
194  max_abs = abs_val;
195  }
196  }
197  TFloat scale = max_abs / INT8_MAX;
198  scales_.push_back(scale / INT8_MAX);
199  if (scale == 0.0) {
200  scale = 1.0;
201  }
202  for (int f = 0; f < dim2; ++f) {
203  i_line[f] = IntCastRounded(f_line[f] / scale);
204  }
205  }
206  wf_.Resize(1, 1, 0.0);
207  int_mode_ = true;
209  int32_t rounded_num_out;
210  IntSimdMatrix::intSimdMatrix->Init(wi_, shaped_w_, rounded_num_out);
211  scales_.resize(rounded_num_out);
212  }
213 }
214 
215 // Allocates any needed memory for running Backward, and zeroes the deltas,
216 // thus eliminating any existing momentum.
218  int no = int_mode_ ? wi_.dim1() : wf_.dim1();
219  int ni = int_mode_ ? wi_.dim2() : wf_.dim2();
220  dw_.Resize(no, ni, 0.0);
221  updates_.Resize(no, ni, 0.0);
222  wf_t_.Transpose(wf_);
223  if (use_adam_) {
224  dw_sq_sum_.Resize(no, ni, 0.0);
225  }
226 }
227 
228 // Flag on mode to indicate that this weightmatrix uses int8_t.
229 const int kInt8Flag = 1;
230 // Flag on mode to indicate that this weightmatrix uses adam.
231 const int kAdamFlag = 4;
232 // Flag on mode to indicate that this weightmatrix uses double. Set
233 // independently of kInt8Flag as even in int mode the scales can
234 // be float or double.
235 const int kDoubleFlag = 128;
236 
237 // Writes to the given file. Returns false in case of error.
238 bool WeightMatrix::Serialize(bool training, TFile *fp) const {
239  // For backward compatibility, add kDoubleFlag to mode to indicate the doubles
240  // format, without errs, so we can detect and read old format weight matrices.
241  uint8_t mode = (int_mode_ ? kInt8Flag : 0) | (use_adam_ ? kAdamFlag : 0) | kDoubleFlag;
242  if (!fp->Serialize(&mode)) {
243  return false;
244  }
245  if (int_mode_) {
246  if (!wi_.Serialize(fp)) {
247  return false;
248  }
249  uint32_t size = scales_.size();
250  if (!fp->Serialize(&size)) {
251  return false;
252  }
253  for (auto scale : scales_) {
254  // The scales stored in memory have an extra factor applied to them
255  // to allow faster operation. We have to remove that factor here
256  // before writing to disc.
257  double value = scale * INT8_MAX;
258  if (!fp->Serialize(&value)) {
259  return false;
260  }
261  }
262  } else {
263  if (!tesseract::Serialize(fp, wf_)) {
264  return false;
265  }
266  if (training) {
267  if (!tesseract::Serialize(fp, updates_)) {
268  return false;
269  }
270  if (use_adam_ && !tesseract::Serialize(fp, dw_sq_sum_)) {
271  return false;
272  }
273  }
274  }
275  return true;
276 }
277 
278 // Reads from the given file. Returns false in case of error.
279 
280 bool WeightMatrix::DeSerialize(bool training, TFile *fp) {
281  uint8_t mode;
282  if (!fp->DeSerialize(&mode)) {
283  return false;
284  }
285  int_mode_ = (mode & kInt8Flag) != 0;
286  use_adam_ = (mode & kAdamFlag) != 0;
287  if ((mode & kDoubleFlag) == 0) {
288  return DeSerializeOld(training, fp);
289  }
290  if (int_mode_) {
291  if (!wi_.DeSerialize(fp)) {
292  return false;
293  }
294  uint32_t size;
295  if (!fp->DeSerialize(&size)) {
296  return false;
297  }
298 #ifdef FAST_FLOAT
299  scales_.reserve(size);
300  for (auto n = size; n > 0; n--) {
301  double val;
302  if (!fp->DeSerialize(&val)) {
303  return false;
304  }
305  scales_.push_back(val / INT8_MAX);
306  }
307 #else
308  scales_.resize(size);
309  if (!fp->DeSerialize(&scales_[0], size)) {
310  return false;
311  }
312  for (auto &scale : scales_) {
313  scale /= INT8_MAX;
314  }
315 #endif
317  int32_t rounded_num_out;
318  IntSimdMatrix::intSimdMatrix->Init(wi_, shaped_w_, rounded_num_out);
319  scales_.resize(rounded_num_out);
320  }
321  } else {
322  if (!tesseract::DeSerialize(fp, wf_)) {
323  return false;
324  }
325  if (training) {
326  InitBackward();
327  if (!tesseract::DeSerialize(fp, updates_)) {
328  return false;
329  }
330  if (use_adam_) {
331  if (!tesseract::DeSerialize(fp, dw_sq_sum_)) {
332  return false;
333  }
334  }
335  }
336  }
337  return true;
338 }
339 
340 // As DeSerialize, but reads an old (float) format WeightMatrix for
341 // backward compatibility.
342 bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) {
343 #ifdef FAST_FLOAT
344  // Not implemented.
345  ASSERT_HOST(!"not implemented");
346  return false;
347 #else
348  if (int_mode_) {
349  if (!wi_.DeSerialize(fp)) {
350  return false;
351  }
352  std::vector<float> old_scales;
353  if (!fp->DeSerialize(old_scales)) {
354  return false;
355  }
356  scales_.reserve(old_scales.size());
357  for (float old_scale : old_scales) {
358  scales_.push_back(old_scale);
359  }
360  } else {
361  GENERIC_2D_ARRAY<float> float_array;
362  if (!float_array.DeSerialize(fp)) {
363  return false;
364  }
365  FloatToDouble(float_array, wf_);
366  }
367  if (training) {
368  InitBackward();
369  GENERIC_2D_ARRAY<float> float_array;
370  if (!float_array.DeSerialize(fp)) {
371  return false;
372  }
373  FloatToDouble(float_array, updates_);
374  // Errs was only used in int training, which is now dead.
375  if (!float_array.DeSerialize(fp)) {
376  return false;
377  }
378  }
379  return true;
380 #endif
381 }
382 
383 // Computes matrix.vector v = Wu.
384 // u is of size W.dim2() - 1 and the output v is of size W.dim1().
385 // u is imagined to have an extra element at the end with value 1, to
386 // implement the bias, but it doesn't actually have it.
387 // Asserts that the call matches what we have.
388 void WeightMatrix::MatrixDotVector(const TFloat *u, TFloat *v) const {
389  assert(!int_mode_);
390  MatrixDotVectorInternal(wf_, true, false, u, v);
391 }
392 
393 void WeightMatrix::MatrixDotVector(const int8_t *u, TFloat *v) const {
394  assert(int_mode_);
396  IntSimdMatrix::intSimdMatrix->matrixDotVectorFunction(wi_.dim1(), wi_.dim2(), &shaped_w_[0],
397  &scales_[0], u, v);
398  } else {
399  IntSimdMatrix::MatrixDotVector(wi_, scales_, u, v);
400  }
401 }
402 
403 // MatrixDotVector for peep weights, MultiplyAccumulate adds the
404 // component-wise products of *this[0] and v to inout.
406  assert(!int_mode_);
407  assert(wf_.dim1() == 1);
408  int n = wf_.dim2();
409  const TFloat *u = wf_[0];
410  for (int i = 0; i < n; ++i) {
411  inout[i] += u[i] * v[i];
412  }
413 }
414 
415 // Computes vector.matrix v = uW.
416 // u is of size W.dim1() and the output v is of size W.dim2() - 1.
417 // The last result is discarded, as v is assumed to have an imaginary
418 // last value of 1, as with MatrixDotVector.
419 void WeightMatrix::VectorDotMatrix(const TFloat *u, TFloat *v) const {
420  assert(!int_mode_);
421  MatrixDotVectorInternal(wf_t_, false, true, u, v);
422 }
423 
424 // Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements from
425 // u and v. In terms of the neural network, u is the gradients and v is the
426 // inputs.
427 // Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0.
428 // Runs parallel if requested. Note that u and v must be transposed.
430  bool in_parallel) {
431  assert(!int_mode_);
432  int num_outputs = dw_.dim1();
433  assert(u.dim1() == num_outputs);
434  assert(u.dim2() == v.dim2());
435  int num_inputs = dw_.dim2() - 1;
436  int num_samples = u.dim2();
437  // v is missing the last element in dim1.
438  assert(v.dim1() == num_inputs);
439 #ifdef _OPENMP
440 # pragma omp parallel for num_threads(4) if (in_parallel)
441 #endif
442  for (int i = 0; i < num_outputs; ++i) {
443  TFloat *dwi = dw_[i];
444  const TFloat *ui = u[i];
445  for (int j = 0; j < num_inputs; ++j) {
446  dwi[j] = DotProduct(ui, v[j], num_samples);
447  }
448  // The last element of v is missing, presumed 1.0f.
449  TFloat total = 0;
450  for (int k = 0; k < num_samples; ++k) {
451  total += ui[k];
452  }
453  dwi[num_inputs] = total;
454  }
455 }
456 
457 // Updates the weights using the given learning rate and momentum.
458 // num_samples is the quotient to be used in the adam computation iff
459 // use_adam_ is true.
460 void WeightMatrix::Update(float learning_rate, float momentum, float adam_beta, int num_samples) {
461  assert(!int_mode_);
462  if (use_adam_ && momentum > 0.0f && num_samples > 0 && num_samples < kAdamCorrectionIterations) {
463  learning_rate *= sqrt(1.0f - pow(adam_beta, num_samples));
464  learning_rate /= 1.0f - pow(momentum, num_samples);
465  }
466  if (use_adam_ && num_samples > 0 && momentum > 0.0f) {
467  dw_sq_sum_.SumSquares(dw_, adam_beta);
468  dw_ *= learning_rate * (1.0f - momentum);
469  updates_ *= momentum;
470  updates_ += dw_;
471  wf_.AdamUpdate(updates_, dw_sq_sum_, learning_rate * kAdamEpsilon);
472  } else {
473  dw_ *= learning_rate;
474  updates_ += dw_;
475  if (momentum > 0.0f) {
476  wf_ += updates_;
477  }
478  if (momentum >= 0.0f) {
479  updates_ *= momentum;
480  }
481  }
482  wf_t_.Transpose(wf_);
483 }
484 
485 // Adds the dw_ in other to the dw_ is *this.
487  assert(dw_.dim1() == other.dw_.dim1());
488  assert(dw_.dim2() == other.dw_.dim2());
489  dw_ += other.dw_;
490 }
491 
492 // Sums the products of weight updates in *this and other, splitting into
493 // positive (same direction) in *same and negative (different direction) in
494 // *changed.
496  TFloat *changed) const {
497  int num_outputs = updates_.dim1();
498  int num_inputs = updates_.dim2();
499  assert(num_outputs == other.updates_.dim1());
500  assert(num_inputs == other.updates_.dim2());
501  for (int i = 0; i < num_outputs; ++i) {
502  const TFloat *this_i = updates_[i];
503  const TFloat *other_i = other.updates_[i];
504  for (int j = 0; j < num_inputs; ++j) {
505  TFloat product = this_i[j] * other_i[j];
506  if (product < 0.0) {
507  *changed -= product;
508  } else {
509  *same += product;
510  }
511  }
512  }
513 }
514 
515 // Helper computes an integer histogram bucket for a weight and adds it
516 // to the histogram.
517 const int kHistogramBuckets = 16;
518 static void HistogramWeight(TFloat weight, STATS *histogram) {
519  int bucket = kHistogramBuckets - 1;
520  if (weight != 0.0) {
521  TFloat logval = -log2(fabs(weight));
522  bucket = ClipToRange(IntCastRounded(logval), 0, kHistogramBuckets - 1);
523  }
524  histogram->add(bucket, 1);
525 }
526 
527 void WeightMatrix::Debug2D(const char *msg) {
528  STATS histogram(0, kHistogramBuckets);
529  if (int_mode_) {
530  for (int i = 0; i < wi_.dim1(); ++i) {
531  for (int j = 0; j < wi_.dim2(); ++j) {
532  HistogramWeight(wi_[i][j] * scales_[i], &histogram);
533  }
534  }
535  } else {
536  for (int i = 0; i < wf_.dim1(); ++i) {
537  for (int j = 0; j < wf_.dim2(); ++j) {
538  HistogramWeight(wf_[i][j], &histogram);
539  }
540  }
541  }
542  tprintf("%s\n", msg);
543  histogram.print();
544 }
545 
546 } // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:59
const int kInt8Flag
const TFloat kAdamEpsilon
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:175
bool DeSerialize(bool swap, FILE *fp, std::vector< T > &data)
Definition: helpers.h:220
const int kDoubleFlag
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:251
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:110
DotProductFunction DotProduct
Definition: simddetect.cpp:79
const int kAdamFlag
double TFloat
Definition: tesstypes.h:39
const int kHistogramBuckets
const int kAdamCorrectionIterations
void AdamUpdate(const GENERIC_2D_ARRAY< T > &sum, const GENERIC_2D_ARRAY< T > &sqsum, const T &epsilon)
Definition: matrix.h:429
bool DeSerialize(bool swap, FILE *fp)
Definition: matrix.h:175
void SumSquares(const GENERIC_2D_ARRAY< T > &src, const T &decay_factor)
Definition: matrix.h:419
void Resize(int size1, int size2, const T &empty)
Definition: matrix.h:110
void ResizeNoInit(int size1, int size2, int pad=0)
Definition: matrix.h:94
bool Serialize(FILE *fp) const
Definition: matrix.h:150
static void MatrixDotVector(const GENERIC_2D_ARRAY< int8_t > &w, const std::vector< TFloat > &scales, const int8_t *u, TFloat *v)
MatrixDotVectorFunction matrixDotVectorFunction
static const IntSimdMatrix * intSimdMatrix
void Init(const GENERIC_2D_ARRAY< int8_t > &w, std::vector< int8_t > &shaped_w, int32_t &rounded_num_out) const
void print() const
Definition: statistc.cpp:548
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
double SignedRand(double range)
Definition: helpers.h:76
bool DeSerialize(std::string &data)
Definition: serialis.cpp:94
bool Serialize(const std::string &data)
Definition: serialis.cpp:107
void Transpose(const GENERIC_2D_ARRAY< TFloat > &input)
void WriteStrided(int t, const float *data)
Definition: weightmatrix.h:40
void SumOuterTransposed(const TransposedArray &u, const TransposedArray &v, bool parallel)
bool Serialize(bool training, TFile *fp) const
bool DeSerializeOld(bool training, TFile *fp)
void MultiplyAccumulate(const TFloat *v, TFloat *inout)
int InitWeightsFloat(int no, int ni, bool use_adam, float weight_range, TRand *randomizer)
void Update(float learning_rate, float momentum, float adam_beta, int num_samples)
void Debug2D(const char *msg)
void AddDeltas(const WeightMatrix &other)
int RemapOutputs(const std::vector< int > &code_map)
void VectorDotMatrix(const TFloat *u, TFloat *v) const
void MatrixDotVector(const TFloat *u, TFloat *v) const
bool DeSerialize(bool training, TFile *fp)
void CountAlternators(const WeightMatrix &other, TFloat *same, TFloat *changed) const