tesseract  5.0.0
dotproductsse.cpp
Go to the documentation of this file.
1 // File: dotproductsse.cpp
3 // Description: Architecture-specific dot-product function.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2015, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
17 
18 #if !defined(__SSE4_1__)
19 # if defined(__i686__) || defined(__x86_64__)
20 # error Implementation only for SSE 4.1 capable architectures
21 # endif
22 #else
23 
24 # include <emmintrin.h>
25 # include <smmintrin.h>
26 # include <cstdint>
27 # include "dotproduct.h"
28 
29 namespace tesseract {
30 
31 // Computes and returns the dot product of the n-vectors u and v.
32 // Uses Intel SSE intrinsics to access the SIMD instruction set.
33 #if defined(FAST_FLOAT)
34 float DotProductSSE(const float *u, const float *v, int n) {
35  int max_offset = n - 4;
36  int offset = 0;
37  // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
38  // v, and multiplying them together in parallel.
39  __m128 sum = _mm_setzero_ps();
40  if (offset <= max_offset) {
41  offset = 4;
42  // Aligned load is reputedly faster but requires 16 byte aligned input.
43  if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
44  (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
45  // Use aligned load.
46  sum = _mm_load_ps(u);
47  __m128 floats2 = _mm_load_ps(v);
48  // Multiply.
49  sum = _mm_mul_ps(sum, floats2);
50  while (offset <= max_offset) {
51  __m128 floats1 = _mm_load_ps(u + offset);
52  floats2 = _mm_load_ps(v + offset);
53  floats1 = _mm_mul_ps(floats1, floats2);
54  sum = _mm_add_ps(sum, floats1);
55  offset += 4;
56  }
57  } else {
58  // Use unaligned load.
59  sum = _mm_loadu_ps(u);
60  __m128 floats2 = _mm_loadu_ps(v);
61  // Multiply.
62  sum = _mm_mul_ps(sum, floats2);
63  while (offset <= max_offset) {
64  __m128 floats1 = _mm_loadu_ps(u + offset);
65  floats2 = _mm_loadu_ps(v + offset);
66  floats1 = _mm_mul_ps(floats1, floats2);
67  sum = _mm_add_ps(sum, floats1);
68  offset += 4;
69  }
70  }
71  }
72  // Add the 4 sums in sum horizontally.
73 #if 0
74  alignas(32) float tmp[4];
75  _mm_store_ps(tmp, sum);
76  float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
77 #else
78  __m128 zero = _mm_setzero_ps();
79  // https://www.felixcloutier.com/x86/haddps
80  sum = _mm_hadd_ps(sum, zero);
81  sum = _mm_hadd_ps(sum, zero);
82  // Extract the low result.
83  float result = _mm_cvtss_f32(sum);
84 #endif
85  // Add on any left-over products.
86  while (offset < n) {
87  result += u[offset] * v[offset];
88  ++offset;
89  }
90  return result;
91 }
92 #else
93 double DotProductSSE(const double *u, const double *v, int n) {
94  int max_offset = n - 2;
95  int offset = 0;
96  // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
97  // v, and multiplying them together in parallel.
98  __m128d sum = _mm_setzero_pd();
99  if (offset <= max_offset) {
100  offset = 2;
101  // Aligned load is reputedly faster but requires 16 byte aligned input.
102  if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
103  (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
104  // Use aligned load.
105  sum = _mm_load_pd(u);
106  __m128d floats2 = _mm_load_pd(v);
107  // Multiply.
108  sum = _mm_mul_pd(sum, floats2);
109  while (offset <= max_offset) {
110  __m128d floats1 = _mm_load_pd(u + offset);
111  floats2 = _mm_load_pd(v + offset);
112  offset += 2;
113  floats1 = _mm_mul_pd(floats1, floats2);
114  sum = _mm_add_pd(sum, floats1);
115  }
116  } else {
117  // Use unaligned load.
118  sum = _mm_loadu_pd(u);
119  __m128d floats2 = _mm_loadu_pd(v);
120  // Multiply.
121  sum = _mm_mul_pd(sum, floats2);
122  while (offset <= max_offset) {
123  __m128d floats1 = _mm_loadu_pd(u + offset);
124  floats2 = _mm_loadu_pd(v + offset);
125  offset += 2;
126  floats1 = _mm_mul_pd(floats1, floats2);
127  sum = _mm_add_pd(sum, floats1);
128  }
129  }
130  }
131  // Add the 2 sums in sum horizontally.
132  sum = _mm_hadd_pd(sum, sum);
133  // Extract the low result.
134  double result = _mm_cvtsd_f64(sum);
135  // Add on any left-over products.
136  while (offset < n) {
137  result += u[offset] * v[offset];
138  ++offset;
139  }
140  return result;
141 }
142 #endif
143 
144 } // namespace tesseract.
145 
146 #endif
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n)