28 #if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
31 #undef HAVE_FRAMEWORK_ACCELERATE
34 #if defined(HAVE_FRAMEWORK_ACCELERATE)
39 #include <Accelerate/Accelerate.h>
43 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
47 #if defined(HAS_CPUID)
48 # if defined(__GNUC__)
50 # elif defined(_WIN32)
55 #if defined(HAVE_NEON) && !defined(__aarch64__)
57 # include <cpu-features.h>
61 # include <asm/hwcap.h>
62 # include <sys/auxv.h>
81 static STRING_VAR(dotproduct,
"auto",
"Function used for calculation of dot product");
85 #if defined(__aarch64__)
87 bool SIMDDetect::neon_available_ =
true;
88 #elif defined(HAVE_NEON)
90 bool SIMDDetect::neon_available_;
93 bool SIMDDetect::avx_available_;
94 bool SIMDDetect::avx2_available_;
95 bool SIMDDetect::avx512F_available_;
96 bool SIMDDetect::avx512BW_available_;
98 bool SIMDDetect::fma_available_;
100 bool SIMDDetect::sse_available_;
103 #if defined(HAVE_FRAMEWORK_ACCELERATE)
106 const int stride = 1;
107 #if defined(FAST_FLOAT)
108 vDSP_dotpr(u, stride, v, stride, &total, n);
110 vDSP_dotprD(u, stride, v, stride, &total, n);
119 for (
int k = 0; k < n; ++k) {
120 total += u[k] * v[k];
127 return std::inner_product(u, u + n, v,
static_cast<TFloat>(0));
140 SIMDDetect::SIMDDetect() {
142 SetDotProduct(DotProductGeneric);
144 #if defined(HAS_CPUID)
145 # if defined(__GNUC__)
146 unsigned int eax, ebx, ecx, edx;
147 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
150 # if defined(HAVE_SSE4_1)
151 sse_available_ = (ecx & 0x00080000) != 0;
153 # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
156 __asm__(
"xgetbv" :
"=a"(xcr0) :
"c"(0) :
"%edx");
159 if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
161 # if defined(HAVE_FMA)
162 fma_available_ = (ecx & 0x00001000) != 0;
164 # if defined(HAVE_AVX)
165 avx_available_ = (ecx & 0x10000000) != 0;
166 if (avx_available_) {
170 __cpuid_count(7, 0, eax, ebx, ecx, edx);
171 avx2_available_ = (ebx & 0x00000020) != 0;
172 avx512F_available_ = (ebx & 0x00010000) != 0;
173 avx512BW_available_ = (ebx & 0x40000000) != 0;
179 # elif defined(_WIN32)
183 max_function_id = cpuInfo[0];
184 if (max_function_id >= 1) {
186 # if defined(HAVE_SSE4_1)
187 sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
189 # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
190 if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
192 # if defined(HAVE_FMA)
193 fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
195 # if defined(HAVE_AVX)
196 avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
198 # if defined(HAVE_AVX2)
199 if (max_function_id >= 7) {
201 avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
202 avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
203 avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
210 # error "I don't know how to test for SIMD with this compiler"
214 #if defined(HAVE_NEON) && !defined(__aarch64__)
217 AndroidCpuFamily family = android_getCpuFamily();
218 if (family == ANDROID_CPU_FAMILY_ARM)
219 neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
224 neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
234 #if defined(HAVE_AVX2)
235 }
else if (avx2_available_) {
239 #if defined(HAVE_AVX)
240 }
else if (avx_available_) {
244 #if defined(HAVE_SSE4_1)
245 }
else if (sse_available_) {
249 #if defined(HAVE_NEON) || defined(__aarch64__)
250 }
else if (neon_available_) {
256 const char *dotproduct_env = getenv(
"DOTPRODUCT");
257 if (dotproduct_env !=
nullptr) {
259 dotproduct = dotproduct_env;
267 const char *dotproduct_method =
"generic";
268 if (dotproduct ==
"auto") {
270 }
else if (dotproduct ==
"generic") {
272 SetDotProduct(DotProductGeneric);
273 dotproduct_method =
"generic";
274 }
else if (dotproduct ==
"native") {
277 dotproduct_method =
"native";
278 #if defined(HAVE_AVX2)
279 }
else if (dotproduct ==
"avx2") {
282 dotproduct_method =
"avx2";
284 #if defined(HAVE_AVX)
285 }
else if (dotproduct ==
"avx") {
288 dotproduct_method =
"avx";
290 #if defined(HAVE_FMA)
291 }
else if (dotproduct ==
"fma") {
294 dotproduct_method =
"fma";
296 #if defined(HAVE_SSE4_1)
297 }
else if (dotproduct ==
"sse") {
300 dotproduct_method =
"sse";
302 #if defined(HAVE_FRAMEWORK_ACCELERATE)
303 }
else if (dotproduct ==
"accelerate") {
306 #if defined(HAVE_NEON) || defined(__aarch64__)
307 }
else if (dotproduct ==
"neon" && neon_available_) {
310 dotproduct_method =
"neon";
312 }
else if (dotproduct ==
"std::inner_product") {
315 dotproduct_method =
"std::inner_product";
318 tprintf(
"Warning, ignoring unsupported config variable value: dotproduct=%s\n",
321 "Supported values for dotproduct: auto generic native"
322 #
if defined(HAVE_AVX2)
325 #
if defined(HAVE_AVX)
328 #
if defined(HAVE_FMA)
331 #
if defined(HAVE_SSE4_1)
334 #
if defined(HAVE_FRAMEWORK_ACCELERATE)
337 " std::inner_product.\n");
340 dotproduct.set_value(dotproduct_method);
#define STRING_VAR(name, val, comment)
TFloat(*)(const TFloat *, const TFloat *, int) DotProductFunction
void tprintf(const char *format,...)
TFloat DotProductNEON(const TFloat *u, const TFloat *v, int n)
TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n)
TFloat DotProductNative(const TFloat *u, const TFloat *v, int n)
TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n)
DotProductFunction DotProduct
TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n)
static const IntSimdMatrix intSimdMatrixAVX2
static const IntSimdMatrix * intSimdMatrix
static const IntSimdMatrix intSimdMatrixSSE
static const IntSimdMatrix intSimdMatrixNEON
static TESS_API void Update()