diff --git a/FaceIdentification/src/math_functions.cpp b/FaceIdentification/src/math_functions.cpp index ec1396258..72fe29c15 100644 --- a/FaceIdentification/src/math_functions.cpp +++ b/FaceIdentification/src/math_functions.cpp @@ -30,18 +30,26 @@ */ #include "math_functions.h" -#include #include -#ifdef _WIN32 -#include -#else -#include +#if defined(_MSC_VER) +/* Microsoft C/C++-compatible compiler */ + #include +#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +/* GCC-compatible compiler, targeting x86/x86-64 */ + #include +#elif defined(__GNUC__) && defined(__ARM_NEON__) +/* GCC-compatible compiler, targeting ARM with NEON */ + #include #endif + + +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER) ) float simd_dot(const float* x, const float* y, const long& len) { + #pragma message("USE SSE") float inner_prod = 0.0f; - __m128 X, Y; // 128-bit values + __m128 X,Y,Z; // 128-bit values __m128 acc = _mm_setzero_ps(); // set to (0, 0, 0, 0) float temp[4]; @@ -49,7 +57,8 @@ float simd_dot(const float* x, const float* y, const long& len) { for (i = 0; i + 4 < len; i += 4) { X = _mm_loadu_ps(x + i); // load chunk of 4 floats Y = _mm_loadu_ps(y + i); - acc = _mm_add_ps(acc, _mm_mul_ps(X, Y)); + Z = _mm_mul_ps(X, Y); + acc = _mm_add_ps(acc, Z); } _mm_storeu_ps(&temp[0], acc); // store acc into an array inner_prod = temp[0] + temp[1] + temp[2] + temp[3]; @@ -60,6 +69,27 @@ float simd_dot(const float* x, const float* y, const long& len) { } return inner_prod; } +#else +float simd_dot(const float* x, const float* y, const long& len) { +#pragma message("USE NEON") + float inner_prod=0.0f; + float32x4_t X,Y,Z;// 128-bit values + float32x4_t acc=vdupq_n_f32(0.0f);//set to (0, 0, 0, 0) + long i; + for (i = 0; i + 4 < len; i += 4) { + X = vld1q_f32(x + i);// load chunk of 4 floats + Y = vld1q_f32(y + i); + Z = vmulq_f32(X, Y); + acc = vaddq_f32(acc, Z); + } + inner_prod=vgetq_lane_f32(acc, 0)+vgetq_lane_f32(acc, 1)+vgetq_lane_f32(acc, 2) +vgetq_lane_f32(acc, 3); + for (; i < len; ++i) { + inner_prod += x[i] * y[i]; + } + return inner_prod; +} +#endif + void matrix_procuct(const float* A, const float* B, float* C, const int n, const int m, const int k, bool ta, bool tb) {