diff --git a/FaceIdentification/src/math_functions.cpp b/FaceIdentification/src/math_functions.cpp
index ec1396258..72fe29c15 100644
--- a/FaceIdentification/src/math_functions.cpp
+++ b/FaceIdentification/src/math_functions.cpp
@@ -30,18 +30,26 @@
  */
 
 #include "math_functions.h"
-#include <xmmintrin.h>
 #include <cstdint>
 
-#ifdef _WIN32
-#include <intrin.h>
-#else
-#include <x86intrin.h>
+#if defined(_MSC_VER)
+/* Microsoft C/C++-compatible compiler */
+     #include <intrin.h>
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+/* GCC-compatible compiler, targeting x86/x86-64 */
+     #include <x86intrin.h>
+#elif defined(__GNUC__) && defined(__ARM_NEON__)
+/* GCC-compatible compiler, targeting ARM with NEON */
+     #include <arm_neon.h>
 #endif
 
+
+
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER) )
 float simd_dot(const float* x, const float* y, const long& len) {
+ #pragma message("USE SSE")
   float inner_prod = 0.0f;
-  __m128 X, Y; // 128-bit values
+  __m128 X,Y,Z; // 128-bit values
   __m128 acc = _mm_setzero_ps(); // set to (0, 0, 0, 0)
   float temp[4];
 
@@ -49,7 +57,8 @@ float simd_dot(const float* x, const float* y, const long& len) {
   for (i = 0; i + 4 < len; i += 4) {
       X = _mm_loadu_ps(x + i); // load chunk of 4 floats
       Y = _mm_loadu_ps(y + i);
-      acc = _mm_add_ps(acc, _mm_mul_ps(X, Y));
+      Z = _mm_mul_ps(X, Y);
+      acc = _mm_add_ps(acc, Z);
   }
   _mm_storeu_ps(&temp[0], acc); // store acc into an array
   inner_prod = temp[0] + temp[1] + temp[2] + temp[3];
@@ -60,6 +69,27 @@ float simd_dot(const float* x, const float* y, const long& len) {
   }
   return inner_prod;
 }
+#else
+float simd_dot(const float* x, const float* y, const long& len) {
+#pragma message("USE NEON")
+    float inner_prod=0.0f;
+    float32x4_t X,Y,Z;// 128-bit values
+    float32x4_t acc=vdupq_n_f32(0.0f);//set to (0, 0, 0, 0)
+    long i;
+    for (i = 0; i + 4 < len; i += 4) {
+        X = vld1q_f32(x + i);// load chunk of 4 floats
+        Y = vld1q_f32(y + i);
+        Z = vmulq_f32(X, Y);
+        acc = vaddq_f32(acc, Z);
+    }
+    inner_prod=vgetq_lane_f32(acc, 0)+vgetq_lane_f32(acc, 1)+vgetq_lane_f32(acc, 2) +vgetq_lane_f32(acc, 3);
+    for (; i < len; ++i) {
+        inner_prod += x[i] * y[i];
+    }
+    return inner_prod;
+}
+#endif
+
 
 void matrix_procuct(const float* A, const float* B, float* C, const int n,
     const int m, const int k, bool ta, bool tb) {