Add AVX2_24 kernels.

nsomatilda · nsomatilda · commit d8ae3e30b83d · 2024-05-23T15:45:23.000-06:00
diff --git a/include/matilda.h b/include/matilda.h
@@ -39,7 +39,7 @@ namespace Matilda
   enum class ThreadTask : int { Initialize, ComputeMVM, SpinWait, StayBusy, Sleep, Terminate, Dead };
   enum class MvmKernel : int {
 #ifdef HAS_AVX2
-    AVX2_16=25616, AVX2_16_A=256161, AVX2_16_B=256162, AVX2_32=25632, AVX2_40=25640, AVX2_48=25648, AVX2_56=25656, AVX2_64=25664,
+    AVX2_16=25616, AVX2_16_A=256161, AVX2_16_B=256162, AVX2_24=25624, AVX2_32=25632, AVX2_40=25640, AVX2_48=25648, AVX2_56=25656, AVX2_64=25664,
 #endif
 #ifdef HAS_AVX512F
   AVX512_16=51216, AVX512_32=51232, AVX512_64=51264,
diff --git a/src/matilda.cc b/src/matilda.cc
@@ -127,6 +127,10 @@ mvm_plan::mvm_plan( const mvm_param & p ) :
         m_simd_rows = 16;
         mvm_kernel_func = m_f16c ? &mvm_kernel_avx2_16_b_f16c : &mvm_kernel_avx2_16_b;
         break;
+    case MvmKernel::AVX2_24:
+        m_simd_rows = 24;
+        mvm_kernel_func = m_f16c ? &mvm_kernel_avx2_24_f16c : &mvm_kernel_avx2_24;
+        break;
     case MvmKernel::AVX2_32:
         m_simd_rows = 32;
         mvm_kernel_func = m_f16c ? &mvm_kernel_avx2_32_f16c : &mvm_kernel_avx2_32;
diff --git a/src/mvm_impl_avx2.h b/src/mvm_impl_avx2.h
@@ -369,6 +369,57 @@ void mvm_kernel_avx2_16_f16c( float const * mat, float const * vec, size_t width
   _mm256_store_ps( rdi + 8, acc1 );
 }
 
+void mvm_kernel_avx2_24( float const * mat, float const * vec, size_t width, float * rdi )
+{
+  __m256 acc0 = _mm256_setzero_ps();
+  __m256 acc1 = _mm256_setzero_ps();
+  __m256 acc2 = _mm256_setzero_ps();
+
+  float const * const vecEnd = vec + width;
+  while( vec < vecEnd )
+  {
+    __m256 const v = _mm256_broadcast_ss( vec );
+    vec++;
+    acc0 = _mm256_fmadd_ps( v, _mm256_load_ps( mat ), acc0 );
+    acc1 = _mm256_fmadd_ps( v, _mm256_load_ps( mat + 8 ), acc1 );
+    acc2 = _mm256_fmadd_ps( v, _mm256_load_ps( mat + 16 ), acc2 );
+    int const distance = 32*4; // 4 fastest for 2048x2048 with 64 threads // prefetching not tuned for 24
+    _mm_prefetch(  mat + distance, _MM_HINT_T0 );      // prefetch 16 elements
+    _mm_prefetch(  mat + distance + 16, _MM_HINT_T0 ); // prefetch another 16 elements
+    mat += 24;
+  }
+
+  _mm256_store_ps( rdi, acc0 );
+  _mm256_store_ps( rdi + 8, acc1 );
+  _mm256_store_ps( rdi + 16, acc2 );
+}
+
+void mvm_kernel_avx2_24_f16c( float const * mat, float const * vec, size_t width, float * rdi )
+{
+  __m256 acc0 = _mm256_setzero_ps();
+  __m256 acc1 = _mm256_setzero_ps();
+  __m256 acc2 = _mm256_setzero_ps();
+
+  float const * const vecEnd = vec + width;
+  while( vec < vecEnd )
+  {
+    int const distance = 1*32; // distance doesn't seem to matter here for 3456x3456 matrix, speed up is present as soon as prefetching
+    _mm_prefetch( mat + distance, _MM_HINT_T0 );  // prefetch 32 elements
+
+    __m256 const v = _mm256_broadcast_ss( vec );
+    vec++;
+
+    acc0 = _mm256_fmadd_ps( v, _mm256_cvtph_ps( _mm_load_si128( reinterpret_cast<__m128i const *>( mat ) ) ), acc0 );
+    acc1 = _mm256_fmadd_ps( v, _mm256_cvtph_ps( _mm_load_si128( reinterpret_cast<__m128i const *>( mat + 4 ) ) ), acc1 );
+    acc2 = _mm256_fmadd_ps( v, _mm256_cvtph_ps( _mm_load_si128( reinterpret_cast<__m128i const *>( mat + 8 ) ) ), acc2 );
+    mat += 12;
+  }
+
+  _mm256_store_ps( rdi, acc0 );
+  _mm256_store_ps( rdi + 8, acc1 );
+  _mm256_store_ps( rdi + 16, acc2 );
+}
+
 void mvm_kernel_avx2_32( float const * mat, float const * vec, size_t width, float * rdi )
 {
   __m256 acc0 = _mm256_setzero_ps();