diff options
author | Jean-Marc Valin <jmvalin@amazon.com> | 2023-11-03 09:46:38 +0300 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@amazon.com> | 2023-11-03 09:48:38 +0300 |
commit | 1ada7d4d6f838dc0842fc89159747755c516ce24 (patch) | |
tree | f3c55be79cd56c8c8ea6b19bff30c88d8bfb0367 /dnn | |
parent | 166a6c8e49fe1335feae6ffc450325f7f5f628c6 (diff) |
Vectorizing sgemv for multiples of 4 with SSE
Diffstat (limited to 'dnn')
-rw-r--r-- | dnn/vec_avx.h | 17 |
1 files changed, 17 insertions, 0 deletions
diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h index 767d7e19..a1d6cad2 100644 --- a/dnn/vec_avx.h +++ b/dnn/vec_avx.h @@ -709,6 +709,23 @@ static inline void sgemv(float *out, const float *weights, int rows, int cols, i } _mm256_storeu_ps (&y[0], vy0); } + for (;i<rows-3;i+=4) + { + float *y; + __m128 vy0; + y = &out[i]; + vy0 = _mm_setzero_ps(); + for (j=0;j<cols;j++) + { + __m128 vxj; + __m128 vw; + vxj = _mm_broadcast_ss(&x[j]); + + vw = _mm_loadu_ps(&weights[j*col_stride + i]); + vy0 = _mm_fmadd_ps(vw, vxj, vy0); + } + _mm_storeu_ps (&y[0], vy0); + } for (;i<rows;i++) { out[i] = 0; |