diff options
author | Jean-Marc Valin <jmvalin@amazon.com> | 2023-10-20 10:28:17 +0300 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@amazon.com> | 2023-10-20 10:28:17 +0300 |
commit | 88c8b3078518b649933616fb7c9a78e4d086233a (patch) | |
tree | ad7e24925f16154c15c1011f07652ae6a001d43d | |
parent | f512c9206beef41367dff3e0c81fffc374b35efc (diff) |
Doing some unrolling on ARM/Neon
-rw-r--r-- | dnn/vec_neon.h | 23 |
1 files changed, 22 insertions, 1 deletions
diff --git a/dnn/vec_neon.h b/dnn/vec_neon.h index 48e3eaa1..db5beef4 100644 --- a/dnn/vec_neon.h +++ b/dnn/vec_neon.h @@ -383,9 +383,30 @@ static inline void cgemv8x4(float *_out, const opus_int8 *w, const float *scale, for (i=0;i<rows;i+=8) { int32x4_t acc0, acc1; + int32x4_t acc2, acc3; acc0 = vdupq_n_s32(0); acc1 = vdupq_n_s32(0); - for (j=0;j<cols;j+=4) + acc2 = vdupq_n_s32(0); + acc3 = vdupq_n_s32(0); + j=0; + for (;j<cols-4;j+=8) + { + int8x16_t vw0, vw1, vw2, vw3, vx0, vx1; + vx0 = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[j]); + vw0 = vld1q_s8(w); + vw1 = vld1q_s8(&w[16]); + acc0 = vdotprod(acc0, vw0, vx0); + acc1 = vdotprod(acc1, vw1, vx0); + vx1 = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[j+4]); + vw2 = vld1q_s8(&w[32]); + vw3 = vld1q_s8(&w[48]); + acc2 = vdotprod(acc2, vw2, vx1); + acc3 = vdotprod(acc3, vw3, vx1); + w += 64; + } + acc0 = vaddq_s32(acc0, acc2); + acc1 = vaddq_s32(acc1, acc3); + for (;j<cols;j+=4) { int8x16_t vw0, vw1, vx; vx = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[j]); |