diff options
author | Jean-Marc Valin <jmvalin@amazon.com> | 2023-11-28 22:16:57 +0300 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@amazon.com> | 2023-11-28 22:16:57 +0300 |
commit | db26e381a45aadcd82851075b85e2466e7de77d2 (patch) | |
tree | b03957232c082640e5c53136ecbf4f5c30be888f | |
parent | 72cc88dfddce319aeac075bd28eff791cd2b14d8 (diff) |
Trying to use fma instructions when possible
Compilers sometimes replace vmlaq*() with fmul+fadd instead of fmla.
Trying to use vfmaq*() instead when possible.
-rw-r--r-- | celt/arm/celt_neon_intr.c | 8 | ||||
-rw-r--r-- | celt/arm/pitch_neon_intr.c | 7 | ||||
-rw-r--r-- | dnn/vec_neon.h | 6 |
3 files changed, 21 insertions, 0 deletions
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c index effda769..c8301101 100644 --- a/celt/arm/celt_neon_intr.c +++ b/celt/arm/celt_neon_intr.c @@ -97,6 +97,14 @@ void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_va } #else + +#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64) +/* If we can, force the compiler to use an FMA instruction rather than break + * vmlaq_f32() into fmul/fadd. */ +#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane) +#endif + + /* * Function: xcorr_kernel_neon_float * --------------------------------- diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c index 35cc46e2..43885f52 100644 --- a/celt/arm/pitch_neon_intr.c +++ b/celt/arm/pitch_neon_intr.c @@ -130,6 +130,13 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus /* ========================================================================== */ +#ifdef __ARM_FEATURE_FMA +/* If we can, force the compiler to use an FMA instruction rather than break + vmlaq_f32() into fmul/fadd. */ +#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c) +#endif + + #ifdef OPUS_CHECK_ASM /* This part of code simulates floating-point NEON operations. */ diff --git a/dnn/vec_neon.h b/dnn/vec_neon.h index 785fc056..acf49f47 100644 --- a/dnn/vec_neon.h +++ b/dnn/vec_neon.h @@ -49,6 +49,12 @@ static OPUS_INLINE int16x8_t vmull_high_s8(int8x16_t a, int8x16_t b) { } #endif +#ifdef __ARM_FEATURE_FMA +/* If we can, force the compiler to use an FMA instruction rather than break + vmlaq_f32() into fmul/fadd. */ +#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c) +#endif + #ifndef LPCNET_TEST static inline float32x4_t exp4_approx(float32x4_t x) { int32x4_t i; |