diff options
author | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2013-06-17 11:58:16 +0400 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2013-06-17 11:58:16 +0400 |
commit | 0d95b3b48cb0f852b22693bcf49ed4f0e604def3 (patch) | |
tree | a13414622a90707dfcb997654698538287865a13 /celt/x86 | |
parent | 71766ef7a451a48536255c62c273b037ebcb2c42 (diff) |
SSE optimization of comb_filter()
Should make it easy to adapt to other architectures.
Diffstat (limited to 'celt/x86')
-rw-r--r-- | celt/x86/pitch_sse.h | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index d2413980..64c96af2 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -100,5 +100,54 @@ static inline opus_val32 dual_inner_prod(const opus_val16 *x, const opus_val16 * return xy; } +#define OVERRIDE_COMB_FILTER_CONST +static inline void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12) +{ + int i; + __m128 x0v; + __m128 g10v, g11v, g12v; + g10v = _mm_load1_ps(&g10); + g11v = _mm_load1_ps(&g11); + g12v = _mm_load1_ps(&g12); + x0v = _mm_loadu_ps(&x[-T-2]); + for (i=0;i<N-3;i+=4) + { + __m128 yi, yi2, x1v, x2v, x3v, x4v; + const opus_val32 *xp = &x[i-T-2]; + yi = _mm_loadu_ps(x+i); + x4v = _mm_loadu_ps(xp+4); +#if 0 + /* Slower version with all loads */ + x1v = _mm_loadu_ps(xp+1); + x2v = _mm_loadu_ps(xp+2); + x3v = _mm_loadu_ps(xp+3); +#else + x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); + x1v = _mm_shuffle_ps(x0v, x2v, 0x99); + x3v = _mm_shuffle_ps(x2v, x4v, 0x99); +#endif + + yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); +#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ + yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); + yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); +#else + /* Use partial sums */ + yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), + _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); + yi = _mm_add_ps(yi, yi2); +#endif + x0v=x4v; + _mm_storeu_ps(y+i, yi); + } + for (;i<N;i++) + { + y[i] = x[i] + + MULT16_32_Q15(g10,x[i-T]) + + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) + + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); + } +} #endif |