Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/celt/x86
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@jmvalin.ca>2013-06-17 11:58:16 +0400
committerJean-Marc Valin <jmvalin@jmvalin.ca>2013-06-17 11:58:16 +0400
commit0d95b3b48cb0f852b22693bcf49ed4f0e604def3 (patch)
treea13414622a90707dfcb997654698538287865a13 /celt/x86
parent71766ef7a451a48536255c62c273b037ebcb2c42 (diff)
SSE optimization of comb_filter()
Should make it easy to adapt to other architectures.
Diffstat (limited to 'celt/x86')
-rw-r--r--celt/x86/pitch_sse.h49
1 files changed, 49 insertions, 0 deletions
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index d2413980..64c96af2 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -100,5 +100,54 @@ static inline opus_val32 dual_inner_prod(const opus_val16 *x, const opus_val16 *
return xy;
}
+#define OVERRIDE_COMB_FILTER_CONST
+static inline void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+ opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+ int i;
+ __m128 x0v;
+ __m128 g10v, g11v, g12v;
+ g10v = _mm_load1_ps(&g10);
+ g11v = _mm_load1_ps(&g11);
+ g12v = _mm_load1_ps(&g12);
+ x0v = _mm_loadu_ps(&x[-T-2]);
+ for (i=0;i<N-3;i+=4)
+ {
+ __m128 yi, yi2, x1v, x2v, x3v, x4v;
+ const opus_val32 *xp = &x[i-T-2];
+ yi = _mm_loadu_ps(x+i);
+ x4v = _mm_loadu_ps(xp+4);
+#if 0
+ /* Slower version with all loads */
+ x1v = _mm_loadu_ps(xp+1);
+ x2v = _mm_loadu_ps(xp+2);
+ x3v = _mm_loadu_ps(xp+3);
+#else
+ x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
+ x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
+ x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
+#endif
+
+ yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
+ yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
+ yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#else
+ /* Use partial sums */
+ yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
+ _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+ yi = _mm_add_ps(yi, yi2);
+#endif
+ x0v=x4v;
+ _mm_storeu_ps(y+i, yi);
+ }
+ for (;i<N;i++)
+ {
+ y[i] = x[i]
+ + MULT16_32_Q15(g10,x[i-T])
+ + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+ + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
+ }
+}
#endif