Defining celt_inner_prod() and using it instead of explicit loops.

Also adds an SSE-optimized celt_inner_prod().
author: Jean-Marc Valin <jmvalin@jmvalin.ca> 2013-12-09 11:33:42 +0400
committer: Jean-Marc Valin <jmvalin@jmvalin.ca> 2013-12-09 11:34:55 +0400
commit: e72833098c86970e0dd922ecd1baede7960346bf (patch)
tree: fc841e8f4dbf9f330e63b86fc0b75bf207f8c322
parent: ab61492195242cd117e6052be41c81280b766e0a (diff)
5 files changed, 59 insertions, 36 deletions
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index db183430..592ae250 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -576,7 +576,7 @@ static int tf_analysis(const CELTMode *m, int len, int isTransient,
    *tf_sum = 0;
    for (i=0;i<len;i++)
    {
-      int j, k, N;
+      int k, N;
       int narrow;
       opus_val32 L1, best_L1;
       int best_level=0;
@@ -768,10 +768,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
       /* Compute inter-channel correlation for low frequencies */
       for (i=0;i<8;i++)
       {
-         int j;
-         opus_val32 partial = 0;
-         for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
-            partial = MAC16_16(partial, X[j], X[N0+j]);
+         opus_val32 partial;
+         partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
          sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
       }
       sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
@@ -779,10 +777,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
       minXC = sum;
       for (i=8;i<intensity;i++)
       {
-         int j;
-         opus_val32 partial = 0;
-         for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
-            partial = MAC16_16(partial, X[j], X[N0+j]);
+         opus_val32 partial;
+         partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
          minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18))));
       }
       minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC));
diff --git a/celt/pitch.c b/celt/pitch.c
index d2b30544..93db58d7 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -252,7 +252,7 @@ void
 #endif
 celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
 {
-   int i,j;
+   int i;
    /*The EDSP version requires that max_pitch is at least 1, and that _x is
       32-bit aligned.
      Since it's hard to put asserts in assembly, put them here.*/
@@ -279,9 +279,8 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
    /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
    for (;i<max_pitch;i++)
    {
-      opus_val32 sum = 0;
-      for (j=0;j<len;j++)
-         sum = MAC16_16(sum, _x[j],_y[i+j]);
+      opus_val32 sum;
+      sum = celt_inner_prod(_x, _y+i, len);
       xcorr[i] = sum;
 #ifdef FIXED_POINT
       maxcorr = MAX32(maxcorr, sum);
@@ -361,12 +360,17 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
 #endif
    for (i=0;i<max_pitch>>1;i++)
    {
-      opus_val32 sum=0;
+      opus_val32 sum;
       xcorr[i] = 0;
       if (abs(i-2*best_pitch[0])>2 && abs(i-2*best_pitch[1])>2)
          continue;
+#ifdef FIXED_POINT
+      sum = 0;
       for (j=0;j<len>>1;j++)
          sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
+#else
+      sum = celt_inner_prod(x_lp, y+i, len>>1);
+#endif
       xcorr[i] = MAX32(-1, sum);
 #ifdef FIXED_POINT
       maxcorr = MAX32(maxcorr, sum);
@@ -513,13 +517,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
       pg = SHR32(frac_div32(best_xy,best_yy+1),16);
 
    for (k=0;k<3;k++)
-   {
-      int T1 = T+k-1;
-      xy = 0;
-      for (i=0;i<N;i++)
-         xy = MAC16_16(xy, x[i], x[i-T1]);
-      xcorr[k] = xy;
-   }
+      xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
    if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
       offset = 1;
    else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))
diff --git a/celt/pitch.h b/celt/pitch.h
index df317ecc..ec55acae 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -141,6 +141,18 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
 }
 #endif
 
+#ifndef OVERRIDE_CELT_INNER_PROD
+static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
+      int N)
+{
+   int i;
+   opus_val32 xy=0;
+   for (i=0;i<N;i++)
+      xy = MAC16_16(xy, x[i], y[i]);
+   return xy;
+}
+#endif
+
 #ifdef FIXED_POINT
 opus_val32
 #else
diff --git a/celt/vq.c b/celt/vq.c
index 98a0f36c..41868134 100644
--- a/celt/vq.c
+++ b/celt/vq.c
@@ -37,6 +37,7 @@
 #include "os_support.h"
 #include "bands.h"
 #include "rate.h"
+#include "pitch.h"
 
 static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s)
 {
@@ -350,15 +351,11 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
 #ifdef FIXED_POINT
    int k;
 #endif
-   opus_val32 E = EPSILON;
+   opus_val32 E;
    opus_val16 g;
    opus_val32 t;
-   celt_norm *xptr = X;
-   for (i=0;i<N;i++)
-   {
-      E = MAC16_16(E, *xptr, *xptr);
-      xptr++;
-   }
+   celt_norm *xptr;
+   E = EPSILON + celt_inner_prod(X, X, N);
 #ifdef FIXED_POINT
    k = celt_ilog2(E)>>1;
 #endif
@@ -393,14 +390,8 @@ int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N)
          Eside = MAC16_16(Eside, s, s);
       }
    } else {
-      for (i=0;i<N;i++)
-      {
-         celt_norm m, s;
-         m = X[i];
-         s = Y[i];
-         Emid = MAC16_16(Emid, m, m);
-         Eside = MAC16_16(Eside, s, s);
-      }
+      Emid += celt_inner_prod(X, X, N);
+      Eside += celt_inner_prod(Y, Y, N);
    }
    mid = celt_sqrt(Emid);
    side = celt_sqrt(Eside);
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index 695122a5..58f83246 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -101,6 +101,32 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
    }
 }
 
+#define OVERRIDE_CELT_INNER_PROD
+static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
+      int N)
+{
+   int i;
+   float xy;
+   __m128 sum;
+   sum = _mm_setzero_ps();
+   /* FIXME: We should probably go 8-way and use 2 sums. */
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 xi = _mm_loadu_ps(x+i);
+      __m128 yi = _mm_loadu_ps(y+i);
+      sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
+   }
+   /* Horizontal sum */
+   sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+   sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+   _mm_store_ss(&xy, sum);
+   for (;i<N;i++)
+   {
+      xy = MAC16_16(xy, x[i], y[i]);
+   }
+   return xy;
+}
+
 #define OVERRIDE_COMB_FILTER_CONST
 static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
       opus_val16 g10, opus_val16 g11, opus_val16 g12)
author	Jean-Marc Valin <jmvalin@jmvalin.ca>	2013-12-09 11:33:42 +0400
committer	Jean-Marc Valin <jmvalin@jmvalin.ca>	2013-12-09 11:34:55 +0400
commit	e72833098c86970e0dd922ecd1baede7960346bf (patch)
tree	fc841e8f4dbf9f330e63b86fc0b75bf207f8c322
parent	ab61492195242cd117e6052be41c81280b766e0a (diff)