Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@jmvalin.ca>2016-08-09 08:13:11 +0300
committerJean-Marc Valin <jmvalin@jmvalin.ca>2016-08-09 08:13:11 +0300
commit8c6c5c570f3a656fba1b77dc359a7f10ea26b962 (patch)
treee10976a39165414dbea3c506c65b3d74d7de4f84
parent2f2b2df148769c05a963f77e647565569546e7e3 (diff)
Converting more code to SSE
-rw-r--r--celt/vq.c47
1 files changed, 28 insertions, 19 deletions
diff --git a/celt/vq.c b/celt/vq.c
index 05cec48a..ec9ae310 100644
--- a/celt/vq.c
+++ b/celt/vq.c
@@ -169,7 +169,8 @@ static float compute_search_vec(float *_X, int *iy, int K, int N)
float xy, yy;
VARDECL(celt_norm, y);
VARDECL(celt_norm, X);
- VARDECL(int, signx);
+ VARDECL(float, signy);
+ __m128 signmask;
#ifdef PVQ_SEARCH_INT
__m128i fours;
#else
@@ -177,6 +178,8 @@ static float compute_search_vec(float *_X, int *iy, int K, int N)
#endif
SAVE_STACK;
+ /* All bits set to zero, except for the sign bit. */
+ signmask = _mm_set_ps1(-0.f);
#ifdef PVQ_SEARCH_INT
fours = _mm_set_epi32(4, 4, 4, 4);
#else
@@ -184,19 +187,21 @@ static float compute_search_vec(float *_X, int *iy, int K, int N)
#endif
ALLOC(y, N+3, celt_norm);
ALLOC(X, N+3, celt_norm);
- ALLOC(signx, N, int);
+ ALLOC(signy, N+3, float);
- /* Get rid of the sign */
+ for (j=0;j<N;j+=4)
+ {
+ __m128 x4, s4;
+ x4 = _mm_loadu_ps(&_X[j]);
+ s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
+ /* Get rid of the sign */
+ x4 = _mm_andnot_ps(signmask, x4);
+ _mm_storeu_ps(&X[j], x4);
+ _mm_storeu_ps(&signy[j], s4);
+ _mm_storeu_ps(&y[j], _mm_setzero_ps());
+ _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
+ }
sum = 0;
- j=0; do {
- signx[j] = _X[j]<0;
- /* OPT: Make sure the compiler doesn't use a branch on ABS16(). */
- X[j] = ABS16(_X[j]);
- iy[j] = 0;
- y[j] = 0;
- } while (++j<N);
- X[N] = X[N+1] = X[N+2] = -100;
- y[N] = y[N+1] = y[N+2] = 100;
xy = yy = 0;
@@ -231,6 +236,8 @@ static float compute_search_vec(float *_X, int *iy, int K, int N)
pulsesLeft -= iy[j];
} while (++j<N);
}
+ X[N] = X[N+1] = X[N+2] = -100;
+ y[N] = y[N+1] = y[N+2] = 100;
celt_assert2(pulsesLeft>=1, "Allocated too many pulses in the quick pass");
/* This should never happen, but just in case it does (e.g. on silence)
@@ -323,13 +330,15 @@ static float compute_search_vec(float *_X, int *iy, int K, int N)
}
/* Put the original sign back */
- j=0;
- do {
- /*iy[j] = signx[j] ? -iy[j] : iy[j];*/
- /* OPT: The is more likely to be compiled without a branch than the code above
- but has the same performance otherwise. */
- iy[j] = (iy[j]^-signx[j]) + signx[j];
- } while (++j<N);
+ for (j=0;j<N;j+=4)
+ {
+ __m128i y4;
+ __m128i s4;
+ y4 = _mm_loadu_si128((__m128i*)&iy[j]);
+ s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
+ y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
+ _mm_storeu_si128((__m128i*)&iy[j], y4);
+ }
RESTORE_STACK;
return yy;
}