diff options
author | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2016-08-10 00:43:35 +0300 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2016-08-10 00:43:35 +0300 |
commit | 85cda644ed25335a7c519702d49c9221683a16a8 (patch) | |
tree | 6ca249f780d8f7c0f8a3056826971395e4cf5656 | |
parent | 54e6a3b30e44f9de06e1d888ff6d49e43770d583 (diff) |
wip autodetection
-rw-r--r-- | celt/tests/test_unit_mathops.c | 1 | ||||
-rw-r--r-- | celt/tests/test_unit_rotation.c | 1 | ||||
-rw-r--r-- | celt/vq.c | 180 | ||||
-rw-r--r-- | celt/vq.h | 8 | ||||
-rw-r--r-- | celt/x86/x86_celt_map.c | 12 | ||||
-rw-r--r-- | celt_headers.mk | 1 | ||||
-rw-r--r-- | celt_sources.mk | 2 |
7 files changed, 26 insertions, 179 deletions
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index fd3319da..d49a2bd2 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -57,6 +57,7 @@ # endif # if defined(OPUS_X86_MAY_HAVE_SSE2) # include "x86/pitch_sse2.c" +# include "x86/vq_sse2.c" # endif # if defined(OPUS_X86_MAY_HAVE_SSE4_1) # include "x86/pitch_sse4_1.c" diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index 1080c208..571ed12d 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -55,6 +55,7 @@ # endif # if defined(OPUS_X86_MAY_HAVE_SSE2) # include "x86/pitch_sse2.c" +# include "x86/vq_sse2.c" # endif # if defined(OPUS_X86_MAY_HAVE_SSE4_1) # include "x86/pitch_sse4_1.c" @@ -159,179 +159,7 @@ static unsigned extract_collapse_mask(int *iy, int N, int B) return collapse_mask; } -static float op_pvq_search_sse(celt_norm *_X, int *iy, int K, int N) -{ - int i, j; - int pulsesLeft; - float xy, yy; - VARDECL(celt_norm, y); - VARDECL(celt_norm, X); - VARDECL(float, signy); - __m128 signmask; - __m128 sums; - __m128i fours; - SAVE_STACK; - - /* All bits set to zero, except for the sign bit. */ - signmask = _mm_set_ps1(-0.f); - fours = _mm_set_epi32(4, 4, 4, 4); - ALLOC(y, N+3, celt_norm); - ALLOC(X, N+3, celt_norm); - ALLOC(signy, N+3, float); - - OPUS_COPY(X, _X, N); - X[N] = X[N+1] = X[N+2] = 0; - sums = _mm_setzero_ps(); - for (j=0;j<N;j+=4) - { - __m128 x4, s4; - x4 = _mm_loadu_ps(&X[j]); - s4 = _mm_cmplt_ps(x4, _mm_setzero_ps()); - /* Get rid of the sign */ - x4 = _mm_andnot_ps(signmask, x4); - sums = _mm_add_ps(sums, x4); - /* Clear y and iy in case we don't do the projection. */ - _mm_storeu_ps(&y[j], _mm_setzero_ps()); - _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128()); - _mm_storeu_ps(&X[j], x4); - _mm_storeu_ps(&signy[j], s4); - } - sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2))); - sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1))); - - xy = yy = 0; - - pulsesLeft = K; - - /* Do a pre-search by projecting on the pyramid */ - if (K > (N>>1)) - { - __m128i pulses_sum; - __m128 yy4, xy4; - __m128 rcp4; - opus_val32 sum = _mm_cvtss_f32(sums); - /* If X is too small, just replace it with a pulse at 0 */ - /* Prevents infinities and NaNs from causing too many pulses - to be allocated. 64 is an approximation of infinity here. */ - if (!(sum > EPSILON && sum < 64)) - { - X[0] = QCONST16(1.f,14); - j=1; do - X[j]=0; - while (++j<N); - sums = _mm_set_ps1(1.f); - } - rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K-1)), _mm_rcp_ps(sums)); - xy4 = yy4 = _mm_setzero_ps(); - pulses_sum = _mm_setzero_si128(); - for (j=0;j<N;j+=4) - { - __m128 rx4, x4, y4; - __m128i iy4; - x4 = _mm_loadu_ps(&X[j]); - rx4 = _mm_mul_ps(x4, rcp4); - iy4 = _mm_cvttps_epi32(rx4); - pulses_sum = _mm_add_epi32(pulses_sum, iy4); - _mm_storeu_si128((__m128i*)&iy[j], iy4); - y4 = _mm_cvtepi32_ps(iy4); - xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4)); - yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4)); - /* double the y[] vector so we don't have to do it in the search loop. */ - _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4)); - } - pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2))); - pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1))); - pulsesLeft -= _mm_cvtsi128_si32(pulses_sum); - xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2))); - xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1))); - xy = _mm_cvtss_f32(xy4); - yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2))); - yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1))); - yy = _mm_cvtss_f32(yy4); - } - X[N] = X[N+1] = X[N+2] = -100; - y[N] = y[N+1] = y[N+2] = 100; - celt_assert2(pulsesLeft>=1, "Allocated too many pulses in the quick pass"); - - /* This should never happen, but just in case it does (e.g. on silence) - we fill the first bin with pulses. */ - if (pulsesLeft > N+3) - { - opus_val16 tmp = (opus_val16)pulsesLeft; - yy = MAC16_16(yy, tmp, tmp); - yy = MAC16_16(yy, tmp, y[0]); - iy[0] += pulsesLeft; - pulsesLeft=0; - } - - for (i=0;i<pulsesLeft;i++) - { - int best_id; - __m128 xy4, yy4; - __m128 max, max2; - __m128i count; - __m128i pos; - best_id = 0; - /* The squared magnitude term gets added anyway, so we might as well - add it outside the loop */ - yy = ADD16(yy, 1); - xy4 = _mm_load1_ps(&xy); - yy4 = _mm_load1_ps(&yy); - max = _mm_setzero_ps(); - pos = _mm_setzero_si128(); - count = _mm_set_epi32(3, 2, 1, 0); - for (j=0;j<N;j+=4) - { - __m128 x4, y4, r4; - x4 = _mm_loadu_ps(&X[j]); - y4 = _mm_loadu_ps(&y[j]); - x4 = _mm_add_ps(x4, xy4); - y4 = _mm_add_ps(y4, yy4); - y4 = _mm_rsqrt_ps(y4); - r4 = _mm_mul_ps(x4, y4); - /* Update the index of the max. */ - pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max)))); - /* Update the max. */ - max = _mm_max_ps(max, r4); - /* Update the indices (+4) */ - count = _mm_add_epi32(count, fours); - } - /* Horizontal max */ - max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2))); - max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1))); - /* Now that max2 contains the max at all positions, look at which value(s) of the - partial max is equal to the global max. */ - pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2))); - pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos)); - pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2))); - best_id = _mm_cvtsi128_si32(pos); - - /* Updating the sums of the new pulse(s) */ - xy = ADD32(xy, EXTEND32(X[best_id])); - /* We're multiplying y[j] by two so we don't have to do it here */ - yy = ADD16(yy, y[best_id]); - - /* Only now that we've made the final choice, update y/iy */ - /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ - y[best_id] += 2; - iy[best_id]++; - } - - /* Put the original sign back */ - for (j=0;j<N;j+=4) - { - __m128i y4; - __m128i s4; - y4 = _mm_loadu_si128((__m128i*)&iy[j]); - s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j])); - y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4); - _mm_storeu_si128((__m128i*)&iy[j], y4); - } - RESTORE_STACK; - return yy; -} - -static float op_pvq_search_c(celt_norm *X, int *iy, int K, int N) +float op_pvq_search_c(celt_norm *X, int *iy, int K, int N) { VARDECL(celt_norm, y); VARDECL(int, signx); @@ -506,11 +334,7 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, exp_rotation(X, N, 1, B, K, spread); -#if 1 - yy = op_pvq_search_sse(X, iy, K, N); -#else - yy = op_pvq_search_c(X, iy, K, N); -#endif + yy = op_pvq_search(X, iy, K, N); encode_pulses(iy, N, K, enc); @@ -37,10 +37,18 @@ #include "entdec.h" #include "modes.h" +#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)) +#include "x86/vq_sse.h" +#endif + #if defined(MIPSr1_ASM) #include "mips/vq_mipsr1.h" #endif +#if !defined(OVERRIDE_OP_PVQ_SEARCH) +#define op_pvq_search(x, iy, K, N) \ + (op_pvq_search_c(x, iy, K, N)) +#endif /** Algebraic pulse-vector quantiser. The signal x is replaced by the sum of * the pitch and a combination of pulses such that its norm is still equal diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c index 47ba41b9..51f9f315 100644 --- a/celt/x86/x86_celt_map.c +++ b/celt/x86/x86_celt_map.c @@ -151,5 +151,17 @@ void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( #endif +#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2) +void (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])( + celt_norm *_X, int *iy, int K, int N +) = { + op_pvq_search_c, /* non-sse */ + op_pvq_search_c, + MAY_HAVE_SSE2(op_pvq_search), + MAY_HAVE_SSE2(op_pvq_search), + MAY_HAVE_SSE2(op_pvq_search) +}; +#endif + #endif #endif diff --git a/celt_headers.mk b/celt_headers.mk index c9df94b3..706185da 100644 --- a/celt_headers.mk +++ b/celt_headers.mk @@ -49,4 +49,5 @@ celt/mips/mdct_mipsr1.h \ celt/mips/pitch_mipsr1.h \ celt/mips/vq_mipsr1.h \ celt/x86/pitch_sse.h \ +celt/x86/vq_sse.h \ celt/x86/x86cpu.h diff --git a/celt_sources.mk b/celt_sources.mk index 2ffe99a3..cabc48ff 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -21,7 +21,7 @@ CELT_SOURCES_SSE = celt/x86/x86cpu.c \ celt/x86/x86_celt_map.c \ celt/x86/pitch_sse.c -CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c +CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c celt/x86/vq_sse2.c CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \ celt/x86/pitch_sse4_1.c |