Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@jmvalin.ca>2013-06-06 02:53:48 +0400
committerJean-Marc Valin <jmvalin@jmvalin.ca>2013-06-06 02:56:07 +0400
commita092aa8f80be25f62b5b62032d996f26fa188ed8 (patch)
treef431d6d57c04636beb05305ce4b77349f552059a
parentcd4c8249bc0e091789495a09b8942d28b687273c (diff)
Adds SSE support (only xcorr_kernel() for now)
There's no CPU detection for it, it only gets enabled by __SSE__ which gcc (other compilers?) defines automatically when supported by -march=, which means at least all x86-64. For ia32, the user needs to enable it in the CFLAGS.
-rw-r--r--celt/pitch.h6
-rw-r--r--celt/x86/pitch_sse.h97
2 files changed, 103 insertions, 0 deletions
diff --git a/celt/pitch.h b/celt/pitch.h
index efc1175c..580ea819 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -36,6 +36,10 @@
#include "modes.h"
+#ifdef __SSE__
+#include "x86/pitch_sse.h"
+#endif
+
void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
int len, int C);
@@ -47,6 +51,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
/* OPT: This is the kernel you really want to optimize. It gets used a lot
by the prefilter and by the PLC. */
+#ifndef OVERRIDE_XCORR_KERNEL
static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
@@ -111,6 +116,7 @@ static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus
sum[3] = MAC16_16(sum[3],tmp,y_1);
}
}
+#endif /* OVERRIDE_XCORR_KERNEL */
#ifdef FIXED_POINT
opus_val32
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
new file mode 100644
index 00000000..6f5a2d1b
--- /dev/null
+++ b/celt/x86/pitch_sse.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2013 Xiph.Org Foundation
+ Written by Jean-Marc Valin */
+/**
+ @file pitch_sse.h
+ @brief Pitch analysis
+ */
+
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef PITCH_SSE_H
+#define PITCH_SSE_H
+
+#include <xmmintrin.h>
+#include "arch.h"
+
+#define OVERRIDE_XCORR_KERNEL
+
+static inline void xcorr_kernel(const opus_val16 * _x, const opus_val16 * _y, opus_val32 _sum[4], int len)
+{
+ int j;
+ __m128 sum;
+ __m128 x;
+ __m128 y;
+ __m128 y2;
+ __m128 y1;
+ __m128 y3;
+ __m128 tmp;
+ sum = _mm_loadu_ps(_sum);
+
+ x = _mm_loadu_ps(_x);
+ y = _mm_loadu_ps(_y);
+ y1 = _mm_loadu_ps(_y+1);
+ for (j=0;j<len-3;j+=4)
+ {
+ _x+=4;
+ _y+=4;
+ y2 = _mm_loadu_ps(_y);
+ y3 = _mm_loadu_ps(_y+1);
+ tmp = _mm_shuffle_ps(x, x, 0x00);
+ sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
+ tmp = _mm_shuffle_ps(x, x, 0x55);
+ sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y1));
+ tmp = _mm_shuffle_ps(x, x, 0xaa);
+ y = _mm_shuffle_ps(y, y2, 0x4e);
+ sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
+ tmp = _mm_shuffle_ps(x, x, 0xff);
+ y = _mm_shuffle_ps(y1, y3, 0x4e);
+ sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
+ x = _mm_loadu_ps(_x);
+ y = y2;
+ y1 = y3;
+ }
+ _y++;
+ if (j++<len)
+ {
+ tmp = _mm_shuffle_ps(x, x, 0x00);
+ sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
+ }
+ if (j++<len)
+ {
+ tmp = _mm_shuffle_ps(x, x, 0x55);
+ y = _mm_loadu_ps(_y++);
+ sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
+ }
+ if (j++<len)
+ {
+ tmp = _mm_shuffle_ps(x, x, 0xaa);
+ y = _mm_loadu_ps(_y++);
+ sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
+ }
+ _mm_storeu_ps(_sum, sum);
+}
+
+#endif