Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@amazon.com>2023-07-22 04:32:28 +0300
committerJean-Marc Valin <jmvalin@amazon.com>2023-07-22 21:56:05 +0300
commit4710bdf7122d1dbc6f8df41137d5a1d4cddaa603 (patch)
tree96c2dd25880cc2e842144f07e0156056bcd489c8
parent9261eb5c3786468e2041bcc6384d9292aaf4d5a4 (diff)
Add SSE2 support
Not so much for old machines, as for getting decent performance when not setting -march= (SSE2 is part of the amd64 ABI).
-rw-r--r--dnn/vec.h2
-rw-r--r--dnn/vec_avx.h39
2 files changed, 30 insertions, 11 deletions
diff --git a/dnn/vec.h b/dnn/vec.h
index 06d15718..f6085cee 100644
--- a/dnn/vec.h
+++ b/dnn/vec.h
@@ -35,7 +35,7 @@
#include "arch.h"
-#if defined(__AVX__) || defined(__SSSE3__)
+#if defined(__AVX__) || defined(__SSE2__)
#include "vec_avx.h"
#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && !defined(DISABLE_NEON)
#include "vec_neon.h"
diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h
index e4b8f043..d20a2620 100644
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -40,18 +40,9 @@
/* Use 8-bit dot products unless disabled or if stuck with SSE2. */
-#if (defined(__AVX2__) || defined(__SSSE3__)) && !defined(DISABLE_DOT_PROD)
+#ifndef DISABLE_DOT_PROD
#define DOT_PROD
#define USE_SU_BIAS
-
-#else
-
-#if defined(_MSC_VER)
-#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
-#else
-#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
-#endif
-
#endif
@@ -652,6 +643,34 @@ static inline mm256i_emu mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a, mm256
#define _mm256_dpbusds_epi32(src, a, b) mm256_dpbusds_epi32(src, a, b)
#elif defined(__SSE2__)
+
+static inline __m128i mm_dpbusds_epi32(__m128i src, __m128i a, __m128i b) {
+ __m128i ah, al, bh, bl, tmp;
+ ah = _mm_srli_epi16(a, 8);
+ bh = _mm_srai_epi16(b, 8);
+ al = _mm_srli_epi16(_mm_slli_epi16(a, 8), 8);
+ bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
+ tmp = _mm_add_epi32(_mm_madd_epi16(ah, bh), _mm_madd_epi16(al, bl));
+ return _mm_add_epi32(src, tmp);
+}
+
+static inline mm256i_emu mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a, mm256i_emu b) {
+ mm256i_emu res;
+ res.hi = mm_dpbusds_epi32(src.hi, a.hi, b.hi);
+ res.lo = mm_dpbusds_epi32(src.lo, a.lo, b.lo);
+ return res;
+}
+#define _mm256_dpbusds_epi32(src, a, b) mm256_dpbusds_epi32(src, a, b)
+
+#if defined(_MSC_VER)
+#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
+#else
+#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
+#endif
+
+#else
+
+#error "No optimizations in vec_avx.h. This should never happen. "
#endif
static inline void sgemv16x1(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)