Add SSE2 support

Not so much for old machines, as for getting decent performance when not setting -march= (SSE2 is part of the amd64 ABI).
author: Jean-Marc Valin <jmvalin@amazon.com> 2023-07-22 04:32:28 +0300
committer: Jean-Marc Valin <jmvalin@amazon.com> 2023-07-22 21:56:05 +0300
commit: 4710bdf7122d1dbc6f8df41137d5a1d4cddaa603 (patch)
tree: 96c2dd25880cc2e842144f07e0156056bcd489c8
parent: 9261eb5c3786468e2041bcc6384d9292aaf4d5a4 (diff)
2 files changed, 30 insertions, 11 deletions
diff --git a/dnn/vec.h b/dnn/vec.h
index 06d15718..f6085cee 100644
--- a/dnn/vec.h
+++ b/dnn/vec.h
@@ -35,7 +35,7 @@
 #include "arch.h"
 
 
-#if defined(__AVX__) || defined(__SSSE3__)
+#if defined(__AVX__) || defined(__SSE2__)
 #include "vec_avx.h"
 #elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && !defined(DISABLE_NEON)
 #include "vec_neon.h"
diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h
index e4b8f043..d20a2620 100644
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -40,18 +40,9 @@
 
 
 /* Use 8-bit dot products unless disabled or if stuck with SSE2. */
-#if (defined(__AVX2__) || defined(__SSSE3__)) && !defined(DISABLE_DOT_PROD)
+#ifndef DISABLE_DOT_PROD
 #define DOT_PROD
 #define USE_SU_BIAS
-
-#else
-
-#if defined(_MSC_VER)
-#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
-#else
-#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
-#endif
-
 #endif
 
 
@@ -652,6 +643,34 @@ static inline mm256i_emu mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a, mm256
 #define _mm256_dpbusds_epi32(src, a, b) mm256_dpbusds_epi32(src, a, b)
 
 #elif defined(__SSE2__)
+
+static inline __m128i mm_dpbusds_epi32(__m128i src, __m128i a, __m128i b) {
+  __m128i ah, al, bh, bl, tmp;
+  ah = _mm_srli_epi16(a, 8);
+  bh = _mm_srai_epi16(b, 8);
+  al = _mm_srli_epi16(_mm_slli_epi16(a, 8), 8);
+  bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
+  tmp = _mm_add_epi32(_mm_madd_epi16(ah, bh), _mm_madd_epi16(al, bl));
+  return _mm_add_epi32(src, tmp);
+}
+
+static inline mm256i_emu mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a, mm256i_emu b) {
+  mm256i_emu res;
+  res.hi = mm_dpbusds_epi32(src.hi, a.hi, b.hi);
+  res.lo = mm_dpbusds_epi32(src.lo, a.lo, b.lo);
+  return res;
+}
+#define _mm256_dpbusds_epi32(src, a, b) mm256_dpbusds_epi32(src, a, b)
+
+#if defined(_MSC_VER)
+#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
+#else
+#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
+#endif
+
+#else
+
+#error "No optimizations in vec_avx.h. This should never happen. "
 #endif
 
 static inline void sgemv16x1(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
author	Jean-Marc Valin <jmvalin@amazon.com>	2023-07-22 04:32:28 +0300
committer	Jean-Marc Valin <jmvalin@amazon.com>	2023-07-22 21:56:05 +0300
commit	4710bdf7122d1dbc6f8df41137d5a1d4cddaa603 (patch)
tree	96c2dd25880cc2e842144f07e0156056bcd489c8
parent	9261eb5c3786468e2041bcc6384d9292aaf4d5a4 (diff)