From edabfc96e5576479e7f88b4c6bfee75c7dfda9bd Mon Sep 17 00:00:00 2001
From: Mateusz Chudyk <mateuszchudyk@gmail.com>
Date: Mon, 22 Jul 2019 16:20:25 +0100
Subject: Add multiply (elemwise) kernel

---
 kernels/implementations.inl | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'kernels/implementations.inl')
diff --git a/kernels/implementations.inl b/kernels/implementations.inl
index fd46390..e2565b3 100644
--- a/kernels/implementations.inl
+++ b/kernels/implementations.inl
@@ -141,6 +141,47 @@ CPU_ATTR inline vd relu<double>(vd input) {
   return max_pd(input, vconst_zero);
 }
 
+/*
+ * Multiply (elemwise)
+ */
+template <typename Type>
+CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> multiply(vector_t<CPUType::CPU_NAME, Type> a, vector_t<CPUType::CPU_NAME, Type> b);
+
+template <>
+CPU_ATTR inline vi multiply<int8_t>(vi a, vi b) {
+  auto even = mullo_epi16(a, b);
+  auto odd = mullo_epi16(srli_epi16(a, 8), srli_epi16(b, 8));
+  return or_si(slli_epi16(odd, 8), srli_epi16(slli_epi16(even, 8), 8));
+}
+
+template <>
+CPU_ATTR inline vi multiply<int16_t>(vi a, vi b) {
+  return mullo_epi16(a, b);
+}
+
+template <>
+CPU_ATTR inline vi multiply<int>(vi a, vi b) {
+#if defined(THIS_IS_SSE2)
+  auto even = mul_epu32(a, b);
+  auto odd = mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+  return unpacklo_epi32(shuffle_epi32(even, 0x8 /* = 0 0 2 0 */), shuffle_epi32(odd, 0x8 /* = 0 0 2 0 */));
+#elif defined(THIS_IS_AVX2)
+  return _mm256_mullo_epi32(a, b);
+#else
+  return _mm512_mullo_epi32(a, b);
+#endif
+}
+
+template <>
+CPU_ATTR inline vf multiply<float>(vf a, vf b) {
+  return mul_ps(a, b);
+}
+
+template <>
+CPU_ATTR inline vd multiply<double>(vd a, vd b) {
+  return mul_pd(a, b);
+}
+
 /*
  * Floor
  */
-- 
cgit v1.2.3