From edabfc96e5576479e7f88b4c6bfee75c7dfda9bd Mon Sep 17 00:00:00 2001 From: Mateusz Chudyk Date: Mon, 22 Jul 2019 16:20:25 +0100 Subject: Add multiply (elemwise) kernel --- kernels/implementations.inl | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'kernels/implementations.inl') diff --git a/kernels/implementations.inl b/kernels/implementations.inl index fd46390..e2565b3 100644 --- a/kernels/implementations.inl +++ b/kernels/implementations.inl @@ -141,6 +141,47 @@ CPU_ATTR inline vd relu(vd input) { return max_pd(input, vconst_zero); } +/* + * Multiply (elemwise) + */ +template +CPU_ATTR static inline vector_t multiply(vector_t a, vector_t b); + +template <> +CPU_ATTR inline vi multiply(vi a, vi b) { + auto even = mullo_epi16(a, b); + auto odd = mullo_epi16(srli_epi16(a, 8), srli_epi16(b, 8)); + return or_si(slli_epi16(odd, 8), srli_epi16(slli_epi16(even, 8), 8)); +} + +template <> +CPU_ATTR inline vi multiply(vi a, vi b) { + return mullo_epi16(a, b); +} + +template <> +CPU_ATTR inline vi multiply(vi a, vi b) { +#if defined(THIS_IS_SSE2) + auto even = mul_epu32(a, b); + auto odd = mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); + return unpacklo_epi32(shuffle_epi32(even, 0x8 /* = 0 0 2 0 */), shuffle_epi32(odd, 0x8 /* = 0 0 2 0 */)); +#elif defined(THIS_IS_AVX2) + return _mm256_mullo_epi32(a, b); +#else + return _mm512_mullo_epi32(a, b); +#endif +} + +template <> +CPU_ATTR inline vf multiply(vf a, vf b) { + return mul_ps(a, b); +} + +template <> +CPU_ATTR inline vd multiply(vd a, vd b) { + return mul_pd(a, b); +} + /* * Floor */ -- cgit v1.2.3