diff options
author | Mateusz Chudyk <mateuszchudyk@gmail.com> | 2019-07-03 16:29:45 +0300 |
---|---|---|
committer | Mateusz Chudyk <mateuszchudyk@gmail.com> | 2019-07-03 16:29:45 +0300 |
commit | 0446d7d5dd0c9b7b908ee82ebc703ef70cbc1b15 (patch) | |
tree | 475a3abf8135ab2dd38b2a743a1dfef4688fdb9e | |
parent | 372ac436374614a3218aa5ad8605289a2ac3d177 (diff) |
Fix: 'scale' parameter of i32gather_ps has to be compile-time constant
-rw-r--r-- | intrinsics.h | 10 | ||||
-rw-r--r-- | vec_utils.h | 2 |
2 files changed, 7 insertions, 5 deletions
diff --git a/intrinsics.h b/intrinsics.h index 9337e35..c27ca97 100644 --- a/intrinsics.h +++ b/intrinsics.h @@ -132,8 +132,9 @@ INTGEMM_AVX2 static inline __m256i cvttps_epi32(__m256 a) { INTGEMM_AVX2 static inline __m256 div_ps(__m256 a, __m256 b) { return _mm256_div_ps(a, b); } -INTGEMM_AVX2 static inline __m256 i32gather_ps(float const *base_addr, __m256i vindex, const int scale) { - return _mm256_i32gather_ps(base_addr, vindex, scale); +template <unsigned Scale> +INTGEMM_AVX2 static inline __m256 i32gather_ps(float const *base_addr, __m256i vindex) { + return _mm256_i32gather_ps(base_addr, vindex, Scale); } template <> INTGEMM_AVX2 inline __m256 loadu_ps(const float* mem_addr) { return _mm256_loadu_ps(mem_addr); @@ -212,8 +213,9 @@ INTGEMM_AVX512BW static inline __m512i cvttps_epi32(__m512 a) { INTGEMM_AVX512BW static inline __m512 div_ps(__m512 a, __m512 b) { return _mm512_div_ps(a, b); } -INTGEMM_AVX512BW static inline __m512 i32gather_ps(float const *base_addr, __m512i vindex, const int scale) { - return _mm512_i32gather_ps(vindex, base_addr, scale); +template <unsigned Scale> +INTGEMM_AVX512BW static inline __m512 i32gather_ps(float const *base_addr, __m512i vindex) { + return _mm512_i32gather_ps(vindex, base_addr, Scale); } template <> INTGEMM_AVX512BW inline __m512 loadu_ps(const float* mem_addr) { return _mm512_loadu_ps(mem_addr); diff --git a/vec_utils.h b/vec_utils.h index e2049fb..acb7d6e 100644 --- a/vec_utils.h +++ b/vec_utils.h @@ -119,7 +119,7 @@ Register exp_approx_taylor(Register x) { result = add_ps(result, const_one); - auto ea = i32gather_ps(EXP_LOOKUP + EXP_MAX, cvtps_epi32(a), 4); + auto ea = i32gather_ps<4>(EXP_LOOKUP + EXP_MAX, cvtps_epi32(a)); return mul_ps(ea, result); } |