Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMateusz Chudyk <mateuszchudyk@gmail.com>2019-07-03 16:29:45 +0300
committerMateusz Chudyk <mateuszchudyk@gmail.com>2019-07-03 16:29:45 +0300
commit0446d7d5dd0c9b7b908ee82ebc703ef70cbc1b15 (patch)
tree475a3abf8135ab2dd38b2a743a1dfef4688fdb9e
parent372ac436374614a3218aa5ad8605289a2ac3d177 (diff)
Fix: 'scale' parameter of i32gather_ps has to be compile-time constant
-rw-r--r--intrinsics.h10
-rw-r--r--vec_utils.h2
2 files changed, 7 insertions, 5 deletions
diff --git a/intrinsics.h b/intrinsics.h
index 9337e35..c27ca97 100644
--- a/intrinsics.h
+++ b/intrinsics.h
@@ -132,8 +132,9 @@ INTGEMM_AVX2 static inline __m256i cvttps_epi32(__m256 a) {
INTGEMM_AVX2 static inline __m256 div_ps(__m256 a, __m256 b) {
return _mm256_div_ps(a, b);
}
-INTGEMM_AVX2 static inline __m256 i32gather_ps(float const *base_addr, __m256i vindex, const int scale) {
- return _mm256_i32gather_ps(base_addr, vindex, scale);
+template <unsigned Scale>
+INTGEMM_AVX2 static inline __m256 i32gather_ps(float const *base_addr, __m256i vindex) {
+ return _mm256_i32gather_ps(base_addr, vindex, Scale);
}
template <> INTGEMM_AVX2 inline __m256 loadu_ps(const float* mem_addr) {
return _mm256_loadu_ps(mem_addr);
@@ -212,8 +213,9 @@ INTGEMM_AVX512BW static inline __m512i cvttps_epi32(__m512 a) {
INTGEMM_AVX512BW static inline __m512 div_ps(__m512 a, __m512 b) {
return _mm512_div_ps(a, b);
}
-INTGEMM_AVX512BW static inline __m512 i32gather_ps(float const *base_addr, __m512i vindex, const int scale) {
- return _mm512_i32gather_ps(vindex, base_addr, scale);
+template <unsigned Scale>
+INTGEMM_AVX512BW static inline __m512 i32gather_ps(float const *base_addr, __m512i vindex) {
+ return _mm512_i32gather_ps(vindex, base_addr, Scale);
}
template <> INTGEMM_AVX512BW inline __m512 loadu_ps(const float* mem_addr) {
return _mm512_loadu_ps(mem_addr);
diff --git a/vec_utils.h b/vec_utils.h
index e2049fb..acb7d6e 100644
--- a/vec_utils.h
+++ b/vec_utils.h
@@ -119,7 +119,7 @@ Register exp_approx_taylor(Register x) {
result = add_ps(result, const_one);
- auto ea = i32gather_ps(EXP_LOOKUP + EXP_MAX, cvtps_epi32(a), 4);
+ auto ea = i32gather_ps<4>(EXP_LOOKUP + EXP_MAX, cvtps_epi32(a));
return mul_ps(ea, result);
}