Fix avx512 compilation

author: Nikolay Bogoychev <nheart@gmail.com> 2019-04-17 20:23:19 +0300
committer: Nikolay Bogoychev <nheart@gmail.com> 2019-04-17 20:23:19 +0300
commit: 6e3b06d56a35dc29144e25f68ccdc8ea9cacfc14 (patch)
tree: c36ee46dfca6fff2e82102034d79b4ac348c87e1 /avx512_gemm.h
parent: 845af69633fddf31ac600ba1ecbb53932132b531 (diff)
1 files changed, 6 insertions, 6 deletions
diff --git a/avx512_gemm.h b/avx512_gemm.h
index d8e9e63..93a5997 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -49,7 +49,7 @@ SELECT_COL_B_DEF(AVX512F, __m512i)
 // but then the memory written to won't be contiguous anyway so we'd be doing a
 // scatter anyway.  Easier to just read the 8 columns we wanted as 256 bits
 // concatenate.
-AVX512F inline __m512 Concat(const __m256 first, const __m256 second) {
+AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) {
   // AVX512DQ but that goes with AVX512BW anyway.
   return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
 }
@@ -68,9 +68,9 @@ class QuantizeTile16 {
   public:
     typedef __m512i Integer;
 
-    explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+    AVX512F explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
 
-    AVX512F inline __m512i ForReshape(const float *input, Index cols) {
+    AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
       __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
       __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
       __m512i packed = _mm512_packs_epi32(g0, g1);
@@ -86,9 +86,9 @@ class QuantizeTile8 {
   public:
     typedef __m512i Integer;
 
-    explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+    AVX512F explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
 
-    AVX512F inline __m512i ForReshape(const float *input, Index cols) {
+    AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
       // TODO: try alternative: _mm512_cvtsepi32_epi8 ?
       const __m512i neg127 = _mm512_set1_epi8(-127);
       // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
@@ -206,7 +206,7 @@ struct AVX512_8bit {
 
   // Special AVX512 implementation due to having 32 registers (so I don't have to
   // allocate registers manually) and no sign instruction.
-  AVX512F static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+  AVX512BW static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
   typedef __m512i Integer;
   typedef __m256 Float; // For quantization we only do 8 at a time.
   // This is copy-paste from Multiply8_SSE2OrAVX2.
author	Nikolay Bogoychev <nheart@gmail.com>	2019-04-17 20:23:19 +0300
committer	Nikolay Bogoychev <nheart@gmail.com>	2019-04-17 20:23:19 +0300
commit	6e3b06d56a35dc29144e25f68ccdc8ea9cacfc14 (patch)
tree	c36ee46dfca6fff2e82102034d79b4ac348c87e1 /avx512_gemm.h
parent	845af69633fddf31ac600ba1ecbb53932132b531 (diff)