Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNikolay Bogoychev <nheart@gmail.com>2019-04-17 20:23:19 +0300
committerNikolay Bogoychev <nheart@gmail.com>2019-04-17 20:23:19 +0300
commit6e3b06d56a35dc29144e25f68ccdc8ea9cacfc14 (patch)
treec36ee46dfca6fff2e82102034d79b4ac348c87e1 /avx512_gemm.h
parent845af69633fddf31ac600ba1ecbb53932132b531 (diff)
Fix avx512 compilation
Diffstat (limited to 'avx512_gemm.h')
-rw-r--r--avx512_gemm.h12
1 files changed, 6 insertions, 6 deletions
diff --git a/avx512_gemm.h b/avx512_gemm.h
index d8e9e63..93a5997 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -49,7 +49,7 @@ SELECT_COL_B_DEF(AVX512F, __m512i)
// but then the memory written to won't be contiguous anyway so we'd be doing a
// scatter anyway. Easier to just read the 8 columns we wanted as 256 bits
// concatenate.
-AVX512F inline __m512 Concat(const __m256 first, const __m256 second) {
+AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) {
// AVX512DQ but that goes with AVX512BW anyway.
return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
}
@@ -68,9 +68,9 @@ class QuantizeTile16 {
public:
typedef __m512i Integer;
- explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+ AVX512F explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
- AVX512F inline __m512i ForReshape(const float *input, Index cols) {
+ AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
__m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
__m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
__m512i packed = _mm512_packs_epi32(g0, g1);
@@ -86,9 +86,9 @@ class QuantizeTile8 {
public:
typedef __m512i Integer;
- explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+ AVX512F explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
- AVX512F inline __m512i ForReshape(const float *input, Index cols) {
+ AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
// TODO: try alternative: _mm512_cvtsepi32_epi8 ?
const __m512i neg127 = _mm512_set1_epi8(-127);
// In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
@@ -206,7 +206,7 @@ struct AVX512_8bit {
// Special AVX512 implementation due to having 32 registers (so I don't have to
// allocate registers manually) and no sign instruction.
- AVX512F static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+ AVX512BW static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
typedef __m512i Integer;
typedef __m256 Float; // For quantization we only do 8 at a time.
// This is copy-paste from Multiply8_SSE2OrAVX2.