diff options
-rw-r--r-- | avx2_gemm.h | 52 | ||||
-rw-r--r-- | avx512_gemm.h | 80 | ||||
-rw-r--r-- | cops.h | 16 | ||||
-rw-r--r-- | interleave.h | 44 | ||||
-rw-r--r-- | intgemm.h | 8 | ||||
-rw-r--r-- | intrinsics.h | 64 | ||||
-rw-r--r-- | multiply.h | 48 | ||||
-rw-r--r-- | sse2_gemm.h | 26 | ||||
-rw-r--r-- | ssse3_gemm.h | 32 | ||||
-rw-r--r-- | test/multiply_test.cc | 4 | ||||
-rw-r--r-- | types.h | 20 |
11 files changed, 197 insertions, 197 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h index bdc60e4..8958920 100644 --- a/avx2_gemm.h +++ b/avx2_gemm.h @@ -11,29 +11,29 @@ namespace intgemm { namespace avx2 { // Read a vector of floats, multiply them, and cast to 32-bit integer. // EVIL EVIL CODE DUPLICATION, FIX -AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) { +INTGEMM_AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) { return _mm256_cvtps_epi32(_mm256_mul_ps(*reinterpret_cast<const __m256*>(input), quant_mult_reg)); } -SELECT_COL_B_DEFINE(AVX2, __m256i) +INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i) class QuantizeTile16 { public: typedef __m256i Integer; - AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {} + INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {} - AVX2 Integer Consecutive(const float *input) { + INTGEMM_AVX2 Integer Consecutive(const float *input) { return Tile(input, input + 8); } - AVX2 Integer ForReshape(const float *input, Index cols) { + INTGEMM_AVX2 Integer ForReshape(const float *input, Index cols) { // 8 rows in the first 128-bit register, 8 in the second register. return Tile(input, input + 8 * cols); } private: - AVX2 __m256i Tile(const float *input0, const float *input1) { + INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) { __m256i g0 = QuantizerGrab(input0, mult_); __m256i g1 = QuantizerGrab(input1, mult_); __m256i packed = _mm256_packs_epi32(g0, g1); @@ -52,12 +52,12 @@ struct AVX2_16bit { typedef int16_t Integer; // Currently A is prepared by quantization but this could theoretically change. - AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + INTGEMM_AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } // Just quantize everything in order. - AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { + INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { assert(size % 16 == 0); assert(reinterpret_cast<uintptr_t>(input) % 32 == 0); avx2::QuantizeTile16 q(quant_mult); @@ -71,18 +71,18 @@ struct AVX2_16bit { static const Index kBTileRow = 16; static const Index kBTileCol = 8; /* - AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + INTGEMM_AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols); }*/ - PREPARE_B_16_DEFINE(AVX2, avx2::QuantizeTile16) + INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16) - AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end); } - MULTIPLY16_DEFINE(__m256i, AVX2, OnAVX2) + INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, OnAVX2) - constexpr static const char *const kName = "16-bit AVX2"; + constexpr static const char *const kName = "16-bit INTGEMM_AVX2"; static const CPUType kUses = CPU_AVX2; }; @@ -96,20 +96,20 @@ class QuantizeTile8 { public: typedef __m256i Integer; - AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {} + INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {} - AVX2 inline __m256i Consecutive(const float *input) { + INTGEMM_AVX2 inline __m256i Consecutive(const float *input) { return Tile(input, input + 8, input + 16, input + 24); } - AVX2 inline __m256i ForReshape(const float *input, Index cols) { + INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) { // Put higher rows in the second half of the register. These will jumble // around in the same way then conveniently land in the right place. return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols); } private: - AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) { + INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) { // Looking at the assembly, gcc has pulled this outside the loops calling this. const __m256i neg127 = _mm256_set1_epi8(-127); const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); @@ -136,7 +136,7 @@ class QuantizeTile8 { }; // Technically only requires AVX -MAXABSOLUTE_DEFINE(__m256, AVX2) +INTGEMM_MAXABSOLUTE(__m256, INTGEMM_AVX2) } // namespace @@ -144,12 +144,12 @@ struct AVX2_8bit { typedef int8_t Integer; // Currently A is prepared by quantization but this could theoretically change. - AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + INTGEMM_AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } // Just quantize everything in order. - AVX2 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) { + INTGEMM_AVX2 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) { assert(size % 32 == 0); assert(reinterpret_cast<uintptr_t>(input) % 32 == 0); avx2::QuantizeTile8 q(quant_mult); @@ -164,23 +164,23 @@ struct AVX2_8bit { static const Index kBTileCol = 8; /* - AVX2 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + INTGEMM_AVX2 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { PrepareBFor8(input, output, avx2::QuantizeTile8(quant_mult), rows, cols); }*/ - PREPARE_B_8_DEFINE(AVX2, avx2::QuantizeTile8) + INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8) - AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end); } /* - AVX2 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { + INTGEMM_AVX2 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { //Multiply8_SSE2OrAVX2<Multiply8_AVXAVX2, __m256i, __m256>(A, B, C, unquant_mult, A_rows, width, B_cols); Multiply8_SSE2OrAVX2__m256i<JustUnquantizeC>(A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols); }*/ - MULTIPLY8_DEFINE(__m256i, AVX2, OnAVX2) + INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, OnAVX2) - constexpr static const char *const kName = "8-bit AVX2"; + constexpr static const char *const kName = "8-bit INTGEMM_AVX2"; static const CPUType kUses = CPU_AVX2; }; diff --git a/avx512_gemm.h b/avx512_gemm.h index b92ba0d..e20eb24 100644 --- a/avx512_gemm.h +++ b/avx512_gemm.h @@ -13,7 +13,7 @@ #include "types.h" /* AVX512 implementation. - * This uses AVX512BW, AVX512DQ, and might use AVX512VL + * This uses INTGEMM_AVX512BW, INTGEMM_AVX512DQ, and might use AVX512VL * That means it supports mainstream CPUs with AVX512, starting with Skylake * Xeons. * It does not support any Knights / Xeon Phi processors. @@ -33,16 +33,16 @@ namespace intgemm { namespace avx512f { // Load from memory, multiply, and convert to int32_t. -/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ -AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) { +/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ +INTGEMM_AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) { // Multiply each by the quantization factor. __m512 val = _mm512_mul_ps(*reinterpret_cast<const __m512*>(input), quant_mult_reg); // Cast to 32-bit int return _mm512_cvtps_epi32(val); } -/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ -SELECT_COL_B_DEFINE(AVX512BW, __m512i) +/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ +INTGEMM_SELECT_COL_B(INTGEMM_AVX512BW, __m512i) // For PrepareB we want to read 8 columns at a time. When converting 32-bit // floats to 8-bit values, that's 32 bytes of floats. But AVX512 is 64 bytes @@ -50,14 +50,14 @@ SELECT_COL_B_DEFINE(AVX512BW, __m512i) // but then the memory written to won't be contiguous anyway so we'd be doing a // scatter anyway. Easier to just read the 8 columns we wanted as 256 bits // concatenate. -AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) { - // AVX512DQ but that goes with AVX512BW anyway. +INTGEMM_AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) { + // INTGEMM_AVX512DQ but that goes with INTGEMM_AVX512BW anyway. return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1); } // Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently. -/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ -AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) { +/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ +INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) { __m512 appended = avx512f::Concat(*reinterpret_cast<const __m256*>(input0), *reinterpret_cast<const __m256*>(input1)); appended = _mm512_mul_ps(appended, quant_mult_reg); return _mm512_cvtps_epi32(appended); @@ -70,14 +70,14 @@ class QuantizeTile16 { public: typedef __m512i Integer; - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} - AVX512BW inline __m512i ForReshape(const float *input, Index cols) { + INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) { __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_); __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_); __m512i packed = _mm512_packs_epi32(g0, g1); - // Permute within 256-bit lanes, so same as AVX2 + // Permute within 256-bit lanes, so same as INTGEMM_AVX2 return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); } @@ -89,10 +89,10 @@ class QuantizeTile8 { public: typedef __m512i Integer; - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} - AVX512BW inline __m512i ForReshape(const float *input, Index cols) { + INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) { // TODO: try alternative: _mm512_cvtsepi32_epi8 ? const __m512i neg127 = _mm512_set1_epi8(-127); // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc. @@ -118,8 +118,8 @@ class QuantizeTile8 { const __m512 mult_reg_; }; -/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ -MAXABSOLUTE_DEFINE(__m512, AVX512BW) +/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ +INTGEMM_MAXABSOLUTE(__m512, INTGEMM_AVX512BW) } // namespace @@ -128,8 +128,8 @@ struct AVX512_16bit { // Currently A is prepared by quantization but this could theoretically change. // rows * cols must be a multiple of 16. - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - AVX512BW static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_AVX512BW static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } @@ -137,8 +137,8 @@ struct AVX512_16bit { // But then it will need to be aligned for Multiply. // size must be a multiple of 16. // Convert to 16-bit signed integers. - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - AVX512BW static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_AVX512BW static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { assert(size % 16 == 0); assert(reinterpret_cast<uintptr_t>(input) % 64 == 0); // Fill with the quantization multiplier. @@ -155,20 +155,20 @@ struct AVX512_16bit { static const Index kBTileRow = 32; static const Index kBTileCol = 8; /* - AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + INTGEMM_AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { PrepareBFor16(input, output, avx512f::QuantizeTile16(quant_mult), rows, cols); } */ - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - PREPARE_B_16_DEFINE(AVX512BW, avx512f::QuantizeTile16) + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, avx512f::QuantizeTile16) - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end); } - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - MULTIPLY16_DEFINE(__m512i, AVX512BW, OnAVX2) + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_MULTIPLY16(__m512i, INTGEMM_AVX512BW, OnAVX2) constexpr static const char *const kName = "16-bit AVX512"; @@ -179,16 +179,16 @@ struct AVX512_8bit { typedef int8_t Integer; // Currently A is prepared by quantization but this could theoretically change. - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - AVX512BW static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_AVX512BW static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } // Technically output can be unaligned in Quantize. // But then it will need to be aligned for Multiply. // Convert to 8-bit signed integers. - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) { + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) { assert(size % 16 == 0); assert(reinterpret_cast<uintptr_t>(input) % 64 == 0); const __m512i neg127 = _mm512_set1_epi32(-127); @@ -206,21 +206,21 @@ struct AVX512_8bit { static const Index kBTileRow = 64; static const Index kBTileCol = 8; /* - AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + INTGEMM_AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { PrepareBFor8(input, output, avx512f::QuantizeTile8(quant_mult), rows, cols); }*/ - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - PREPARE_B_8_DEFINE(AVX512BW, avx512f::QuantizeTile8) + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, avx512f::QuantizeTile8) - /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ - AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ + INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end); } // Special AVX512 implementation due to having 32 registers (so I don't have to // allocate registers manually) and no sign instruction. template <class WriteC> - AVX512BW static void Multiply(const int8_t *A, const int8_t *B, WriteC C, Index A_rows, Index width, Index B_cols) { + INTGEMM_AVX512BW static void Multiply(const int8_t *A, const int8_t *B, WriteC C, Index A_rows, Index width, Index B_cols) { typedef __m512i Integer; //typedef __m256 Float; // For quantization we only do 8 at a time. // This is copy-paste from Multiply8_SSE2OrAVX2. @@ -228,7 +228,7 @@ struct AVX512_8bit { assert(B_cols % 8 == 0); assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0); assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); - // There's 8 results for AVX2 to handle. + // There's 8 results for INTGEMM_AVX2 to handle. typename WriteC::OnAVX2 write_C(C); const int simd_width = width / sizeof(Integer); const Integer *B0_col = reinterpret_cast<const Integer*>(B); @@ -11,12 +11,12 @@ class JustUnquantizeC { class OnSSE2 { public: - SSE2 explicit OnSSE2(const JustUnquantizeC &from) + INTGEMM_SSE2 explicit OnSSE2(const JustUnquantizeC &from) : C_(from.C_), unquant_mult_(_mm_set1_ps(from.unquant_mult_)) { assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m128i) == 0); } - SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) { + INTGEMM_SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) { *reinterpret_cast<__m128*>(C_ + rowIDX*cols + colIDX) = mul_ps(cvtepi32_ps(result.pack0123), unquant_mult_); *reinterpret_cast<__m128*>(C_ + rowIDX*cols + colIDX + 4) = mul_ps(cvtepi32_ps(result.pack4567), unquant_mult_); } @@ -27,12 +27,12 @@ class JustUnquantizeC { class OnAVX2 { public: - AVX2 explicit OnAVX2(const JustUnquantizeC &from) + INTGEMM_AVX2 explicit OnAVX2(const JustUnquantizeC &from) : C_(from.C_), unquant_mult_(_mm256_set1_ps(from.unquant_mult_)) { assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m256i) == 0); } - AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) { + INTGEMM_AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) { *reinterpret_cast<__m256*>(C_ + rowIDX*cols + colIDX) = mul_ps(cvtepi32_ps(result), unquant_mult_); } @@ -52,12 +52,12 @@ class Identity { class OnSSE2 { public: - SSE2 explicit OnSSE2(const Identity &from) + INTGEMM_SSE2 explicit OnSSE2(const Identity &from) : C_(from.C_) { assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m128i) == 0); } - SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) { + INTGEMM_SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) { _mm_storeu_si128(reinterpret_cast<__m128i*>(C_ + rowIDX*cols + colIDX), result.pack0123); _mm_storeu_si128(reinterpret_cast<__m128i*>(C_ + rowIDX*cols + colIDX + 4), result.pack4567); } @@ -67,12 +67,12 @@ class Identity { class OnAVX2 { public: - AVX2 explicit OnAVX2(const Identity &from) + INTGEMM_AVX2 explicit OnAVX2(const Identity &from) : C_(from.C_) { assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m256i) == 0); } - AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) { + INTGEMM_AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(C_ + rowIDX*cols + colIDX), result); } diff --git a/interleave.h b/interleave.h index e7a62f5..8b7484e 100644 --- a/interleave.h +++ b/interleave.h @@ -49,41 +49,41 @@ target static inline void Interleave64(type &first, type &second) { \ template <class Register> static inline Register setzero_si() __attribute__((always_inline));; -INTGEMM_INTERLEAVE(SSE2, __m128i, ) -template <> SSE2 inline __m128i setzero_si<__m128i>() { +INTGEMM_INTERLEAVE(INTGEMM_SSE2, __m128i, ) +template <> INTGEMM_SSE2 inline __m128i setzero_si<__m128i>() { return _mm_setzero_si128(); } -INTGEMM_INTERLEAVE(AVX2, __m256i, 256) -template <> AVX2 inline __m256i setzero_si<__m256i>() { +INTGEMM_INTERLEAVE(INTGEMM_AVX2, __m256i, 256) +template <> INTGEMM_AVX2 inline __m256i setzero_si<__m256i>() { return _mm256_setzero_si256(); } #ifndef INTGEMM_NO_AVX512 -INTGEMM_INTERLEAVE(AVX512BW, __m512i, 512) -/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ -template <> AVX512BW inline __m512i setzero_si<__m512i>() { +INTGEMM_INTERLEAVE(INTGEMM_AVX512BW, __m512i, 512) +/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ +template <> INTGEMM_AVX512BW inline __m512i setzero_si<__m512i>() { return _mm512_setzero_si512(); } #endif -#define SWAP_DEFINE(target, Register) \ +#define INTGEMM_SWAP(target, Register) \ target static inline void Swap(Register &a, Register &b) { \ Register tmp = a; \ a = b; \ b = tmp; \ } \ -SWAP_DEFINE(SSE2, __m128i) -SWAP_DEFINE(AVX2, __m256i) +INTGEMM_SWAP(INTGEMM_SSE2, __m128i) +INTGEMM_SWAP(INTGEMM_AVX2, __m256i) #ifndef INTGEMM_NO_AVX512 -/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ -SWAP_DEFINE(AVX512BW, __m512i) +/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ +INTGEMM_SWAP(INTGEMM_AVX512BW, __m512i) #endif /* Transpose registers containing 8 packed 16-bit integers. * Each 128-bit lane is handled independently. */ -#define TRANSPOSE16_DEFINE(target, Register) \ +#define INTGEMM_TRANSPOSE16(target, Register) \ target static inline void Transpose16InLane(Register &r0, Register &r1, Register &r2, Register &r3, Register &r4, Register &r5, Register &r6, Register &r7) { \ /* r0: columns 0 1 2 3 4 5 6 7 from row 0 r1: columns 0 1 2 3 4 5 6 7 from row 1*/ \ @@ -126,11 +126,11 @@ target static inline void Transpose16InLane(Register &r0, Register &r1, Register Swap(r3, r6); \ } \ -TRANSPOSE16_DEFINE(SSE2, __m128i) -TRANSPOSE16_DEFINE(AVX2, __m256i) +INTGEMM_TRANSPOSE16(INTGEMM_SSE2, __m128i) +INTGEMM_TRANSPOSE16(INTGEMM_AVX2, __m256i) #ifndef INTGEMM_NO_AVX512 -/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ -TRANSPOSE16_DEFINE(AVX512BW, __m512i) +/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ +INTGEMM_TRANSPOSE16(INTGEMM_AVX512BW, __m512i) #endif /* Tranpose registers containing 16 packed 8-bit integers. @@ -180,7 +180,7 @@ template <class Register> static inline void Transpose8InLane( // // We presume B starts in row-major order. // -// In AVX2, a register holds 32 8-bit values or 16 16-bit values and we want +// In INTGEMM_AVX2, a register holds 32 8-bit values or 16 16-bit values and we want // that many values from the same column in the register. // // The multiplier reads 8 rows at a time and we want these reads to be @@ -189,7 +189,7 @@ template <class Register> static inline void Transpose8InLane( // Each 8x32 (for 8-bit) or 8x16 (for 16-bit) tile of B is transposed. // The tiles are stored in column major order. // -// For AVX2, this matrix shows what index each value of B will be stored at: +// For INTGEMM_AVX2, this matrix shows what index each value of B will be stored at: // 0 16 ... 240 // 1 17 ... 241 // 2 18 ... 242 @@ -209,7 +209,7 @@ template <class Register> static inline void Transpose8InLane( // 256 272 // 257 273 // ... ... -#define PREPARE_B_8_DEFINE(target, QuantClass) \ +#define INTGEMM_PREPARE_B_8(target, QuantClass) \ target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \ typedef typename QuantClass Quantizer; \ typedef typename Quantizer::Integer Register; \ @@ -244,7 +244,7 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl } \ } \ -#define PREPARE_B_16_DEFINE(target, QuantClass) \ +#define INTGEMM_PREPARE_B_16(target, QuantClass) \ target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \ typedef typename QuantClass Quantizer; \ typedef typename Quantizer::Integer Register; \ @@ -267,7 +267,7 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f /* Select columns of B from PrepareB format to PrepareB format. */ -#define SELECT_COL_B_DEFINE(target, Register) \ +#define INTGEMM_SELECT_COL_B(target, Register) \ target static inline void SelectColumnsOfB(const Register *input, Register *output, Index rows_bytes /* number of bytes in a row */, const Index *cols_begin, const Index *cols_end) { \ assert(rows_bytes % sizeof(Register) == 0); \ assert((cols_end - cols_begin) % 8 == 0); \ @@ -107,14 +107,14 @@ float MaxAbsolute(const float *begin, const float *end) { #endif /* Returns: - * avx512 if the CPU supports AVX512F (though really it should be AVX512BW, but + * avx512 if the CPU supports INTGEMM_AVX512F (though really it should be INTGEMM_AVX512BW, but * cloud providers lie). TODO: don't catch Knights processors with this. * - * avx2 if the CPU supports AVX2 + * avx2 if the CPU supports INTGEMM_AVX2 * - * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit) + * ssse3 if the CPU supports INTGEMM_SSSE3 (this distinction from INTGEMM_SSE2 matters for 8-bit) * - * sse2 if the CPU supports SSE2 + * sse2 if the CPU supports INTGEMM_SSE2 * * unsupported otherwise */ diff --git a/intrinsics.h b/intrinsics.h index fa3a3c2..9548a81 100644 --- a/intrinsics.h +++ b/intrinsics.h @@ -18,105 +18,105 @@ template <class Register> static inline Register set1_ps(float to); struct MultiplyResult128 { __m128i pack0123, pack4567; }; -SSE2 static inline __m128i add_epi32(__m128i first, __m128i second) { +INTGEMM_SSE2 static inline __m128i add_epi32(__m128i first, __m128i second) { return _mm_add_epi32(first, second); } -SSE2 static inline __m128i adds_epi16(__m128i first, __m128i second) { +INTGEMM_SSE2 static inline __m128i adds_epi16(__m128i first, __m128i second) { return _mm_adds_epi16(first, second); } -template <> SSE2 inline __m128i set1_epi16<__m128i>(int16_t to) { +template <> INTGEMM_SSE2 inline __m128i set1_epi16<__m128i>(int16_t to) { return _mm_set1_epi16(to); } -template <> SSE2 inline __m128 set1_ps<__m128>(float to) { +template <> INTGEMM_SSE2 inline __m128 set1_ps<__m128>(float to) { return _mm_set1_ps(to); } -SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) { +INTGEMM_SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) { return _mm_madd_epi16(first, second); } -SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) { +INTGEMM_SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) { return _mm_maddubs_epi16(first, second); } -SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) { +INTGEMM_SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) { return _mm_sign_epi8(first, second); } -SSSE3 static inline __m128i abs_epi8(__m128i arg) { +INTGEMM_SSSE3 static inline __m128i abs_epi8(__m128i arg) { return _mm_abs_epi8(arg); } -SSE2 static inline __m128 max_ps(__m128 first, __m128 second) { +INTGEMM_SSE2 static inline __m128 max_ps(__m128 first, __m128 second) { return _mm_max_ps(first, second); } -SSE2 static inline __m128 and_ps(__m128 first, __m128 second) { +INTGEMM_SSE2 static inline __m128 and_ps(__m128 first, __m128 second) { return _mm_and_ps(first, second); } -SSE2 static inline __m128 cvtepi32_ps(__m128i arg) { +INTGEMM_SSE2 static inline __m128 cvtepi32_ps(__m128i arg) { return _mm_cvtepi32_ps(arg); } -SSE2 static inline __m128 mul_ps (__m128 a, __m128 b) { +INTGEMM_SSE2 static inline __m128 mul_ps (__m128 a, __m128 b) { return _mm_mul_ps(a, b); } -AVX2 static inline __m256i add_epi32(__m256i first, __m256i second) { +INTGEMM_AVX2 static inline __m256i add_epi32(__m256i first, __m256i second) { return _mm256_add_epi32(first, second); } -AVX2 static inline __m256i adds_epi16(__m256i first, __m256i second) { +INTGEMM_AVX2 static inline __m256i adds_epi16(__m256i first, __m256i second) { return _mm256_adds_epi16(first, second); } -template <> AVX2 inline __m256i set1_epi16<__m256i>(int16_t to) { +template <> INTGEMM_AVX2 inline __m256i set1_epi16<__m256i>(int16_t to) { return _mm256_set1_epi16(to); } -template <> AVX2 inline __m256 set1_ps<__m256>(float to) { +template <> INTGEMM_AVX2 inline __m256 set1_ps<__m256>(float to) { return _mm256_set1_ps(to); } -AVX2 static inline __m256i madd_epi16(__m256i first, __m256i second) { +INTGEMM_AVX2 static inline __m256i madd_epi16(__m256i first, __m256i second) { return _mm256_madd_epi16(first, second); } -AVX2 static inline __m256i maddubs_epi16(__m256i first, __m256i second) { +INTGEMM_AVX2 static inline __m256i maddubs_epi16(__m256i first, __m256i second) { return _mm256_maddubs_epi16(first, second); } -AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) { +INTGEMM_AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) { return _mm256_sign_epi8(first, second); } -AVX2 static inline __m256i abs_epi8(__m256i arg) { +INTGEMM_AVX2 static inline __m256i abs_epi8(__m256i arg) { return _mm256_abs_epi8(arg); } -AVX2 static inline __m256 max_ps(__m256 first, __m256 second) { +INTGEMM_AVX2 static inline __m256 max_ps(__m256 first, __m256 second) { return _mm256_max_ps(first, second); } -AVX2 static inline __m256 and_ps(__m256 first, __m256 second) { +INTGEMM_AVX2 static inline __m256 and_ps(__m256 first, __m256 second) { return _mm256_and_ps(first, second); } -AVX2 static inline __m256 cvtepi32_ps(__m256i arg) { +INTGEMM_AVX2 static inline __m256 cvtepi32_ps(__m256i arg) { return _mm256_cvtepi32_ps(arg); } -AVX2 static inline __m256 mul_ps (__m256 a, __m256 b) { +INTGEMM_AVX2 static inline __m256 mul_ps (__m256 a, __m256 b) { return _mm256_mul_ps(a, b); } #ifndef INTGEMM_NO_AVX512 -AVX512BW static inline __m512i add_epi32(__m512i first, __m512i second) { +INTGEMM_AVX512BW static inline __m512i add_epi32(__m512i first, __m512i second) { return _mm512_add_epi32(first, second); } -template <> inline AVX512BW __m512i set1_epi16<__m512i>(int16_t to) { +template <> inline INTGEMM_AVX512BW __m512i set1_epi16<__m512i>(int16_t to) { return _mm512_set1_epi16(to); } -template <> inline AVX512BW __m512 set1_ps<__m512>(float to) { +template <> inline INTGEMM_AVX512BW __m512 set1_ps<__m512>(float to) { return _mm512_set1_ps(to); } -AVX512BW static inline __m512i madd_epi16(__m512i first, __m512i second) { +INTGEMM_AVX512BW static inline __m512i madd_epi16(__m512i first, __m512i second) { return _mm512_madd_epi16(first, second); } -AVX512BW static inline __m512i maddubs_epi16(__m512i first, __m512i second) { +INTGEMM_AVX512BW static inline __m512i maddubs_epi16(__m512i first, __m512i second) { return _mm512_maddubs_epi16(first, second); } -AVX512BW static inline __m512i abs_epi8(__m512i arg) { +INTGEMM_AVX512BW static inline __m512i abs_epi8(__m512i arg) { return _mm512_abs_epi8(arg); } -AVX512BW static inline __m512 max_ps(__m512 first, __m512 second) { +INTGEMM_AVX512BW static inline __m512 max_ps(__m512 first, __m512 second) { return _mm512_max_ps(first, second); } // Technically __AVX512DQ__ -AVX512DQ static inline __m512 and_ps(__m512 first, __m512 second) { +INTGEMM_AVX512DQ static inline __m512 and_ps(__m512 first, __m512 second) { return _mm512_and_ps(first, second); } #endif @@ -4,7 +4,7 @@ namespace intgemm { -SSE2 static inline float MaxFloat32(__m128 a) { +INTGEMM_SSE2 static inline float MaxFloat32(__m128 a) { // Fold to just using the first 64 bits. __m128 second_half = _mm_shuffle_ps(a, a, 3 * 4 + 2); a = _mm_max_ps(a, second_half); @@ -15,7 +15,7 @@ SSE2 static inline float MaxFloat32(__m128 a) { return *reinterpret_cast<float*>(&a); } -SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) { +INTGEMM_SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) { // No op for 128 bits: already reduced fully. MultiplyResult128 ret; ret.pack0123 = pack0123; @@ -23,11 +23,11 @@ SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pac return ret; } -AVX2 static inline float MaxFloat32(__m256 a) { +INTGEMM_AVX2 static inline float MaxFloat32(__m256 a) { return MaxFloat32(max_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))); } -AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) { +INTGEMM_AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) { // This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f __m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21); // This instruction generates 1f 2f 3f 4f 5s 6s 7s 8s @@ -36,8 +36,8 @@ AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) { } #ifndef INTGEMM_NO_AVX512 -/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ -AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) { +/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ +INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) { // Form [0th 128-bit register of pack0123, 0st 128-bit register of pack4567, 2nd 128-bit register of pack0123, 2nd 128-bit register of pack4567] __m512i mix0 = _mm512_mask_permutex_epi64(pack0123, 0xcc, pack4567, (0 << 4) | (1 << 6)); // Form [1st 128-bit register of pack0123, 1st 128-bit register of pack4567, 3rd 128-bit register of pack0123, 3rd 128-bit register of pack4567] @@ -49,7 +49,7 @@ AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) } // Find the maximum float. -static inline AVX512DQ float MaxFloat32(__m512 a) { +static inline INTGEMM_AVX512DQ float MaxFloat32(__m512 a) { return MaxFloat32(max_ps(_mm512_castps512_ps256(a), _mm512_extractf32x8_ps(a, 1))); } @@ -70,7 +70,7 @@ template <class Register> inline Register Pack0123(Register sum0, Register sum1, return add_epi32(pack01, pack23); } */ -#define PACK0123_DEFINE(target, Register) \ +#define INTGEMM_PACK0123(target, Register) \ target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Register sum3) { \ Interleave32(sum0, sum1); \ Register pack01 = add_epi32(sum0, sum1); \ @@ -80,14 +80,14 @@ target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Reg return add_epi32(pack01, pack23); \ } \ -PACK0123_DEFINE(SSE2, __m128i) -PACK0123_DEFINE(AVX2, __m256i) +INTGEMM_PACK0123(INTGEMM_SSE2, __m128i) +INTGEMM_PACK0123(INTGEMM_AVX2, __m256i) #ifndef INTGEMM_NO_AVX512 -/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */ -PACK0123_DEFINE(AVX512BW, __m512i) +/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ +INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i) #endif -// 16-bit multiplier for SSE2, AVX2, and AVX512. +// 16-bit multiplier for INTGEMM_SSE2, INTGEMM_AVX2, and AVX512. // C = A * B * unquant_mult // // This has been substantially revised from Jacob Devlin's SSE code which is: @@ -116,15 +116,15 @@ PACK0123_DEFINE(AVX512BW, __m512i) // C is output in row-major form. // // All of A, B, and C must be in aligned to a multiple of the register size: -// SSE2: 16 bytes -// AVX2: 32 bytes +// INTGEMM_SSE2: 16 bytes +// INTGEMM_AVX2: 32 bytes // AVX512: 64 bytes. // // A_rows can be anything non-negative. // width must be a multiple of the register size. // B_cols must be a multiple of 8. // Multiply16 -#define MULTIPLY16_DEFINE(Integer, target, WriteCSubType) \ +#define INTGEMM_MULTIPLY16(Integer, target, WriteCSubType) \ template <class WriteC> target static void Multiply(const int16_t *A, const int16_t *B, WriteC C, Index A_rows, Index width, Index B_cols) { \ assert(width % (sizeof(Integer) / sizeof(int16_t)) == 0); \ assert(B_cols % 8 == 0); \ @@ -180,7 +180,7 @@ PACK0123_DEFINE(AVX512BW, __m512i) } \ } \ -/* 8-bit matrix multiply used by AVX and AVX2. +/* 8-bit matrix multiply used by AVX and INTGEMM_AVX2. * These have two peculiar properties: * 1. The sign instructions don't exist in AVX512. * 2. 16 registers means gcc's register allocation failed so I wrote it in my @@ -189,11 +189,11 @@ PACK0123_DEFINE(AVX512BW, __m512i) * * Fun fact: AVX introduced the three-argument vpsignb and vpmaddubsw but only * for 128-bit, despite the primary change in AVX being the addition of - * 256-bit. We had to wait for AVX2 to get 256-bit versions of vpsignb and + * 256-bit. We had to wait for INTGEMM_AVX2 to get 256-bit versions of vpsignb and * vpmaddubsw. That's why this code is generic over 128-bit or 256-bit. */ -AVX2 inline static void InnerAVX2( +INTGEMM_AVX2 inline static void InnerINTGEMM_AVX2( __m256i a, const __m256i *b, __m256i &sum0, __m256i &sum1, __m256i &sum2, __m256i &sum3, __m256i &sum4, __m256i &sum5, __m256i &sum6, __m256i &sum7) { @@ -312,8 +312,8 @@ AVX2 inline static void InnerAVX2( } -// For SSSE3 without AVX -SSSE3 inline static void InnerSSSE3( +// For INTGEMM_SSSE3 without AVX +INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3( __m128i a, const __m128i *b, __m128i &sum0, __m128i &sum1, __m128i &sum2, __m128i &sum3, __m128i &sum4, __m128i &sum5, __m128i &sum6, __m128i &sum7) { @@ -327,8 +327,8 @@ SSSE3 inline static void InnerSSSE3( sum6 = adds_epi16(sum6, maddubs_epi16(a_positive, sign_epi8(b[6], a))); sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a))); } -//AVX2 or SSSE3 multiply -#define MULTIPLY8_DEFINE(Integer, target, WriteCSubType) \ +//INTGEMM_AVX2 or INTGEMM_SSSE3 multiply +#define INTGEMM_MULTIPLY8(Integer, target, WriteCSubType) \ template <class WriteC> target static void Multiply(const int8_t *A, const int8_t *B, WriteC C, Index A_rows, Index width, Index B_cols) { \ assert(width % sizeof(Integer) == 0); \ assert(B_cols % 8 == 0); \ @@ -417,7 +417,7 @@ template <class Register> inline static float MaxAbsoluteBackend(const float *be return MaxFloat32(highest); }*/ -#define MAXABSOLUTE_DEFINE(Register, target) \ +#define INTGEMM_MAXABSOLUTE(Register, target) \ target static float MaxAbsolute(const float *begin_float, const float *end_float) { \ assert(end_float > begin_float); \ assert((end_float - begin_float) % (sizeof(Register) / sizeof(float)) == 0); \ diff --git a/sse2_gemm.h b/sse2_gemm.h index 882e006..9605a68 100644 --- a/sse2_gemm.h +++ b/sse2_gemm.h @@ -10,26 +10,26 @@ namespace intgemm { namespace sse2 { // Same implementation as AVX512, just width. Grabs 4 32-bit values. // TODO duplicated function requires the removal of the annonymous namespace -SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { +INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg)); } -SELECT_COL_B_DEFINE(SSE2, __m128i) +INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i) class QuantizeTile16 { public: typedef __m128i Integer; - SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {} + INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {} // Quantize 8xfloat into 8xint16_t - SSE2 inline __m128i Consecutive(const float *input) { + INTGEMM_SSE2 inline __m128i Consecutive(const float *input) { __m128i g0 = QuantizerGrab(input, mult_reg_); __m128i g1 = QuantizerGrab(input + 4, mult_reg_); return _mm_packs_epi32(g0, g1); } - SSE2 inline __m128i ForReshape(const float *input, int) { + INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) { return Consecutive(input); } @@ -39,19 +39,19 @@ class QuantizeTile16 { // Technically only requires SSE -MAXABSOLUTE_DEFINE(__m128, SSE2) +INTGEMM_MAXABSOLUTE(__m128, INTGEMM_SSE2) } //namespace -// This should be pure SSE2 (and below). +// This should be pure INTGEMM_SSE2 (and below). struct SSE2_16bit { typedef int16_t Integer; // Currently A is prepared by quantization but this could theoretically change. - SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + INTGEMM_SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } - SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { + INTGEMM_SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { assert(size % 8 == 0); assert(reinterpret_cast<uintptr_t>(input) % 16 == 0); assert(reinterpret_cast<uintptr_t>(output) % 16 == 0); @@ -66,15 +66,15 @@ struct SSE2_16bit { static const Index kBTileRow = 8; static const Index kBTileCol = 8; - PREPARE_B_16_DEFINE(SSE2, sse2::QuantizeTile16) + INTGEMM_PREPARE_B_16(INTGEMM_SSE2, sse2::QuantizeTile16) - SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { //TODO #DEFINE sse2::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end); } - MULTIPLY16_DEFINE(__m128i, SSE2, OnSSE2) + INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, OnSSE2) - constexpr static const char *const kName = "16-bit SSE2"; + constexpr static const char *const kName = "16-bit INTGEMM_SSE2"; static const CPUType kUses = CPU_SSE2; }; diff --git a/ssse3_gemm.h b/ssse3_gemm.h index dccc730..cf5d2c1 100644 --- a/ssse3_gemm.h +++ b/ssse3_gemm.h @@ -13,30 +13,30 @@ namespace intgemm { namespace ssse3 { // Same implementation as AVX512, just width. Grabs 4 32-bit values. //TODO duplicated function requires the removal of the annonymous namespace -SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { +INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg)); } -SELECT_COL_B_DEFINE(SSSE3, __m128i) +INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i) class QuantizeTile8 { public: typedef __m128i Integer; - SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {} + INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {} - SSSE3 inline __m128i ForReshape(const float *input, Index cols) { + INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) { // Skip a row. return Tile(input, input + 2 * cols); } - SSSE3 inline __m128i Consecutive(const float *input) { + INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) { return Tile(input, input + 8); } private: // Quantize 16xfloat into 16xint8_t - SSSE3 inline __m128i Tile(const float *input0, const float *input1) { + INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1) { const __m128i neg128 = _mm_set1_epi8(-128); __m128i g0 = QuantizerGrab(input0, mult_reg_); __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_); @@ -47,7 +47,7 @@ class QuantizeTile8 { __m128i packed = _mm_packs_epi16(packed0, packed1); /* Ban -128. * Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127). Instead, - * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8. + * use INTGEMM_SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8. * The first generates 0xff for fields -128. * The second subtracts 0xff from -128 which has the effect of converting * to -127. @@ -65,16 +65,16 @@ class QuantizeTile8 { } // namespace -// pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need. +// pmaddubsw (the 8-bit multiply) is INTGEMM_SSSE3, so pedantically that's the version we need. struct SSSE3_8bit { typedef int8_t Integer; // Currently A is prepared by quantization but this could theoretically change. - SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + INTGEMM_SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } - SSSE3 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) { + INTGEMM_SSSE3 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) { assert(size % 16 == 0); assert(reinterpret_cast<uintptr_t>(input) % 16 == 0); assert(reinterpret_cast<uintptr_t>(output) % 16 == 0); @@ -89,22 +89,22 @@ struct SSSE3_8bit { static const Index kBTileRow = 16; static const Index kBTileCol = 8; /* - SSSE3 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + INTGEMM_SSSE3 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { PrepareBFor8(input, output, ssse3::QuantizeTile8(quant_mult), rows, cols); }*/ - PREPARE_B_8_DEFINE(SSSE3, ssse3::QuantizeTile8) + INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8) - SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); } /* - SSSE3 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { + INTGEMM_SSSE3 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { //Multiply8_SSE2OrAVX2<Multiply8_C, __m128i, __m128>(A, B, C, unquant_mult, A_rows, width, B_cols); Multiply8_SSE2OrAVX2__m128i<JustUnquantizeC>(A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols); }*/ - MULTIPLY8_DEFINE(__m128i, SSSE3, OnSSE2) + INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, OnSSE2) - constexpr static const char *const kName = "8-bit SSSE3"; + constexpr static const char *const kName = "8-bit INTGEMM_SSSE3"; static const CPUType kUses = CPU_SSSE3; }; diff --git a/test/multiply_test.cc b/test/multiply_test.cc index 6c16dec..6deb259 100644 --- a/test/multiply_test.cc +++ b/test/multiply_test.cc @@ -51,7 +51,7 @@ template <class V> void SlowTranspose(const V *from, V *to, Index rows, Index co } -TEST_CASE("Transpose 16", "[transpose]") { +INTGEMM_SSE2 TEST_CASE("Transpose 16", "[transpose]") { if (kCPU < CPU_SSE2) return; AlignedVector<int16_t> input(8 * 8); for (int16_t i = 0; i < 64; ++i) { @@ -69,7 +69,7 @@ TEST_CASE("Transpose 16", "[transpose]") { } } -SSSE3 TEST_CASE("Transpose 8", "[transpose]") { +INTGEMM_SSSE3 TEST_CASE("Transpose 8", "[transpose]") { if (kCPU < CPU_SSSE3) return; AlignedVector<int8_t> input(16 * 16); for (int i = 0; i < 16 * 16; ++i) { @@ -2,23 +2,23 @@ #include <exception> #define DEFAULT __attribute__ ((target ("default"))) -#define SSE2 __attribute__ ((target ("sse2"))) +#define INTGEMM_SSE2 __attribute__ ((target ("sse2"))) //#define SSE2_3 __attribute__ ((target ("ssse3"), target("sse2"))) //Not supported by clang -#define SSSE3 __attribute__ ((target ("ssse3"))) -#define AVX2 __attribute__ ((target ("avx2"))) +#define INTGEMM_SSSE3 __attribute__ ((target ("ssse3"))) +#define INTGEMM_AVX2 __attribute__ ((target ("avx2"))) //#define AVX2_512F __attribute__ ((target ("avx2"), target("avx512f"))) //Not supported by clang #if defined __INTEL_COMPILER -#define AVX512F __attribute__ ((target ("avx512f"))) -#define AVX512BW __attribute__ ((target ("avx512f"))) -#define AVX512DQ __attribute__ ((target ("avx512f"))) +#define INTGEMM_AVX512F __attribute__ ((target ("avx512f"))) +#define INTGEMM_AVX512BW __attribute__ ((target ("avx512f"))) +#define INTGEMM_AVX512DQ __attribute__ ((target ("avx512f"))) #else -#define AVX512F __attribute__ ((target ("avx512f"))) -#define AVX512BW __attribute__ ((target ("avx512bw"))) -#define AVX512DQ __attribute__ ((target ("avx512dq"))) +#define INTGEMM_AVX512F __attribute__ ((target ("avx512f"))) +#define INTGEMM_AVX512BW __attribute__ ((target ("avx512bw"))) +#define INTGEMM_AVX512DQ __attribute__ ((target ("avx512dq"))) #endif namespace intgemm { -// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3). +// This will be thrown if a CPU isn't supported by the routines (16-bit without INTGEMM_SSE2 or 8-bit without INTGEMM_SSSE3). class UnsupportedCPU : public std::exception { public: UnsupportedCPU() {} |