Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--avx2_gemm.h52
-rw-r--r--avx512_gemm.h80
-rw-r--r--cops.h16
-rw-r--r--interleave.h44
-rw-r--r--intgemm.h8
-rw-r--r--intrinsics.h64
-rw-r--r--multiply.h48
-rw-r--r--sse2_gemm.h26
-rw-r--r--ssse3_gemm.h32
-rw-r--r--test/multiply_test.cc4
-rw-r--r--types.h20
11 files changed, 197 insertions, 197 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h
index bdc60e4..8958920 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -11,29 +11,29 @@ namespace intgemm {
namespace avx2 {
// Read a vector of floats, multiply them, and cast to 32-bit integer.
// EVIL EVIL CODE DUPLICATION, FIX
-AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
+INTGEMM_AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
return _mm256_cvtps_epi32(_mm256_mul_ps(*reinterpret_cast<const __m256*>(input), quant_mult_reg));
}
-SELECT_COL_B_DEFINE(AVX2, __m256i)
+INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i)
class QuantizeTile16 {
public:
typedef __m256i Integer;
- AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
+ INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
- AVX2 Integer Consecutive(const float *input) {
+ INTGEMM_AVX2 Integer Consecutive(const float *input) {
return Tile(input, input + 8);
}
- AVX2 Integer ForReshape(const float *input, Index cols) {
+ INTGEMM_AVX2 Integer ForReshape(const float *input, Index cols) {
// 8 rows in the first 128-bit register, 8 in the second register.
return Tile(input, input + 8 * cols);
}
private:
- AVX2 __m256i Tile(const float *input0, const float *input1) {
+ INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) {
__m256i g0 = QuantizerGrab(input0, mult_);
__m256i g1 = QuantizerGrab(input1, mult_);
__m256i packed = _mm256_packs_epi32(g0, g1);
@@ -52,12 +52,12 @@ struct AVX2_16bit {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
- AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ INTGEMM_AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
// Just quantize everything in order.
- AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+ INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
avx2::QuantizeTile16 q(quant_mult);
@@ -71,18 +71,18 @@ struct AVX2_16bit {
static const Index kBTileRow = 16;
static const Index kBTileCol = 8;
/*
- AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ INTGEMM_AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols);
}*/
- PREPARE_B_16_DEFINE(AVX2, avx2::QuantizeTile16)
+ INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16)
- AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
}
- MULTIPLY16_DEFINE(__m256i, AVX2, OnAVX2)
+ INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, OnAVX2)
- constexpr static const char *const kName = "16-bit AVX2";
+ constexpr static const char *const kName = "16-bit INTGEMM_AVX2";
static const CPUType kUses = CPU_AVX2;
};
@@ -96,20 +96,20 @@ class QuantizeTile8 {
public:
typedef __m256i Integer;
- AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
+ INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
- AVX2 inline __m256i Consecutive(const float *input) {
+ INTGEMM_AVX2 inline __m256i Consecutive(const float *input) {
return Tile(input, input + 8, input + 16, input + 24);
}
- AVX2 inline __m256i ForReshape(const float *input, Index cols) {
+ INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) {
// Put higher rows in the second half of the register. These will jumble
// around in the same way then conveniently land in the right place.
return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
}
private:
- AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
+ INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
// Looking at the assembly, gcc has pulled this outside the loops calling this.
const __m256i neg127 = _mm256_set1_epi8(-127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
@@ -136,7 +136,7 @@ class QuantizeTile8 {
};
// Technically only requires AVX
-MAXABSOLUTE_DEFINE(__m256, AVX2)
+INTGEMM_MAXABSOLUTE(__m256, INTGEMM_AVX2)
} // namespace
@@ -144,12 +144,12 @@ struct AVX2_8bit {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
- AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ INTGEMM_AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
// Just quantize everything in order.
- AVX2 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+ INTGEMM_AVX2 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
assert(size % 32 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
avx2::QuantizeTile8 q(quant_mult);
@@ -164,23 +164,23 @@ struct AVX2_8bit {
static const Index kBTileCol = 8;
/*
- AVX2 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ INTGEMM_AVX2 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
PrepareBFor8(input, output, avx2::QuantizeTile8(quant_mult), rows, cols);
}*/
- PREPARE_B_8_DEFINE(AVX2, avx2::QuantizeTile8)
+ INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8)
- AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
}
/*
- AVX2 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+ INTGEMM_AVX2 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
//Multiply8_SSE2OrAVX2<Multiply8_AVXAVX2, __m256i, __m256>(A, B, C, unquant_mult, A_rows, width, B_cols);
Multiply8_SSE2OrAVX2__m256i<JustUnquantizeC>(A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
}*/
- MULTIPLY8_DEFINE(__m256i, AVX2, OnAVX2)
+ INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, OnAVX2)
- constexpr static const char *const kName = "8-bit AVX2";
+ constexpr static const char *const kName = "8-bit INTGEMM_AVX2";
static const CPUType kUses = CPU_AVX2;
};
diff --git a/avx512_gemm.h b/avx512_gemm.h
index b92ba0d..e20eb24 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -13,7 +13,7 @@
#include "types.h"
/* AVX512 implementation.
- * This uses AVX512BW, AVX512DQ, and might use AVX512VL
+ * This uses INTGEMM_AVX512BW, INTGEMM_AVX512DQ, and might use AVX512VL
* That means it supports mainstream CPUs with AVX512, starting with Skylake
* Xeons.
* It does not support any Knights / Xeon Phi processors.
@@ -33,16 +33,16 @@ namespace intgemm {
namespace avx512f {
// Load from memory, multiply, and convert to int32_t.
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
// Multiply each by the quantization factor.
__m512 val = _mm512_mul_ps(*reinterpret_cast<const __m512*>(input), quant_mult_reg);
// Cast to 32-bit int
return _mm512_cvtps_epi32(val);
}
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-SELECT_COL_B_DEFINE(AVX512BW, __m512i)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_SELECT_COL_B(INTGEMM_AVX512BW, __m512i)
// For PrepareB we want to read 8 columns at a time. When converting 32-bit
// floats to 8-bit values, that's 32 bytes of floats. But AVX512 is 64 bytes
@@ -50,14 +50,14 @@ SELECT_COL_B_DEFINE(AVX512BW, __m512i)
// but then the memory written to won't be contiguous anyway so we'd be doing a
// scatter anyway. Easier to just read the 8 columns we wanted as 256 bits
// concatenate.
-AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) {
- // AVX512DQ but that goes with AVX512BW anyway.
+INTGEMM_AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) {
+ // INTGEMM_AVX512DQ but that goes with INTGEMM_AVX512BW anyway.
return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
}
// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently.
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
__m512 appended = avx512f::Concat(*reinterpret_cast<const __m256*>(input0), *reinterpret_cast<const __m256*>(input1));
appended = _mm512_mul_ps(appended, quant_mult_reg);
return _mm512_cvtps_epi32(appended);
@@ -70,14 +70,14 @@ class QuantizeTile16 {
public:
typedef __m512i Integer;
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
- AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
+ INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
__m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
__m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
__m512i packed = _mm512_packs_epi32(g0, g1);
- // Permute within 256-bit lanes, so same as AVX2
+ // Permute within 256-bit lanes, so same as INTGEMM_AVX2
return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
}
@@ -89,10 +89,10 @@ class QuantizeTile8 {
public:
typedef __m512i Integer;
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
- AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
+ INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
// TODO: try alternative: _mm512_cvtsepi32_epi8 ?
const __m512i neg127 = _mm512_set1_epi8(-127);
// In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
@@ -118,8 +118,8 @@ class QuantizeTile8 {
const __m512 mult_reg_;
};
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-MAXABSOLUTE_DEFINE(__m512, AVX512BW)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_MAXABSOLUTE(__m512, INTGEMM_AVX512BW)
} // namespace
@@ -128,8 +128,8 @@ struct AVX512_16bit {
// Currently A is prepared by quantization but this could theoretically change.
// rows * cols must be a multiple of 16.
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- AVX512BW static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_AVX512BW static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
@@ -137,8 +137,8 @@ struct AVX512_16bit {
// But then it will need to be aligned for Multiply.
// size must be a multiple of 16.
// Convert to 16-bit signed integers.
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- AVX512BW static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_AVX512BW static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
// Fill with the quantization multiplier.
@@ -155,20 +155,20 @@ struct AVX512_16bit {
static const Index kBTileRow = 32;
static const Index kBTileCol = 8;
/*
- AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ INTGEMM_AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
PrepareBFor16(input, output, avx512f::QuantizeTile16(quant_mult), rows, cols);
}
*/
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- PREPARE_B_16_DEFINE(AVX512BW, avx512f::QuantizeTile16)
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, avx512f::QuantizeTile16)
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
}
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- MULTIPLY16_DEFINE(__m512i, AVX512BW, OnAVX2)
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_MULTIPLY16(__m512i, INTGEMM_AVX512BW, OnAVX2)
constexpr static const char *const kName = "16-bit AVX512";
@@ -179,16 +179,16 @@ struct AVX512_8bit {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- AVX512BW static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_AVX512BW static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
// Technically output can be unaligned in Quantize.
// But then it will need to be aligned for Multiply.
// Convert to 8-bit signed integers.
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
const __m512i neg127 = _mm512_set1_epi32(-127);
@@ -206,21 +206,21 @@ struct AVX512_8bit {
static const Index kBTileRow = 64;
static const Index kBTileCol = 8;
/*
- AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ INTGEMM_AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
PrepareBFor8(input, output, avx512f::QuantizeTile8(quant_mult), rows, cols);
}*/
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- PREPARE_B_8_DEFINE(AVX512BW, avx512f::QuantizeTile8)
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, avx512f::QuantizeTile8)
- /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
- AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+ INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
}
// Special AVX512 implementation due to having 32 registers (so I don't have to
// allocate registers manually) and no sign instruction.
template <class WriteC>
- AVX512BW static void Multiply(const int8_t *A, const int8_t *B, WriteC C, Index A_rows, Index width, Index B_cols) {
+ INTGEMM_AVX512BW static void Multiply(const int8_t *A, const int8_t *B, WriteC C, Index A_rows, Index width, Index B_cols) {
typedef __m512i Integer;
//typedef __m256 Float; // For quantization we only do 8 at a time.
// This is copy-paste from Multiply8_SSE2OrAVX2.
@@ -228,7 +228,7 @@ struct AVX512_8bit {
assert(B_cols % 8 == 0);
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0);
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0);
- // There's 8 results for AVX2 to handle.
+ // There's 8 results for INTGEMM_AVX2 to handle.
typename WriteC::OnAVX2 write_C(C);
const int simd_width = width / sizeof(Integer);
const Integer *B0_col = reinterpret_cast<const Integer*>(B);
diff --git a/cops.h b/cops.h
index e685a3c..2771aeb 100644
--- a/cops.h
+++ b/cops.h
@@ -11,12 +11,12 @@ class JustUnquantizeC {
class OnSSE2 {
public:
- SSE2 explicit OnSSE2(const JustUnquantizeC &from)
+ INTGEMM_SSE2 explicit OnSSE2(const JustUnquantizeC &from)
: C_(from.C_), unquant_mult_(_mm_set1_ps(from.unquant_mult_)) {
assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m128i) == 0);
}
- SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) {
+ INTGEMM_SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) {
*reinterpret_cast<__m128*>(C_ + rowIDX*cols + colIDX) = mul_ps(cvtepi32_ps(result.pack0123), unquant_mult_);
*reinterpret_cast<__m128*>(C_ + rowIDX*cols + colIDX + 4) = mul_ps(cvtepi32_ps(result.pack4567), unquant_mult_);
}
@@ -27,12 +27,12 @@ class JustUnquantizeC {
class OnAVX2 {
public:
- AVX2 explicit OnAVX2(const JustUnquantizeC &from)
+ INTGEMM_AVX2 explicit OnAVX2(const JustUnquantizeC &from)
: C_(from.C_), unquant_mult_(_mm256_set1_ps(from.unquant_mult_)) {
assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m256i) == 0);
}
- AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) {
+ INTGEMM_AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) {
*reinterpret_cast<__m256*>(C_ + rowIDX*cols + colIDX) = mul_ps(cvtepi32_ps(result), unquant_mult_);
}
@@ -52,12 +52,12 @@ class Identity {
class OnSSE2 {
public:
- SSE2 explicit OnSSE2(const Identity &from)
+ INTGEMM_SSE2 explicit OnSSE2(const Identity &from)
: C_(from.C_) {
assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m128i) == 0);
}
- SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) {
+ INTGEMM_SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(C_ + rowIDX*cols + colIDX), result.pack0123);
_mm_storeu_si128(reinterpret_cast<__m128i*>(C_ + rowIDX*cols + colIDX + 4), result.pack4567);
}
@@ -67,12 +67,12 @@ class Identity {
class OnAVX2 {
public:
- AVX2 explicit OnAVX2(const Identity &from)
+ INTGEMM_AVX2 explicit OnAVX2(const Identity &from)
: C_(from.C_) {
assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m256i) == 0);
}
- AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) {
+ INTGEMM_AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(C_ + rowIDX*cols + colIDX), result);
}
diff --git a/interleave.h b/interleave.h
index e7a62f5..8b7484e 100644
--- a/interleave.h
+++ b/interleave.h
@@ -49,41 +49,41 @@ target static inline void Interleave64(type &first, type &second) { \
template <class Register> static inline Register setzero_si() __attribute__((always_inline));;
-INTGEMM_INTERLEAVE(SSE2, __m128i, )
-template <> SSE2 inline __m128i setzero_si<__m128i>() {
+INTGEMM_INTERLEAVE(INTGEMM_SSE2, __m128i, )
+template <> INTGEMM_SSE2 inline __m128i setzero_si<__m128i>() {
return _mm_setzero_si128();
}
-INTGEMM_INTERLEAVE(AVX2, __m256i, 256)
-template <> AVX2 inline __m256i setzero_si<__m256i>() {
+INTGEMM_INTERLEAVE(INTGEMM_AVX2, __m256i, 256)
+template <> INTGEMM_AVX2 inline __m256i setzero_si<__m256i>() {
return _mm256_setzero_si256();
}
#ifndef INTGEMM_NO_AVX512
-INTGEMM_INTERLEAVE(AVX512BW, __m512i, 512)
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-template <> AVX512BW inline __m512i setzero_si<__m512i>() {
+INTGEMM_INTERLEAVE(INTGEMM_AVX512BW, __m512i, 512)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+template <> INTGEMM_AVX512BW inline __m512i setzero_si<__m512i>() {
return _mm512_setzero_si512();
}
#endif
-#define SWAP_DEFINE(target, Register) \
+#define INTGEMM_SWAP(target, Register) \
target static inline void Swap(Register &a, Register &b) { \
Register tmp = a; \
a = b; \
b = tmp; \
} \
-SWAP_DEFINE(SSE2, __m128i)
-SWAP_DEFINE(AVX2, __m256i)
+INTGEMM_SWAP(INTGEMM_SSE2, __m128i)
+INTGEMM_SWAP(INTGEMM_AVX2, __m256i)
#ifndef INTGEMM_NO_AVX512
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-SWAP_DEFINE(AVX512BW, __m512i)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_SWAP(INTGEMM_AVX512BW, __m512i)
#endif
/* Transpose registers containing 8 packed 16-bit integers.
* Each 128-bit lane is handled independently.
*/
-#define TRANSPOSE16_DEFINE(target, Register) \
+#define INTGEMM_TRANSPOSE16(target, Register) \
target static inline void Transpose16InLane(Register &r0, Register &r1, Register &r2, Register &r3, Register &r4, Register &r5, Register &r6, Register &r7) { \
/* r0: columns 0 1 2 3 4 5 6 7 from row 0
r1: columns 0 1 2 3 4 5 6 7 from row 1*/ \
@@ -126,11 +126,11 @@ target static inline void Transpose16InLane(Register &r0, Register &r1, Register
Swap(r3, r6); \
} \
-TRANSPOSE16_DEFINE(SSE2, __m128i)
-TRANSPOSE16_DEFINE(AVX2, __m256i)
+INTGEMM_TRANSPOSE16(INTGEMM_SSE2, __m128i)
+INTGEMM_TRANSPOSE16(INTGEMM_AVX2, __m256i)
#ifndef INTGEMM_NO_AVX512
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-TRANSPOSE16_DEFINE(AVX512BW, __m512i)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_TRANSPOSE16(INTGEMM_AVX512BW, __m512i)
#endif
/* Tranpose registers containing 16 packed 8-bit integers.
@@ -180,7 +180,7 @@ template <class Register> static inline void Transpose8InLane(
//
// We presume B starts in row-major order.
//
-// In AVX2, a register holds 32 8-bit values or 16 16-bit values and we want
+// In INTGEMM_AVX2, a register holds 32 8-bit values or 16 16-bit values and we want
// that many values from the same column in the register.
//
// The multiplier reads 8 rows at a time and we want these reads to be
@@ -189,7 +189,7 @@ template <class Register> static inline void Transpose8InLane(
// Each 8x32 (for 8-bit) or 8x16 (for 16-bit) tile of B is transposed.
// The tiles are stored in column major order.
//
-// For AVX2, this matrix shows what index each value of B will be stored at:
+// For INTGEMM_AVX2, this matrix shows what index each value of B will be stored at:
// 0 16 ... 240
// 1 17 ... 241
// 2 18 ... 242
@@ -209,7 +209,7 @@ template <class Register> static inline void Transpose8InLane(
// 256 272
// 257 273
// ... ...
-#define PREPARE_B_8_DEFINE(target, QuantClass) \
+#define INTGEMM_PREPARE_B_8(target, QuantClass) \
target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \
typedef typename QuantClass Quantizer; \
typedef typename Quantizer::Integer Register; \
@@ -244,7 +244,7 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
} \
} \
-#define PREPARE_B_16_DEFINE(target, QuantClass) \
+#define INTGEMM_PREPARE_B_16(target, QuantClass) \
target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \
typedef typename QuantClass Quantizer; \
typedef typename Quantizer::Integer Register; \
@@ -267,7 +267,7 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
/* Select columns of B from PrepareB format to PrepareB format.
*/
-#define SELECT_COL_B_DEFINE(target, Register) \
+#define INTGEMM_SELECT_COL_B(target, Register) \
target static inline void SelectColumnsOfB(const Register *input, Register *output, Index rows_bytes /* number of bytes in a row */, const Index *cols_begin, const Index *cols_end) { \
assert(rows_bytes % sizeof(Register) == 0); \
assert((cols_end - cols_begin) % 8 == 0); \
diff --git a/intgemm.h b/intgemm.h
index b739e6e..76ef5fb 100644
--- a/intgemm.h
+++ b/intgemm.h
@@ -107,14 +107,14 @@ float MaxAbsolute(const float *begin, const float *end) {
#endif
/* Returns:
- * avx512 if the CPU supports AVX512F (though really it should be AVX512BW, but
+ * avx512 if the CPU supports INTGEMM_AVX512F (though really it should be INTGEMM_AVX512BW, but
* cloud providers lie). TODO: don't catch Knights processors with this.
*
- * avx2 if the CPU supports AVX2
+ * avx2 if the CPU supports INTGEMM_AVX2
*
- * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit)
+ * ssse3 if the CPU supports INTGEMM_SSSE3 (this distinction from INTGEMM_SSE2 matters for 8-bit)
*
- * sse2 if the CPU supports SSE2
+ * sse2 if the CPU supports INTGEMM_SSE2
*
* unsupported otherwise
*/
diff --git a/intrinsics.h b/intrinsics.h
index fa3a3c2..9548a81 100644
--- a/intrinsics.h
+++ b/intrinsics.h
@@ -18,105 +18,105 @@ template <class Register> static inline Register set1_ps(float to);
struct MultiplyResult128 {
__m128i pack0123, pack4567;
};
-SSE2 static inline __m128i add_epi32(__m128i first, __m128i second) {
+INTGEMM_SSE2 static inline __m128i add_epi32(__m128i first, __m128i second) {
return _mm_add_epi32(first, second);
}
-SSE2 static inline __m128i adds_epi16(__m128i first, __m128i second) {
+INTGEMM_SSE2 static inline __m128i adds_epi16(__m128i first, __m128i second) {
return _mm_adds_epi16(first, second);
}
-template <> SSE2 inline __m128i set1_epi16<__m128i>(int16_t to) {
+template <> INTGEMM_SSE2 inline __m128i set1_epi16<__m128i>(int16_t to) {
return _mm_set1_epi16(to);
}
-template <> SSE2 inline __m128 set1_ps<__m128>(float to) {
+template <> INTGEMM_SSE2 inline __m128 set1_ps<__m128>(float to) {
return _mm_set1_ps(to);
}
-SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) {
+INTGEMM_SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) {
return _mm_madd_epi16(first, second);
}
-SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) {
+INTGEMM_SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) {
return _mm_maddubs_epi16(first, second);
}
-SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) {
+INTGEMM_SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) {
return _mm_sign_epi8(first, second);
}
-SSSE3 static inline __m128i abs_epi8(__m128i arg) {
+INTGEMM_SSSE3 static inline __m128i abs_epi8(__m128i arg) {
return _mm_abs_epi8(arg);
}
-SSE2 static inline __m128 max_ps(__m128 first, __m128 second) {
+INTGEMM_SSE2 static inline __m128 max_ps(__m128 first, __m128 second) {
return _mm_max_ps(first, second);
}
-SSE2 static inline __m128 and_ps(__m128 first, __m128 second) {
+INTGEMM_SSE2 static inline __m128 and_ps(__m128 first, __m128 second) {
return _mm_and_ps(first, second);
}
-SSE2 static inline __m128 cvtepi32_ps(__m128i arg) {
+INTGEMM_SSE2 static inline __m128 cvtepi32_ps(__m128i arg) {
return _mm_cvtepi32_ps(arg);
}
-SSE2 static inline __m128 mul_ps (__m128 a, __m128 b) {
+INTGEMM_SSE2 static inline __m128 mul_ps (__m128 a, __m128 b) {
return _mm_mul_ps(a, b);
}
-AVX2 static inline __m256i add_epi32(__m256i first, __m256i second) {
+INTGEMM_AVX2 static inline __m256i add_epi32(__m256i first, __m256i second) {
return _mm256_add_epi32(first, second);
}
-AVX2 static inline __m256i adds_epi16(__m256i first, __m256i second) {
+INTGEMM_AVX2 static inline __m256i adds_epi16(__m256i first, __m256i second) {
return _mm256_adds_epi16(first, second);
}
-template <> AVX2 inline __m256i set1_epi16<__m256i>(int16_t to) {
+template <> INTGEMM_AVX2 inline __m256i set1_epi16<__m256i>(int16_t to) {
return _mm256_set1_epi16(to);
}
-template <> AVX2 inline __m256 set1_ps<__m256>(float to) {
+template <> INTGEMM_AVX2 inline __m256 set1_ps<__m256>(float to) {
return _mm256_set1_ps(to);
}
-AVX2 static inline __m256i madd_epi16(__m256i first, __m256i second) {
+INTGEMM_AVX2 static inline __m256i madd_epi16(__m256i first, __m256i second) {
return _mm256_madd_epi16(first, second);
}
-AVX2 static inline __m256i maddubs_epi16(__m256i first, __m256i second) {
+INTGEMM_AVX2 static inline __m256i maddubs_epi16(__m256i first, __m256i second) {
return _mm256_maddubs_epi16(first, second);
}
-AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) {
+INTGEMM_AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) {
return _mm256_sign_epi8(first, second);
}
-AVX2 static inline __m256i abs_epi8(__m256i arg) {
+INTGEMM_AVX2 static inline __m256i abs_epi8(__m256i arg) {
return _mm256_abs_epi8(arg);
}
-AVX2 static inline __m256 max_ps(__m256 first, __m256 second) {
+INTGEMM_AVX2 static inline __m256 max_ps(__m256 first, __m256 second) {
return _mm256_max_ps(first, second);
}
-AVX2 static inline __m256 and_ps(__m256 first, __m256 second) {
+INTGEMM_AVX2 static inline __m256 and_ps(__m256 first, __m256 second) {
return _mm256_and_ps(first, second);
}
-AVX2 static inline __m256 cvtepi32_ps(__m256i arg) {
+INTGEMM_AVX2 static inline __m256 cvtepi32_ps(__m256i arg) {
return _mm256_cvtepi32_ps(arg);
}
-AVX2 static inline __m256 mul_ps (__m256 a, __m256 b) {
+INTGEMM_AVX2 static inline __m256 mul_ps (__m256 a, __m256 b) {
return _mm256_mul_ps(a, b);
}
#ifndef INTGEMM_NO_AVX512
-AVX512BW static inline __m512i add_epi32(__m512i first, __m512i second) {
+INTGEMM_AVX512BW static inline __m512i add_epi32(__m512i first, __m512i second) {
return _mm512_add_epi32(first, second);
}
-template <> inline AVX512BW __m512i set1_epi16<__m512i>(int16_t to) {
+template <> inline INTGEMM_AVX512BW __m512i set1_epi16<__m512i>(int16_t to) {
return _mm512_set1_epi16(to);
}
-template <> inline AVX512BW __m512 set1_ps<__m512>(float to) {
+template <> inline INTGEMM_AVX512BW __m512 set1_ps<__m512>(float to) {
return _mm512_set1_ps(to);
}
-AVX512BW static inline __m512i madd_epi16(__m512i first, __m512i second) {
+INTGEMM_AVX512BW static inline __m512i madd_epi16(__m512i first, __m512i second) {
return _mm512_madd_epi16(first, second);
}
-AVX512BW static inline __m512i maddubs_epi16(__m512i first, __m512i second) {
+INTGEMM_AVX512BW static inline __m512i maddubs_epi16(__m512i first, __m512i second) {
return _mm512_maddubs_epi16(first, second);
}
-AVX512BW static inline __m512i abs_epi8(__m512i arg) {
+INTGEMM_AVX512BW static inline __m512i abs_epi8(__m512i arg) {
return _mm512_abs_epi8(arg);
}
-AVX512BW static inline __m512 max_ps(__m512 first, __m512 second) {
+INTGEMM_AVX512BW static inline __m512 max_ps(__m512 first, __m512 second) {
return _mm512_max_ps(first, second);
}
// Technically __AVX512DQ__
-AVX512DQ static inline __m512 and_ps(__m512 first, __m512 second) {
+INTGEMM_AVX512DQ static inline __m512 and_ps(__m512 first, __m512 second) {
return _mm512_and_ps(first, second);
}
#endif
diff --git a/multiply.h b/multiply.h
index 6c81468..36b2da8 100644
--- a/multiply.h
+++ b/multiply.h
@@ -4,7 +4,7 @@
namespace intgemm {
-SSE2 static inline float MaxFloat32(__m128 a) {
+INTGEMM_SSE2 static inline float MaxFloat32(__m128 a) {
// Fold to just using the first 64 bits.
__m128 second_half = _mm_shuffle_ps(a, a, 3 * 4 + 2);
a = _mm_max_ps(a, second_half);
@@ -15,7 +15,7 @@ SSE2 static inline float MaxFloat32(__m128 a) {
return *reinterpret_cast<float*>(&a);
}
-SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) {
+INTGEMM_SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) {
// No op for 128 bits: already reduced fully.
MultiplyResult128 ret;
ret.pack0123 = pack0123;
@@ -23,11 +23,11 @@ SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pac
return ret;
}
-AVX2 static inline float MaxFloat32(__m256 a) {
+INTGEMM_AVX2 static inline float MaxFloat32(__m256 a) {
return MaxFloat32(max_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)));
}
-AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
+INTGEMM_AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
// This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f
__m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21);
// This instruction generates 1f 2f 3f 4f 5s 6s 7s 8s
@@ -36,8 +36,8 @@ AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
}
#ifndef INTGEMM_NO_AVX512
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) {
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) {
// Form [0th 128-bit register of pack0123, 0st 128-bit register of pack4567, 2nd 128-bit register of pack0123, 2nd 128-bit register of pack4567]
__m512i mix0 = _mm512_mask_permutex_epi64(pack0123, 0xcc, pack4567, (0 << 4) | (1 << 6));
// Form [1st 128-bit register of pack0123, 1st 128-bit register of pack4567, 3rd 128-bit register of pack0123, 3rd 128-bit register of pack4567]
@@ -49,7 +49,7 @@ AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567)
}
// Find the maximum float.
-static inline AVX512DQ float MaxFloat32(__m512 a) {
+static inline INTGEMM_AVX512DQ float MaxFloat32(__m512 a) {
return MaxFloat32(max_ps(_mm512_castps512_ps256(a), _mm512_extractf32x8_ps(a, 1)));
}
@@ -70,7 +70,7 @@ template <class Register> inline Register Pack0123(Register sum0, Register sum1,
return add_epi32(pack01, pack23);
}
*/
-#define PACK0123_DEFINE(target, Register) \
+#define INTGEMM_PACK0123(target, Register) \
target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Register sum3) { \
Interleave32(sum0, sum1); \
Register pack01 = add_epi32(sum0, sum1); \
@@ -80,14 +80,14 @@ target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Reg
return add_epi32(pack01, pack23); \
} \
-PACK0123_DEFINE(SSE2, __m128i)
-PACK0123_DEFINE(AVX2, __m256i)
+INTGEMM_PACK0123(INTGEMM_SSE2, __m128i)
+INTGEMM_PACK0123(INTGEMM_AVX2, __m256i)
#ifndef INTGEMM_NO_AVX512
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-PACK0123_DEFINE(AVX512BW, __m512i)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i)
#endif
-// 16-bit multiplier for SSE2, AVX2, and AVX512.
+// 16-bit multiplier for INTGEMM_SSE2, INTGEMM_AVX2, and AVX512.
// C = A * B * unquant_mult
//
// This has been substantially revised from Jacob Devlin's SSE code which is:
@@ -116,15 +116,15 @@ PACK0123_DEFINE(AVX512BW, __m512i)
// C is output in row-major form.
//
// All of A, B, and C must be in aligned to a multiple of the register size:
-// SSE2: 16 bytes
-// AVX2: 32 bytes
+// INTGEMM_SSE2: 16 bytes
+// INTGEMM_AVX2: 32 bytes
// AVX512: 64 bytes.
//
// A_rows can be anything non-negative.
// width must be a multiple of the register size.
// B_cols must be a multiple of 8.
// Multiply16
-#define MULTIPLY16_DEFINE(Integer, target, WriteCSubType) \
+#define INTGEMM_MULTIPLY16(Integer, target, WriteCSubType) \
template <class WriteC> target static void Multiply(const int16_t *A, const int16_t *B, WriteC C, Index A_rows, Index width, Index B_cols) { \
assert(width % (sizeof(Integer) / sizeof(int16_t)) == 0); \
assert(B_cols % 8 == 0); \
@@ -180,7 +180,7 @@ PACK0123_DEFINE(AVX512BW, __m512i)
} \
} \
-/* 8-bit matrix multiply used by AVX and AVX2.
+/* 8-bit matrix multiply used by AVX and INTGEMM_AVX2.
* These have two peculiar properties:
* 1. The sign instructions don't exist in AVX512.
* 2. 16 registers means gcc's register allocation failed so I wrote it in my
@@ -189,11 +189,11 @@ PACK0123_DEFINE(AVX512BW, __m512i)
*
* Fun fact: AVX introduced the three-argument vpsignb and vpmaddubsw but only
* for 128-bit, despite the primary change in AVX being the addition of
- * 256-bit. We had to wait for AVX2 to get 256-bit versions of vpsignb and
+ * 256-bit. We had to wait for INTGEMM_AVX2 to get 256-bit versions of vpsignb and
* vpmaddubsw. That's why this code is generic over 128-bit or 256-bit.
*/
-AVX2 inline static void InnerAVX2(
+INTGEMM_AVX2 inline static void InnerINTGEMM_AVX2(
__m256i a, const __m256i *b,
__m256i &sum0, __m256i &sum1, __m256i &sum2, __m256i &sum3,
__m256i &sum4, __m256i &sum5, __m256i &sum6, __m256i &sum7) {
@@ -312,8 +312,8 @@ AVX2 inline static void InnerAVX2(
}
-// For SSSE3 without AVX
-SSSE3 inline static void InnerSSSE3(
+// For INTGEMM_SSSE3 without AVX
+INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
__m128i a, const __m128i *b,
__m128i &sum0, __m128i &sum1, __m128i &sum2, __m128i &sum3,
__m128i &sum4, __m128i &sum5, __m128i &sum6, __m128i &sum7) {
@@ -327,8 +327,8 @@ SSSE3 inline static void InnerSSSE3(
sum6 = adds_epi16(sum6, maddubs_epi16(a_positive, sign_epi8(b[6], a)));
sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a)));
}
-//AVX2 or SSSE3 multiply
-#define MULTIPLY8_DEFINE(Integer, target, WriteCSubType) \
+//INTGEMM_AVX2 or INTGEMM_SSSE3 multiply
+#define INTGEMM_MULTIPLY8(Integer, target, WriteCSubType) \
template <class WriteC> target static void Multiply(const int8_t *A, const int8_t *B, WriteC C, Index A_rows, Index width, Index B_cols) { \
assert(width % sizeof(Integer) == 0); \
assert(B_cols % 8 == 0); \
@@ -417,7 +417,7 @@ template <class Register> inline static float MaxAbsoluteBackend(const float *be
return MaxFloat32(highest);
}*/
-#define MAXABSOLUTE_DEFINE(Register, target) \
+#define INTGEMM_MAXABSOLUTE(Register, target) \
target static float MaxAbsolute(const float *begin_float, const float *end_float) { \
assert(end_float > begin_float); \
assert((end_float - begin_float) % (sizeof(Register) / sizeof(float)) == 0); \
diff --git a/sse2_gemm.h b/sse2_gemm.h
index 882e006..9605a68 100644
--- a/sse2_gemm.h
+++ b/sse2_gemm.h
@@ -10,26 +10,26 @@ namespace intgemm {
namespace sse2 {
// Same implementation as AVX512, just width. Grabs 4 32-bit values.
// TODO duplicated function requires the removal of the annonymous namespace
-SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
+INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
}
-SELECT_COL_B_DEFINE(SSE2, __m128i)
+INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
class QuantizeTile16 {
public:
typedef __m128i Integer;
- SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
+ INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
// Quantize 8xfloat into 8xint16_t
- SSE2 inline __m128i Consecutive(const float *input) {
+ INTGEMM_SSE2 inline __m128i Consecutive(const float *input) {
__m128i g0 = QuantizerGrab(input, mult_reg_);
__m128i g1 = QuantizerGrab(input + 4, mult_reg_);
return _mm_packs_epi32(g0, g1);
}
- SSE2 inline __m128i ForReshape(const float *input, int) {
+ INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) {
return Consecutive(input);
}
@@ -39,19 +39,19 @@ class QuantizeTile16 {
// Technically only requires SSE
-MAXABSOLUTE_DEFINE(__m128, SSE2)
+INTGEMM_MAXABSOLUTE(__m128, INTGEMM_SSE2)
} //namespace
-// This should be pure SSE2 (and below).
+// This should be pure INTGEMM_SSE2 (and below).
struct SSE2_16bit {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
- SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+ INTGEMM_SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
- SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+ INTGEMM_SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
assert(size % 8 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
@@ -66,15 +66,15 @@ struct SSE2_16bit {
static const Index kBTileRow = 8;
static const Index kBTileCol = 8;
- PREPARE_B_16_DEFINE(SSE2, sse2::QuantizeTile16)
+ INTGEMM_PREPARE_B_16(INTGEMM_SSE2, sse2::QuantizeTile16)
- SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
//TODO #DEFINE
sse2::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
}
- MULTIPLY16_DEFINE(__m128i, SSE2, OnSSE2)
+ INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, OnSSE2)
- constexpr static const char *const kName = "16-bit SSE2";
+ constexpr static const char *const kName = "16-bit INTGEMM_SSE2";
static const CPUType kUses = CPU_SSE2;
};
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index dccc730..cf5d2c1 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -13,30 +13,30 @@ namespace intgemm {
namespace ssse3 {
// Same implementation as AVX512, just width. Grabs 4 32-bit values.
//TODO duplicated function requires the removal of the annonymous namespace
-SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
+INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
}
-SELECT_COL_B_DEFINE(SSSE3, __m128i)
+INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i)
class QuantizeTile8 {
public:
typedef __m128i Integer;
- SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
+ INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
- SSSE3 inline __m128i ForReshape(const float *input, Index cols) {
+ INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) {
// Skip a row.
return Tile(input, input + 2 * cols);
}
- SSSE3 inline __m128i Consecutive(const float *input) {
+ INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) {
return Tile(input, input + 8);
}
private:
// Quantize 16xfloat into 16xint8_t
- SSSE3 inline __m128i Tile(const float *input0, const float *input1) {
+ INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1) {
const __m128i neg128 = _mm_set1_epi8(-128);
__m128i g0 = QuantizerGrab(input0, mult_reg_);
__m128i g1 = QuantizerGrab(input0 + 4, mult_reg_);
@@ -47,7 +47,7 @@ class QuantizeTile8 {
__m128i packed = _mm_packs_epi16(packed0, packed1);
/* Ban -128.
* Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127). Instead,
- * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
+ * use INTGEMM_SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
* The first generates 0xff for fields -128.
* The second subtracts 0xff from -128 which has the effect of converting
* to -127.
@@ -65,16 +65,16 @@ class QuantizeTile8 {
} // namespace
-// pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need.
+// pmaddubsw (the 8-bit multiply) is INTGEMM_SSSE3, so pedantically that's the version we need.
struct SSSE3_8bit {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
- SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ INTGEMM_SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
- SSSE3 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+ INTGEMM_SSSE3 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
@@ -89,22 +89,22 @@ struct SSSE3_8bit {
static const Index kBTileRow = 16;
static const Index kBTileCol = 8;
/*
- SSSE3 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+ INTGEMM_SSSE3 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
PrepareBFor8(input, output, ssse3::QuantizeTile8(quant_mult), rows, cols);
}*/
- PREPARE_B_8_DEFINE(SSSE3, ssse3::QuantizeTile8)
+ INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8)
- SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+ INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
}
/*
- SSSE3 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+ INTGEMM_SSSE3 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
//Multiply8_SSE2OrAVX2<Multiply8_C, __m128i, __m128>(A, B, C, unquant_mult, A_rows, width, B_cols);
Multiply8_SSE2OrAVX2__m128i<JustUnquantizeC>(A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
}*/
- MULTIPLY8_DEFINE(__m128i, SSSE3, OnSSE2)
+ INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, OnSSE2)
- constexpr static const char *const kName = "8-bit SSSE3";
+ constexpr static const char *const kName = "8-bit INTGEMM_SSSE3";
static const CPUType kUses = CPU_SSSE3;
};
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index 6c16dec..6deb259 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -51,7 +51,7 @@ template <class V> void SlowTranspose(const V *from, V *to, Index rows, Index co
}
-TEST_CASE("Transpose 16", "[transpose]") {
+INTGEMM_SSE2 TEST_CASE("Transpose 16", "[transpose]") {
if (kCPU < CPU_SSE2) return;
AlignedVector<int16_t> input(8 * 8);
for (int16_t i = 0; i < 64; ++i) {
@@ -69,7 +69,7 @@ TEST_CASE("Transpose 16", "[transpose]") {
}
}
-SSSE3 TEST_CASE("Transpose 8", "[transpose]") {
+INTGEMM_SSSE3 TEST_CASE("Transpose 8", "[transpose]") {
if (kCPU < CPU_SSSE3) return;
AlignedVector<int8_t> input(16 * 16);
for (int i = 0; i < 16 * 16; ++i) {
diff --git a/types.h b/types.h
index 9bd6f03..568c444 100644
--- a/types.h
+++ b/types.h
@@ -2,23 +2,23 @@
#include <exception>
#define DEFAULT __attribute__ ((target ("default")))
-#define SSE2 __attribute__ ((target ("sse2")))
+#define INTGEMM_SSE2 __attribute__ ((target ("sse2")))
//#define SSE2_3 __attribute__ ((target ("ssse3"), target("sse2"))) //Not supported by clang
-#define SSSE3 __attribute__ ((target ("ssse3")))
-#define AVX2 __attribute__ ((target ("avx2")))
+#define INTGEMM_SSSE3 __attribute__ ((target ("ssse3")))
+#define INTGEMM_AVX2 __attribute__ ((target ("avx2")))
//#define AVX2_512F __attribute__ ((target ("avx2"), target("avx512f"))) //Not supported by clang
#if defined __INTEL_COMPILER
-#define AVX512F __attribute__ ((target ("avx512f")))
-#define AVX512BW __attribute__ ((target ("avx512f")))
-#define AVX512DQ __attribute__ ((target ("avx512f")))
+#define INTGEMM_AVX512F __attribute__ ((target ("avx512f")))
+#define INTGEMM_AVX512BW __attribute__ ((target ("avx512f")))
+#define INTGEMM_AVX512DQ __attribute__ ((target ("avx512f")))
#else
-#define AVX512F __attribute__ ((target ("avx512f")))
-#define AVX512BW __attribute__ ((target ("avx512bw")))
-#define AVX512DQ __attribute__ ((target ("avx512dq")))
+#define INTGEMM_AVX512F __attribute__ ((target ("avx512f")))
+#define INTGEMM_AVX512BW __attribute__ ((target ("avx512bw")))
+#define INTGEMM_AVX512DQ __attribute__ ((target ("avx512dq")))
#endif
namespace intgemm {
-// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3).
+// This will be thrown if a CPU isn't supported by the routines (16-bit without INTGEMM_SSE2 or 8-bit without INTGEMM_SSSE3).
class UnsupportedCPU : public std::exception {
public:
UnsupportedCPU() {}