Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMateusz Chudyk <mateuszchudyk@gmail.com>2020-02-06 19:24:46 +0300
committerGitHub <noreply@github.com>2020-02-06 19:24:46 +0300
commit54c84ff67061ccf07480bdb384f8b64211b083bd (patch)
tree1f9f1646a9760da89d5f9b606dedbd61172bcbed
parentfaa096b372df5c3bf8e060effb6437fdf26598cc (diff)
parent12becc0f51e085f9f16177ce4f01d7e6fc136188 (diff)
Merge pull request #56 from kpu/prepare-b-transposed
Add PrepareBTransposed
-rw-r--r--CMakeLists.txt1
-rw-r--r--avx2_gemm.h28
-rw-r--r--avx512_gemm.h42
-rw-r--r--interleave.h41
-rw-r--r--intgemm.cc4
-rw-r--r--intgemm.h18
-rw-r--r--sse2_gemm.h18
-rw-r--r--ssse3_gemm.h38
-rw-r--r--test/prepare_b_quantized_transposed.cc34
-rw-r--r--test/prepare_b_transposed.cc95
10 files changed, 278 insertions, 41 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4efe533..5b24410 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,7 @@ add_executable(tests
test/add127_test.cc
test/multiply_test.cc
test/prepare_b_quantized_transposed.cc
+ test/prepare_b_transposed.cc
test/quantize_test.cc
test/utils_test.cc
diff --git a/avx2_gemm.h b/avx2_gemm.h
index 4b430ea..85319a8 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -28,6 +28,12 @@ class QuantizeTile16 {
return Tile(input, input + 8);
}
+ INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+ return Tile(
+ input,
+ input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0));
+ }
+
INTGEMM_AVX2 Integer ForReshape(const float *input, Index cols) {
// 8 rows in the first 128-bit register, 8 in the second register.
return Tile(input, input + 8 * cols);
@@ -77,11 +83,12 @@ struct AVX2_16bit {
}*/
INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, CPUType::AVX2, int16_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile16, int16_t)
INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
}
-
+
INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2)
constexpr static const char *const kName = "16-bit AVX2";
@@ -108,6 +115,20 @@ class QuantizeTile8 {
return TileU(input, input + 8, input + 16, input + 24);
}
+ INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+ const float* inputs[4];
+ for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
+ while (cols_left < sizeof(Integer) / sizeof(float)) {
+ input += cols * (row_step - 1);
+ cols_left += cols;
+ }
+ inputs[i] = input;
+ input += sizeof(Integer) / sizeof(float);
+ cols_left -= sizeof(Integer) / sizeof(float);
+ }
+ return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
+ }
+
INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) {
// Put higher rows in the second half of the register. These will jumble
// around in the same way then conveniently land in the right place.
@@ -163,7 +184,7 @@ class QuantizeTile8 {
// and the values are only used for GEMM.
return _mm256_permutevar8x32_epi32(packed, shuffle_param);
}
-
+
const __m256 mult_;
};
@@ -213,6 +234,7 @@ struct AVX2_8bit {
INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, CPUType::AVX2, int8_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile8, int8_t)
INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
@@ -223,7 +245,7 @@ struct AVX2_8bit {
INTGEMM_MULTIPLY8SHIFT(__m256i, INTGEMM_AVX2, CPUType::AVX2)
INTGEMM_PREPAREBIASFOR8(__m256i, INTGEMM_AVX2, CPUType::AVX2)
-
+
constexpr static const char *const kName = "8-bit AVX2";
static const CPUType kUses = CPUType::AVX2;
diff --git a/avx512_gemm.h b/avx512_gemm.h
index ded9366..efddd7a 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -75,6 +75,15 @@ class QuantizeTile16 {
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+ INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+ auto input0 = input;
+ auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0);
+ auto g0 = QuantizerGrabHalves(input0, input1, mult_reg_);
+ auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, mult_reg_);
+ auto packed = _mm512_packs_epi32(g0, g1);
+ return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
+ }
+
INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
__m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
__m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
@@ -94,6 +103,33 @@ class QuantizeTile8 {
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+ INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+ static const __m512i neg127 = _mm512_set1_epi8(-127);
+ static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+
+ const float* inputs[4];
+ for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
+ while (cols_left < sizeof(Integer) / sizeof(float)) {
+ input += cols * (row_step - 1);
+ cols_left += cols;
+ }
+ inputs[i] = input;
+ input += sizeof(Integer) / sizeof(float);
+ cols_left -= sizeof(Integer) / sizeof(float);
+ }
+
+ auto g0 = QuantizerGrab(inputs[0], mult_reg_);
+ auto g1 = QuantizerGrab(inputs[1], mult_reg_);
+ auto g2 = QuantizerGrab(inputs[2], mult_reg_);
+ auto g3 = QuantizerGrab(inputs[3], mult_reg_);
+
+ auto packed0 = _mm512_packs_epi32(g0, g1);
+ auto packed1 = _mm512_packs_epi32(g2, g3);
+ auto packed = _mm512_packs_epi16(packed0, packed1);
+ packed = _mm512_max_epi8(packed, neg127);
+ return _mm512_permutexvar_epi32(shuffle_param, packed);
+ }
+
INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
// TODO: try alternative: _mm512_cvtsepi32_epi8 ?
const __m512i neg127 = _mm512_set1_epi8(-127);
@@ -151,7 +187,7 @@ struct AVX512_16bit {
_mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, avx512f::QuantizerGrab(input, quant_mult_reg));
}
}
-
+
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 32;
@@ -164,12 +200,13 @@ struct AVX512_16bit {
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, avx512f::QuantizeTile16)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, CPUType::AVX512BW, int16_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, avx512f::QuantizeTile16, int16_t)
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
}
-
+
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_MULTIPLY16(__m512i, INTGEMM_AVX512BW, CPUType::AVX2)
@@ -244,6 +281,7 @@ struct AVX512_8bit {
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, avx512f::QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, CPUType::AVX512BW, int8_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, avx512f::QuantizeTile8, int8_t)
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
diff --git a/interleave.h b/interleave.h
index 43cbab0..41ac8b7 100644
--- a/interleave.h
+++ b/interleave.h
@@ -234,6 +234,13 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
} \
}
+/*
+ * Prepare B matrix.
+ * B matrix has to be transposed and quantized.
+ * Cols has to be a multiple of sizeof(Register) / sizeof(Integer).
+ *
+ * cols and rows describe size of transposed B.
+ */
#define INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(target, cpu_type, Integer) \
target static inline void PrepareBQuantizedTransposed(const Integer* input, Integer* output, Index cols, Index rows) { \
using Register = vector_t<cpu_type, Integer>; \
@@ -252,6 +259,40 @@ target static inline void PrepareBQuantizedTransposed(const Integer* input, Inte
*output_it++ = *reinterpret_cast<const Register*>(input + (r + ri) * cols + c); \
}
+/*
+ * Prepare B matrix.
+ * B matrix has to be transposed.
+ * Cols has to be a multiple of sizeof(Register) / sizeof(float).
+ *
+ * cols and rows describe size of transposed B.
+ */
+#define INTGEMM_PREPARE_B_TRANSPOSED(target, Quantizer, integer) \
+target static inline void PrepareBTransposed(const float* input, integer* output, float quant_mult, Index cols, Index rows) { \
+ using Register = typename Quantizer::Integer; \
+ const Index RegisterElemsInt = sizeof(Register) / sizeof(integer); \
+ const Index RegisterElemsFloat = sizeof(Register) / sizeof(float); \
+ const Index kColStride = 8; \
+ \
+ assert(cols % RegisterElemsFloat == 0); \
+ assert(rows % kColStride == 0); \
+ assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
+ assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
+ \
+ Quantizer quantizer(quant_mult); \
+ Register* output_it = reinterpret_cast<Register*>(output); \
+ Index r = 0; \
+ Index c = 0; \
+ while (r < rows) { \
+ for (Index ri = 0; ri < 8; ++ri) \
+ *output_it++ = quantizer.ConsecutiveWithWrapping(input + (r + ri) * cols + c, cols - c, cols, 8); \
+ c += RegisterElemsInt; \
+ while (c >= cols) { \
+ r += kColStride; \
+ c -= cols; \
+ } \
+ } \
+}
+
/* Select columns of B from PrepareB format to PrepareB format.
*/
#define INTGEMM_SELECT_COL_B(target, Register) \
diff --git a/intgemm.cc b/intgemm.cc
index 850bb4e..43b8ca6 100644
--- a/intgemm.cc
+++ b/intgemm.cc
@@ -12,6 +12,8 @@ void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, I
void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBQuantizedTransposed, AVX512_16bit::PrepareBQuantizedTransposed, AVX2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
+void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBTransposed, AVX512_16bit::PrepareBTransposed, AVX2_16bit::PrepareBTransposed, SSE2_16bit::PrepareBTransposed, SSE2_16bit::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
+
void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName);
@@ -24,6 +26,8 @@ void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Ind
void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_8bit::PrepareBQuantizedTransposed, AVX512_8bit::PrepareBQuantizedTransposed, AVX2_8bit::PrepareBQuantizedTransposed, SSSE3_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
+void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_8bit::PrepareBTransposed, AVX512_8bit::PrepareBTransposed, AVX2_8bit::PrepareBTransposed, SSSE3_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
+
void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI_8bit::SelectColumnsB, AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
const char *const Int8::kName = ChooseCPU(AVX512VNNI_8bit::kName, AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
diff --git a/intgemm.h b/intgemm.h
index b67ceda..8a63fb8 100644
--- a/intgemm.h
+++ b/intgemm.h
@@ -66,11 +66,12 @@ struct Unsupported_16bit {
static void PrepareB(const float *, int16_t *, float, Index, Index) {
throw UnsupportedCPU();
}
-
static void PrepareBQuantizedTransposed(const int16_t *, int16_t *, Index, Index) {
throw UnsupportedCPU();
}
-
+ static void PrepareBTransposed(const float *, int16_t *, float, Index, Index) {
+ throw UnsupportedCPU();
+ }
static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) {
throw UnsupportedCPU();
}
@@ -94,6 +95,9 @@ struct Unsupported_8bit {
static void PrepareBQuantizedTransposed(const int8_t *, int8_t *, Index, Index) {
throw UnsupportedCPU();
}
+ static void PrepareBTransposed(const float *, int8_t *, float, Index, Index) {
+ throw UnsupportedCPU();
+ }
static void PrepareB(const float *, int8_t *, float, Index, Index) {
throw UnsupportedCPU();
}
@@ -238,6 +242,11 @@ struct Int16 {
// CPU-independent fashion.
static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols);
+ // Convert from a B that was already transposed (routine not provided) to
+ // the CPU-dependent format used for Multiply. This is useful for storing
+ // a quantized model on disk then in a CPU-independent fashion.
+ static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols);
+
// Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8.
static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
@@ -299,6 +308,11 @@ struct Int8 {
// CPU-independent fashion.
static void (*PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols);
+ // Convert from a B that was already transposed (routine not provided) to
+ // the CPU-dependent format used for Multiply. This is useful for storing
+ // a quantized model on disk then in a CPU-independent fashion.
+ static void (*PrepareBTransposed)(const float *input, int8_t *output, float quant_mul, Index inner, Index B_untransposed_cols);
+
// Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8.
static void (*SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
diff --git a/sse2_gemm.h b/sse2_gemm.h
index c59c4a6..a27b358 100644
--- a/sse2_gemm.h
+++ b/sse2_gemm.h
@@ -25,11 +25,14 @@ class QuantizeTile16 {
INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
- // Quantize 8xfloat into 8xint16_t
INTGEMM_SSE2 inline __m128i Consecutive(const float *input) {
- __m128i g0 = QuantizerGrab(input, mult_reg_);
- __m128i g1 = QuantizerGrab(input + 4, mult_reg_);
- return _mm_packs_epi32(g0, g1);
+ return Tile(input, input + 4);
+ }
+
+ INTGEMM_SSE2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+ return Tile(
+ input,
+ input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0));
}
INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) {
@@ -37,6 +40,12 @@ class QuantizeTile16 {
}
private:
+ INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) {
+ __m128i g0 = QuantizerGrab(input0, mult_reg_);
+ __m128i g1 = QuantizerGrab(input1, mult_reg_);
+ return _mm_packs_epi32(g0, g1);
+ }
+
const __m128 mult_reg_;
};
@@ -71,6 +80,7 @@ struct SSE2_16bit {
INTGEMM_PREPARE_B_16(INTGEMM_SSE2, sse2::QuantizeTile16)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSE2, CPUType::SSE2, int16_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSE2, sse2::QuantizeTile16, int16_t)
INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
//TODO #DEFINE
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index dc0b6c7..18bf14b 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -28,26 +28,39 @@ class QuantizeTile8 {
INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) {
// Skip a row.
- return Tile(input, input + 2 * cols);
+ return Tile(input, input + 4, input + 2 * cols, input + 2 * cols + 4);
}
INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) {
- return Tile(input, input + 8);
+ return Tile(input, input + 4, input + 8, input + 12);
}
INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) {
- return TileU(input, input + 8);
+ return TileU(input, input + 4, input + 8, input + 12);
}
+ INTGEMM_SSSE3 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+ const float* inputs[4];
+ for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
+ while (cols_left < sizeof(Integer) / sizeof(float)) {
+ input += cols * (row_step - 1);
+ cols_left += cols;
+ }
+ inputs[i] = input;
+ input += sizeof(Integer) / sizeof(float);
+ cols_left -= sizeof(Integer) / sizeof(float);
+ }
+ return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
+ }
private:
// Quantize 16xfloat into 16xint8_t
- INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1) {
+ INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
const __m128i neg128 = _mm_set1_epi8(-128);
__m128i g0 = QuantizerGrab(input0, mult_reg_);
- __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_);
- __m128i g2 = QuantizerGrab(input1, mult_reg_);
- __m128i g3 = QuantizerGrab(input1 + 4, mult_reg_);
+ __m128i g1 = QuantizerGrab(input1, mult_reg_);
+ __m128i g2 = QuantizerGrab(input2, mult_reg_);
+ __m128i g3 = QuantizerGrab(input3, mult_reg_);
__m128i packed0 = _mm_packs_epi32(g0, g1);
__m128i packed1 = _mm_packs_epi32(g2, g3);
__m128i packed = _mm_packs_epi16(packed0, packed1);
@@ -64,13 +77,13 @@ class QuantizeTile8 {
// No permute needed. packs is in order for SSE.
}
- INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1) {
+ INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) {
const __m128i neg128 = _mm_set1_epi8(-128);
const __m128i pos127 = _mm_set1_epi8(127);
__m128i g0 = QuantizerGrab(input0, mult_reg_);
- __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_);
- __m128i g2 = QuantizerGrab(input1, mult_reg_);
- __m128i g3 = QuantizerGrab(input1 + 4, mult_reg_);
+ __m128i g1 = QuantizerGrab(input1, mult_reg_);
+ __m128i g2 = QuantizerGrab(input2, mult_reg_);
+ __m128i g3 = QuantizerGrab(input3, mult_reg_);
__m128i packed0 = _mm_packs_epi32(g0, g1);
__m128i packed1 = _mm_packs_epi32(g2, g3);
__m128i packed = _mm_packs_epi16(packed0, packed1);
@@ -137,6 +150,7 @@ struct SSSE3_8bit {
INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, CPUType::SSE2, int8_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, ssse3::QuantizeTile8, int8_t)
INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
@@ -145,7 +159,7 @@ struct SSSE3_8bit {
INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
INTGEMM_MULTIPLY8SHIFT(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
-
+
INTGEMM_PREPAREBIASFOR8(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
constexpr static const char *const kName = "8-bit SSSE3";
diff --git a/test/prepare_b_quantized_transposed.cc b/test/prepare_b_quantized_transposed.cc
index b7b2257..a096c21 100644
--- a/test/prepare_b_quantized_transposed.cc
+++ b/test/prepare_b_quantized_transposed.cc
@@ -13,27 +13,27 @@ namespace intgemm {
namespace {
template <typename Backend>
-void PrepareBQuantizedTransposedRef(const typename Backend::Integer* input, typename Backend::Integer* output, Index rows, Index cols) {
+void PrepareBQuantizedTransposedRef(const typename Backend::Integer* input, typename Backend::Integer* output, Index B_transposed_cols, Index B_transposed_rows) {
using vec_t = intgemm::vector_t<Backend::kUses, typename Backend::Integer>;
constexpr Index vec_len = sizeof(vec_t) / sizeof(typename Backend::Integer);
auto output_it = output;
- for (Index r = 0; r < rows; r += 8)
- for (Index c = 0; c < cols; c += vec_len)
+ for (Index r = 0; r < B_transposed_rows; r += 8)
+ for (Index c = 0; c < B_transposed_cols; c += vec_len)
for (Index ri = 0; ri < 8; ++ri)
for (Index ci = 0; ci < vec_len; ++ci)
- *output_it++ = input[(r + ri) * cols + c + ci];
+ *output_it++ = input[(r + ri) * B_transposed_cols + c + ci];
}
template <typename Backend>
-bool Test(const AlignedVector<typename Backend::Integer>& input, Index rows, Index cols) {
+bool Test(const AlignedVector<typename Backend::Integer>& input, Index B_rows, Index B_cols) {
bool success = true;
AlignedVector<typename Backend::Integer> output(input.size());
- Backend::PrepareBQuantizedTransposed(input.begin(), output.begin(), cols, rows);
+ Backend::PrepareBQuantizedTransposed(input.begin(), output.begin(), B_rows, B_cols);
AlignedVector<typename Backend::Integer> reference(input.size());
- PrepareBQuantizedTransposedRef<Backend>(input.begin(), reference.begin(), rows, cols);
+ PrepareBQuantizedTransposedRef<Backend>(input.begin(), reference.begin(), B_rows, B_cols);
for (std::size_t i = 0; i < output.size(); ++i) {
if (output[i] != reference[i]) {
@@ -46,10 +46,8 @@ bool Test(const AlignedVector<typename Backend::Integer>& input, Index rows, Ind
}
template <typename Backend>
-bool TestMany() {
- const static Index rows = 128;
- const static Index cols = 128;
- AlignedVector<typename Backend::Integer> input(rows * cols);
+bool TestMany(Index B_rows, Index B_cols) {
+ AlignedVector<typename Backend::Integer> input(B_rows * B_cols);
std::generate(input.begin(), input.end(), []() {
static constexpr int divider = sizeof(intgemm::vector_t<Backend::kUses, typename Backend::Integer>) / sizeof(typename Backend::Integer);
@@ -57,29 +55,29 @@ bool TestMany() {
return (value++) % divider;
});
- return Test<Backend>(input, rows, cols);
+ return Test<Backend>(input, B_rows, B_cols);
}
TEST_CASE("PrepareBQuantizedTransposed SSE2", "") {
if (kCPU < CPUType::SSE2)
return;
- CHECK(TestMany<SSE2_16bit>());
+ CHECK(TestMany<SSE2_16bit>(32, 128));
}
TEST_CASE("PrepareBQuantizedTransposed SSSE3", "") {
if (kCPU < CPUType::SSSE3)
return;
- CHECK(TestMany<SSSE3_8bit>());
+ CHECK(TestMany<SSSE3_8bit>(32, 128));
}
TEST_CASE("PrepareBQuantizedTransposed AVX2", "") {
if (kCPU < CPUType::AVX2)
return;
- CHECK(TestMany<AVX2_8bit>());
- CHECK(TestMany<AVX2_16bit>());
+ CHECK(TestMany<AVX2_8bit>(32, 128));
+ CHECK(TestMany<AVX2_16bit>(32, 128));
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512
@@ -87,8 +85,8 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") {
if (kCPU < CPUType::AVX512BW)
return;
- CHECK(TestMany<AVX512_8bit>());
- CHECK(TestMany<AVX512_16bit>());
+ CHECK(TestMany<AVX512_8bit>(32, 128));
+ CHECK(TestMany<AVX512_16bit>(32, 128));
}
#endif
diff --git a/test/prepare_b_transposed.cc b/test/prepare_b_transposed.cc
new file mode 100644
index 0000000..219e56a
--- /dev/null
+++ b/test/prepare_b_transposed.cc
@@ -0,0 +1,95 @@
+#include "test.h"
+#include "../aligned.h"
+#include "../avx2_gemm.h"
+#include "../avx512_gemm.h"
+#include "../sse2_gemm.h"
+#include "../ssse3_gemm.h"
+
+#include <cstring>
+#include <iostream>
+#include <math.h>
+
+namespace intgemm {
+namespace {
+
+template <typename Backend>
+void PrepareBTransposedRef(const float* input, typename Backend::Integer* output, float quant_mult, Index B_transposed_cols, Index B_transposed_rows) {
+ using vec_t = intgemm::vector_t<Backend::kUses, typename Backend::Integer>;
+ constexpr Index vec_len = sizeof(vec_t) / sizeof(typename Backend::Integer);
+
+ for (Index i = 0; i < B_transposed_rows * B_transposed_cols / 8; i += vec_len)
+ for (Index j = 0; j < 8; ++j)
+ for (Index k = 0; k < vec_len; ++k) {
+ Index col = (i + k) % B_transposed_cols;
+ Index row = 8 * ((i + k) / B_transposed_cols) + j;
+ *output++ = input[row * B_transposed_cols + col] * quant_mult;
+ }
+}
+
+template <typename Backend>
+bool Test(const AlignedVector<float>& input, Index B_rows, Index B_cols, float quant_mult) {
+ bool success = true;
+
+ AlignedVector<typename Backend::Integer> output(input.size());
+ Backend::PrepareBTransposed(input.begin(), output.begin(), quant_mult, B_rows, B_cols);
+
+ AlignedVector<typename Backend::Integer> reference(input.size());
+ PrepareBTransposedRef<Backend>(input.begin(), reference.begin(), quant_mult, B_rows, B_cols);
+
+ for (std::size_t i = 0; i < output.size(); ++i) {
+ if (output[i] != reference[i]) {
+ UNSCOPED_INFO("Error at " << i << ", output = " << int(output[i]) << ", reference = " << int(reference[i]));
+ success = false;
+ break;
+ }
+ }
+ return success;
+}
+
+template <typename Backend>
+bool TestMany(Index B_rows, Index B_cols, float quant_mult) {
+ AlignedVector<float> input(B_rows * B_cols);
+
+ std::generate(input.begin(), input.end(), []() {
+ static constexpr int divider = sizeof(intgemm::vector_t<Backend::kUses, typename Backend::Integer>) / sizeof(typename Backend::Integer);
+ static int value = 0;
+ return (value++) % divider;
+ });
+
+ return Test<Backend>(input, B_rows, B_cols, quant_mult);
+}
+
+TEST_CASE("PrepareBTransposed SSE2", "") {
+ if (kCPU < CPUType::SSE2)
+ return;
+
+ CHECK(TestMany<SSE2_16bit>(4, 128, 2.0f));
+}
+
+TEST_CASE("PrepareBTransposed SSSE3", "") {
+ if (kCPU < CPUType::SSSE3)
+ return;
+
+ CHECK(TestMany<SSSE3_8bit>(4, 128, 2.0f));
+}
+
+TEST_CASE("PrepareBTransposed AVX2", "") {
+ if (kCPU < CPUType::AVX2)
+ return;
+
+ CHECK(TestMany<AVX2_8bit>(8, 128, 2.0f));
+ CHECK(TestMany<AVX2_16bit>(8, 128, 2.0f));
+}
+
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512
+ TEST_CASE("PrepareBTransposed AVX512", "") {
+ if (kCPU < CPUType::AVX512BW)
+ return;
+
+ CHECK(TestMany<AVX512_8bit>(16, 128, 2.0f));
+ CHECK(TestMany<AVX512_16bit>(16, 128, 2.0f));
+ }
+#endif
+
+}
+}