diff options
author | Mateusz Chudyk <mateuszchudyk@gmail.com> | 2020-02-06 19:24:46 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-06 19:24:46 +0300 |
commit | 54c84ff67061ccf07480bdb384f8b64211b083bd (patch) | |
tree | 1f9f1646a9760da89d5f9b606dedbd61172bcbed | |
parent | faa096b372df5c3bf8e060effb6437fdf26598cc (diff) | |
parent | 12becc0f51e085f9f16177ce4f01d7e6fc136188 (diff) |
Merge pull request #56 from kpu/prepare-b-transposed
Add PrepareBTransposed
-rw-r--r-- | CMakeLists.txt | 1 | ||||
-rw-r--r-- | avx2_gemm.h | 28 | ||||
-rw-r--r-- | avx512_gemm.h | 42 | ||||
-rw-r--r-- | interleave.h | 41 | ||||
-rw-r--r-- | intgemm.cc | 4 | ||||
-rw-r--r-- | intgemm.h | 18 | ||||
-rw-r--r-- | sse2_gemm.h | 18 | ||||
-rw-r--r-- | ssse3_gemm.h | 38 | ||||
-rw-r--r-- | test/prepare_b_quantized_transposed.cc | 34 | ||||
-rw-r--r-- | test/prepare_b_transposed.cc | 95 |
10 files changed, 278 insertions, 41 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 4efe533..5b24410 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,7 @@ add_executable(tests test/add127_test.cc test/multiply_test.cc test/prepare_b_quantized_transposed.cc + test/prepare_b_transposed.cc test/quantize_test.cc test/utils_test.cc diff --git a/avx2_gemm.h b/avx2_gemm.h index 4b430ea..85319a8 100644 --- a/avx2_gemm.h +++ b/avx2_gemm.h @@ -28,6 +28,12 @@ class QuantizeTile16 { return Tile(input, input + 8); } + INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + return Tile( + input, + input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0)); + } + INTGEMM_AVX2 Integer ForReshape(const float *input, Index cols) { // 8 rows in the first 128-bit register, 8 in the second register. return Tile(input, input + 8 * cols); @@ -77,11 +83,12 @@ struct AVX2_16bit { }*/ INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, CPUType::AVX2, int16_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile16, int16_t) INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end); } - + INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2) constexpr static const char *const kName = "16-bit AVX2"; @@ -108,6 +115,20 @@ class QuantizeTile8 { return TileU(input, input + 8, input + 16, input + 24); } + INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + const float* inputs[4]; + for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) { + while (cols_left < sizeof(Integer) / sizeof(float)) { + input += cols * (row_step - 1); + cols_left += cols; + } + inputs[i] = input; + input += sizeof(Integer) / sizeof(float); + cols_left -= sizeof(Integer) / sizeof(float); + } + return Tile(inputs[0], inputs[1], inputs[2], inputs[3]); + } + INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) { // Put higher rows in the second half of the register. These will jumble // around in the same way then conveniently land in the right place. @@ -163,7 +184,7 @@ class QuantizeTile8 { // and the values are only used for GEMM. return _mm256_permutevar8x32_epi32(packed, shuffle_param); } - + const __m256 mult_; }; @@ -213,6 +234,7 @@ struct AVX2_8bit { INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, CPUType::AVX2, int8_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile8, int8_t) INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end); @@ -223,7 +245,7 @@ struct AVX2_8bit { INTGEMM_MULTIPLY8SHIFT(__m256i, INTGEMM_AVX2, CPUType::AVX2) INTGEMM_PREPAREBIASFOR8(__m256i, INTGEMM_AVX2, CPUType::AVX2) - + constexpr static const char *const kName = "8-bit AVX2"; static const CPUType kUses = CPUType::AVX2; diff --git a/avx512_gemm.h b/avx512_gemm.h index ded9366..efddd7a 100644 --- a/avx512_gemm.h +++ b/avx512_gemm.h @@ -75,6 +75,15 @@ class QuantizeTile16 { /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} + INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + auto input0 = input; + auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0); + auto g0 = QuantizerGrabHalves(input0, input1, mult_reg_); + auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, mult_reg_); + auto packed = _mm512_packs_epi32(g0, g1); + return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); + } + INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) { __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_); __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_); @@ -94,6 +103,33 @@ class QuantizeTile8 { /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} + INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + static const __m512i neg127 = _mm512_set1_epi8(-127); + static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); + + const float* inputs[4]; + for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) { + while (cols_left < sizeof(Integer) / sizeof(float)) { + input += cols * (row_step - 1); + cols_left += cols; + } + inputs[i] = input; + input += sizeof(Integer) / sizeof(float); + cols_left -= sizeof(Integer) / sizeof(float); + } + + auto g0 = QuantizerGrab(inputs[0], mult_reg_); + auto g1 = QuantizerGrab(inputs[1], mult_reg_); + auto g2 = QuantizerGrab(inputs[2], mult_reg_); + auto g3 = QuantizerGrab(inputs[3], mult_reg_); + + auto packed0 = _mm512_packs_epi32(g0, g1); + auto packed1 = _mm512_packs_epi32(g2, g3); + auto packed = _mm512_packs_epi16(packed0, packed1); + packed = _mm512_max_epi8(packed, neg127); + return _mm512_permutexvar_epi32(shuffle_param, packed); + } + INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) { // TODO: try alternative: _mm512_cvtsepi32_epi8 ? const __m512i neg127 = _mm512_set1_epi8(-127); @@ -151,7 +187,7 @@ struct AVX512_16bit { _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, avx512f::QuantizerGrab(input, quant_mult_reg)); } } - + // Tile size for B; B must be a multiple of this block size. static const Index kBTileRow = 32; @@ -164,12 +200,13 @@ struct AVX512_16bit { /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, avx512f::QuantizeTile16) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, CPUType::AVX512BW, int16_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, avx512f::QuantizeTile16, int16_t) /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end); } - + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_MULTIPLY16(__m512i, INTGEMM_AVX512BW, CPUType::AVX2) @@ -244,6 +281,7 @@ struct AVX512_8bit { /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, avx512f::QuantizeTile8) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, CPUType::AVX512BW, int8_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, avx512f::QuantizeTile8, int8_t) /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { diff --git a/interleave.h b/interleave.h index 43cbab0..41ac8b7 100644 --- a/interleave.h +++ b/interleave.h @@ -234,6 +234,13 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f } \ } +/* + * Prepare B matrix. + * B matrix has to be transposed and quantized. + * Cols has to be a multiple of sizeof(Register) / sizeof(Integer). + * + * cols and rows describe size of transposed B. + */ #define INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(target, cpu_type, Integer) \ target static inline void PrepareBQuantizedTransposed(const Integer* input, Integer* output, Index cols, Index rows) { \ using Register = vector_t<cpu_type, Integer>; \ @@ -252,6 +259,40 @@ target static inline void PrepareBQuantizedTransposed(const Integer* input, Inte *output_it++ = *reinterpret_cast<const Register*>(input + (r + ri) * cols + c); \ } +/* + * Prepare B matrix. + * B matrix has to be transposed. + * Cols has to be a multiple of sizeof(Register) / sizeof(float). + * + * cols and rows describe size of transposed B. + */ +#define INTGEMM_PREPARE_B_TRANSPOSED(target, Quantizer, integer) \ +target static inline void PrepareBTransposed(const float* input, integer* output, float quant_mult, Index cols, Index rows) { \ + using Register = typename Quantizer::Integer; \ + const Index RegisterElemsInt = sizeof(Register) / sizeof(integer); \ + const Index RegisterElemsFloat = sizeof(Register) / sizeof(float); \ + const Index kColStride = 8; \ + \ + assert(cols % RegisterElemsFloat == 0); \ + assert(rows % kColStride == 0); \ + assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ + assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \ + \ + Quantizer quantizer(quant_mult); \ + Register* output_it = reinterpret_cast<Register*>(output); \ + Index r = 0; \ + Index c = 0; \ + while (r < rows) { \ + for (Index ri = 0; ri < 8; ++ri) \ + *output_it++ = quantizer.ConsecutiveWithWrapping(input + (r + ri) * cols + c, cols - c, cols, 8); \ + c += RegisterElemsInt; \ + while (c >= cols) { \ + r += kColStride; \ + c -= cols; \ + } \ + } \ +} + /* Select columns of B from PrepareB format to PrepareB format. */ #define INTGEMM_SELECT_COL_B(target, Register) \ @@ -12,6 +12,8 @@ void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, I void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBQuantizedTransposed, AVX512_16bit::PrepareBQuantizedTransposed, AVX2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed); +void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBTransposed, AVX512_16bit::PrepareBTransposed, AVX2_16bit::PrepareBTransposed, SSE2_16bit::PrepareBTransposed, SSE2_16bit::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed); + void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB); const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName); @@ -24,6 +26,8 @@ void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Ind void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_8bit::PrepareBQuantizedTransposed, AVX512_8bit::PrepareBQuantizedTransposed, AVX2_8bit::PrepareBQuantizedTransposed, SSSE3_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed); +void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_8bit::PrepareBTransposed, AVX512_8bit::PrepareBTransposed, AVX2_8bit::PrepareBTransposed, SSSE3_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed); + void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI_8bit::SelectColumnsB, AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB); const char *const Int8::kName = ChooseCPU(AVX512VNNI_8bit::kName, AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); @@ -66,11 +66,12 @@ struct Unsupported_16bit { static void PrepareB(const float *, int16_t *, float, Index, Index) { throw UnsupportedCPU(); } - static void PrepareBQuantizedTransposed(const int16_t *, int16_t *, Index, Index) { throw UnsupportedCPU(); } - + static void PrepareBTransposed(const float *, int16_t *, float, Index, Index) { + throw UnsupportedCPU(); + } static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) { throw UnsupportedCPU(); } @@ -94,6 +95,9 @@ struct Unsupported_8bit { static void PrepareBQuantizedTransposed(const int8_t *, int8_t *, Index, Index) { throw UnsupportedCPU(); } + static void PrepareBTransposed(const float *, int8_t *, float, Index, Index) { + throw UnsupportedCPU(); + } static void PrepareB(const float *, int8_t *, float, Index, Index) { throw UnsupportedCPU(); } @@ -238,6 +242,11 @@ struct Int16 { // CPU-independent fashion. static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols); + // Convert from a B that was already transposed (routine not provided) to + // the CPU-dependent format used for Multiply. This is useful for storing + // a quantized model on disk then in a CPU-independent fashion. + static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols); + // Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8. static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end); @@ -299,6 +308,11 @@ struct Int8 { // CPU-independent fashion. static void (*PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols); + // Convert from a B that was already transposed (routine not provided) to + // the CPU-dependent format used for Multiply. This is useful for storing + // a quantized model on disk then in a CPU-independent fashion. + static void (*PrepareBTransposed)(const float *input, int8_t *output, float quant_mul, Index inner, Index B_untransposed_cols); + // Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8. static void (*SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end); diff --git a/sse2_gemm.h b/sse2_gemm.h index c59c4a6..a27b358 100644 --- a/sse2_gemm.h +++ b/sse2_gemm.h @@ -25,11 +25,14 @@ class QuantizeTile16 { INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {} - // Quantize 8xfloat into 8xint16_t INTGEMM_SSE2 inline __m128i Consecutive(const float *input) { - __m128i g0 = QuantizerGrab(input, mult_reg_); - __m128i g1 = QuantizerGrab(input + 4, mult_reg_); - return _mm_packs_epi32(g0, g1); + return Tile(input, input + 4); + } + + INTGEMM_SSE2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + return Tile( + input, + input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0)); } INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) { @@ -37,6 +40,12 @@ class QuantizeTile16 { } private: + INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) { + __m128i g0 = QuantizerGrab(input0, mult_reg_); + __m128i g1 = QuantizerGrab(input1, mult_reg_); + return _mm_packs_epi32(g0, g1); + } + const __m128 mult_reg_; }; @@ -71,6 +80,7 @@ struct SSE2_16bit { INTGEMM_PREPARE_B_16(INTGEMM_SSE2, sse2::QuantizeTile16) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSE2, CPUType::SSE2, int16_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSE2, sse2::QuantizeTile16, int16_t) INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { //TODO #DEFINE diff --git a/ssse3_gemm.h b/ssse3_gemm.h index dc0b6c7..18bf14b 100644 --- a/ssse3_gemm.h +++ b/ssse3_gemm.h @@ -28,26 +28,39 @@ class QuantizeTile8 { INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) { // Skip a row. - return Tile(input, input + 2 * cols); + return Tile(input, input + 4, input + 2 * cols, input + 2 * cols + 4); } INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) { - return Tile(input, input + 8); + return Tile(input, input + 4, input + 8, input + 12); } INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) { - return TileU(input, input + 8); + return TileU(input, input + 4, input + 8, input + 12); } + INTGEMM_SSSE3 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + const float* inputs[4]; + for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) { + while (cols_left < sizeof(Integer) / sizeof(float)) { + input += cols * (row_step - 1); + cols_left += cols; + } + inputs[i] = input; + input += sizeof(Integer) / sizeof(float); + cols_left -= sizeof(Integer) / sizeof(float); + } + return Tile(inputs[0], inputs[1], inputs[2], inputs[3]); + } private: // Quantize 16xfloat into 16xint8_t - INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1) { + INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) { const __m128i neg128 = _mm_set1_epi8(-128); __m128i g0 = QuantizerGrab(input0, mult_reg_); - __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_); - __m128i g2 = QuantizerGrab(input1, mult_reg_); - __m128i g3 = QuantizerGrab(input1 + 4, mult_reg_); + __m128i g1 = QuantizerGrab(input1, mult_reg_); + __m128i g2 = QuantizerGrab(input2, mult_reg_); + __m128i g3 = QuantizerGrab(input3, mult_reg_); __m128i packed0 = _mm_packs_epi32(g0, g1); __m128i packed1 = _mm_packs_epi32(g2, g3); __m128i packed = _mm_packs_epi16(packed0, packed1); @@ -64,13 +77,13 @@ class QuantizeTile8 { // No permute needed. packs is in order for SSE. } - INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1) { + INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) { const __m128i neg128 = _mm_set1_epi8(-128); const __m128i pos127 = _mm_set1_epi8(127); __m128i g0 = QuantizerGrab(input0, mult_reg_); - __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_); - __m128i g2 = QuantizerGrab(input1, mult_reg_); - __m128i g3 = QuantizerGrab(input1 + 4, mult_reg_); + __m128i g1 = QuantizerGrab(input1, mult_reg_); + __m128i g2 = QuantizerGrab(input2, mult_reg_); + __m128i g3 = QuantizerGrab(input3, mult_reg_); __m128i packed0 = _mm_packs_epi32(g0, g1); __m128i packed1 = _mm_packs_epi32(g2, g3); __m128i packed = _mm_packs_epi16(packed0, packed1); @@ -137,6 +150,7 @@ struct SSSE3_8bit { INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, CPUType::SSE2, int8_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, ssse3::QuantizeTile8, int8_t) INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); @@ -145,7 +159,7 @@ struct SSSE3_8bit { INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2) INTGEMM_MULTIPLY8SHIFT(__m128i, INTGEMM_SSSE3, CPUType::SSE2) - + INTGEMM_PREPAREBIASFOR8(__m128i, INTGEMM_SSSE3, CPUType::SSE2) constexpr static const char *const kName = "8-bit SSSE3"; diff --git a/test/prepare_b_quantized_transposed.cc b/test/prepare_b_quantized_transposed.cc index b7b2257..a096c21 100644 --- a/test/prepare_b_quantized_transposed.cc +++ b/test/prepare_b_quantized_transposed.cc @@ -13,27 +13,27 @@ namespace intgemm { namespace { template <typename Backend> -void PrepareBQuantizedTransposedRef(const typename Backend::Integer* input, typename Backend::Integer* output, Index rows, Index cols) { +void PrepareBQuantizedTransposedRef(const typename Backend::Integer* input, typename Backend::Integer* output, Index B_transposed_cols, Index B_transposed_rows) { using vec_t = intgemm::vector_t<Backend::kUses, typename Backend::Integer>; constexpr Index vec_len = sizeof(vec_t) / sizeof(typename Backend::Integer); auto output_it = output; - for (Index r = 0; r < rows; r += 8) - for (Index c = 0; c < cols; c += vec_len) + for (Index r = 0; r < B_transposed_rows; r += 8) + for (Index c = 0; c < B_transposed_cols; c += vec_len) for (Index ri = 0; ri < 8; ++ri) for (Index ci = 0; ci < vec_len; ++ci) - *output_it++ = input[(r + ri) * cols + c + ci]; + *output_it++ = input[(r + ri) * B_transposed_cols + c + ci]; } template <typename Backend> -bool Test(const AlignedVector<typename Backend::Integer>& input, Index rows, Index cols) { +bool Test(const AlignedVector<typename Backend::Integer>& input, Index B_rows, Index B_cols) { bool success = true; AlignedVector<typename Backend::Integer> output(input.size()); - Backend::PrepareBQuantizedTransposed(input.begin(), output.begin(), cols, rows); + Backend::PrepareBQuantizedTransposed(input.begin(), output.begin(), B_rows, B_cols); AlignedVector<typename Backend::Integer> reference(input.size()); - PrepareBQuantizedTransposedRef<Backend>(input.begin(), reference.begin(), rows, cols); + PrepareBQuantizedTransposedRef<Backend>(input.begin(), reference.begin(), B_rows, B_cols); for (std::size_t i = 0; i < output.size(); ++i) { if (output[i] != reference[i]) { @@ -46,10 +46,8 @@ bool Test(const AlignedVector<typename Backend::Integer>& input, Index rows, Ind } template <typename Backend> -bool TestMany() { - const static Index rows = 128; - const static Index cols = 128; - AlignedVector<typename Backend::Integer> input(rows * cols); +bool TestMany(Index B_rows, Index B_cols) { + AlignedVector<typename Backend::Integer> input(B_rows * B_cols); std::generate(input.begin(), input.end(), []() { static constexpr int divider = sizeof(intgemm::vector_t<Backend::kUses, typename Backend::Integer>) / sizeof(typename Backend::Integer); @@ -57,29 +55,29 @@ bool TestMany() { return (value++) % divider; }); - return Test<Backend>(input, rows, cols); + return Test<Backend>(input, B_rows, B_cols); } TEST_CASE("PrepareBQuantizedTransposed SSE2", "") { if (kCPU < CPUType::SSE2) return; - CHECK(TestMany<SSE2_16bit>()); + CHECK(TestMany<SSE2_16bit>(32, 128)); } TEST_CASE("PrepareBQuantizedTransposed SSSE3", "") { if (kCPU < CPUType::SSSE3) return; - CHECK(TestMany<SSSE3_8bit>()); + CHECK(TestMany<SSSE3_8bit>(32, 128)); } TEST_CASE("PrepareBQuantizedTransposed AVX2", "") { if (kCPU < CPUType::AVX2) return; - CHECK(TestMany<AVX2_8bit>()); - CHECK(TestMany<AVX2_16bit>()); + CHECK(TestMany<AVX2_8bit>(32, 128)); + CHECK(TestMany<AVX2_16bit>(32, 128)); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 @@ -87,8 +85,8 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") { if (kCPU < CPUType::AVX512BW) return; - CHECK(TestMany<AVX512_8bit>()); - CHECK(TestMany<AVX512_16bit>()); + CHECK(TestMany<AVX512_8bit>(32, 128)); + CHECK(TestMany<AVX512_16bit>(32, 128)); } #endif diff --git a/test/prepare_b_transposed.cc b/test/prepare_b_transposed.cc new file mode 100644 index 0000000..219e56a --- /dev/null +++ b/test/prepare_b_transposed.cc @@ -0,0 +1,95 @@ +#include "test.h" +#include "../aligned.h" +#include "../avx2_gemm.h" +#include "../avx512_gemm.h" +#include "../sse2_gemm.h" +#include "../ssse3_gemm.h" + +#include <cstring> +#include <iostream> +#include <math.h> + +namespace intgemm { +namespace { + +template <typename Backend> +void PrepareBTransposedRef(const float* input, typename Backend::Integer* output, float quant_mult, Index B_transposed_cols, Index B_transposed_rows) { + using vec_t = intgemm::vector_t<Backend::kUses, typename Backend::Integer>; + constexpr Index vec_len = sizeof(vec_t) / sizeof(typename Backend::Integer); + + for (Index i = 0; i < B_transposed_rows * B_transposed_cols / 8; i += vec_len) + for (Index j = 0; j < 8; ++j) + for (Index k = 0; k < vec_len; ++k) { + Index col = (i + k) % B_transposed_cols; + Index row = 8 * ((i + k) / B_transposed_cols) + j; + *output++ = input[row * B_transposed_cols + col] * quant_mult; + } +} + +template <typename Backend> +bool Test(const AlignedVector<float>& input, Index B_rows, Index B_cols, float quant_mult) { + bool success = true; + + AlignedVector<typename Backend::Integer> output(input.size()); + Backend::PrepareBTransposed(input.begin(), output.begin(), quant_mult, B_rows, B_cols); + + AlignedVector<typename Backend::Integer> reference(input.size()); + PrepareBTransposedRef<Backend>(input.begin(), reference.begin(), quant_mult, B_rows, B_cols); + + for (std::size_t i = 0; i < output.size(); ++i) { + if (output[i] != reference[i]) { + UNSCOPED_INFO("Error at " << i << ", output = " << int(output[i]) << ", reference = " << int(reference[i])); + success = false; + break; + } + } + return success; +} + +template <typename Backend> +bool TestMany(Index B_rows, Index B_cols, float quant_mult) { + AlignedVector<float> input(B_rows * B_cols); + + std::generate(input.begin(), input.end(), []() { + static constexpr int divider = sizeof(intgemm::vector_t<Backend::kUses, typename Backend::Integer>) / sizeof(typename Backend::Integer); + static int value = 0; + return (value++) % divider; + }); + + return Test<Backend>(input, B_rows, B_cols, quant_mult); +} + +TEST_CASE("PrepareBTransposed SSE2", "") { + if (kCPU < CPUType::SSE2) + return; + + CHECK(TestMany<SSE2_16bit>(4, 128, 2.0f)); +} + +TEST_CASE("PrepareBTransposed SSSE3", "") { + if (kCPU < CPUType::SSSE3) + return; + + CHECK(TestMany<SSSE3_8bit>(4, 128, 2.0f)); +} + +TEST_CASE("PrepareBTransposed AVX2", "") { + if (kCPU < CPUType::AVX2) + return; + + CHECK(TestMany<AVX2_8bit>(8, 128, 2.0f)); + CHECK(TestMany<AVX2_16bit>(8, 128, 2.0f)); +} + +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 + TEST_CASE("PrepareBTransposed AVX512", "") { + if (kCPU < CPUType::AVX512BW) + return; + + CHECK(TestMany<AVX512_8bit>(16, 128, 2.0f)); + CHECK(TestMany<AVX512_16bit>(16, 128, 2.0f)); + } +#endif + +} +} |