diff options
author | Nikolay Bogoychev <nheart@gmail.com> | 2019-04-11 20:43:16 +0300 |
---|---|---|
committer | Nikolay Bogoychev <nheart@gmail.com> | 2019-04-11 20:43:16 +0300 |
commit | aaf3291466b7d04b830fca2cc53f93ba0b661d8b (patch) | |
tree | d2cd122b38f002faaf53ea0de61112903a4535ba | |
parent | 7755f35f0492a92aa736d1a66e01b440fda7c2c2 (diff) |
WiP
-rw-r--r-- | CMakeLists.txt | 14 | ||||
-rw-r--r-- | avx2_gemm.cc | 154 | ||||
-rw-r--r-- | avx2_gemm.h | 151 | ||||
-rw-r--r-- | avx512_gemm.cc | 281 | ||||
-rw-r--r-- | avx512_gemm.h | 274 | ||||
-rw-r--r-- | cops.h | 19 | ||||
-rw-r--r-- | intgemm.cc | 117 | ||||
-rw-r--r-- | intgemm.h | 103 | ||||
-rw-r--r-- | intrinsics.h | 7 | ||||
-rw-r--r-- | multiply.h | 10 | ||||
-rw-r--r-- | sse2_gemm.cc | 75 | ||||
-rw-r--r-- | sse2_gemm.h | 64 | ||||
-rw-r--r-- | ssse3_gemm.cc | 88 | ||||
-rw-r--r-- | ssse3_gemm.h | 83 | ||||
-rw-r--r-- | test/multiply_test.cc | 15 | ||||
-rw-r--r-- | types.h | 15 |
16 files changed, 667 insertions, 803 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index d9aa674..05e395a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,15 +29,15 @@ else() set_source_files_properties(intgemm.cc test/quantize_test.cc test/multiply_test.cc benchmark.cc PROPERTIES COMPILE_DEFINITIONS "INTGEMM_NO_AVX512") endif() -add_library(intgemm STATIC ${GEMMS} intgemm.cc) -foreach(exe example benchmark) - add_executable(${exe} ${exe}.cc) - target_link_libraries(${exe} intgemm) -endforeach() +#add_library(intgemm STATIC ${GEMMS} intgemm.cc) +#foreach(exe example benchmark) +# add_executable(${exe} ${exe}.cc) +# target_link_libraries(${exe} intgemm) +#endforeach() include_directories(.) -add_executable(tests test/multiply_test.cc test/quantize_test.cc) -target_link_libraries(tests intgemm) +add_executable(tests test/multiply_test.cc test/quantize_test.cc avx2_gemm.cc) # avx512_gemm.cc) +#target_link_libraries(tests intgemm) #CTest integration with Catch2 include(CMake/Catch.cmake) diff --git a/avx2_gemm.cc b/avx2_gemm.cc index 955f64c..541b5c2 100644 --- a/avx2_gemm.cc +++ b/avx2_gemm.cc @@ -1,155 +1 @@ #include "avx2_gemm.h" -#include "cops.h" -#include "interleave.h" -#include "multiply.h" - -#include <cassert> -#include <emmintrin.h> -#include <immintrin.h> -#include <tmmintrin.h> -#include <xmmintrin.h> -#include <stdint.h> - -namespace intgemm { - -// PREPARE A: just quantization in the same memory order. - -namespace { -// Read a vector of floats, multiply them, and cast to 32-bit integer. -inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) { - return _mm256_cvtps_epi32(_mm256_mul_ps(*reinterpret_cast<const __m256*>(input), quant_mult_reg)); -} - -class QuantizeTile16 { - public: - typedef __m256i Integer; - - explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {} - - Integer Consecutive(const float *input) { - return Tile(input, input + 8); - } - - Integer ForReshape(const float *input, Index cols) { - // 8 rows in the first 128-bit register, 8 in the second register. - return Tile(input, input + 8 * cols); - } - - private: - __m256i Tile(const float *input0, const float *input1) { - __m256i g0 = QuantizerGrab(input0, mult_); - __m256i g1 = QuantizerGrab(input1, mult_); - __m256i packed = _mm256_packs_epi32(g0, g1); - // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15. - // Technically this could be removed if the PrepareB did the same reordering internally. - return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); - } - - const __m256 mult_; -}; - -} // namespace - -// Just quantize everything in order. -void AVX2_16bit::Quantize(const float *input, int16_t *output, float quant_mult, Index size) { - assert(size % 16 == 0); - assert(reinterpret_cast<uintptr_t>(input) % 32 == 0); - QuantizeTile16 q(quant_mult); - const float *end = input + size; - for (; input != end; input += 16, output += 16) { - *reinterpret_cast<__m256i*>(output) = q.Consecutive(input); - } -} - -namespace { -/* Read 8 floats at a time from input0, input1, input2, and input3. Quantize - * them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate - * the result into one register and return it. - */ -class QuantizeTile8 { - public: - typedef __m256i Integer; - - explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {} - - inline __m256i Consecutive(const float *input) { - return Tile(input, input + 8, input + 16, input + 24); - } - - inline __m256i ForReshape(const float *input, Index cols) { - // Put higher rows in the second half of the register. These will jumble - // around in the same way then conveniently land in the right place. - return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols); - } - - private: - inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) { - // Looking at the assembly, gcc has pulled this outside the loops calling this. - const __m256i neg127 = _mm256_set1_epi8(-127); - const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); - // Grab 4 registers at a time in 32-bit format. - __m256i g0 = QuantizerGrab(input0, mult_); - __m256i g1 = QuantizerGrab(input1, mult_); - __m256i g2 = QuantizerGrab(input2, mult_); - __m256i g3 = QuantizerGrab(input3, mult_); - // Pack 32-bit to 16-bit. - __m256i packed0 = _mm256_packs_epi32(g0, g1); - __m256i packed1 = _mm256_packs_epi32(g2, g3); - // Pack 16-bit to 8-bit. - __m256i packed = _mm256_packs_epi16(packed0, packed1); - // Ban -128. - packed = _mm256_max_epi8(packed, neg127); - // Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 - // Or as 32-bit integers 0 2 4 6 1 3 5 7 - // Technically this could be removed so long as the rows are bigger than 16 - // and the values are only used for GEMM. - return _mm256_permutevar8x32_epi32(packed, shuffle_param); - } - - const __m256 mult_; -}; -} // namespace - -// Just quantize everything in order. -void AVX2_8bit::Quantize(const float *input, int8_t *output, float quant_mult, Index size) { - assert(size % 32 == 0); - assert(reinterpret_cast<uintptr_t>(input) % 32 == 0); - QuantizeTile8 q(quant_mult); - const float *end = input + size; - for (; input != end; input += 32, output += 32) { - *reinterpret_cast<__m256i*>(output) = q.Consecutive(input); - } -} - -void AVX2_16bit::PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { - PrepareBFor16(input, output, QuantizeTile16(quant_mult), rows, cols); -} - -void AVX2_16bit::SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end); -} - -void AVX2_8bit::PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { - PrepareBFor8(input, output, QuantizeTile8(quant_mult), rows, cols); -} - -void AVX2_8bit::SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end); -} - -void AVX2_16bit::Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { - Multiply16<__m256i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols); -} - -void AVX2_8bit::Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { - Multiply8_SSE2OrAVX2<Multiply8_AVXAVX2, __m256i, __m256>(A, B, C, unquant_mult, A_rows, width, B_cols); -} - -const char *const AVX2_16bit::kName = "16-bit AVX2"; -const char *const AVX2_8bit::kName = "8-bit AVX2"; - -float AVX2_MaxAbsolute(const float *begin, const float *end) { - return MaxAbsoluteBackend<__m256>(begin, end); -} - -} // namespace intgemm diff --git a/avx2_gemm.h b/avx2_gemm.h index 4b0b001..32b1a5e 100644 --- a/avx2_gemm.h +++ b/avx2_gemm.h @@ -3,59 +3,184 @@ #include <cstdint> #include <stdint.h> +#include "cops.h" +#include "interleave.h" +#include "multiply.h" + namespace intgemm { +// PREPARE A: just quantization in the same memory order. + +namespace avx2 { +// Read a vector of floats, multiply them, and cast to 32-bit integer. +// EVIL EVIL CODE DUPLICATION, FIX +AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) { + return _mm256_cvtps_epi32(_mm256_mul_ps(*reinterpret_cast<const __m256*>(input), quant_mult_reg)); +} + +class QuantizeTile16 { + public: + typedef __m256i Integer; + + explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {} + + AVX2 Integer Consecutive(const float *input) { + return Tile(input, input + 8); + } + + AVX2 Integer ForReshape(const float *input, Index cols) { + // 8 rows in the first 128-bit register, 8 in the second register. + return Tile(input, input + 8 * cols); + } + + private: + AVX2 __m256i Tile(const float *input0, const float *input1) { + __m256i g0 = QuantizerGrab(input0, mult_); + __m256i g1 = QuantizerGrab(input1, mult_); + __m256i packed = _mm256_packs_epi32(g0, g1); + // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15. + // Technically this could be removed if the PrepareB did the same reordering internally. + return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); + } + + const __m256 mult_; +}; + +} // namespace + + struct AVX2_16bit { typedef int16_t Integer; // Currently A is prepared by quantization but this could theoretically change. - static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } - static void Quantize(const float *input, int16_t *output, float quant_mult, Index size); + // Just quantize everything in order. + AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { + assert(size % 16 == 0); + assert(reinterpret_cast<uintptr_t>(input) % 32 == 0); + avx2::QuantizeTile16 q(quant_mult); + const float *end = input + size; + for (; input != end; input += 16, output += 16) { + *reinterpret_cast<__m256i*>(output) = q.Consecutive(input); + } + } // Tile size for B; B must be a multiple of this block size. static const Index kBTileRow = 16; static const Index kBTileCol = 8; - static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols); + AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols); + } - static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end); + AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end); + } - static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols); + AVX2 static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { + Multiply16<__m256i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols); + } - static const char *const kName; + constexpr static const char *const kName = "16-bit AVX2"; static const CPUType kUses = CPU_AVX2; }; +namespace avx2 { +/* Read 8 floats at a time from input0, input1, input2, and input3. Quantize + * them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate + * the result into one register and return it. + */ +class QuantizeTile8 { + public: + typedef __m256i Integer; + + explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {} + + AVX2 inline __m256i Consecutive(const float *input) { + return Tile(input, input + 8, input + 16, input + 24); + } + + AVX2 inline __m256i ForReshape(const float *input, Index cols) { + // Put higher rows in the second half of the register. These will jumble + // around in the same way then conveniently land in the right place. + return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols); + } + + private: + AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) { + // Looking at the assembly, gcc has pulled this outside the loops calling this. + const __m256i neg127 = _mm256_set1_epi8(-127); + const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + // Grab 4 registers at a time in 32-bit format. + __m256i g0 = avx2::QuantizerGrab(input0, mult_); + __m256i g1 = avx2::QuantizerGrab(input1, mult_); + __m256i g2 = avx2::QuantizerGrab(input2, mult_); + __m256i g3 = avx2::QuantizerGrab(input3, mult_); + // Pack 32-bit to 16-bit. + __m256i packed0 = _mm256_packs_epi32(g0, g1); + __m256i packed1 = _mm256_packs_epi32(g2, g3); + // Pack 16-bit to 8-bit. + __m256i packed = _mm256_packs_epi16(packed0, packed1); + // Ban -128. + packed = _mm256_max_epi8(packed, neg127); + // Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31 + // Or as 32-bit integers 0 2 4 6 1 3 5 7 + // Technically this could be removed so long as the rows are bigger than 16 + // and the values are only used for GEMM. + return _mm256_permutevar8x32_epi32(packed, shuffle_param); + } + + const __m256 mult_; +}; +} // namespace + struct AVX2_8bit { typedef int8_t Integer; // Currently A is prepared by quantization but this could theoretically change. - static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } - static void Quantize(const float *input, int8_t *output, float quant_mult, Index size); + // Just quantize everything in order. + AVX2 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) { + assert(size % 32 == 0); + assert(reinterpret_cast<uintptr_t>(input) % 32 == 0); + avx2::QuantizeTile8 q(quant_mult); + const float *end = input + size; + for (; input != end; input += 32, output += 32) { + *reinterpret_cast<__m256i*>(output) = q.Consecutive(input); + } + } // Tile size for B; B must be a multiple of this block size. static const Index kBTileRow = 32; static const Index kBTileCol = 8; - static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols); + AVX2 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + PrepareBFor8(input, output, avx2::QuantizeTile8(quant_mult), rows, cols); + } - static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end); + AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end); + } - static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols); + AVX2 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { + Multiply8_SSE2OrAVX2<Multiply8_AVXAVX2, __m256i, __m256>(A, B, C, unquant_mult, A_rows, width, B_cols); + } - static const char *const kName; + constexpr static const char *const kName = "8-bit AVX2"; static const CPUType kUses = CPU_AVX2; }; // Technically only requires AVX -float AVX2_MaxAbsolute(const float *begin, const float *end); +AVX2 float AVX2_MaxAbsolute(const float *begin, const float *end) { + return MaxAbsoluteBackend<__m256>(begin, end); +} } // namespace intgemm diff --git a/avx512_gemm.cc b/avx512_gemm.cc index ff200bf..59da3e4 100644 --- a/avx512_gemm.cc +++ b/avx512_gemm.cc @@ -1,282 +1 @@ #include "avx512_gemm.h" -#include "interleave.h" -#include "multiply.h" -#include "cops.h" - -#include <cassert> -#include <cstddef> -#include <emmintrin.h> -#include <immintrin.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <tmmintrin.h> -#include <xmmintrin.h> - -namespace intgemm { - -namespace { - -// Load from memory, multiply, and convert to int32_t. -inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) { - // Multiply each by the quantization factor. - __m512 val = _mm512_mul_ps(*reinterpret_cast<const __m512*>(input), quant_mult_reg); - // Cast to 32-bit int - return _mm512_cvtps_epi32(val); -} - -} // namespace - - -// AVX512 has combined collapse and store instructions: -// _mm512_mask_cvtsepi32_storeu_epi16 -// _mm512_mask_cvtsepi32_storeu_epi8 -// So conversion in memory uses these, but I also implement a wider version for -// rearranging B. -// -// Convert to 16-bit signed integers. -void AVX512_16bit::Quantize(const float *input, int16_t *output, float quant_mult, Index size) { - assert(size % 16 == 0); - assert(reinterpret_cast<uintptr_t>(input) % 64 == 0); - // Fill with the quantization multiplier. - const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); - const float *end = input + size; - for (; input != end; input += 16, output += 16) { - // There doesn't seem to be an unmasked version. - _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, QuantizerGrab(input, quant_mult_reg)); - } -} - -// Convert to 8-bit signed integers. -void AVX512_8bit::Quantize(const float *input, int8_t *output, float quant_mult, Index size) { - assert(size % 16 == 0); - assert(reinterpret_cast<uintptr_t>(input) % 64 == 0); - const __m512i neg127 = _mm512_set1_epi32(-127); - const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); - const float *end = input + size; - for (; input < end; input += 16, output += 16) { - __m512i asint = QuantizerGrab(input, quant_mult_reg); - asint = _mm512_max_epi32(asint, neg127); - // There doesn't seem to be an unmasked version. - _mm512_mask_cvtsepi32_storeu_epi8(output, 0xffff, asint); - } -} - -namespace { - -// For PrepareB we want to read 8 columns at a time. When converting 32-bit -// floats to 8-bit values, that's 32 bytes of floats. But AVX512 is 64 bytes -// wide so it reads off the edge of the tile. We could expand the tile size -// but then the memory written to won't be contiguous anyway so we'd be doing a -// scatter anyway. Easier to just read the 8 columns we wanted as 256 bits -// concatenate. -inline __m512 Concat(const __m256 first, const __m256 second) { - // AVX512DQ but that goes with AVX512BW anyway. - return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1); -} - -// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently. -inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) { - __m512 appended = Concat(*reinterpret_cast<const __m256*>(input0), *reinterpret_cast<const __m256*>(input1)); - appended = _mm512_mul_ps(appended, quant_mult_reg); - return _mm512_cvtps_epi32(appended); -} - -// These are only used for reshaping due to the AVX512 instructions -// _mm512_mask_cvtsepi32_storeu_epi16 and _mm512_mask_cvtsepi32_storeu_epi8 -// being used for the quantizer. -class QuantizeTile16 { - public: - typedef __m512i Integer; - - explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} - - inline __m512i ForReshape(const float *input, Index cols) { - __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_); - __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_); - __m512i packed = _mm512_packs_epi32(g0, g1); - // Permute within 256-bit lanes, so same as AVX2 - return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); - } - - private: - const __m512 mult_reg_; -}; - -class QuantizeTile8 { - public: - typedef __m512i Integer; - - explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} - - inline __m512i ForReshape(const float *input, Index cols) { - // TODO: try alternative: _mm512_cvtsepi32_epi8 ? - const __m512i neg127 = _mm512_set1_epi8(-127); - // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc. - const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); - - // 32-bit format. - __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_); - __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_); - __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_); - __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_); - // Pack 32-bit to 16-bit. - __m512i packed0 = _mm512_packs_epi32(g0, g1); - __m512i packed1 = _mm512_packs_epi32(g2, g3); - // Pack 16-bit to 8-bit. - __m512i packed = _mm512_packs_epi16(packed0, packed1); - // Ban -128. - packed = _mm512_max_epi8(packed, neg127); - // 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63 - return _mm512_permutexvar_epi32(shuffle_param, packed); - } - - private: - const __m512 mult_reg_; -}; - -} // namespace - -void AVX512_16bit::PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { - PrepareBFor16(input, output, QuantizeTile16(quant_mult), rows, cols); -} - -void AVX512_16bit::SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end); -} - -void AVX512_8bit::PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { - PrepareBFor8(input, output, QuantizeTile8(quant_mult), rows, cols); -} - -void AVX512_8bit::SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end); -} - -void AVX512_16bit::Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { - // The unquantization is only 256-bit wide because there are 8 results. - Multiply16<__m512i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols); -} - -// Special AVX512 implementation due to having 32 registers (so I don't have to -// allocate registers manually) and no sign instruction. -void AVX512_8bit::Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { - typedef __m512i Integer; - typedef __m256 Float; // For quantization we only do 8 at a time. - // This is copy-paste from Multiply8_SSE2OrAVX2. - assert(width % sizeof(Integer) == 0); - assert(B_cols % 8 == 0); - assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0); - assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); - assert(reinterpret_cast<uintptr_t>(C) % sizeof(Integer) == 0); - Float unquant_reg = set1_ps<Float>(unquant_mult); - const int simd_width = width / sizeof(Integer); - const Integer *B0_col = reinterpret_cast<const Integer*>(B); - // Added for AVX512. - Integer zeros = setzero_si<Integer>(); - // Go over 8 columns of B at a time. - for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { - // Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once. - for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { - // Iterate over shared (inner) dimension. - const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width); - const Integer *A_end = A_live + simd_width; - const Integer *B_live = B0_col; - - // Do the first iteration to initialize the sums. - __m512i a = *A_live; - __mmask64 neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128)); - __m512i a_positive = _mm512_abs_epi8(a); - // These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A. - Integer sum0 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[0], neg_mask, zeros, B_live[0])); - Integer sum1 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[1], neg_mask, zeros, B_live[1])); - Integer sum2 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[2], neg_mask, zeros, B_live[2])); - Integer sum3 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[3], neg_mask, zeros, B_live[3])); - Integer sum4 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[4], neg_mask, zeros, B_live[4])); - Integer sum5 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[5], neg_mask, zeros, B_live[5])); - Integer sum6 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[6], neg_mask, zeros, B_live[6])); - Integer sum7 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[7], neg_mask, zeros, B_live[7])); - - ++A_live; - B_live += 8; - - // Use A as the loop variable so the add can be done where gcc likes it - // for branch prediction. - for (; A_live != A_end; ++A_live, B_live += 8) { - // Unique code here: can we do an inline function? - // Retrieve a. We will use this as the unsigned part. - a = *A_live; - // Retrieve the conveniently consecutive values of B. - __m512i b0 = *B_live; - __m512i b1 = *(B_live + 1); - __m512i b2 = *(B_live + 2); - __m512i b3 = *(B_live + 3); - __m512i b4 = *(B_live + 4); - __m512i b5 = *(B_live + 5); - __m512i b6 = *(B_live + 6); - __m512i b7 = *(B_live + 7); - - // Get a mask where a is negative. - // Didn't seem to make a difference definining sign bits here vs at top - neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128)); - a_positive = _mm512_abs_epi8(a); - - // Negate by subtracting from zero with a mask. - b0 = _mm512_mask_sub_epi8(b0, neg_mask, zeros, b0); - b1 = _mm512_mask_sub_epi8(b1, neg_mask, zeros, b1); - b2 = _mm512_mask_sub_epi8(b2, neg_mask, zeros, b2); - b3 = _mm512_mask_sub_epi8(b3, neg_mask, zeros, b3); - b4 = _mm512_mask_sub_epi8(b4, neg_mask, zeros, b4); - b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5); - b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6); - b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7); - // The magic 8-bit multiply then horizontal sum into 16-bit. - b0 = _mm512_maddubs_epi16(a_positive, b0); - b1 = _mm512_maddubs_epi16(a_positive, b1); - b2 = _mm512_maddubs_epi16(a_positive, b2); - b3 = _mm512_maddubs_epi16(a_positive, b3); - b4 = _mm512_maddubs_epi16(a_positive, b4); - b5 = _mm512_maddubs_epi16(a_positive, b5); - b6 = _mm512_maddubs_epi16(a_positive, b6); - b7 = _mm512_maddubs_epi16(a_positive, b7); - // Now we have 16-bit results that are the sum of two multiplies. - // Choosing to approximate and do adds. - // Perhaps every so often we could accumulate by upcasting. - sum0 = _mm512_adds_epi16(sum0, b0); - sum1 = _mm512_adds_epi16(sum1, b1); - sum2 = _mm512_adds_epi16(sum2, b2); - sum3 = _mm512_adds_epi16(sum3, b3); - sum4 = _mm512_adds_epi16(sum4, b4); - sum5 = _mm512_adds_epi16(sum5, b5); - sum6 = _mm512_adds_epi16(sum6, b6); - sum7 = _mm512_adds_epi16(sum7, b7); - // Unique code ends: can we do an inline function? - } - // Upcast to 32-bit and horizontally add. - Integer ones = set1_epi16<Integer>(1); - sum0 = madd_epi16(sum0, ones); - sum1 = madd_epi16(sum1, ones); - sum2 = madd_epi16(sum2, ones); - sum3 = madd_epi16(sum3, ones); - sum4 = madd_epi16(sum4, ones); - sum5 = madd_epi16(sum5, ones); - sum6 = madd_epi16(sum6, ones); - sum7 = madd_epi16(sum7, ones); - Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3); - Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7); - - auto total = PermuteSummer(pack0123, pack4567); - WriteC(C + A_rowidx * B_cols + B0_colidx, total, unquant_reg); - } - } -} - -const char *const AVX512_16bit::kName = "16-bit AVX512"; -const char *const AVX512_8bit::kName = "8-bit AVX512"; - -float AVX512_MaxAbsolute(const float *begin, const float *end) { - return MaxAbsoluteBackend<__m512>(begin, end); -} - -} // namespace intgemm diff --git a/avx512_gemm.h b/avx512_gemm.h index f9b0f81..e226686 100644 --- a/avx512_gemm.h +++ b/avx512_gemm.h @@ -1,6 +1,15 @@ #pragma once #include <stdint.h> #include <cstdint> +#include <cassert> +#include <cstddef> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "interleave.h" +#include "multiply.h" +#include "cops.h" #include "types.h" @@ -15,31 +24,140 @@ namespace intgemm { +// AVX512 has combined collapse and store instructions: +// _mm512_mask_cvtsepi32_storeu_epi16 +// _mm512_mask_cvtsepi32_storeu_epi8 +// So conversion in memory uses these, but I also implement a wider version for +// rearranging B. + +// Convert to 16-bit signed integers. +namespace avx512f { + +// Load from memory, multiply, and convert to int32_t. +AVX512F inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) { + // Multiply each by the quantization factor. + __m512 val = _mm512_mul_ps(*reinterpret_cast<const __m512*>(input), quant_mult_reg); + // Cast to 32-bit int + return _mm512_cvtps_epi32(val); +} + +// For PrepareB we want to read 8 columns at a time. When converting 32-bit +// floats to 8-bit values, that's 32 bytes of floats. But AVX512 is 64 bytes +// wide so it reads off the edge of the tile. We could expand the tile size +// but then the memory written to won't be contiguous anyway so we'd be doing a +// scatter anyway. Easier to just read the 8 columns we wanted as 256 bits +// concatenate. +AVX512F inline __m512 Concat(const __m256 first, const __m256 second) { + // AVX512DQ but that goes with AVX512BW anyway. + return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1); +} + +// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently. +AVX512F inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) { + __m512 appended = avx512f::Concat(*reinterpret_cast<const __m256*>(input0), *reinterpret_cast<const __m256*>(input1)); + appended = _mm512_mul_ps(appended, quant_mult_reg); + return _mm512_cvtps_epi32(appended); +} + +// These are only used for reshaping due to the AVX512 instructions +// _mm512_mask_cvtsepi32_storeu_epi16 and _mm512_mask_cvtsepi32_storeu_epi8 +// being used for the quantizer. +class QuantizeTile16 { + public: + typedef __m512i Integer; + + explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} + + AVX512F inline __m512i ForReshape(const float *input, Index cols) { + __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_); + __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_); + __m512i packed = _mm512_packs_epi32(g0, g1); + // Permute within 256-bit lanes, so same as AVX2 + return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); + } + + private: + const __m512 mult_reg_; +}; + +class QuantizeTile8 { + public: + typedef __m512i Integer; + + explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} + + AVX512F inline __m512i ForReshape(const float *input, Index cols) { + // TODO: try alternative: _mm512_cvtsepi32_epi8 ? + const __m512i neg127 = _mm512_set1_epi8(-127); + // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc. + const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); + + // 32-bit format. + __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_); + __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_); + __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_); + __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_); + // Pack 32-bit to 16-bit. + __m512i packed0 = _mm512_packs_epi32(g0, g1); + __m512i packed1 = _mm512_packs_epi32(g2, g3); + // Pack 16-bit to 8-bit. + __m512i packed = _mm512_packs_epi16(packed0, packed1); + // Ban -128. + packed = _mm512_max_epi8(packed, neg127); + // 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63 + return _mm512_permutexvar_epi32(shuffle_param, packed); + } + + private: + const __m512 mult_reg_; +}; + +} // namespace + struct AVX512_16bit { typedef int16_t Integer; // Currently A is prepared by quantization but this could theoretically change. // rows * cols must be a multiple of 16. - static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + AVX512F static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } // Technically output can be unaligned in Quantize. // But then it will need to be aligned for Multiply. // size must be a multiple of 16. - static void Quantize(const float *input, int16_t *output, float quant_mult, Index size); + // Convert to 16-bit signed integers. + AVX512F static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { + assert(size % 16 == 0); + assert(reinterpret_cast<uintptr_t>(input) % 64 == 0); + // Fill with the quantization multiplier. + const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); + const float *end = input + size; + for (; input != end; input += 16, output += 16) { + // There doesn't seem to be an unmasked version. + _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, avx512f::QuantizerGrab(input, quant_mult_reg)); + } + } + // Tile size for B; B must be a multiple of this block size. static const Index kBTileRow = 32; static const Index kBTileCol = 8; - static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols); + AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + PrepareBFor16(input, output, avx512f::QuantizeTile16(quant_mult), rows, cols); + } - static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end); + AVX512F static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end); + } - static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols); + AVX512F static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { + // The unquantization is only 256-bit wide because there are 8 results. + Multiply16<__m512i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols); + } - static const char *const kName; + constexpr static const char *const kName = "16-bit AVX512"; static const CPUType kUses = CPU_AVX512BW; }; @@ -48,29 +166,159 @@ struct AVX512_8bit { typedef int8_t Integer; // Currently A is prepared by quantization but this could theoretically change. - static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + AVX512F static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } // Technically output can be unaligned in Quantize. // But then it will need to be aligned for Multiply. - static void Quantize(const float *input, int8_t *output, float quant_mult, Index size); + // Convert to 8-bit signed integers. + AVX512F static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) { + assert(size % 16 == 0); + assert(reinterpret_cast<uintptr_t>(input) % 64 == 0); + const __m512i neg127 = _mm512_set1_epi32(-127); + const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); + const float *end = input + size; + for (; input < end; input += 16, output += 16) { + __m512i asint = avx512f::QuantizerGrab(input, quant_mult_reg); + asint = _mm512_max_epi32(asint, neg127); + // There doesn't seem to be an unmasked version. + _mm512_mask_cvtsepi32_storeu_epi8(output, 0xffff, asint); + } + } // Tile size for B; B must be a multiple of this block size. static const Index kBTileRow = 64; static const Index kBTileCol = 8; - static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols); + AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + PrepareBFor8(input, output, avx512f::QuantizeTile8(quant_mult), rows, cols); + } + + AVX512F static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end); + } + + // Special AVX512 implementation due to having 32 registers (so I don't have to + // allocate registers manually) and no sign instruction. + AVX512F static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { + typedef __m512i Integer; + typedef __m256 Float; // For quantization we only do 8 at a time. + // This is copy-paste from Multiply8_SSE2OrAVX2. + assert(width % sizeof(Integer) == 0); + assert(B_cols % 8 == 0); + assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0); + assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); + assert(reinterpret_cast<uintptr_t>(C) % sizeof(Integer) == 0); + Float unquant_reg = set1_ps<Float>(unquant_mult); + const int simd_width = width / sizeof(Integer); + const Integer *B0_col = reinterpret_cast<const Integer*>(B); + // Added for AVX512. + Integer zeros = setzero_si<Integer>(); + // Go over 8 columns of B at a time. + for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { + // Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once. + for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { + // Iterate over shared (inner) dimension. + const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width); + const Integer *A_end = A_live + simd_width; + const Integer *B_live = B0_col; + + // Do the first iteration to initialize the sums. + __m512i a = *A_live; + __mmask64 neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128)); + __m512i a_positive = _mm512_abs_epi8(a); + // These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A. + Integer sum0 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[0], neg_mask, zeros, B_live[0])); + Integer sum1 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[1], neg_mask, zeros, B_live[1])); + Integer sum2 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[2], neg_mask, zeros, B_live[2])); + Integer sum3 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[3], neg_mask, zeros, B_live[3])); + Integer sum4 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[4], neg_mask, zeros, B_live[4])); + Integer sum5 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[5], neg_mask, zeros, B_live[5])); + Integer sum6 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[6], neg_mask, zeros, B_live[6])); + Integer sum7 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[7], neg_mask, zeros, B_live[7])); - static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end); + ++A_live; + B_live += 8; - static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols); + // Use A as the loop variable so the add can be done where gcc likes it + // for branch prediction. + for (; A_live != A_end; ++A_live, B_live += 8) { + // Unique code here: can we do an inline function? + // Retrieve a. We will use this as the unsigned part. + a = *A_live; + // Retrieve the conveniently consecutive values of B. + __m512i b0 = *B_live; + __m512i b1 = *(B_live + 1); + __m512i b2 = *(B_live + 2); + __m512i b3 = *(B_live + 3); + __m512i b4 = *(B_live + 4); + __m512i b5 = *(B_live + 5); + __m512i b6 = *(B_live + 6); + __m512i b7 = *(B_live + 7); + + // Get a mask where a is negative. + // Didn't seem to make a difference definining sign bits here vs at top + neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128)); + a_positive = _mm512_abs_epi8(a); + + // Negate by subtracting from zero with a mask. + b0 = _mm512_mask_sub_epi8(b0, neg_mask, zeros, b0); + b1 = _mm512_mask_sub_epi8(b1, neg_mask, zeros, b1); + b2 = _mm512_mask_sub_epi8(b2, neg_mask, zeros, b2); + b3 = _mm512_mask_sub_epi8(b3, neg_mask, zeros, b3); + b4 = _mm512_mask_sub_epi8(b4, neg_mask, zeros, b4); + b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5); + b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6); + b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7); + // The magic 8-bit multiply then horizontal sum into 16-bit. + b0 = _mm512_maddubs_epi16(a_positive, b0); + b1 = _mm512_maddubs_epi16(a_positive, b1); + b2 = _mm512_maddubs_epi16(a_positive, b2); + b3 = _mm512_maddubs_epi16(a_positive, b3); + b4 = _mm512_maddubs_epi16(a_positive, b4); + b5 = _mm512_maddubs_epi16(a_positive, b5); + b6 = _mm512_maddubs_epi16(a_positive, b6); + b7 = _mm512_maddubs_epi16(a_positive, b7); + // Now we have 16-bit results that are the sum of two multiplies. + // Choosing to approximate and do adds. + // Perhaps every so often we could accumulate by upcasting. + sum0 = _mm512_adds_epi16(sum0, b0); + sum1 = _mm512_adds_epi16(sum1, b1); + sum2 = _mm512_adds_epi16(sum2, b2); + sum3 = _mm512_adds_epi16(sum3, b3); + sum4 = _mm512_adds_epi16(sum4, b4); + sum5 = _mm512_adds_epi16(sum5, b5); + sum6 = _mm512_adds_epi16(sum6, b6); + sum7 = _mm512_adds_epi16(sum7, b7); + // Unique code ends: can we do an inline function? + } + // Upcast to 32-bit and horizontally add. + Integer ones = set1_epi16<Integer>(1); + sum0 = madd_epi16(sum0, ones); + sum1 = madd_epi16(sum1, ones); + sum2 = madd_epi16(sum2, ones); + sum3 = madd_epi16(sum3, ones); + sum4 = madd_epi16(sum4, ones); + sum5 = madd_epi16(sum5, ones); + sum6 = madd_epi16(sum6, ones); + sum7 = madd_epi16(sum7, ones); + Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3); + Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7); + + auto total = PermuteSummer(pack0123, pack4567); + WriteC(C + A_rowidx * B_cols + B0_colidx, total, unquant_reg); + } + } +} - static const char *const kName; + constexpr static const char *const kName = "8-bit AVX512"; static const CPUType kUses = CPU_AVX512BW; }; -float AVX512_MaxAbsolute(const float *begin_float, const float *end_float); +AVX512F float AVX512_MaxAbsolute(const float *begin, const float *end) { + return MaxAbsoluteBackend<__m512>(begin, end); +} } // namespace intgemm @@ -1,27 +1,10 @@ +#pragma once #include "intrinsics.h" #include <exception> namespace intgemm { -// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3). -class UnsupportedCPU : public std::exception { - public: - UnsupportedCPU(); - - ~UnsupportedCPU() throw(); - - const char *what() const throw() override; -}; - -UnsupportedCPU::UnsupportedCPU() {} - -UnsupportedCPU::~UnsupportedCPU() throw() {} - -const char *UnsupportedCPU::what() const throw() { - return "Integer matrix multiplication has not been efficiently implemented for your CPU."; -} - class JustUnquantizeC { public: JustUnquantizeC(float *C, float unquant_mult); @@ -1,118 +1 @@ #include "intgemm.h" - -#include "types.h" -#include "sse2_gemm.h" -#include "ssse3_gemm.h" -#include "avx2_gemm.h" -#ifndef INTGEMM_NO_AVX512 -#include "avx512_gemm.h" -#endif - -namespace intgemm { - -UnsupportedCPU::UnsupportedCPU() {} - -UnsupportedCPU::~UnsupportedCPU() throw() {} - -const char *UnsupportedCPU::what() const throw() { - return "Integer matrix multiplication has not been efficiently implemented for your CPU."; -} - -namespace { - -struct Unsupported_16bit { - static void Quantize(const float *, int16_t *, float, Index) { - throw UnsupportedCPU(); - } - static void PrepareB(const float *, int16_t *, float, Index, Index) { - throw UnsupportedCPU(); - } - static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) { - throw UnsupportedCPU(); - } - static void Multiply(const int16_t *, const int16_t *, float *, float, Index, Index, Index) { - throw UnsupportedCPU(); - } - static const char *const kName; -}; -const char *const Unsupported_16bit::kName = "16-bit Unsupported"; - -struct Unsupported_8bit { - static void Quantize(const float *, int8_t *, float, Index) { - throw UnsupportedCPU(); - } - static void PrepareB(const float *, int8_t *, float, Index, Index) { - throw UnsupportedCPU(); - } - static void SelectColumnsB(const int8_t *, int8_t *, Index, const Index *, const Index *) { - throw UnsupportedCPU(); - } - static void Multiply(const int8_t *, const int8_t *, float *, float, Index, Index, Index) { - throw UnsupportedCPU(); - } - static const char *const kName; -}; -const char *const Unsupported_8bit::kName = "8-bit Unsupported"; - -float Unsupported_MaxAbsolute(const float *begin, const float *end) { - throw UnsupportedCPU(); -} - -/* Returns: - * avx512 if the CPU supports AVX512F (though really it should be AVX512BW, but - * cloud providers lie). TODO: don't catch Knights processors with this. - * - * avx2 if the CPU supports AVX2 - * - * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit) - * - * sse2 if the CPU supports SSE2 - * - * unsupported otherwise - */ -template <class T> T ChooseCPU(T avx512, T avx2, T ssse3, T sse2, T unsupported) { - // TODO: don't catch Knights processors here! -#ifndef INTGEMM_NO_AVX512 - if (__builtin_cpu_supports("avx512f")) { - return avx512; - } -#endif - if (__builtin_cpu_supports("avx2")) { - return avx2; - } else if (__builtin_cpu_supports("ssse3")) { - return ssse3; - } else if (__builtin_cpu_supports("sse2")) { - return sse2; - } else { - return unsupported; - } -} - -#ifdef INTGEMM_NO_AVX512 -// These won't ever be called in this capacity, but it does let the code below compile. -typedef Unsupported_16bit AVX512_16bit; -typedef Unsupported_8bit AVX512_8bit; -float AVX512_MaxAbsolute(const float *begin, const float *end) { - throw UnsupportedCPU(); -} -#endif - -} // namespace - -void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_16bit::Quantize, AVX2_16bit::Quantize, SSE2_16bit::Quantize, SSE2_16bit::Quantize, Unsupported_16bit::Quantize); -void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_16bit::PrepareB, AVX2_16bit::PrepareB, SSE2_16bit::PrepareB, SSE2_16bit::PrepareB, Unsupported_16bit::PrepareB); -void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB); -void (*Int16::Multiply)(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_16bit::Multiply, AVX2_16bit::Multiply, SSE2_16bit::Multiply, SSE2_16bit::Multiply, Unsupported_16bit::Multiply); -const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName); - -void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_8bit::Quantize, AVX2_8bit::Quantize, SSSE3_8bit::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize); -void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_8bit::PrepareB, AVX2_8bit::PrepareB, SSSE3_8bit::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB); -void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB); -void (*Int8::Multiply)(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_8bit::Multiply, AVX2_8bit::Multiply, SSSE3_8bit::Multiply, Unsupported_8bit::Multiply, Unsupported_8bit::Multiply); -const char *const Int8::kName = ChooseCPU(AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); - -const CPUType kCPU = ChooseCPU(CPU_AVX512BW, CPU_AVX2, CPU_SSSE3, CPU_SSE2, CPU_UNSUPPORTED); - -float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512_MaxAbsolute, AVX2_MaxAbsolute, SSE2_MaxAbsolute, SSE2_MaxAbsolute, Unsupported_MaxAbsolute); - -} // namespace intgemm @@ -43,24 +43,94 @@ // Yes, both headers due to the debacle about int32_t #include <cstdint> #include <stdint.h> -#include <exception> #include "types.h" +#include "sse2_gemm.h" +#include "ssse3_gemm.h" +#include "avx2_gemm.h" +#ifndef INTGEMM_NO_AVX512 +#include "avx512_gemm.h" +#endif /* Dispatch to functions based on runtime CPUID. This adds one call-by-variable to each call. */ namespace intgemm { -// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3). -class UnsupportedCPU : public std::exception { - public: - UnsupportedCPU(); - - ~UnsupportedCPU() throw(); +struct Unsupported_16bit { + static void Quantize(const float *, int16_t *, float, Index) { + throw UnsupportedCPU(); + } + static void PrepareB(const float *, int16_t *, float, Index, Index) { + throw UnsupportedCPU(); + } + static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) { + throw UnsupportedCPU(); + } + static void Multiply(const int16_t *, const int16_t *, float *, float, Index, Index, Index) { + throw UnsupportedCPU(); + } + constexpr static const char *const kName = "16-bit Unsupported"; +}; - const char *what() const throw() override; +struct Unsupported_8bit { + static void Quantize(const float *, int8_t *, float, Index) { + throw UnsupportedCPU(); + } + static void PrepareB(const float *, int8_t *, float, Index, Index) { + throw UnsupportedCPU(); + } + static void SelectColumnsB(const int8_t *, int8_t *, Index, const Index *, const Index *) { + throw UnsupportedCPU(); + } + static void Multiply(const int8_t *, const int8_t *, float *, float, Index, Index, Index) { + throw UnsupportedCPU(); + } + constexpr static const char *const kName = "8-bit Unsupported"; }; +float Unsupported_MaxAbsolute(const float *begin, const float *end) { + throw UnsupportedCPU(); +} + +#ifdef INTGEMM_NO_AVX512 +// These won't ever be called in this capacity, but it does let the code below compile. +typedef Unsupported_16bit AVX512_16bit; +typedef Unsupported_8bit AVX512_8bit; +float AVX512_MaxAbsolute(const float *begin, const float *end) { + throw UnsupportedCPU(); +} +#endif + +/* Returns: + * avx512 if the CPU supports AVX512F (though really it should be AVX512BW, but + * cloud providers lie). TODO: don't catch Knights processors with this. + * + * avx2 if the CPU supports AVX2 + * + * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit) + * + * sse2 if the CPU supports SSE2 + * + * unsupported otherwise + */ +template <class T> T ChooseCPU(T avx512, T avx2, T ssse3, T sse2, T unsupported) { + // TODO: don't catch Knights processors here! +#ifndef INTGEMM_NO_AVX512 + if (__builtin_cpu_supports("avx512f")) { + return avx512; + } +#endif + if (__builtin_cpu_supports("avx2")) { + return avx2; + } else if (__builtin_cpu_supports("ssse3")) { + return ssse3; + } else if (__builtin_cpu_supports("sse2")) { + return sse2; + } else { + return unsupported; + } +} + /* 16-bit matrix multiplication. */ struct Int16 { typedef int16_t Integer; @@ -96,6 +166,13 @@ struct Int16 { static const char *const kName; }; +void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_16bit::Quantize, AVX2_16bit::Quantize, SSE2_16bit::Quantize, SSE2_16bit::Quantize, Unsupported_16bit::Quantize); +void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_16bit::PrepareB, AVX2_16bit::PrepareB, SSE2_16bit::PrepareB, SSE2_16bit::PrepareB, Unsupported_16bit::PrepareB); +void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB); +void (*Int16::Multiply)(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_16bit::Multiply, AVX2_16bit::Multiply, SSE2_16bit::Multiply, SSE2_16bit::Multiply, Unsupported_16bit::Multiply); +const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName); + + /* 8-bit matrix multiplication */ struct Int8 { typedef int8_t Integer; @@ -130,7 +207,17 @@ struct Int8 { static const char *const kName; }; +void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_8bit::Quantize, AVX2_8bit::Quantize, SSSE3_8bit::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize); +void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_8bit::PrepareB, AVX2_8bit::PrepareB, SSSE3_8bit::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB); +void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB); +void (*Int8::Multiply)(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_8bit::Multiply, AVX2_8bit::Multiply, SSSE3_8bit::Multiply, Unsupported_8bit::Multiply, Unsupported_8bit::Multiply); +const char *const Int8::kName = ChooseCPU(AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); + +const CPUType kCPU = ChooseCPU(CPU_AVX512BW, CPU_AVX2, CPU_SSSE3, CPU_SSE2, CPU_UNSUPPORTED); + // Get the maximum absolute value of an array of floats. The number of floats must be a multiple of 16 and 64-byte aligned. extern float (*MaxAbsolute)(const float *begin, const float *end); +float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512_MaxAbsolute, AVX2_MaxAbsolute, SSE2_MaxAbsolute, SSE2_MaxAbsolute, Unsupported_MaxAbsolute); + } // namespace intgemm diff --git a/intrinsics.h b/intrinsics.h index 7c2cf57..fcc4752 100644 --- a/intrinsics.h +++ b/intrinsics.h @@ -33,13 +33,14 @@ template <> SSE2 inline __m128 set1_ps<__m128>(float to) { SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) { return _mm_madd_epi16(first, second); } -SSE2 static inline __m128i maddubs_epi16(__m128i first, __m128i second) { +SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) { return _mm_maddubs_epi16(first, second); } -SSE2 static inline __m128i sign_epi8(__m128i first, __m128i second) { +SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) { return _mm_sign_epi8(first, second); } -SSE2 static inline __m128i abs_epi8(__m128i arg) { + +SSSE3 static inline __m128i abs_epi8(__m128i arg) { return _mm_abs_epi8(arg); } SSE2 static inline __m128 max_ps(__m128 first, __m128 second) { @@ -17,7 +17,7 @@ static inline float MaxFloat32(__m128 a) { return *reinterpret_cast<float*>(&a); } -static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) { +SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) { // No op for 128 bits: already reduced fully. MultiplyResult128 ret; ret.pack0123 = pack0123; @@ -38,7 +38,7 @@ static inline float MaxFloat32(__m256 a) { return MaxFloat32(max_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))); } -static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) { +AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) { // This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f __m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21); // This instruction generates 1f 2f 3f 4f 5s 6s 7s 8s @@ -54,7 +54,7 @@ static inline void WriteC(float *to, __m256i total, __m256 unquant_reg) { #endif #ifdef __AVX512BW__ -static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) { +AVX512F static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) { // Form [0th 128-bit register of pack0123, 0st 128-bit register of pack4567, 2nd 128-bit register of pack0123, 2nd 128-bit register of pack4567] __m512i mix0 = _mm512_mask_permutex_epi64(pack0123, 0xcc, pack4567, (0 << 4) | (1 << 6)); // Form [1st 128-bit register of pack0123, 1st 128-bit register of pack4567, 3rd 128-bit register of pack0123, 3rd 128-bit register of pack4567] @@ -123,8 +123,8 @@ template <class Register> inline Register Pack0123(Register sum0, Register sum1, // A_rows can be anything non-negative. // width must be a multiple of the register size. // B_cols must be a multiple of 8. -//#define Multiply16(Integer, Annotate) \ - template <class WriteC> Annotate inline void Multiply16(const int16_t *A, const int16_t *B, WriteC functor, Index A_rows, Index width, Index B_cols) { +//#define Multiply16(Integer, Annotate) \ //fd +// template <class WriteC> Annotate inline void Multiply16(const int16_t *A, const int16_t *B, WriteC functor, Index A_rows, Index width, Index B_cols) { // template <class Integer, class WriteC> inline void Multiply16(const int16_t *A, const int16_t *B, WriteC functor, Index A_rows, Index width, Index B_cols) { assert(width % (sizeof(Integer) / sizeof(int16_t)) == 0); diff --git a/sse2_gemm.cc b/sse2_gemm.cc index 0ef353a..2fa5596 100644 --- a/sse2_gemm.cc +++ b/sse2_gemm.cc @@ -1,77 +1,2 @@ // This is only 16-bit. 8-bit is in ssse3_gemm.cc since it requires that. #include "sse2_gemm.h" -#include "cops.h" - -#include "interleave.h" -#include "multiply.h" - -#include <stdint.h> -#include <cassert> -#include <xmmintrin.h> -#include <emmintrin.h> - -namespace intgemm { - -namespace { -// Same implementation as AVX512, just width. Grabs 4 32-bit values. -inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { - return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg)); -} - -class QuantizeTile16 { - public: - typedef __m128i Integer; - - explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {} - - // Quantize 8xfloat into 8xint16_t - inline __m128i Consecutive(const float *input) { - __m128i g0 = QuantizerGrab(input, mult_reg_); - __m128i g1 = QuantizerGrab(input + 4, mult_reg_); - return _mm_packs_epi32(g0, g1); - } - - inline __m128i ForReshape(const float *input, int) { - return Consecutive(input); - } - - private: - const __m128 mult_reg_; -}; -} // namespace - -/* I also tried an implementation based on _mm_cvtps_pi16 but it was slower: - * For size 1048576, run 10x in seconds on i7-6700: - * This code: 0.00228409, 0.00204906 - * With _mm_cvtps_pi16 basis: 0.00391884, 0.00390869 - */ -void SSE2_16bit::Quantize(const float *input, int16_t *output, float quant_mult, Index size) { - assert(size % 8 == 0); - assert(reinterpret_cast<uintptr_t>(input) % 16 == 0); - assert(reinterpret_cast<uintptr_t>(output) % 16 == 0); - QuantizeTile16 q(quant_mult); - const float *end = input + size; - for (; input != end; input += 8, output += 8) { - *reinterpret_cast<__m128i*>(output) = q.Consecutive(input); - } -} - -void SSE2_16bit::PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { - PrepareBFor16(input, output, QuantizeTile16(quant_mult), rows, cols); -} - -void SSE2_16bit::SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end); -} - -void SSE2_16bit::Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { - Multiply16<__m128i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols); -} - -const char *const SSE2_16bit::kName = "16-bit SSE2"; - -float SSE2_MaxAbsolute(const float *begin, const float *end) { - return MaxAbsoluteBackend<__m128>(begin, end); -} - -} // namespace intgemm diff --git a/sse2_gemm.h b/sse2_gemm.h index 0f26362..7a5e5ce 100644 --- a/sse2_gemm.h +++ b/sse2_gemm.h @@ -2,37 +2,87 @@ #include "types.h" #include <cstdint> #include <stdint.h> +#include "cops.h" +#include "multiply.h" // 8 bit is in ssse3_gemm.h namespace intgemm { +namespace sse2 { +// Same implementation as AVX512, just width. Grabs 4 32-bit values. +// TODO duplicated function requires the removal of the annonymous namespace +SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { + return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg)); +} + +class QuantizeTile16 { + public: + typedef __m128i Integer; + + explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {} + + // Quantize 8xfloat into 8xint16_t + SSE2 inline __m128i Consecutive(const float *input) { + __m128i g0 = QuantizerGrab(input, mult_reg_); + __m128i g1 = QuantizerGrab(input + 4, mult_reg_); + return _mm_packs_epi32(g0, g1); + } + + SSE2 inline __m128i ForReshape(const float *input, int) { + return Consecutive(input); + } + + private: + const __m128 mult_reg_; +}; +} //namespace // This should be pure SSE2 (and below). struct SSE2_16bit { typedef int16_t Integer; // Currently A is prepared by quantization but this could theoretically change. - static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } - static void Quantize(const float *input, int16_t *output, float quant_mult, Index size); + SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { + assert(size % 8 == 0); + assert(reinterpret_cast<uintptr_t>(input) % 16 == 0); + assert(reinterpret_cast<uintptr_t>(output) % 16 == 0); + sse2::QuantizeTile16 q(quant_mult); + const float *end = input + size; + for (; input != end; input += 8, output += 8) { + *reinterpret_cast<__m128i*>(output) = q.Consecutive(input); + } + } // Tile size for B; B must be a multiple of this block size. static const Index kBTileRow = 8; static const Index kBTileCol = 8; - static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols); + SSE2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + //TODO #DEFINE + PrepareBFor16(input, output, sse2::QuantizeTile16(quant_mult), rows, cols); + } - static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end); + SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + //TODO #DEFINE + SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end); + } - static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols); + SSE2 static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { + //TODO #DEFINE + Multiply16<__m128i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols); + } - static const char *const kName; + constexpr static const char *const kName = "16-bit SSE2"; static const CPUType kUses = CPU_SSE2; }; // Technically only requires SSE -float SSE2_MaxAbsolute(const float *begin, const float *end); +SSE2 float SSE2_MaxAbsolute(const float *begin, const float *end) { + return MaxAbsoluteBackend<__m128>(begin, end); +} } // namespace intgemm diff --git a/ssse3_gemm.cc b/ssse3_gemm.cc index d5de13d..32e5bd2 100644 --- a/ssse3_gemm.cc +++ b/ssse3_gemm.cc @@ -1,89 +1 @@ #include "ssse3_gemm.h" - -#include "interleave.h" -#include "multiply.h" - -#include <stdint.h> -#include <cassert> -#include <xmmintrin.h> -#include <emmintrin.h> - -namespace intgemm { - -namespace { -// Same implementation as AVX512, just width. Grabs 4 32-bit values. -inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { - return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg)); -} - -class QuantizeTile8 { - public: - typedef __m128i Integer; - - explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {} - - inline __m128i ForReshape(const float *input, Index cols) { - // Skip a row. - return Tile(input, input + 2 * cols); - } - - inline __m128i Consecutive(const float *input) { - return Tile(input, input + 8); - } - - private: - // Quantize 16xfloat into 16xint8_t - inline __m128i Tile(const float *input0, const float *input1) { - const __m128i neg128 = _mm_set1_epi8(-128); - __m128i g0 = QuantizerGrab(input0, mult_reg_); - __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_); - __m128i g2 = QuantizerGrab(input1, mult_reg_); - __m128i g3 = QuantizerGrab(input1 + 4, mult_reg_); - __m128i packed0 = _mm_packs_epi32(g0, g1); - __m128i packed1 = _mm_packs_epi32(g2, g3); - __m128i packed = _mm_packs_epi16(packed0, packed1); - /* Ban -128. - * Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127). Instead, - * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8. - * The first generates 0xff for fields -128. - * The second subtracts 0xff from -128 which has the effect of converting - * to -127. - */ - // packed = _mm_max_epi8(packed, neg127); - __m128i evils = _mm_cmpeq_epi8(packed, neg128); - return _mm_sub_epi8(packed, evils); - // No permute needed. packs is in order for SSE. - } - - private: - const __m128 mult_reg_; -}; - -} // namespace - -void SSSE3_8bit::Quantize(const float *input, int8_t *output, float quant_mult, Index size) { - assert(size % 16 == 0); - assert(reinterpret_cast<uintptr_t>(input) % 16 == 0); - assert(reinterpret_cast<uintptr_t>(output) % 16 == 0); - QuantizeTile8 q(quant_mult); - const float *end = input + size; - for (; input != end; input += 16, output += 16) { - *reinterpret_cast<__m128i*>(output) = q.Consecutive(input); - } -} - -void SSSE3_8bit::PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { - PrepareBFor8(input, output, QuantizeTile8(quant_mult), rows, cols); -} - -void SSSE3_8bit::SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); -} - -void SSSE3_8bit::Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { - Multiply8_SSE2OrAVX2<Multiply8_C, __m128i, __m128>(A, B, C, unquant_mult, A_rows, width, B_cols); -} - -const char *const SSSE3_8bit::kName = "8-bit SSSE3"; - -} // namespace intgemm diff --git a/ssse3_gemm.h b/ssse3_gemm.h index 4993ef6..69ac298 100644 --- a/ssse3_gemm.h +++ b/ssse3_gemm.h @@ -3,32 +3,103 @@ #include <cstdint> #include <stdint.h> +#include "interleave.h" +#include "multiply.h" + // 16-bit is in sse2_gemm.h namespace intgemm { +namespace ssse3 { +// Same implementation as AVX512, just width. Grabs 4 32-bit values. +//TODO duplicated function requires the removal of the annonymous namespace +SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { + return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg)); +} + +class QuantizeTile8 { + public: + typedef __m128i Integer; + + SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {} + + SSSE3 inline __m128i ForReshape(const float *input, Index cols) { + // Skip a row. + return Tile(input, input + 2 * cols); + } + + SSSE3 inline __m128i Consecutive(const float *input) { + return Tile(input, input + 8); + } + + private: + // Quantize 16xfloat into 16xint8_t + SSSE3 inline __m128i Tile(const float *input0, const float *input1) { + const __m128i neg128 = _mm_set1_epi8(-128); + __m128i g0 = QuantizerGrab(input0, mult_reg_); + __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_); + __m128i g2 = QuantizerGrab(input1, mult_reg_); + __m128i g3 = QuantizerGrab(input1 + 4, mult_reg_); + __m128i packed0 = _mm_packs_epi32(g0, g1); + __m128i packed1 = _mm_packs_epi32(g2, g3); + __m128i packed = _mm_packs_epi16(packed0, packed1); + /* Ban -128. + * Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127). Instead, + * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8. + * The first generates 0xff for fields -128. + * The second subtracts 0xff from -128 which has the effect of converting + * to -127. + */ + // packed = _mm_max_epi8(packed, neg127); + __m128i evils = _mm_cmpeq_epi8(packed, neg128); + return _mm_sub_epi8(packed, evils); + // No permute needed. packs is in order for SSE. + } + + private: + const __m128 mult_reg_; +}; + +} // namespace + + // pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need. struct SSSE3_8bit { typedef int8_t Integer; // Currently A is prepared by quantization but this could theoretically change. - static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } - static void Quantize(const float *input, int8_t *output, float quant_mult, Index size); + SSSE3 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) { + assert(size % 16 == 0); + assert(reinterpret_cast<uintptr_t>(input) % 16 == 0); + assert(reinterpret_cast<uintptr_t>(output) % 16 == 0); + ssse3::QuantizeTile8 q(quant_mult); + const float *end = input + size; + for (; input != end; input += 16, output += 16) { + *reinterpret_cast<__m128i*>(output) = q.Consecutive(input); + } + } // Tile size for B; B must be a multiple of this block size. static const Index kBTileRow = 16; static const Index kBTileCol = 8; - static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols); + SSSE3 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { + PrepareBFor8(input, output, ssse3::QuantizeTile8(quant_mult), rows, cols); + } - static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end); + SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { + SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); + } - static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols); + SSSE3 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) { + Multiply8_SSE2OrAVX2<Multiply8_C, __m128i, __m128>(A, B, C, unquant_mult, A_rows, width, B_cols); + } - static const char *const kName; + constexpr static const char *const kName = "8-bit SSSE3"; static const CPUType kUses = CPU_SSSE3; }; diff --git a/test/multiply_test.cc b/test/multiply_test.cc index 301254a..8d2f50d 100644 --- a/test/multiply_test.cc +++ b/test/multiply_test.cc @@ -1,4 +1,4 @@ -#include "avx512_gemm.h" +//#include "avx512_gemm.h" #include "avx2_gemm.h" #include "ssse3_gemm.h" #include "sse2_gemm.h" @@ -73,7 +73,7 @@ TEST_CASE("Transpose 16", "[transpose]") { } } -TEST_CASE("Transpose 8", "[transpose]") { +SSSE3 TEST_CASE("Transpose 8", "[transpose]") { if (kCPU < CPU_SSSE3) return; AlignedVector<int8_t> input(16 * 16); for (int i = 0; i < 16 * 16; ++i) { @@ -127,7 +127,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) { "Quantized Input" << '\n' << PrintMatrix(quantized.get(), rows, cols) << "Reference" << '\n' << PrintMatrix(reference.get(), rows, cols) << "Routine" << '\n' << PrintMatrix(test.get(), rows, cols)); } - +/* TEST_CASE("Prepare AVX512", "[prepare]") { if (kCPU < CPU_AVX512BW) return; #ifndef INTGEMM_NO_AVX512 @@ -137,7 +137,7 @@ TEST_CASE("Prepare AVX512", "[prepare]") { TestPrepare<AVX512_16bit>(256, 32); #endif } - +*/ TEST_CASE("Prepare AVX2", "[prepare]") { if (kCPU < CPU_AVX2) return; TestPrepare<AVX2_8bit>(64, 32); @@ -192,7 +192,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1 CHECK_MESSAGE(memcmp(ref.get(), test.get(), sizeof(Integer) * rows * kSelectCols) == 0, "Reference:\n" << PrintMatrix(ref.get(), rows, kSelectCols) << PrintMatrix(test.get(), rows, kSelectCols)); } - +/* TEST_CASE("SelectColumnsB AVX512", "[select]") { if (kCPU < CPU_AVX512BW) return; #ifndef INTGEMM_NO_AVX512 @@ -200,7 +200,7 @@ TEST_CASE("SelectColumnsB AVX512", "[select]") { TestSelectColumnsB<AVX512_16bit>(256, 256); #endif } - +*/ TEST_CASE("SelectColumnsB AVX2", "[select]") { if (kCPU < CPU_AVX2) return; TestSelectColumnsB<AVX2_8bit>(256, 256); @@ -411,7 +411,7 @@ TEST_CASE ("Multiply AVX2 16bit", "[multiply]") { TestMultiply<AVX2_16bit>(248, 256, 256, .1, 1, 0.01); TestMultiply<AVX2_16bit>(200, 256, 256, .1, 1, 0.01); } - +/* #ifndef INTGEMM_NO_AVX512 TEST_CASE ("Multiply AVX512 8bit", "[multiply]") { if (kCPU < CPU_AVX512BW) return; @@ -433,6 +433,7 @@ TEST_CASE ("Multiply AVX2 16bit", "[multiply]") { TestMultiply<AVX512_16bit>(200, 256, 256, .1, 1, 0.01); } #endif + */ } // namespace intgemm int main(int argc, char ** argv) { @@ -1,4 +1,5 @@ #pragma once +#include <exception> #define DEFAULT __attribute__ ((target ("default"))) #define SSE2 __attribute__ ((target ("sse2"))) @@ -6,10 +7,22 @@ #define SSSE3 __attribute__ ((target ("ssse3"))) #define AVX2 __attribute__ ((target ("avx2"))) //#define AVX2_512F __attribute__ ((target ("avx2"), target("avx512f"))) //Not supported by clang -#define AVX512F __attribute__ ((target ("avx512f"))) +#define AVX512F __attribute__ ((target ("avx512bw"))) namespace intgemm { +// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3). +class UnsupportedCPU : public std::exception { + public: + UnsupportedCPU() {} + + ~UnsupportedCPU() throw() {} + + const char *what() const throw() override { + return "Integer matrix multiplication has not been efficiently implemented for your CPU."; + } +}; + typedef unsigned int Index; // If you want to detect the CPU and dispatch yourself, here's what to use: |