diff options
Diffstat (limited to 'intgemm')
-rw-r--r-- | intgemm/avx2_gemm.h | 34 | ||||
-rw-r--r-- | intgemm/avx512_gemm.h | 4 | ||||
-rw-r--r-- | intgemm/avx512vnni_gemm.h | 6 | ||||
-rw-r--r-- | intgemm/intgemm.cc | 64 | ||||
-rw-r--r-- | intgemm/intgemm.h | 26 | ||||
-rw-r--r-- | intgemm/sse2_gemm.h | 4 | ||||
-rw-r--r-- | intgemm/ssse3_gemm.h | 8 | ||||
-rw-r--r-- | intgemm/stats.inl | 6 | ||||
-rw-r--r-- | intgemm/types.h | 20 |
9 files changed, 86 insertions, 86 deletions
diff --git a/intgemm/avx2_gemm.h b/intgemm/avx2_gemm.h index 6e01679..d93ac8e 100644 --- a/intgemm/avx2_gemm.h +++ b/intgemm/avx2_gemm.h @@ -13,7 +13,7 @@ #include <cstring> namespace intgemm { -namespace avx2 { +namespace AVX2 { INTGEMM_AVX2 inline Register QuantizerGrab(const float *input, const __m256 quant_mult_reg) { return kernels::quantize(loadu_ps<FRegister>(input), quant_mult_reg); @@ -73,14 +73,14 @@ struct Kernels16 { static const Index kBTileCol = 8; /* INTGEMM_AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { - PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols); + PrepareBFor16(input, output, AVX2::QuantizeTile16(quant_mult), rows, cols); }*/ - INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16) + INTGEMM_PREPARE_B_16(INTGEMM_AVX2, AVX2::QuantizeTile16) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int16_t) - INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile16, int16_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile16, int16_t) INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end); + AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end); } INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2) @@ -129,10 +129,10 @@ class QuantizeTile8 { const __m256i neg127 = _mm256_set1_epi8(-127); const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); // Grab 4 registers at a time in 32-bit format. - __m256i g0 = avx2::QuantizerGrab(input0, quant_mult); - __m256i g1 = avx2::QuantizerGrab(input1, quant_mult); - __m256i g2 = avx2::QuantizerGrab(input2, quant_mult); - __m256i g3 = avx2::QuantizerGrab(input3, quant_mult); + __m256i g0 = AVX2::QuantizerGrab(input0, quant_mult); + __m256i g1 = AVX2::QuantizerGrab(input1, quant_mult); + __m256i g2 = AVX2::QuantizerGrab(input2, quant_mult); + __m256i g3 = AVX2::QuantizerGrab(input3, quant_mult); // Pack 32-bit to 16-bit. __m256i packed0 = _mm256_packs_epi32(g0, g1); __m256i packed1 = _mm256_packs_epi32(g2, g3); @@ -155,10 +155,10 @@ class QuantizeTile8 { const __m256i pos127 = _mm256_set1_epi8(127); const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); // Grab 4 registers at a time in 32-bit format. - __m256i g0 = avx2::QuantizerGrab(input0, quant_mult); - __m256i g1 = avx2::QuantizerGrab(input1, quant_mult); - __m256i g2 = avx2::QuantizerGrab(input2, quant_mult); - __m256i g3 = avx2::QuantizerGrab(input3, quant_mult); + __m256i g0 = AVX2::QuantizerGrab(input0, quant_mult); + __m256i g1 = AVX2::QuantizerGrab(input1, quant_mult); + __m256i g2 = AVX2::QuantizerGrab(input2, quant_mult); + __m256i g3 = AVX2::QuantizerGrab(input3, quant_mult); // Pack 32-bit to 16-bit. __m256i packed0 = _mm256_packs_epi32(g0, g1); __m256i packed1 = _mm256_packs_epi32(g2, g3); @@ -207,12 +207,12 @@ struct Kernels8 { static const Index kBTileRow = 32; static const Index kBTileCol = 8; - INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8) + INTGEMM_PREPARE_B_8(INTGEMM_AVX2, AVX2::QuantizeTile8) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int8_t) - INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile8, int8_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile8, int8_t) INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end); + AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end); } INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, CPUType::AVX2) @@ -226,7 +226,7 @@ struct Kernels8 { static const CPUType kUses = CPUType::AVX2; }; -} // namespace avx2 +} // namespace AVX2 } // namespace intgemm #endif diff --git a/intgemm/avx512_gemm.h b/intgemm/avx512_gemm.h index f9fb1eb..a69b2dc 100644 --- a/intgemm/avx512_gemm.h +++ b/intgemm/avx512_gemm.h @@ -31,7 +31,7 @@ namespace intgemm { // So conversion in memory uses these, but I also implement a wider version for // rearranging B. -namespace avx512bw { +namespace AVX512BW { // Load from memory, multiply, and convert to int32_t. /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ @@ -405,7 +405,7 @@ struct Kernels8 { static const CPUType kUses = CPUType::AVX512BW; }; -} // namespace avx512bw +} // namespace AVX512BW } // namespace intgemm #endif diff --git a/intgemm/avx512vnni_gemm.h b/intgemm/avx512vnni_gemm.h index c660168..747bdf9 100644 --- a/intgemm/avx512vnni_gemm.h +++ b/intgemm/avx512vnni_gemm.h @@ -7,7 +7,7 @@ #include "types.h" namespace intgemm { -namespace avx512vnni { +namespace AVX512VNNI { // Workaround extra vmovdqa64 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663 INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) { @@ -18,7 +18,7 @@ INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) { #endif } -struct Kernels8 : public avx512bw::Kernels8 { +struct Kernels8 : public AVX512BW::Kernels8 { template <typename Callback> INTGEMM_AVX512VNNI static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { assert(width % sizeof(Register) == 0); @@ -162,7 +162,7 @@ struct Kernels8 : public avx512bw::Kernels8 { static const CPUType kUses = CPUType::AVX512VNNI; }; -} // namespace avx512vnni +} // namespace AVX512VNNI } // namespace intgemm #endif diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc index d45cf60..82ad750 100644 --- a/intgemm/intgemm.cc +++ b/intgemm/intgemm.cc @@ -11,69 +11,69 @@ MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/ throw UnsupportedCPU(); } -void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(avx512bw::Kernels16::Quantize, avx512bw::Kernels16::Quantize, avx2::Kernels16::Quantize, sse2::Kernels16::Quantize, sse2::Kernels16::Quantize, Unsupported_16bit::Quantize); +void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512BW::Kernels16::Quantize, AVX512BW::Kernels16::Quantize, AVX2::Kernels16::Quantize, SSE2::Kernels16::Quantize, SSE2::Kernels16::Quantize, Unsupported_16bit::Quantize); -void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512bw::Kernels16::PrepareB, avx512bw::Kernels16::PrepareB, avx2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, Unsupported_16bit::PrepareB); +void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512BW::Kernels16::PrepareB, AVX512BW::Kernels16::PrepareB, AVX2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, Unsupported_16bit::PrepareB); -void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBQuantizedTransposed, avx512bw::Kernels16::PrepareBQuantizedTransposed, avx2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed); +void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed); -void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBTransposed, avx512bw::Kernels16::PrepareBTransposed, avx2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed); +void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBTransposed, AVX512BW::Kernels16::PrepareBTransposed, AVX2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed); -void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512bw::Kernels16::SelectColumnsB, avx512bw::Kernels16::SelectColumnsB, avx2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB); +void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512BW::Kernels16::SelectColumnsB, AVX512BW::Kernels16::SelectColumnsB, AVX2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB); -const char *const Int16::kName = ChooseCPU(avx512bw::Kernels16::kName, avx512bw::Kernels16::kName, avx2::Kernels16::kName, sse2::Kernels16::kName, sse2::Kernels16::kName, Unsupported_16bit::kName); +const char *const Int16::kName = ChooseCPU(AVX512BW::Kernels16::kName, AVX512BW::Kernels16::kName, AVX2::Kernels16::kName, SSE2::Kernels16::kName, SSE2::Kernels16::kName, Unsupported_16bit::kName); -void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::Quantize, avx512bw::Kernels8::Quantize, avx2::Kernels8::Quantize, ssse3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize); +void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::Quantize, AVX512BW::Kernels8::Quantize, AVX2::Kernels8::Quantize, SSSE3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize); -void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); +void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); -void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512vnni::Kernels8::PrepareB, avx512bw::Kernels8::PrepareB, avx2::Kernels8::PrepareB, ssse3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB); +void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512VNNI::Kernels8::PrepareB, AVX512BW::Kernels8::PrepareB, AVX2::Kernels8::PrepareB, SSSE3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB); -void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBQuantizedTransposed, avx512bw::Kernels8::PrepareBQuantizedTransposed, avx2::Kernels8::PrepareBQuantizedTransposed, ssse3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed); +void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX2::Kernels8::PrepareBQuantizedTransposed, SSSE3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed); -void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBTransposed, avx512bw::Kernels8::PrepareBTransposed, avx2::Kernels8::PrepareBTransposed, ssse3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed); +void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBTransposed, AVX512BW::Kernels8::PrepareBTransposed, AVX2::Kernels8::PrepareBTransposed, SSSE3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed); -void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512vnni::Kernels8::SelectColumnsB, avx512bw::Kernels8::SelectColumnsB, avx2::Kernels8::SelectColumnsB, ssse3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB); +void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI::Kernels8::SelectColumnsB, AVX512BW::Kernels8::SelectColumnsB, AVX2::Kernels8::SelectColumnsB, SSSE3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB); -const char *const Int8::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); +const char *const Int8::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); -void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); +void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); -const char *const Int8Shift::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); +const char *const Int8Shift::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED); #if !defined(INTGEMM_COMPILER_SUPPORTS_AVX2) -namespace avx2{ -using sse2::MaxAbsolute; -using sse2::VectorMeanStd; -} // namespace avx2 +namespace AVX2{ +using SSE2::MaxAbsolute; +using SSE2::VectorMeanStd; +} // namespace AVX2 #endif #if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW) -namespace avx512bw { -using avx2::MaxAbsolute; -using avx2::VectorMeanStd; -} // namespace avx512bw +namespace AVX512BW { +using AVX2::MaxAbsolute; +using AVX2::VectorMeanStd; +} // namespace AVX512BW #endif -float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512bw::MaxAbsolute, avx512bw::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute); +float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512BW::MaxAbsolute, AVX512BW::MaxAbsolute, AVX2::MaxAbsolute, SSE2::MaxAbsolute, SSE2::MaxAbsolute, Unsupported_MaxAbsolute); -MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512bw::VectorMeanStd, avx512bw::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, Unsupported_VectorMeanStd); +MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(AVX512BW::VectorMeanStd, AVX512BW::VectorMeanStd, AVX2::VectorMeanStd, SSE2::VectorMeanStd, SSE2::VectorMeanStd, Unsupported_VectorMeanStd); constexpr const char *const Unsupported_16bit::kName; constexpr const char *const Unsupported_8bit::kName; -constexpr const char *const sse2::Kernels16::kName; -constexpr const char *const ssse3::Kernels8::kName; +constexpr const char *const SSE2::Kernels16::kName; +constexpr const char *const SSSE3::Kernels8::kName; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 -constexpr const char *const avx2::Kernels8::kName; -constexpr const char *const avx2::Kernels16::kName; +constexpr const char *const AVX2::Kernels8::kName; +constexpr const char *const AVX2::Kernels16::kName; #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW -constexpr const char *const avx512bw::Kernels8::kName; -constexpr const char *const avx512bw::Kernels16::kName; +constexpr const char *const AVX512BW::Kernels8::kName; +constexpr const char *const AVX512BW::Kernels16::kName; #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI -constexpr const char *const avx512vnni::Kernels8::kName; +constexpr const char *const AVX512VNNI::Kernels8::kName; #endif } diff --git a/intgemm/intgemm.h b/intgemm/intgemm.h index a354b60..029a8ec 100644 --- a/intgemm/intgemm.h +++ b/intgemm/intgemm.h @@ -127,21 +127,21 @@ struct Unsupported_8bit { #ifndef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI // These won't ever be called in this capacity, but it does let the code below compile. -namespace avx512vnni { +namespace AVX512VNNI { typedef Unsupported_8bit Kernels8; -} // namespace avx512vnni +} // namespace AVX512VNNI #endif #ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW -namespace avx512bw { +namespace AVX512BW { typedef Unsupported_8bit Kernels8; typedef Unsupported_16bit Kernels16; -} // namespace avx512bw +} // namespace AVX512BW #endif #ifndef INTGEMM_COMPILER_SUPPORTS_AVX2 -namespace avx2 { +namespace AVX2 { typedef Unsupported_8bit Kernels8; typedef Unsupported_16bit Kernels16; -} // namespace avx2 +} // namespace AVX2 #endif @@ -309,7 +309,7 @@ private: }; template <typename Callback> -void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512vnni::Kernels8>, OMPParallelWrap<Callback, avx512bw::Kernels8>, OMPParallelWrap<Callback, avx2::Kernels8>, OMPParallelWrap<Callback, ssse3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>); +void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512VNNI::Kernels8>, OMPParallelWrap<Callback, AVX512BW::Kernels8>, OMPParallelWrap<Callback, AVX2::Kernels8>, OMPParallelWrap<Callback, SSSE3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>); /* * 8-bit matrix multiplication with shifting A by 127 @@ -373,14 +373,14 @@ private: template <class Callback> void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU( - OMPParallelWrap8Shift<Callback, avx512vnni::Kernels8>, - OMPParallelWrap8Shift<Callback, avx512bw::Kernels8>, - OMPParallelWrap8Shift<Callback, avx2::Kernels8>, - OMPParallelWrap8Shift<Callback, ssse3::Kernels8>, + OMPParallelWrap8Shift<Callback, AVX512VNNI::Kernels8>, + OMPParallelWrap8Shift<Callback, AVX512BW::Kernels8>, + OMPParallelWrap8Shift<Callback, AVX2::Kernels8>, + OMPParallelWrap8Shift<Callback, SSSE3::Kernels8>, Unsupported_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift<Callback>); template <class Callback> -void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(avx512vnni::Kernels8::PrepareBias<Callback>, avx512bw::Kernels8::PrepareBias<Callback>, avx2::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias); +void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI::Kernels8::PrepareBias<Callback>, AVX512BW::Kernels8::PrepareBias<Callback>, AVX2::Kernels8::PrepareBias<Callback>, SSSE3::Kernels8::PrepareBias<Callback>, SSSE3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias); /* * 16-bit matrix multiplication @@ -436,7 +436,7 @@ private: }; template <typename Callback> -void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512bw::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, avx512bw::Kernels16>, OMPParallelWrap<Callback, avx2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, Unsupported_16bit::Multiply<Callback>); +void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512BW::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, AVX512BW::Kernels16>, OMPParallelWrap<Callback, AVX2::Kernels16>, OMPParallelWrap<Callback, SSE2::Kernels16>, OMPParallelWrap<Callback, SSE2::Kernels16>, Unsupported_16bit::Multiply<Callback>); extern const CPUType kCPU; diff --git a/intgemm/sse2_gemm.h b/intgemm/sse2_gemm.h index cd49efe..cd855a6 100644 --- a/intgemm/sse2_gemm.h +++ b/intgemm/sse2_gemm.h @@ -9,7 +9,7 @@ // 8 bit is in ssse3_gemm.h namespace intgemm { -namespace sse2 { +namespace SSE2 { INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg); @@ -80,5 +80,5 @@ struct Kernels16 { static const CPUType kUses = CPUType::SSE2; }; -} // namespace sse2 +} // namespace SSE2 } // namespace intgemm diff --git a/intgemm/ssse3_gemm.h b/intgemm/ssse3_gemm.h index 865fe12..db403bd 100644 --- a/intgemm/ssse3_gemm.h +++ b/intgemm/ssse3_gemm.h @@ -11,7 +11,7 @@ // 16-bit is in sse2_gemm.h namespace intgemm { -namespace ssse3 { +namespace SSSE3 { INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg); @@ -131,12 +131,12 @@ struct Kernels8 { static const Index kBTileRow = 16; static const Index kBTileCol = 8; - INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8) + INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, SSSE3::QuantizeTile8) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t) INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t) INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); + SSSE3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); } INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2) @@ -150,5 +150,5 @@ struct Kernels8 { static const CPUType kUses = CPUType::SSSE3; }; -} // namespace ssse3 +} // namespace SSSE3 } // namespace intgemm diff --git a/intgemm/stats.inl b/intgemm/stats.inl index d6a850e..68a5b8e 100644 --- a/intgemm/stats.inl +++ b/intgemm/stats.inl @@ -1,12 +1,12 @@ /* This file is included multiple times, once per architecture. */ #if defined(INTGEMM_THIS_IS_AVX512DQ) -#define INTGEMM_ARCH avx512bw +#define INTGEMM_ARCH AVX512BW #define INTGEMM_TARGET INTGEMM_AVX512DQ #elif defined(INTGEMM_THIS_IS_AVX2) -#define INTGEMM_ARCH avx2 +#define INTGEMM_ARCH AVX2 #define INTGEMM_TARGET INTGEMM_AVX2 #elif defined(INTGEMM_THIS_IS_SSE2) -#define INTGEMM_ARCH sse2 +#define INTGEMM_ARCH SSE2 #define INTGEMM_TARGET INTGEMM_SSE2 #else #error Included with unexpected architecture diff --git a/intgemm/types.h b/intgemm/types.h index 174f1ff..602130a 100644 --- a/intgemm/types.h +++ b/intgemm/types.h @@ -70,30 +70,30 @@ struct MeanStd { }; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI -namespace avx512vnni { +namespace AVX512VNNI { typedef __m512i Register; typedef __m512 FRegister; -} // namespace avx512vnni +} // namespace AVX512VNNI #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW -namespace avx512bw { +namespace AVX512BW { typedef __m512i Register; typedef __m512 FRegister; -} // namespace avx512bw +} // namespace AVX512BW #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 -namespace avx2 { +namespace AVX2 { typedef __m256i Register; typedef __m256 FRegister; -} // namespace avx2 +} // namespace AVX2 #endif -namespace ssse3 { +namespace SSSE3 { typedef __m128i Register; typedef __m128 FRegister; -} // namespace ssse3 -namespace sse2 { +} // namespace SSSE3 +namespace SSE2 { typedef __m128i Register; typedef __m128 FRegister; -} // namespace sse2 +} // namespace SSE2 } // namespace intgemm |