Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intgemm')
-rw-r--r--intgemm/avx2_gemm.h34
-rw-r--r--intgemm/avx512_gemm.h4
-rw-r--r--intgemm/avx512vnni_gemm.h6
-rw-r--r--intgemm/intgemm.cc64
-rw-r--r--intgemm/intgemm.h26
-rw-r--r--intgemm/sse2_gemm.h4
-rw-r--r--intgemm/ssse3_gemm.h8
-rw-r--r--intgemm/stats.inl6
-rw-r--r--intgemm/types.h20
9 files changed, 86 insertions, 86 deletions
diff --git a/intgemm/avx2_gemm.h b/intgemm/avx2_gemm.h
index 6e01679..d93ac8e 100644
--- a/intgemm/avx2_gemm.h
+++ b/intgemm/avx2_gemm.h
@@ -13,7 +13,7 @@
#include <cstring>
namespace intgemm {
-namespace avx2 {
+namespace AVX2 {
INTGEMM_AVX2 inline Register QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
return kernels::quantize(loadu_ps<FRegister>(input), quant_mult_reg);
@@ -73,14 +73,14 @@ struct Kernels16 {
static const Index kBTileCol = 8;
/*
INTGEMM_AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
- PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols);
+ PrepareBFor16(input, output, AVX2::QuantizeTile16(quant_mult), rows, cols);
}*/
- INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16)
+ INTGEMM_PREPARE_B_16(INTGEMM_AVX2, AVX2::QuantizeTile16)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int16_t)
- INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile16, int16_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile16, int16_t)
INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
+ AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
}
INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2)
@@ -129,10 +129,10 @@ class QuantizeTile8 {
const __m256i neg127 = _mm256_set1_epi8(-127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
// Grab 4 registers at a time in 32-bit format.
- __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
- __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
- __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
- __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
+ __m256i g0 = AVX2::QuantizerGrab(input0, quant_mult);
+ __m256i g1 = AVX2::QuantizerGrab(input1, quant_mult);
+ __m256i g2 = AVX2::QuantizerGrab(input2, quant_mult);
+ __m256i g3 = AVX2::QuantizerGrab(input3, quant_mult);
// Pack 32-bit to 16-bit.
__m256i packed0 = _mm256_packs_epi32(g0, g1);
__m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -155,10 +155,10 @@ class QuantizeTile8 {
const __m256i pos127 = _mm256_set1_epi8(127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
// Grab 4 registers at a time in 32-bit format.
- __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
- __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
- __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
- __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
+ __m256i g0 = AVX2::QuantizerGrab(input0, quant_mult);
+ __m256i g1 = AVX2::QuantizerGrab(input1, quant_mult);
+ __m256i g2 = AVX2::QuantizerGrab(input2, quant_mult);
+ __m256i g3 = AVX2::QuantizerGrab(input3, quant_mult);
// Pack 32-bit to 16-bit.
__m256i packed0 = _mm256_packs_epi32(g0, g1);
__m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -207,12 +207,12 @@ struct Kernels8 {
static const Index kBTileRow = 32;
static const Index kBTileCol = 8;
- INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8)
+ INTGEMM_PREPARE_B_8(INTGEMM_AVX2, AVX2::QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int8_t)
- INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile8, int8_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile8, int8_t)
INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
+ AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
}
INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, CPUType::AVX2)
@@ -226,7 +226,7 @@ struct Kernels8 {
static const CPUType kUses = CPUType::AVX2;
};
-} // namespace avx2
+} // namespace AVX2
} // namespace intgemm
#endif
diff --git a/intgemm/avx512_gemm.h b/intgemm/avx512_gemm.h
index f9fb1eb..a69b2dc 100644
--- a/intgemm/avx512_gemm.h
+++ b/intgemm/avx512_gemm.h
@@ -31,7 +31,7 @@ namespace intgemm {
// So conversion in memory uses these, but I also implement a wider version for
// rearranging B.
-namespace avx512bw {
+namespace AVX512BW {
// Load from memory, multiply, and convert to int32_t.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
@@ -405,7 +405,7 @@ struct Kernels8 {
static const CPUType kUses = CPUType::AVX512BW;
};
-} // namespace avx512bw
+} // namespace AVX512BW
} // namespace intgemm
#endif
diff --git a/intgemm/avx512vnni_gemm.h b/intgemm/avx512vnni_gemm.h
index c660168..747bdf9 100644
--- a/intgemm/avx512vnni_gemm.h
+++ b/intgemm/avx512vnni_gemm.h
@@ -7,7 +7,7 @@
#include "types.h"
namespace intgemm {
-namespace avx512vnni {
+namespace AVX512VNNI {
// Workaround extra vmovdqa64 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663
INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) {
@@ -18,7 +18,7 @@ INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) {
#endif
}
-struct Kernels8 : public avx512bw::Kernels8 {
+struct Kernels8 : public AVX512BW::Kernels8 {
template <typename Callback>
INTGEMM_AVX512VNNI static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
assert(width % sizeof(Register) == 0);
@@ -162,7 +162,7 @@ struct Kernels8 : public avx512bw::Kernels8 {
static const CPUType kUses = CPUType::AVX512VNNI;
};
-} // namespace avx512vnni
+} // namespace AVX512VNNI
} // namespace intgemm
#endif
diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc
index d45cf60..82ad750 100644
--- a/intgemm/intgemm.cc
+++ b/intgemm/intgemm.cc
@@ -11,69 +11,69 @@ MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/
throw UnsupportedCPU();
}
-void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(avx512bw::Kernels16::Quantize, avx512bw::Kernels16::Quantize, avx2::Kernels16::Quantize, sse2::Kernels16::Quantize, sse2::Kernels16::Quantize, Unsupported_16bit::Quantize);
+void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512BW::Kernels16::Quantize, AVX512BW::Kernels16::Quantize, AVX2::Kernels16::Quantize, SSE2::Kernels16::Quantize, SSE2::Kernels16::Quantize, Unsupported_16bit::Quantize);
-void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512bw::Kernels16::PrepareB, avx512bw::Kernels16::PrepareB, avx2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, Unsupported_16bit::PrepareB);
+void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512BW::Kernels16::PrepareB, AVX512BW::Kernels16::PrepareB, AVX2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, Unsupported_16bit::PrepareB);
-void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBQuantizedTransposed, avx512bw::Kernels16::PrepareBQuantizedTransposed, avx2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
+void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
-void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBTransposed, avx512bw::Kernels16::PrepareBTransposed, avx2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
+void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBTransposed, AVX512BW::Kernels16::PrepareBTransposed, AVX2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
-void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512bw::Kernels16::SelectColumnsB, avx512bw::Kernels16::SelectColumnsB, avx2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
+void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512BW::Kernels16::SelectColumnsB, AVX512BW::Kernels16::SelectColumnsB, AVX2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
-const char *const Int16::kName = ChooseCPU(avx512bw::Kernels16::kName, avx512bw::Kernels16::kName, avx2::Kernels16::kName, sse2::Kernels16::kName, sse2::Kernels16::kName, Unsupported_16bit::kName);
+const char *const Int16::kName = ChooseCPU(AVX512BW::Kernels16::kName, AVX512BW::Kernels16::kName, AVX2::Kernels16::kName, SSE2::Kernels16::kName, SSE2::Kernels16::kName, Unsupported_16bit::kName);
-void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::Quantize, avx512bw::Kernels8::Quantize, avx2::Kernels8::Quantize, ssse3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
+void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::Quantize, AVX512BW::Kernels8::Quantize, AVX2::Kernels8::Quantize, SSSE3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
-void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
+void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
-void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512vnni::Kernels8::PrepareB, avx512bw::Kernels8::PrepareB, avx2::Kernels8::PrepareB, ssse3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
+void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512VNNI::Kernels8::PrepareB, AVX512BW::Kernels8::PrepareB, AVX2::Kernels8::PrepareB, SSSE3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
-void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBQuantizedTransposed, avx512bw::Kernels8::PrepareBQuantizedTransposed, avx2::Kernels8::PrepareBQuantizedTransposed, ssse3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
+void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX2::Kernels8::PrepareBQuantizedTransposed, SSSE3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
-void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBTransposed, avx512bw::Kernels8::PrepareBTransposed, avx2::Kernels8::PrepareBTransposed, ssse3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
+void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBTransposed, AVX512BW::Kernels8::PrepareBTransposed, AVX2::Kernels8::PrepareBTransposed, SSSE3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
-void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512vnni::Kernels8::SelectColumnsB, avx512bw::Kernels8::SelectColumnsB, avx2::Kernels8::SelectColumnsB, ssse3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
+void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI::Kernels8::SelectColumnsB, AVX512BW::Kernels8::SelectColumnsB, AVX2::Kernels8::SelectColumnsB, SSSE3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
-const char *const Int8::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
+const char *const Int8::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
-void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
+void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
-const char *const Int8Shift::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
+const char *const Int8Shift::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED);
#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX2)
-namespace avx2{
-using sse2::MaxAbsolute;
-using sse2::VectorMeanStd;
-} // namespace avx2
+namespace AVX2{
+using SSE2::MaxAbsolute;
+using SSE2::VectorMeanStd;
+} // namespace AVX2
#endif
#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW)
-namespace avx512bw {
-using avx2::MaxAbsolute;
-using avx2::VectorMeanStd;
-} // namespace avx512bw
+namespace AVX512BW {
+using AVX2::MaxAbsolute;
+using AVX2::VectorMeanStd;
+} // namespace AVX512BW
#endif
-float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512bw::MaxAbsolute, avx512bw::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute);
+float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512BW::MaxAbsolute, AVX512BW::MaxAbsolute, AVX2::MaxAbsolute, SSE2::MaxAbsolute, SSE2::MaxAbsolute, Unsupported_MaxAbsolute);
-MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512bw::VectorMeanStd, avx512bw::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, Unsupported_VectorMeanStd);
+MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(AVX512BW::VectorMeanStd, AVX512BW::VectorMeanStd, AVX2::VectorMeanStd, SSE2::VectorMeanStd, SSE2::VectorMeanStd, Unsupported_VectorMeanStd);
constexpr const char *const Unsupported_16bit::kName;
constexpr const char *const Unsupported_8bit::kName;
-constexpr const char *const sse2::Kernels16::kName;
-constexpr const char *const ssse3::Kernels8::kName;
+constexpr const char *const SSE2::Kernels16::kName;
+constexpr const char *const SSSE3::Kernels8::kName;
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-constexpr const char *const avx2::Kernels8::kName;
-constexpr const char *const avx2::Kernels16::kName;
+constexpr const char *const AVX2::Kernels8::kName;
+constexpr const char *const AVX2::Kernels16::kName;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-constexpr const char *const avx512bw::Kernels8::kName;
-constexpr const char *const avx512bw::Kernels16::kName;
+constexpr const char *const AVX512BW::Kernels8::kName;
+constexpr const char *const AVX512BW::Kernels16::kName;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-constexpr const char *const avx512vnni::Kernels8::kName;
+constexpr const char *const AVX512VNNI::Kernels8::kName;
#endif
}
diff --git a/intgemm/intgemm.h b/intgemm/intgemm.h
index a354b60..029a8ec 100644
--- a/intgemm/intgemm.h
+++ b/intgemm/intgemm.h
@@ -127,21 +127,21 @@ struct Unsupported_8bit {
#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
// These won't ever be called in this capacity, but it does let the code below compile.
-namespace avx512vnni {
+namespace AVX512VNNI {
typedef Unsupported_8bit Kernels8;
-} // namespace avx512vnni
+} // namespace AVX512VNNI
#endif
#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-namespace avx512bw {
+namespace AVX512BW {
typedef Unsupported_8bit Kernels8;
typedef Unsupported_16bit Kernels16;
-} // namespace avx512bw
+} // namespace AVX512BW
#endif
#ifndef INTGEMM_COMPILER_SUPPORTS_AVX2
-namespace avx2 {
+namespace AVX2 {
typedef Unsupported_8bit Kernels8;
typedef Unsupported_16bit Kernels16;
-} // namespace avx2
+} // namespace AVX2
#endif
@@ -309,7 +309,7 @@ private:
};
template <typename Callback>
-void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512vnni::Kernels8>, OMPParallelWrap<Callback, avx512bw::Kernels8>, OMPParallelWrap<Callback, avx2::Kernels8>, OMPParallelWrap<Callback, ssse3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>);
+void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512VNNI::Kernels8>, OMPParallelWrap<Callback, AVX512BW::Kernels8>, OMPParallelWrap<Callback, AVX2::Kernels8>, OMPParallelWrap<Callback, SSSE3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>);
/*
* 8-bit matrix multiplication with shifting A by 127
@@ -373,14 +373,14 @@ private:
template <class Callback>
void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(
- OMPParallelWrap8Shift<Callback, avx512vnni::Kernels8>,
- OMPParallelWrap8Shift<Callback, avx512bw::Kernels8>,
- OMPParallelWrap8Shift<Callback, avx2::Kernels8>,
- OMPParallelWrap8Shift<Callback, ssse3::Kernels8>,
+ OMPParallelWrap8Shift<Callback, AVX512VNNI::Kernels8>,
+ OMPParallelWrap8Shift<Callback, AVX512BW::Kernels8>,
+ OMPParallelWrap8Shift<Callback, AVX2::Kernels8>,
+ OMPParallelWrap8Shift<Callback, SSSE3::Kernels8>,
Unsupported_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift<Callback>);
template <class Callback>
-void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(avx512vnni::Kernels8::PrepareBias<Callback>, avx512bw::Kernels8::PrepareBias<Callback>, avx2::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
+void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI::Kernels8::PrepareBias<Callback>, AVX512BW::Kernels8::PrepareBias<Callback>, AVX2::Kernels8::PrepareBias<Callback>, SSSE3::Kernels8::PrepareBias<Callback>, SSSE3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
/*
* 16-bit matrix multiplication
@@ -436,7 +436,7 @@ private:
};
template <typename Callback>
-void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512bw::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, avx512bw::Kernels16>, OMPParallelWrap<Callback, avx2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, Unsupported_16bit::Multiply<Callback>);
+void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512BW::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, AVX512BW::Kernels16>, OMPParallelWrap<Callback, AVX2::Kernels16>, OMPParallelWrap<Callback, SSE2::Kernels16>, OMPParallelWrap<Callback, SSE2::Kernels16>, Unsupported_16bit::Multiply<Callback>);
extern const CPUType kCPU;
diff --git a/intgemm/sse2_gemm.h b/intgemm/sse2_gemm.h
index cd49efe..cd855a6 100644
--- a/intgemm/sse2_gemm.h
+++ b/intgemm/sse2_gemm.h
@@ -9,7 +9,7 @@
// 8 bit is in ssse3_gemm.h
namespace intgemm {
-namespace sse2 {
+namespace SSE2 {
INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg);
@@ -80,5 +80,5 @@ struct Kernels16 {
static const CPUType kUses = CPUType::SSE2;
};
-} // namespace sse2
+} // namespace SSE2
} // namespace intgemm
diff --git a/intgemm/ssse3_gemm.h b/intgemm/ssse3_gemm.h
index 865fe12..db403bd 100644
--- a/intgemm/ssse3_gemm.h
+++ b/intgemm/ssse3_gemm.h
@@ -11,7 +11,7 @@
// 16-bit is in sse2_gemm.h
namespace intgemm {
-namespace ssse3 {
+namespace SSSE3 {
INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg);
@@ -131,12 +131,12 @@ struct Kernels8 {
static const Index kBTileRow = 16;
static const Index kBTileCol = 8;
- INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8)
+ INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, SSSE3::QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t)
INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t)
INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
+ SSSE3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
}
INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
@@ -150,5 +150,5 @@ struct Kernels8 {
static const CPUType kUses = CPUType::SSSE3;
};
-} // namespace ssse3
+} // namespace SSSE3
} // namespace intgemm
diff --git a/intgemm/stats.inl b/intgemm/stats.inl
index d6a850e..68a5b8e 100644
--- a/intgemm/stats.inl
+++ b/intgemm/stats.inl
@@ -1,12 +1,12 @@
/* This file is included multiple times, once per architecture. */
#if defined(INTGEMM_THIS_IS_AVX512DQ)
-#define INTGEMM_ARCH avx512bw
+#define INTGEMM_ARCH AVX512BW
#define INTGEMM_TARGET INTGEMM_AVX512DQ
#elif defined(INTGEMM_THIS_IS_AVX2)
-#define INTGEMM_ARCH avx2
+#define INTGEMM_ARCH AVX2
#define INTGEMM_TARGET INTGEMM_AVX2
#elif defined(INTGEMM_THIS_IS_SSE2)
-#define INTGEMM_ARCH sse2
+#define INTGEMM_ARCH SSE2
#define INTGEMM_TARGET INTGEMM_SSE2
#else
#error Included with unexpected architecture
diff --git a/intgemm/types.h b/intgemm/types.h
index 174f1ff..602130a 100644
--- a/intgemm/types.h
+++ b/intgemm/types.h
@@ -70,30 +70,30 @@ struct MeanStd {
};
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-namespace avx512vnni {
+namespace AVX512VNNI {
typedef __m512i Register;
typedef __m512 FRegister;
-} // namespace avx512vnni
+} // namespace AVX512VNNI
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-namespace avx512bw {
+namespace AVX512BW {
typedef __m512i Register;
typedef __m512 FRegister;
-} // namespace avx512bw
+} // namespace AVX512BW
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-namespace avx2 {
+namespace AVX2 {
typedef __m256i Register;
typedef __m256 FRegister;
-} // namespace avx2
+} // namespace AVX2
#endif
-namespace ssse3 {
+namespace SSSE3 {
typedef __m128i Register;
typedef __m128 FRegister;
-} // namespace ssse3
-namespace sse2 {
+} // namespace SSSE3
+namespace SSE2 {
typedef __m128i Register;
typedef __m128 FRegister;
-} // namespace sse2
+} // namespace SSE2
} // namespace intgemm