From ed081ba03efb2826a082038404d10b044a45f27e Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 25 Nov 2020 13:19:48 +0000 Subject: Change architecture namespace to capitals. Matches static branch, fewer defines for inline files --- benchmarks/benchmark.cc | 28 ++--- benchmarks/benchmark_quantizer.cc | 6 +- benchmarks/biasmultiply.cc | 168 +++++++++++++------------- intgemm/avx2_gemm.h | 34 +++--- intgemm/avx512_gemm.h | 4 +- intgemm/avx512vnni_gemm.h | 6 +- intgemm/intgemm.cc | 64 +++++----- intgemm/intgemm.h | 26 ++-- intgemm/sse2_gemm.h | 4 +- intgemm/ssse3_gemm.h | 8 +- intgemm/stats.inl | 6 +- intgemm/types.h | 20 ++-- test/add127_test.cc | 210 ++++++++++++++++---------------- test/multiply_test.cc | 212 ++++++++++++++++----------------- test/prepare_b_quantized_transposed.cc | 12 +- test/prepare_b_transposed.cc | 12 +- test/quantize_test.cc | 84 ++++++------- 17 files changed, 452 insertions(+), 452 deletions(-) diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 2dbe483..512d3ec 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -145,46 +145,46 @@ int main(int, char ** argv) { std::cerr << "SSSE3 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll(matrices, end, stats.ssse3_8bit); + RunAll(matrices, end, stats.ssse3_8bit); } std::cerr << "SSE2 16bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll(matrices, end, stats.sse2_16bit); + RunAll(matrices, end, stats.sse2_16bit); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 std::cerr << "AVX2 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll(matrices, end, stats.avx2_8bit); + RunAll(matrices, end, stats.avx2_8bit); } std::cerr << "AVX2 16bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll(matrices, end, stats.avx2_16bit); + RunAll(matrices, end, stats.avx2_16bit); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW std::cerr << "AVX512 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll(matrices, end, stats.avx512_8bit); + RunAll(matrices, end, stats.avx512_8bit); } std::cerr << "AVX512 16bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll(matrices, end, stats.avx512_16bit); + RunAll(matrices, end, stats.avx512_16bit); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI std::cerr << "AVX512VNNI 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll(matrices, end, stats.avx512vnni_8bit); + RunAll(matrices, end, stats.avx512vnni_8bit); } #endif @@ -194,18 +194,18 @@ int main(int, char ** argv) { } for (std::size_t i = 0; i < sizeof(matrices) / sizeof(RandomMatrices); ++i) { std::cout << "Multiply\t" << matrices[i].A_rows << '\t' << matrices[i].width << '\t' << matrices[i].B_cols << '\t' << "Samples=" << (kOutlierThreshold * stats.sse2_16bit[i].size()) << '\n'; - Print(stats.ssse3_8bit, i); - Print(stats.avx2_8bit, i); + Print(stats.ssse3_8bit, i); + Print(stats.avx2_8bit, i); #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - Print(stats.avx512_8bit, i); + Print(stats.avx512_8bit, i); #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI - Print(stats.avx512vnni_8bit, i); + Print(stats.avx512vnni_8bit, i); #endif - Print(stats.sse2_16bit, i); - Print(stats.avx2_16bit, i); + Print(stats.sse2_16bit, i); + Print(stats.avx2_16bit, i); #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - Print(stats.avx512_16bit, i); + Print(stats.avx512_16bit, i); #endif } return 0; diff --git a/benchmarks/benchmark_quantizer.cc b/benchmarks/benchmark_quantizer.cc index 86d90dc..5235b1e 100644 --- a/benchmarks/benchmark_quantizer.cc +++ b/benchmarks/benchmark_quantizer.cc @@ -63,12 +63,12 @@ int main() { for (float &element : in) { element = dist(gen); } - QuantizerBench(in.begin(), out.begin(), static_cast(count)); + QuantizerBench(in.begin(), out.begin(), static_cast(count)); #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 - QuantizerBench(in.begin(), out.begin(), static_cast(count)); + QuantizerBench(in.begin(), out.begin(), static_cast(count)); #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - QuantizerBench(in.begin(), out.begin(), static_cast(count)); + QuantizerBench(in.begin(), out.begin(), static_cast(count)); #endif } } diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc index 65deadb..c835b61 100644 --- a/benchmarks/biasmultiply.cc +++ b/benchmarks/biasmultiply.cc @@ -125,151 +125,151 @@ int main(int argc, char ** argv) { repeat = atoi(argv[1]); } - std::chrono::duration oldSSSE3_nobias = testOld_nobias(1, 64, 8); + std::chrono::duration oldSSSE3_nobias = testOld_nobias(1, 64, 8); for (int i = 0; i(8, 256, 256); - oldSSSE3_nobias += testOld_nobias(8, 2048, 256); - oldSSSE3_nobias += testOld_nobias(320, 256, 256); - oldSSSE3_nobias += testOld_nobias(472, 256, 256); - oldSSSE3_nobias += testOld_nobias(248, 256, 256); - oldSSSE3_nobias += testOld_nobias(200, 256, 256); + oldSSSE3_nobias += testOld_nobias(8, 256, 256); + oldSSSE3_nobias += testOld_nobias(8, 2048, 256); + oldSSSE3_nobias += testOld_nobias(320, 256, 256); + oldSSSE3_nobias += testOld_nobias(472, 256, 256); + oldSSSE3_nobias += testOld_nobias(248, 256, 256); + oldSSSE3_nobias += testOld_nobias(200, 256, 256); } std::cout << repeat << " iterations of SSSE3 without bias took: " << oldSSSE3_nobias.count() << " seconds." << std::endl; - std::chrono::duration oldSSSE3 = testOld(1, 64, 8); + std::chrono::duration oldSSSE3 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); - oldSSSE3 += testOld(8, 2048, 256); - oldSSSE3 += testOld(320, 256, 256); - oldSSSE3 += testOld(472, 256, 256); - oldSSSE3 += testOld(248, 256, 256); - oldSSSE3 += testOld(200, 256, 256); + oldSSSE3 += testOld(8, 256, 256); + oldSSSE3 += testOld(8, 2048, 256); + oldSSSE3 += testOld(320, 256, 256); + oldSSSE3 += testOld(472, 256, 256); + oldSSSE3 += testOld(248, 256, 256); + oldSSSE3 += testOld(200, 256, 256); } std::cout << repeat << " iterations of SSSE3 took: " << oldSSSE3.count() << " seconds." << std::endl; - std::chrono::duration newTimeSSSE3 = testOld(1, 64, 8); + std::chrono::duration newTimeSSSE3 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); - newTimeSSSE3 += testNew(8, 2048, 256); - newTimeSSSE3 += testNew(320, 256, 256); - newTimeSSSE3 += testNew(472, 256, 256); - newTimeSSSE3 += testNew(248, 256, 256); - newTimeSSSE3 += testNew(200, 256, 256); + newTimeSSSE3 += testNew(8, 256, 256); + newTimeSSSE3 += testNew(8, 2048, 256); + newTimeSSSE3 += testNew(320, 256, 256); + newTimeSSSE3 += testNew(472, 256, 256); + newTimeSSSE3 += testNew(248, 256, 256); + newTimeSSSE3 += testNew(200, 256, 256); } std::cout << repeat << " iterations of Shifted SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 - std::chrono::duration oldAVX2_nobias = testOld_nobias(1, 64, 8); + std::chrono::duration oldAVX2_nobias = testOld_nobias(1, 64, 8); for (int i = 0; i(8, 256, 256); - oldAVX2_nobias += testOld_nobias(8, 2048, 256); - oldAVX2_nobias += testOld_nobias(320, 256, 256); - oldAVX2_nobias += testOld_nobias(472, 256, 256); - oldAVX2_nobias += testOld_nobias(248, 256, 256); - oldAVX2_nobias += testOld_nobias(200, 256, 256); + oldAVX2_nobias += testOld_nobias(8, 256, 256); + oldAVX2_nobias += testOld_nobias(8, 2048, 256); + oldAVX2_nobias += testOld_nobias(320, 256, 256); + oldAVX2_nobias += testOld_nobias(472, 256, 256); + oldAVX2_nobias += testOld_nobias(248, 256, 256); + oldAVX2_nobias += testOld_nobias(200, 256, 256); } std::cout << repeat << " iterations of AVX2 without bias took: " << oldAVX2_nobias.count() << " seconds." << std::endl; - std::chrono::duration oldAVX2 = testOld(1, 64, 8); + std::chrono::duration oldAVX2 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); - oldAVX2 += testOld(8, 2048, 256); - oldAVX2 += testOld(320, 256, 256); - oldAVX2 += testOld(472, 256, 256); - oldAVX2 += testOld(248, 256, 256); - oldAVX2 += testOld(200, 256, 256); + oldAVX2 += testOld(8, 256, 256); + oldAVX2 += testOld(8, 2048, 256); + oldAVX2 += testOld(320, 256, 256); + oldAVX2 += testOld(472, 256, 256); + oldAVX2 += testOld(248, 256, 256); + oldAVX2 += testOld(200, 256, 256); } std::cout << repeat << " iterations of AVX2 took: " << oldAVX2.count() << " seconds." << std::endl; - std::chrono::duration newTimeAVX2 = testOld(1, 64, 8); + std::chrono::duration newTimeAVX2 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); - newTimeAVX2 += testNew(8, 2048, 256); - newTimeAVX2 += testNew(320, 256, 256); - newTimeAVX2 += testNew(472, 256, 256); - newTimeAVX2 += testNew(248, 256, 256); - newTimeAVX2 += testNew(200, 256, 256); + newTimeAVX2 += testNew(8, 256, 256); + newTimeAVX2 += testNew(8, 2048, 256); + newTimeAVX2 += testNew(320, 256, 256); + newTimeAVX2 += testNew(472, 256, 256); + newTimeAVX2 += testNew(248, 256, 256); + newTimeAVX2 += testNew(200, 256, 256); } std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl; #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW if (kCPU < CPUType::AVX512BW) return 0; - std::chrono::duration oldAVX512_nobias = testOld_nobias(1, 64, 8); + std::chrono::duration oldAVX512_nobias = testOld_nobias(1, 64, 8); for (int i = 0; i(8, 256, 256); - oldAVX512_nobias += testOld_nobias(8, 2048, 256); - oldAVX512_nobias += testOld_nobias(320, 256, 256); - oldAVX512_nobias += testOld_nobias(472, 256, 256); - oldAVX512_nobias += testOld_nobias(248, 256, 256); - oldAVX512_nobias += testOld_nobias(200, 256, 256); + oldAVX512_nobias += testOld_nobias(8, 256, 256); + oldAVX512_nobias += testOld_nobias(8, 2048, 256); + oldAVX512_nobias += testOld_nobias(320, 256, 256); + oldAVX512_nobias += testOld_nobias(472, 256, 256); + oldAVX512_nobias += testOld_nobias(248, 256, 256); + oldAVX512_nobias += testOld_nobias(200, 256, 256); } std::cout << repeat << " iterations of AVX512 without bias took: " << oldAVX512_nobias.count() << " seconds." << std::endl; - std::chrono::duration oldAVX512 = testOld(1, 64, 8); + std::chrono::duration oldAVX512 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); - oldAVX512 += testOld(8, 2048, 256); - oldAVX512 += testOld(320, 256, 256); - oldAVX512 += testOld(472, 256, 256); - oldAVX512 += testOld(248, 256, 256); - oldAVX512 += testOld(200, 256, 256); + oldAVX512 += testOld(8, 256, 256); + oldAVX512 += testOld(8, 2048, 256); + oldAVX512 += testOld(320, 256, 256); + oldAVX512 += testOld(472, 256, 256); + oldAVX512 += testOld(248, 256, 256); + oldAVX512 += testOld(200, 256, 256); } std::cout << repeat << " iterations of AVX512 took: " << oldAVX512.count() << " seconds." << std::endl; - std::chrono::duration newTimeAVX512 = testOld(1, 64, 8); + std::chrono::duration newTimeAVX512 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); - newTimeAVX512 += testNew(8, 2048, 256); - newTimeAVX512 += testNew(320, 256, 256); - newTimeAVX512 += testNew(472, 256, 256); - newTimeAVX512 += testNew(248, 256, 256); - newTimeAVX512 += testNew(200, 256, 256); + newTimeAVX512 += testNew(8, 256, 256); + newTimeAVX512 += testNew(8, 2048, 256); + newTimeAVX512 += testNew(320, 256, 256); + newTimeAVX512 += testNew(472, 256, 256); + newTimeAVX512 += testNew(248, 256, 256); + newTimeAVX512 += testNew(200, 256, 256); } std::cout << repeat << " iterations of Shifted AVX512 took: " << newTimeAVX512.count() << " seconds." << std::endl; #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI if (kCPU < CPUType::AVX512VNNI) return 0; - std::chrono::duration oldAVX512VNNI_nobias = testOld_nobias(1, 64, 8); + std::chrono::duration oldAVX512VNNI_nobias = testOld_nobias(1, 64, 8); for (int i = 0; i(8, 256, 256); - oldAVX512VNNI_nobias += testOld_nobias(8, 2048, 256); - oldAVX512VNNI_nobias += testOld_nobias(320, 256, 256); - oldAVX512VNNI_nobias += testOld_nobias(472, 256, 256); - oldAVX512VNNI_nobias += testOld_nobias(248, 256, 256); - oldAVX512VNNI_nobias += testOld_nobias(200, 256, 256); + oldAVX512VNNI_nobias += testOld_nobias(8, 256, 256); + oldAVX512VNNI_nobias += testOld_nobias(8, 2048, 256); + oldAVX512VNNI_nobias += testOld_nobias(320, 256, 256); + oldAVX512VNNI_nobias += testOld_nobias(472, 256, 256); + oldAVX512VNNI_nobias += testOld_nobias(248, 256, 256); + oldAVX512VNNI_nobias += testOld_nobias(200, 256, 256); } std::cout << repeat << " iterations of AVX512VNNI without bias took: " << oldAVX512VNNI_nobias.count() << " seconds." << std::endl; - std::chrono::duration oldAVX512VNNI = testOld(1, 64, 8); + std::chrono::duration oldAVX512VNNI = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); - oldAVX512VNNI += testOld(8, 2048, 256); - oldAVX512VNNI += testOld(320, 256, 256); - oldAVX512VNNI += testOld(472, 256, 256); - oldAVX512VNNI += testOld(248, 256, 256); - oldAVX512VNNI += testOld(200, 256, 256); + oldAVX512VNNI += testOld(8, 256, 256); + oldAVX512VNNI += testOld(8, 2048, 256); + oldAVX512VNNI += testOld(320, 256, 256); + oldAVX512VNNI += testOld(472, 256, 256); + oldAVX512VNNI += testOld(248, 256, 256); + oldAVX512VNNI += testOld(200, 256, 256); } std::cout << repeat << " iterations of AVX512VNNI took: " << oldAVX512VNNI.count() << " seconds." << std::endl; - std::chrono::duration newTimeAVX512VNNI = testOld(1, 64, 8); + std::chrono::duration newTimeAVX512VNNI = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); - newTimeAVX512VNNI += testNew(8, 2048, 256); - newTimeAVX512VNNI += testNew(320, 256, 256); - newTimeAVX512VNNI += testNew(472, 256, 256); - newTimeAVX512VNNI += testNew(248, 256, 256); - newTimeAVX512VNNI += testNew(200, 256, 256); + newTimeAVX512VNNI += testNew(8, 256, 256); + newTimeAVX512VNNI += testNew(8, 2048, 256); + newTimeAVX512VNNI += testNew(320, 256, 256); + newTimeAVX512VNNI += testNew(472, 256, 256); + newTimeAVX512VNNI += testNew(248, 256, 256); + newTimeAVX512VNNI += testNew(200, 256, 256); } std::cout << repeat << " iterations of Shifted AVX512VNNI took: " << newTimeAVX512VNNI.count() << " seconds." << std::endl; diff --git a/intgemm/avx2_gemm.h b/intgemm/avx2_gemm.h index 6e01679..d93ac8e 100644 --- a/intgemm/avx2_gemm.h +++ b/intgemm/avx2_gemm.h @@ -13,7 +13,7 @@ #include namespace intgemm { -namespace avx2 { +namespace AVX2 { INTGEMM_AVX2 inline Register QuantizerGrab(const float *input, const __m256 quant_mult_reg) { return kernels::quantize(loadu_ps(input), quant_mult_reg); @@ -73,14 +73,14 @@ struct Kernels16 { static const Index kBTileCol = 8; /* INTGEMM_AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { - PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols); + PrepareBFor16(input, output, AVX2::QuantizeTile16(quant_mult), rows, cols); }*/ - INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16) + INTGEMM_PREPARE_B_16(INTGEMM_AVX2, AVX2::QuantizeTile16) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int16_t) - INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile16, int16_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile16, int16_t) INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end); + AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end); } INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2) @@ -129,10 +129,10 @@ class QuantizeTile8 { const __m256i neg127 = _mm256_set1_epi8(-127); const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); // Grab 4 registers at a time in 32-bit format. - __m256i g0 = avx2::QuantizerGrab(input0, quant_mult); - __m256i g1 = avx2::QuantizerGrab(input1, quant_mult); - __m256i g2 = avx2::QuantizerGrab(input2, quant_mult); - __m256i g3 = avx2::QuantizerGrab(input3, quant_mult); + __m256i g0 = AVX2::QuantizerGrab(input0, quant_mult); + __m256i g1 = AVX2::QuantizerGrab(input1, quant_mult); + __m256i g2 = AVX2::QuantizerGrab(input2, quant_mult); + __m256i g3 = AVX2::QuantizerGrab(input3, quant_mult); // Pack 32-bit to 16-bit. __m256i packed0 = _mm256_packs_epi32(g0, g1); __m256i packed1 = _mm256_packs_epi32(g2, g3); @@ -155,10 +155,10 @@ class QuantizeTile8 { const __m256i pos127 = _mm256_set1_epi8(127); const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); // Grab 4 registers at a time in 32-bit format. - __m256i g0 = avx2::QuantizerGrab(input0, quant_mult); - __m256i g1 = avx2::QuantizerGrab(input1, quant_mult); - __m256i g2 = avx2::QuantizerGrab(input2, quant_mult); - __m256i g3 = avx2::QuantizerGrab(input3, quant_mult); + __m256i g0 = AVX2::QuantizerGrab(input0, quant_mult); + __m256i g1 = AVX2::QuantizerGrab(input1, quant_mult); + __m256i g2 = AVX2::QuantizerGrab(input2, quant_mult); + __m256i g3 = AVX2::QuantizerGrab(input3, quant_mult); // Pack 32-bit to 16-bit. __m256i packed0 = _mm256_packs_epi32(g0, g1); __m256i packed1 = _mm256_packs_epi32(g2, g3); @@ -207,12 +207,12 @@ struct Kernels8 { static const Index kBTileRow = 32; static const Index kBTileCol = 8; - INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8) + INTGEMM_PREPARE_B_8(INTGEMM_AVX2, AVX2::QuantizeTile8) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int8_t) - INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile8, int8_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile8, int8_t) INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end); + AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end); } INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, CPUType::AVX2) @@ -226,7 +226,7 @@ struct Kernels8 { static const CPUType kUses = CPUType::AVX2; }; -} // namespace avx2 +} // namespace AVX2 } // namespace intgemm #endif diff --git a/intgemm/avx512_gemm.h b/intgemm/avx512_gemm.h index f9fb1eb..a69b2dc 100644 --- a/intgemm/avx512_gemm.h +++ b/intgemm/avx512_gemm.h @@ -31,7 +31,7 @@ namespace intgemm { // So conversion in memory uses these, but I also implement a wider version for // rearranging B. -namespace avx512bw { +namespace AVX512BW { // Load from memory, multiply, and convert to int32_t. /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ @@ -405,7 +405,7 @@ struct Kernels8 { static const CPUType kUses = CPUType::AVX512BW; }; -} // namespace avx512bw +} // namespace AVX512BW } // namespace intgemm #endif diff --git a/intgemm/avx512vnni_gemm.h b/intgemm/avx512vnni_gemm.h index c660168..747bdf9 100644 --- a/intgemm/avx512vnni_gemm.h +++ b/intgemm/avx512vnni_gemm.h @@ -7,7 +7,7 @@ #include "types.h" namespace intgemm { -namespace avx512vnni { +namespace AVX512VNNI { // Workaround extra vmovdqa64 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663 INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) { @@ -18,7 +18,7 @@ INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) { #endif } -struct Kernels8 : public avx512bw::Kernels8 { +struct Kernels8 : public AVX512BW::Kernels8 { template INTGEMM_AVX512VNNI static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { assert(width % sizeof(Register) == 0); @@ -162,7 +162,7 @@ struct Kernels8 : public avx512bw::Kernels8 { static const CPUType kUses = CPUType::AVX512VNNI; }; -} // namespace avx512vnni +} // namespace AVX512VNNI } // namespace intgemm #endif diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc index d45cf60..82ad750 100644 --- a/intgemm/intgemm.cc +++ b/intgemm/intgemm.cc @@ -11,69 +11,69 @@ MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/ throw UnsupportedCPU(); } -void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(avx512bw::Kernels16::Quantize, avx512bw::Kernels16::Quantize, avx2::Kernels16::Quantize, sse2::Kernels16::Quantize, sse2::Kernels16::Quantize, Unsupported_16bit::Quantize); +void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512BW::Kernels16::Quantize, AVX512BW::Kernels16::Quantize, AVX2::Kernels16::Quantize, SSE2::Kernels16::Quantize, SSE2::Kernels16::Quantize, Unsupported_16bit::Quantize); -void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512bw::Kernels16::PrepareB, avx512bw::Kernels16::PrepareB, avx2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, Unsupported_16bit::PrepareB); +void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512BW::Kernels16::PrepareB, AVX512BW::Kernels16::PrepareB, AVX2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, Unsupported_16bit::PrepareB); -void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBQuantizedTransposed, avx512bw::Kernels16::PrepareBQuantizedTransposed, avx2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed); +void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed); -void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBTransposed, avx512bw::Kernels16::PrepareBTransposed, avx2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed); +void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBTransposed, AVX512BW::Kernels16::PrepareBTransposed, AVX2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed); -void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512bw::Kernels16::SelectColumnsB, avx512bw::Kernels16::SelectColumnsB, avx2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB); +void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512BW::Kernels16::SelectColumnsB, AVX512BW::Kernels16::SelectColumnsB, AVX2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB); -const char *const Int16::kName = ChooseCPU(avx512bw::Kernels16::kName, avx512bw::Kernels16::kName, avx2::Kernels16::kName, sse2::Kernels16::kName, sse2::Kernels16::kName, Unsupported_16bit::kName); +const char *const Int16::kName = ChooseCPU(AVX512BW::Kernels16::kName, AVX512BW::Kernels16::kName, AVX2::Kernels16::kName, SSE2::Kernels16::kName, SSE2::Kernels16::kName, Unsupported_16bit::kName); -void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::Quantize, avx512bw::Kernels8::Quantize, avx2::Kernels8::Quantize, ssse3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize); +void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::Quantize, AVX512BW::Kernels8::Quantize, AVX2::Kernels8::Quantize, SSSE3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize); -void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); +void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); -void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512vnni::Kernels8::PrepareB, avx512bw::Kernels8::PrepareB, avx2::Kernels8::PrepareB, ssse3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB); +void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512VNNI::Kernels8::PrepareB, AVX512BW::Kernels8::PrepareB, AVX2::Kernels8::PrepareB, SSSE3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB); -void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBQuantizedTransposed, avx512bw::Kernels8::PrepareBQuantizedTransposed, avx2::Kernels8::PrepareBQuantizedTransposed, ssse3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed); +void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX2::Kernels8::PrepareBQuantizedTransposed, SSSE3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed); -void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBTransposed, avx512bw::Kernels8::PrepareBTransposed, avx2::Kernels8::PrepareBTransposed, ssse3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed); +void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBTransposed, AVX512BW::Kernels8::PrepareBTransposed, AVX2::Kernels8::PrepareBTransposed, SSSE3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed); -void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512vnni::Kernels8::SelectColumnsB, avx512bw::Kernels8::SelectColumnsB, avx2::Kernels8::SelectColumnsB, ssse3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB); +void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI::Kernels8::SelectColumnsB, AVX512BW::Kernels8::SelectColumnsB, AVX2::Kernels8::SelectColumnsB, SSSE3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB); -const char *const Int8::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); +const char *const Int8::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); -void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); +void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); -const char *const Int8Shift::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); +const char *const Int8Shift::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED); #if !defined(INTGEMM_COMPILER_SUPPORTS_AVX2) -namespace avx2{ -using sse2::MaxAbsolute; -using sse2::VectorMeanStd; -} // namespace avx2 +namespace AVX2{ +using SSE2::MaxAbsolute; +using SSE2::VectorMeanStd; +} // namespace AVX2 #endif #if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW) -namespace avx512bw { -using avx2::MaxAbsolute; -using avx2::VectorMeanStd; -} // namespace avx512bw +namespace AVX512BW { +using AVX2::MaxAbsolute; +using AVX2::VectorMeanStd; +} // namespace AVX512BW #endif -float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512bw::MaxAbsolute, avx512bw::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute); +float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512BW::MaxAbsolute, AVX512BW::MaxAbsolute, AVX2::MaxAbsolute, SSE2::MaxAbsolute, SSE2::MaxAbsolute, Unsupported_MaxAbsolute); -MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512bw::VectorMeanStd, avx512bw::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, Unsupported_VectorMeanStd); +MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(AVX512BW::VectorMeanStd, AVX512BW::VectorMeanStd, AVX2::VectorMeanStd, SSE2::VectorMeanStd, SSE2::VectorMeanStd, Unsupported_VectorMeanStd); constexpr const char *const Unsupported_16bit::kName; constexpr const char *const Unsupported_8bit::kName; -constexpr const char *const sse2::Kernels16::kName; -constexpr const char *const ssse3::Kernels8::kName; +constexpr const char *const SSE2::Kernels16::kName; +constexpr const char *const SSSE3::Kernels8::kName; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 -constexpr const char *const avx2::Kernels8::kName; -constexpr const char *const avx2::Kernels16::kName; +constexpr const char *const AVX2::Kernels8::kName; +constexpr const char *const AVX2::Kernels16::kName; #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW -constexpr const char *const avx512bw::Kernels8::kName; -constexpr const char *const avx512bw::Kernels16::kName; +constexpr const char *const AVX512BW::Kernels8::kName; +constexpr const char *const AVX512BW::Kernels16::kName; #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI -constexpr const char *const avx512vnni::Kernels8::kName; +constexpr const char *const AVX512VNNI::Kernels8::kName; #endif } diff --git a/intgemm/intgemm.h b/intgemm/intgemm.h index a354b60..029a8ec 100644 --- a/intgemm/intgemm.h +++ b/intgemm/intgemm.h @@ -127,21 +127,21 @@ struct Unsupported_8bit { #ifndef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI // These won't ever be called in this capacity, but it does let the code below compile. -namespace avx512vnni { +namespace AVX512VNNI { typedef Unsupported_8bit Kernels8; -} // namespace avx512vnni +} // namespace AVX512VNNI #endif #ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW -namespace avx512bw { +namespace AVX512BW { typedef Unsupported_8bit Kernels8; typedef Unsupported_16bit Kernels16; -} // namespace avx512bw +} // namespace AVX512BW #endif #ifndef INTGEMM_COMPILER_SUPPORTS_AVX2 -namespace avx2 { +namespace AVX2 { typedef Unsupported_8bit Kernels8; typedef Unsupported_16bit Kernels16; -} // namespace avx2 +} // namespace AVX2 #endif @@ -309,7 +309,7 @@ private: }; template -void (*Int8::MultiplyImpl::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap, OMPParallelWrap, OMPParallelWrap, OMPParallelWrap, Unsupported_8bit::Multiply, Unsupported_8bit::Multiply); +void (*Int8::MultiplyImpl::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap, OMPParallelWrap, OMPParallelWrap, OMPParallelWrap, Unsupported_8bit::Multiply, Unsupported_8bit::Multiply); /* * 8-bit matrix multiplication with shifting A by 127 @@ -373,14 +373,14 @@ private: template void (*Int8Shift::MultiplyImpl::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU( - OMPParallelWrap8Shift, - OMPParallelWrap8Shift, - OMPParallelWrap8Shift, - OMPParallelWrap8Shift, + OMPParallelWrap8Shift, + OMPParallelWrap8Shift, + OMPParallelWrap8Shift, + OMPParallelWrap8Shift, Unsupported_8bit::Multiply8Shift, Unsupported_8bit::Multiply8Shift); template -void (*Int8Shift::PrepareBiasImpl::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(avx512vnni::Kernels8::PrepareBias, avx512bw::Kernels8::PrepareBias, avx2::Kernels8::PrepareBias, ssse3::Kernels8::PrepareBias, ssse3::Kernels8::PrepareBias, Unsupported_8bit::PrepareBias); +void (*Int8Shift::PrepareBiasImpl::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI::Kernels8::PrepareBias, AVX512BW::Kernels8::PrepareBias, AVX2::Kernels8::PrepareBias, SSSE3::Kernels8::PrepareBias, SSSE3::Kernels8::PrepareBias, Unsupported_8bit::PrepareBias); /* * 16-bit matrix multiplication @@ -436,7 +436,7 @@ private: }; template -void (*Int16::MultiplyImpl::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap /*TODO VNNI 16-bit. */, OMPParallelWrap, OMPParallelWrap, OMPParallelWrap, OMPParallelWrap, Unsupported_16bit::Multiply); +void (*Int16::MultiplyImpl::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap /*TODO VNNI 16-bit. */, OMPParallelWrap, OMPParallelWrap, OMPParallelWrap, OMPParallelWrap, Unsupported_16bit::Multiply); extern const CPUType kCPU; diff --git a/intgemm/sse2_gemm.h b/intgemm/sse2_gemm.h index cd49efe..cd855a6 100644 --- a/intgemm/sse2_gemm.h +++ b/intgemm/sse2_gemm.h @@ -9,7 +9,7 @@ // 8 bit is in ssse3_gemm.h namespace intgemm { -namespace sse2 { +namespace SSE2 { INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg); @@ -80,5 +80,5 @@ struct Kernels16 { static const CPUType kUses = CPUType::SSE2; }; -} // namespace sse2 +} // namespace SSE2 } // namespace intgemm diff --git a/intgemm/ssse3_gemm.h b/intgemm/ssse3_gemm.h index 865fe12..db403bd 100644 --- a/intgemm/ssse3_gemm.h +++ b/intgemm/ssse3_gemm.h @@ -11,7 +11,7 @@ // 16-bit is in sse2_gemm.h namespace intgemm { -namespace ssse3 { +namespace SSSE3 { INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg); @@ -131,12 +131,12 @@ struct Kernels8 { static const Index kBTileRow = 16; static const Index kBTileCol = 8; - INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8) + INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, SSSE3::QuantizeTile8) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t) INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t) INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); + SSSE3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); } INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2) @@ -150,5 +150,5 @@ struct Kernels8 { static const CPUType kUses = CPUType::SSSE3; }; -} // namespace ssse3 +} // namespace SSSE3 } // namespace intgemm diff --git a/intgemm/stats.inl b/intgemm/stats.inl index d6a850e..68a5b8e 100644 --- a/intgemm/stats.inl +++ b/intgemm/stats.inl @@ -1,12 +1,12 @@ /* This file is included multiple times, once per architecture. */ #if defined(INTGEMM_THIS_IS_AVX512DQ) -#define INTGEMM_ARCH avx512bw +#define INTGEMM_ARCH AVX512BW #define INTGEMM_TARGET INTGEMM_AVX512DQ #elif defined(INTGEMM_THIS_IS_AVX2) -#define INTGEMM_ARCH avx2 +#define INTGEMM_ARCH AVX2 #define INTGEMM_TARGET INTGEMM_AVX2 #elif defined(INTGEMM_THIS_IS_SSE2) -#define INTGEMM_ARCH sse2 +#define INTGEMM_ARCH SSE2 #define INTGEMM_TARGET INTGEMM_SSE2 #else #error Included with unexpected architecture diff --git a/intgemm/types.h b/intgemm/types.h index 174f1ff..602130a 100644 --- a/intgemm/types.h +++ b/intgemm/types.h @@ -70,30 +70,30 @@ struct MeanStd { }; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI -namespace avx512vnni { +namespace AVX512VNNI { typedef __m512i Register; typedef __m512 FRegister; -} // namespace avx512vnni +} // namespace AVX512VNNI #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW -namespace avx512bw { +namespace AVX512BW { typedef __m512i Register; typedef __m512 FRegister; -} // namespace avx512bw +} // namespace AVX512BW #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 -namespace avx2 { +namespace AVX2 { typedef __m256i Register; typedef __m256 FRegister; -} // namespace avx2 +} // namespace AVX2 #endif -namespace ssse3 { +namespace SSSE3 { typedef __m128i Register; typedef __m128 FRegister; -} // namespace ssse3 -namespace sse2 { +} // namespace SSSE3 +namespace SSE2 { typedef __m128i Register; typedef __m128 FRegister; -} // namespace sse2 +} // namespace SSE2 } // namespace intgemm diff --git a/test/add127_test.cc b/test/add127_test.cc index 723e143..c31732c 100644 --- a/test/add127_test.cc +++ b/test/add127_test.cc @@ -282,55 +282,55 @@ template void TestMultiplyShiftInt(Index A_rows, Index width, In // Bias TEST_CASE("PrepareBias SSSE3", "[Add127]") { if (kCPU < CPUType::SSSE3) return; - TestPrepareBias(256,256); - TestPrepareBias(2048,256); - TestPrepareBias(512,512); + TestPrepareBias(256,256); + TestPrepareBias(2048,256); + TestPrepareBias(512,512); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("PrepareBias AVX2", "[Add127]") { if (kCPU < CPUType::AVX2) return; - TestPrepareBias(256,256); - TestPrepareBias(2048,256); - TestPrepareBias(512,512); + TestPrepareBias(256,256); + TestPrepareBias(2048,256); + TestPrepareBias(512,512); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("PrepareBias AVX512F", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - TestPrepareBias(256,256); - TestPrepareBias(2048,256); - TestPrepareBias(512,512); + TestPrepareBias(256,256); + TestPrepareBias(2048,256); + TestPrepareBias(512,512); } #endif //A TEST_CASE("PrepareA SSSE3", "[Add127]") { if (kCPU < CPUType::SSSE3) return; - TestPrepareA(64,64); - TestPrepareA(256,256); - TestPrepareA(512,512); - TestPrepareA(2048,256); + TestPrepareA(64,64); + TestPrepareA(256,256); + TestPrepareA(512,512); + TestPrepareA(2048,256); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("PrepareA AVX2", "[Add127]") { if (kCPU < CPUType::AVX2) return; - TestPrepareA(64,64); - TestPrepareA(256,256); - TestPrepareA(512,512); - TestPrepareA(2048,256); + TestPrepareA(64,64); + TestPrepareA(256,256); + TestPrepareA(512,512); + TestPrepareA(2048,256); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("PrepareA AVX512F", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - TestPrepareA(64,64); - TestPrepareA(256,256); - TestPrepareA(512,512); - TestPrepareA(2048,256); + TestPrepareA(64,64); + TestPrepareA(256,256); + TestPrepareA(512,512); + TestPrepareA(2048,256); } #endif @@ -338,153 +338,153 @@ TEST_CASE("PrepareA AVX512F", "[Add127]") { TEST_CASE ("Multiply SSSE3 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::SSSE3) return; - TestMultiplyBiasNew(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f); - TestMultiplyBiasNew(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f); - TestMultiplyBiasNew(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f); - TestMultiplyBiasNew(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f); - TestMultiplyBiasNew(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f); - TestMultiplyBiasNew(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f); - TestMultiplyBiasNew(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f); + TestMultiplyBiasNew(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f); + TestMultiplyBiasNew(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f); + TestMultiplyBiasNew(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f); + TestMultiplyBiasNew(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f); + TestMultiplyBiasNew(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f); + TestMultiplyBiasNew(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f); + TestMultiplyBiasNew(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE ("Multiply AVX2 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::AVX2) return; - TestMultiplyBiasNew(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f); - TestMultiplyBiasNew(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f); - TestMultiplyBiasNew(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f); - TestMultiplyBiasNew(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f); - TestMultiplyBiasNew(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f); - TestMultiplyBiasNew(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f); - TestMultiplyBiasNew(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f); + TestMultiplyBiasNew(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f); + TestMultiplyBiasNew(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f); + TestMultiplyBiasNew(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f); + TestMultiplyBiasNew(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f); + TestMultiplyBiasNew(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f); + TestMultiplyBiasNew(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f); + TestMultiplyBiasNew(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiplyBiasNew(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); - TestMultiplyBiasNew(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); - TestMultiplyBiasNew(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f); - TestMultiplyBiasNew(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyBiasNew(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); - TestMultiplyBiasNew(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyBiasNew(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); + TestMultiplyBiasNew(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); + TestMultiplyBiasNew(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); + TestMultiplyBiasNew(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f); + TestMultiplyBiasNew(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyBiasNew(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); + TestMultiplyBiasNew(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyBiasNew(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI TEST_CASE ("Multiply AVX512VNNI 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::AVX512VNNI) return; - TestMultiplyBiasNew(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); - TestMultiplyBiasNew(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); - TestMultiplyBiasNew(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f); - TestMultiplyBiasNew(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyBiasNew(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); - TestMultiplyBiasNew(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyBiasNew(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); + TestMultiplyBiasNew(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); + TestMultiplyBiasNew(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); + TestMultiplyBiasNew(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f); + TestMultiplyBiasNew(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyBiasNew(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); + TestMultiplyBiasNew(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyBiasNew(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); } #endif //Multiply old vs new TEST_CASE ("Multiply SSSE3 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::SSSE3) return; - TestMultiplyShiftNonShift(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f); - TestMultiplyShiftNonShift(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad - TestMultiplyShiftNonShift(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f); - TestMultiplyShiftNonShift(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f); - TestMultiplyShiftNonShift(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f); - TestMultiplyShiftNonShift(200, 256, 256, 1, 0.74f, 0.17f, 0.006f); + TestMultiplyShiftNonShift(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f); + TestMultiplyShiftNonShift(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad + TestMultiplyShiftNonShift(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f); + TestMultiplyShiftNonShift(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f); + TestMultiplyShiftNonShift(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f); + TestMultiplyShiftNonShift(200, 256, 256, 1, 0.74f, 0.17f, 0.006f); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE ("Multiply AVX2 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::AVX2) return; - TestMultiplyShiftNonShift(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f); - TestMultiplyShiftNonShift(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad - TestMultiplyShiftNonShift(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftNonShift(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); - TestMultiplyShiftNonShift(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftNonShift(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); + TestMultiplyShiftNonShift(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f); + TestMultiplyShiftNonShift(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad + TestMultiplyShiftNonShift(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftNonShift(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); + TestMultiplyShiftNonShift(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftNonShift(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiplyShiftNonShift(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); - TestMultiplyShiftNonShift(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); - TestMultiplyShiftNonShift(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f); - TestMultiplyShiftNonShift(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyShiftNonShift(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); - TestMultiplyShiftNonShift(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyShiftNonShift(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); + TestMultiplyShiftNonShift(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); + TestMultiplyShiftNonShift(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); + TestMultiplyShiftNonShift(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f); + TestMultiplyShiftNonShift(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyShiftNonShift(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); + TestMultiplyShiftNonShift(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyShiftNonShift(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::AVX512VNNI) return; - TestMultiplyShiftNonShift(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f); - TestMultiplyShiftNonShift(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); - TestMultiplyShiftNonShift(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f); + TestMultiplyShiftNonShift(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); + TestMultiplyShiftNonShift(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f); } #endif //Multiply Shift vs int shift implementation TEST_CASE ("Multiply SSSE3 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::SSSE3) return; - TestMultiplyShiftInt(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f); - TestMultiplyShiftInt(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f); - TestMultiplyShiftInt(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f); - TestMultiplyShiftInt(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftInt(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); - TestMultiplyShiftInt(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftInt(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); + TestMultiplyShiftInt(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f); + TestMultiplyShiftInt(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f); + TestMultiplyShiftInt(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f); + TestMultiplyShiftInt(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftInt(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); + TestMultiplyShiftInt(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftInt(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE ("Multiply AVX2 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::AVX2) return; - TestMultiplyShiftInt(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f); - TestMultiplyShiftInt(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f); - TestMultiplyShiftInt(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f); - TestMultiplyShiftInt(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftInt(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); - TestMultiplyShiftInt(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftInt(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); + TestMultiplyShiftInt(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f); + TestMultiplyShiftInt(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f); + TestMultiplyShiftInt(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f); + TestMultiplyShiftInt(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftInt(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); + TestMultiplyShiftInt(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftInt(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiplyShiftInt(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f); - TestMultiplyShiftInt(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f); - TestMultiplyShiftInt(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); - TestMultiplyShiftInt(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); - TestMultiplyShiftInt(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f); - TestMultiplyShiftInt(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); - TestMultiplyShiftInt(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f); + TestMultiplyShiftInt(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f); + TestMultiplyShiftInt(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f); + TestMultiplyShiftInt(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); + TestMultiplyShiftInt(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); + TestMultiplyShiftInt(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f); + TestMultiplyShiftInt(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); + TestMultiplyShiftInt(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::AVX512VNNI) return; - TestMultiplyShiftInt(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f); - TestMultiplyShiftInt(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f); - TestMultiplyShiftInt(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); - TestMultiplyShiftInt(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); - TestMultiplyShiftInt(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f); - TestMultiplyShiftInt(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); - TestMultiplyShiftInt(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f); + TestMultiplyShiftInt(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f); + TestMultiplyShiftInt(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f); + TestMultiplyShiftInt(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); + TestMultiplyShiftInt(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); + TestMultiplyShiftInt(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f); + TestMultiplyShiftInt(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); + TestMultiplyShiftInt(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f); } #endif diff --git a/test/multiply_test.cc b/test/multiply_test.cc index 47201c0..5395d40 100644 --- a/test/multiply_test.cc +++ b/test/multiply_test.cc @@ -85,32 +85,32 @@ template void TestPrepare(Index rows = 32, Index cols = 16) { #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("Prepare AVX512", "[prepare]") { if (kCPU < CPUType::AVX512BW) return; - TestPrepare(64, 8); - TestPrepare(256, 32); - TestPrepare(64, 8); - TestPrepare(256, 32); + TestPrepare(64, 8); + TestPrepare(256, 32); + TestPrepare(64, 8); + TestPrepare(256, 32); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("Prepare AVX2", "[prepare]") { if (kCPU < CPUType::AVX2) return; - TestPrepare(64, 32); - TestPrepare(64, 32); + TestPrepare(64, 32); + TestPrepare(64, 32); } #endif TEST_CASE("Prepare SSSE3", "[prepare]") { if (kCPU < CPUType::SSSE3) return; - TestPrepare(16, 8); - TestPrepare(32, 16); - TestPrepare(32, 32); + TestPrepare(16, 8); + TestPrepare(32, 16); + TestPrepare(32, 32); } TEST_CASE("Prepare SSE2", "[prepare]") { if (kCPU < CPUType::SSE2) return; - TestPrepare(8, 8); - TestPrepare(32, 32); + TestPrepare(8, 8); + TestPrepare(32, 32); } template void TestSelectColumnsB(Index rows = 64, Index cols = 16) { @@ -152,29 +152,29 @@ template void TestSelectColumnsB(Index rows = 64, Index cols = 1 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("SelectColumnsB AVX512", "[select]") { if (kCPU < CPUType::AVX512BW) return; - TestSelectColumnsB(); - TestSelectColumnsB(256, 256); + TestSelectColumnsB(); + TestSelectColumnsB(256, 256); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("SelectColumnsB AVX2", "[select]") { if (kCPU < CPUType::AVX2) return; - TestSelectColumnsB(256, 256); - TestSelectColumnsB(256, 256); + TestSelectColumnsB(256, 256); + TestSelectColumnsB(256, 256); } #endif TEST_CASE("SelectColumnsB SSSE3", "[select]") { if (kCPU < CPUType::SSSE3) return; - TestSelectColumnsB(); - TestSelectColumnsB(256, 256); + TestSelectColumnsB(); + TestSelectColumnsB(256, 256); } TEST_CASE("SelectColumnsB SSE2", "[select]") { if (kCPU < CPUType::SSE2) return; - TestSelectColumnsB(); - TestSelectColumnsB(256, 256); + TestSelectColumnsB(); + TestSelectColumnsB(256, 256); } template void TestMax() { @@ -219,20 +219,20 @@ template void TestMaxAbsolute( TEST_CASE("MaxAbsolute SSE2", "[max]") { if (kCPU < CPUType::SSE2) return; - TestMaxAbsolute(); + TestMaxAbsolute(); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("MaxAbsolute AVX2", "[max]") { if (kCPU < CPUType::AVX2) return; - TestMaxAbsolute(); + TestMaxAbsolute(); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("MaxAbsolute AVX512BW", "[max]") { if (kCPU < CPUType::AVX512BW) return; - TestMaxAbsolute(); + TestMaxAbsolute(); } #endif @@ -364,148 +364,148 @@ template void TestMultiplyBias(Index A_rows, Index width, Index TEST_CASE ("Multiply SSE2 16bit", "[multiply]") { if (kCPU < CPUType::SSE2) return; - TestMultiply(8, 256, 256, .1f, 1, 0.01f); - TestMultiply(8, 2048, 256, .1f, 1, 0.02f); - TestMultiply(320, 256, 256, .1f, 1, 0.01f); - TestMultiply(472, 256, 256, .1f, 1, 0.01f); - TestMultiply(248, 256, 256, .1f, 1, 0.01f); - TestMultiply(200, 256, 256, .1f, 1, 0.01f); + TestMultiply(8, 256, 256, .1f, 1, 0.01f); + TestMultiply(8, 2048, 256, .1f, 1, 0.02f); + TestMultiply(320, 256, 256, .1f, 1, 0.01f); + TestMultiply(472, 256, 256, .1f, 1, 0.01f); + TestMultiply(248, 256, 256, .1f, 1, 0.01f); + TestMultiply(200, 256, 256, .1f, 1, 0.01f); } TEST_CASE ("Multiply SSE2 16bit with bias", "[biased_multiply]") { if (kCPU < CPUType::SSE2) return; - TestMultiplyBias(8, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(8, 2048, 256, .1f, 1, 0.02f); - TestMultiplyBias(320, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(472, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(248, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(200, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(8, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(8, 2048, 256, .1f, 1, 0.02f); + TestMultiplyBias(320, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(472, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(248, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(200, 256, 256, .1f, 1, 0.01f); } TEST_CASE ("Multiply SSSE3 8bit", "[multiply]") { if (kCPU < CPUType::SSSE3) return; - TestMultiply(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f); - TestMultiply(8, 2048, 256, 33, 33, 4.4f, 4.4f); - TestMultiply(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f); - TestMultiply(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f); - TestMultiply(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f); - TestMultiply(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f); + TestMultiply(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f); + TestMultiply(8, 2048, 256, 33, 33, 4.4f, 4.4f); + TestMultiply(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f); + TestMultiply(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f); + TestMultiply(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f); + TestMultiply(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f); } TEST_CASE ("Multiply SSSE3 8bit with bias", "[biased_multiply]") { if (kCPU < CPUType::SSSE3) return; - TestMultiplyBias(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f); - TestMultiplyBias(8, 2048, 256, 33, 33, 4.4f, 4.4f); - TestMultiplyBias(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f); - TestMultiplyBias(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f); - TestMultiplyBias(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f); - TestMultiplyBias(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f); + TestMultiplyBias(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f); + TestMultiplyBias(8, 2048, 256, 33, 33, 4.4f, 4.4f); + TestMultiplyBias(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f); + TestMultiplyBias(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f); + TestMultiplyBias(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f); + TestMultiplyBias(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE ("Multiply AVX2 8bit", "[multiply]") { if (kCPU < CPUType::AVX2) return; - TestMultiply(8, 256, 256, .1f, 1, 0.1f); - TestMultiply(8, 2048, 256, 19, 19, 1.8f, 1.8f); - TestMultiply(320, 256, 256, .1f, 1, 0.1f); - TestMultiply(472, 256, 256, .1f, 1, 0.1f); - TestMultiply(248, 256, 256, .1f, 1, 0.1f); - TestMultiply(200, 256, 256, .1f, 1, 0.1f); + TestMultiply(8, 256, 256, .1f, 1, 0.1f); + TestMultiply(8, 2048, 256, 19, 19, 1.8f, 1.8f); + TestMultiply(320, 256, 256, .1f, 1, 0.1f); + TestMultiply(472, 256, 256, .1f, 1, 0.1f); + TestMultiply(248, 256, 256, .1f, 1, 0.1f); + TestMultiply(200, 256, 256, .1f, 1, 0.1f); } TEST_CASE ("Multiply AVX2 8bit with bias", "[biased_multiply]") { if (kCPU < CPUType::AVX2) return; - TestMultiplyBias(8, 256, 256, .1f, 1, 0.1f); - TestMultiplyBias(8, 2048, 256, 19, 19, 1.8f, 1.8f); - TestMultiplyBias(320, 256, 256, .1f, 1, 0.1f); - TestMultiplyBias(472, 256, 256, .1f, 1, 0.1f); - TestMultiplyBias(248, 256, 256, .1f, 1, 0.1f); - TestMultiplyBias(200, 256, 256, .1f, 1, 0.1f); + TestMultiplyBias(8, 256, 256, .1f, 1, 0.1f); + TestMultiplyBias(8, 2048, 256, 19, 19, 1.8f, 1.8f); + TestMultiplyBias(320, 256, 256, .1f, 1, 0.1f); + TestMultiplyBias(472, 256, 256, .1f, 1, 0.1f); + TestMultiplyBias(248, 256, 256, .1f, 1, 0.1f); + TestMultiplyBias(200, 256, 256, .1f, 1, 0.1f); } TEST_CASE ("Multiply AVX2 16bit", "[multiply]") { if (kCPU < CPUType::AVX2) return; - TestMultiply(8, 256, 256, .1f, 1, 0.01f); - TestMultiply(8, 2048, 256, .1f, 1, 0.02f); - TestMultiply(320, 256, 256, .1f, 1, 0.01f); - TestMultiply(472, 256, 256, .1f, 1, 0.01f); - TestMultiply(248, 256, 256, .1f, 1, 0.01f); - TestMultiply(200, 256, 256, .1f, 1, 0.01f); + TestMultiply(8, 256, 256, .1f, 1, 0.01f); + TestMultiply(8, 2048, 256, .1f, 1, 0.02f); + TestMultiply(320, 256, 256, .1f, 1, 0.01f); + TestMultiply(472, 256, 256, .1f, 1, 0.01f); + TestMultiply(248, 256, 256, .1f, 1, 0.01f); + TestMultiply(200, 256, 256, .1f, 1, 0.01f); } TEST_CASE ("Multiply AVX2 16bit with bias", "[biased_multiply]") { if (kCPU < CPUType::AVX2) return; - TestMultiplyBias(8, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(8, 2048, 256, .1f, 1, 0.02f); - TestMultiplyBias(320, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(472, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(248, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(200, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(8, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(8, 2048, 256, .1f, 1, 0.02f); + TestMultiplyBias(320, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(472, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(248, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(200, 256, 256, .1f, 1, 0.01f); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512 8bit", "[multiply]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiply(8, 256, 256, 0, 0.25f, 0.062f); - TestMultiply(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f); - TestMultiply(320, 256, 256, 0, 0.26f, 0.059f); - TestMultiply(472, 256, 256, 0, 0.29f, 0.059f); - TestMultiply(248, 256, 256, 0, 0.29f, 0.059f); - TestMultiply(200, 256, 256, 0, 0.28f, 0.06f); + TestMultiply(8, 256, 256, 0, 0.25f, 0.062f); + TestMultiply(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f); + TestMultiply(320, 256, 256, 0, 0.26f, 0.059f); + TestMultiply(472, 256, 256, 0, 0.29f, 0.059f); + TestMultiply(248, 256, 256, 0, 0.29f, 0.059f); + TestMultiply(200, 256, 256, 0, 0.28f, 0.06f); } TEST_CASE ("Multiply AVX512 8bit with bias", "[biased_multiply]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiplyBias(8, 256, 256, 0, 0.25f, 0.062f); - TestMultiplyBias(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f); - TestMultiplyBias(320, 256, 256, 0, 0.26f, 0.059f); - TestMultiplyBias(472, 256, 256, 0, 0.29f, 0.059f); - TestMultiplyBias(248, 256, 256, 0, 0.29f, 0.059f); - TestMultiplyBias(200, 256, 256, 0, 0.28f, 0.06f); + TestMultiplyBias(8, 256, 256, 0, 0.25f, 0.062f); + TestMultiplyBias(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f); + TestMultiplyBias(320, 256, 256, 0, 0.26f, 0.059f); + TestMultiplyBias(472, 256, 256, 0, 0.29f, 0.059f); + TestMultiplyBias(248, 256, 256, 0, 0.29f, 0.059f); + TestMultiplyBias(200, 256, 256, 0, 0.28f, 0.06f); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI TEST_CASE ("Multiply AVX512VNNI 8bit", "[multiply]") { if (kCPU < CPUType::AVX512VNNI) return; - TestMultiply(8, 256, 256, 0, 0.25f, 0.062f); - TestMultiply(8, 2048, 256, 0, 0.55f, 0.25f); - TestMultiply(320, 256, 256, 0, 0.26f, 0.059f); - TestMultiply(472, 256, 256, 0, 0.29f, 0.059f); - TestMultiply(248, 256, 256, 0, 0.29f, 0.059f); - TestMultiply(200, 256, 256, 0, 0.28f, 0.06f); + TestMultiply(8, 256, 256, 0, 0.25f, 0.062f); + TestMultiply(8, 2048, 256, 0, 0.55f, 0.25f); + TestMultiply(320, 256, 256, 0, 0.26f, 0.059f); + TestMultiply(472, 256, 256, 0, 0.29f, 0.059f); + TestMultiply(248, 256, 256, 0, 0.29f, 0.059f); + TestMultiply(200, 256, 256, 0, 0.28f, 0.06f); } TEST_CASE ("Multiply AVX512VNNI 8bit with bias", "[biased_multiply]") { if (kCPU < CPUType::AVX512VNNI) return; - TestMultiplyBias(8, 256, 256, 0, 0.25f, 0.062f); - TestMultiplyBias(8, 2048, 256, 0, 0.55f, 0.25f); - TestMultiplyBias(320, 256, 256, 0, 0.26f, 0.059f); - TestMultiplyBias(472, 256, 256, 0, 0.29f, 0.059f); - TestMultiplyBias(248, 256, 256, 0, 0.29f, 0.059f); - TestMultiplyBias(200, 256, 256, 0, 0.28f, 0.06f); + TestMultiplyBias(8, 256, 256, 0, 0.25f, 0.062f); + TestMultiplyBias(8, 2048, 256, 0, 0.55f, 0.25f); + TestMultiplyBias(320, 256, 256, 0, 0.26f, 0.059f); + TestMultiplyBias(472, 256, 256, 0, 0.29f, 0.059f); + TestMultiplyBias(248, 256, 256, 0, 0.29f, 0.059f); + TestMultiplyBias(200, 256, 256, 0, 0.28f, 0.06f); } #endif TEST_CASE ("Multiply AVX512 16bit", "[multiply]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiply(8, 256, 256, .1f, 1, 0.01f); - TestMultiply(8, 2048, 256, .1f, 1, 0.011f); - TestMultiply(320, 256, 256, .1f, 1, 0.01f); - TestMultiply(472, 256, 256, .1f, 1, 0.01f); - TestMultiply(248, 256, 256, .1f, 1, 0.01f); - TestMultiply(200, 256, 256, .1f, 1, 0.01f); + TestMultiply(8, 256, 256, .1f, 1, 0.01f); + TestMultiply(8, 2048, 256, .1f, 1, 0.011f); + TestMultiply(320, 256, 256, .1f, 1, 0.01f); + TestMultiply(472, 256, 256, .1f, 1, 0.01f); + TestMultiply(248, 256, 256, .1f, 1, 0.01f); + TestMultiply(200, 256, 256, .1f, 1, 0.01f); } TEST_CASE ("Multiply AVX512 16bit with bias", "[biased_multiply]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiplyBias(8, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(8, 2048, 256, .1f, 1, 0.011f); - TestMultiplyBias(320, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(472, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(248, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias(200, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(8, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(8, 2048, 256, .1f, 1, 0.011f); + TestMultiplyBias(320, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(472, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(248, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias(200, 256, 256, .1f, 1, 0.01f); } #endif diff --git a/test/prepare_b_quantized_transposed.cc b/test/prepare_b_quantized_transposed.cc index e27992a..defe9a0 100644 --- a/test/prepare_b_quantized_transposed.cc +++ b/test/prepare_b_quantized_transposed.cc @@ -62,14 +62,14 @@ TEST_CASE("PrepareBQuantizedTransposed SSE2", "") { if (kCPU < CPUType::SSE2) return; - CHECK(TestMany(32, 128)); + CHECK(TestMany(32, 128)); } TEST_CASE("PrepareBQuantizedTransposed SSSE3", "") { if (kCPU < CPUType::SSSE3) return; - CHECK(TestMany(32, 128)); + CHECK(TestMany(32, 128)); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 @@ -77,8 +77,8 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") { if (kCPU < CPUType::AVX2) return; - CHECK(TestMany(32, 128)); - CHECK(TestMany(32, 128)); + CHECK(TestMany(32, 128)); + CHECK(TestMany(32, 128)); } #endif @@ -87,8 +87,8 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") { if (kCPU < CPUType::AVX512BW) return; - CHECK(TestMany(64, 128)); - CHECK(TestMany(64, 128)); + CHECK(TestMany(64, 128)); + CHECK(TestMany(64, 128)); } #endif diff --git a/test/prepare_b_transposed.cc b/test/prepare_b_transposed.cc index 1ba88df..1c11fbe 100644 --- a/test/prepare_b_transposed.cc +++ b/test/prepare_b_transposed.cc @@ -63,14 +63,14 @@ TEST_CASE("PrepareBTransposed SSE2", "") { if (kCPU < CPUType::SSE2) return; - CHECK(TestMany(4, 128, 2.0f)); + CHECK(TestMany(4, 128, 2.0f)); } TEST_CASE("PrepareBTransposed SSSE3", "") { if (kCPU < CPUType::SSSE3) return; - CHECK(TestMany(4, 128, 2.0f)); + CHECK(TestMany(4, 128, 2.0f)); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 @@ -78,8 +78,8 @@ TEST_CASE("PrepareBTransposed AVX2", "") { if (kCPU < CPUType::AVX2) return; - CHECK(TestMany(8, 128, 2.0f)); - CHECK(TestMany(8, 128, 2.0f)); + CHECK(TestMany(8, 128, 2.0f)); + CHECK(TestMany(8, 128, 2.0f)); } #endif @@ -88,8 +88,8 @@ TEST_CASE("PrepareBTransposed AVX512", "") { if (kCPU < CPUType::AVX512BW) return; - CHECK(TestMany(16, 128, 2.0f)); - CHECK(TestMany(16, 128, 2.0f)); + CHECK(TestMany(16, 128, 2.0f)); + CHECK(TestMany(16, 128, 2.0f)); } #endif diff --git a/test/quantize_test.cc b/test/quantize_test.cc index e9f7980..622ff71 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -120,78 +120,78 @@ template void TestMany(std::size_t grow) { TEST_CASE ("Quantize SSE2", "[quantize]") { if (kCPU < CPUType::SSE2) return; - TestMany(8); + TestMany(8); } TEST_CASE ("Quantize SSSE3", "[quantize]") { if (kCPU < CPUType::SSSE3) return; - TestMany(1); + TestMany(1); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE ("Quantize AVX2", "[quantize]") { if (kCPU < CPUType::AVX2) return; - TestMany(1); - TestMany(16); + TestMany(1); + TestMany(16); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Quantize AVX512", "[quantize]") { if (kCPU < CPUType::AVX512BW) return; - TestMany(1); - TestMany(16); + TestMany(1); + TestMany(16); } #endif TEST_CASE("QuantizeStd SSSE3", "[VectorMeanStd]") { if (kCPU < CPUType::SSSE3) return; - testVectorMeanStd(64); - testVectorMeanStd(64, true); - testVectorMeanStd(256); - testVectorMeanStd(256, true); - testVectorMeanStd(2048); - testVectorMeanStd(2048, true); - testVectorMeanStd(65536); - testVectorMeanStd(65536, true); - testVectorMeanStd(81920); - testVectorMeanStd(81920, true); - testVectorMeanStd(120832); - testVectorMeanStd(120832, true); + testVectorMeanStd(64); + testVectorMeanStd(64, true); + testVectorMeanStd(256); + testVectorMeanStd(256, true); + testVectorMeanStd(2048); + testVectorMeanStd(2048, true); + testVectorMeanStd(65536); + testVectorMeanStd(65536, true); + testVectorMeanStd(81920); + testVectorMeanStd(81920, true); + testVectorMeanStd(120832); + testVectorMeanStd(120832, true); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("QuantizeStd AVX2", "[VectorMeanStd]") { if (kCPU < CPUType::AVX2) return; - testVectorMeanStd(64); - testVectorMeanStd(64, true); - testVectorMeanStd(256); - testVectorMeanStd(256, true); - testVectorMeanStd(2048); - testVectorMeanStd(2048, true); - testVectorMeanStd(65536); - testVectorMeanStd(65536, true); - testVectorMeanStd(81920); - testVectorMeanStd(81920, true); - testVectorMeanStd(120832); - testVectorMeanStd(120832, true); + testVectorMeanStd(64); + testVectorMeanStd(64, true); + testVectorMeanStd(256); + testVectorMeanStd(256, true); + testVectorMeanStd(2048); + testVectorMeanStd(2048, true); + testVectorMeanStd(65536); + testVectorMeanStd(65536, true); + testVectorMeanStd(81920); + testVectorMeanStd(81920, true); + testVectorMeanStd(120832); + testVectorMeanStd(120832, true); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("QuantizeStd AVX512BW", "[VectorMeanStd]") { if (kCPU < CPUType::AVX512BW) return; - testVectorMeanStd(64); - testVectorMeanStd(64, true); - testVectorMeanStd(256); - testVectorMeanStd(256, true); - testVectorMeanStd(2048); - testVectorMeanStd(2048, true); - testVectorMeanStd(65536); - testVectorMeanStd(65536, true); - testVectorMeanStd(81920); - testVectorMeanStd(81920, true); - testVectorMeanStd(120832); - testVectorMeanStd(120832, true); + testVectorMeanStd(64); + testVectorMeanStd(64, true); + testVectorMeanStd(256); + testVectorMeanStd(256, true); + testVectorMeanStd(2048); + testVectorMeanStd(2048, true); + testVectorMeanStd(65536); + testVectorMeanStd(65536, true); + testVectorMeanStd(81920); + testVectorMeanStd(81920, true); + testVectorMeanStd(120832); + testVectorMeanStd(120832, true); } #endif -- cgit v1.2.3