Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2020-11-25 16:19:48 +0300
committerKenneth Heafield <github@kheafield.com>2020-11-25 16:19:48 +0300
commited081ba03efb2826a082038404d10b044a45f27e (patch)
treeb74ea0bc99218807beb38c7156d2d041ab6a2c36
parentb7a513372bf9d81bad5b00568aa1daec13054d34 (diff)
Change architecture namespace to capitals.
Matches static branch, fewer defines for inline files
-rw-r--r--benchmarks/benchmark.cc28
-rw-r--r--benchmarks/benchmark_quantizer.cc6
-rw-r--r--benchmarks/biasmultiply.cc168
-rw-r--r--intgemm/avx2_gemm.h34
-rw-r--r--intgemm/avx512_gemm.h4
-rw-r--r--intgemm/avx512vnni_gemm.h6
-rw-r--r--intgemm/intgemm.cc64
-rw-r--r--intgemm/intgemm.h26
-rw-r--r--intgemm/sse2_gemm.h4
-rw-r--r--intgemm/ssse3_gemm.h8
-rw-r--r--intgemm/stats.inl6
-rw-r--r--intgemm/types.h20
-rw-r--r--test/add127_test.cc210
-rw-r--r--test/multiply_test.cc212
-rw-r--r--test/prepare_b_quantized_transposed.cc12
-rw-r--r--test/prepare_b_transposed.cc12
-rw-r--r--test/quantize_test.cc84
17 files changed, 452 insertions, 452 deletions
diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc
index 2dbe483..512d3ec 100644
--- a/benchmarks/benchmark.cc
+++ b/benchmarks/benchmark.cc
@@ -145,46 +145,46 @@ int main(int, char ** argv) {
std::cerr << "SSSE3 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<ssse3::Kernels8>(matrices, end, stats.ssse3_8bit);
+ RunAll<SSSE3::Kernels8>(matrices, end, stats.ssse3_8bit);
}
std::cerr << "SSE2 16bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<sse2::Kernels16>(matrices, end, stats.sse2_16bit);
+ RunAll<SSE2::Kernels16>(matrices, end, stats.sse2_16bit);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
std::cerr << "AVX2 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<avx2::Kernels8>(matrices, end, stats.avx2_8bit);
+ RunAll<AVX2::Kernels8>(matrices, end, stats.avx2_8bit);
}
std::cerr << "AVX2 16bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<avx2::Kernels16>(matrices, end, stats.avx2_16bit);
+ RunAll<AVX2::Kernels16>(matrices, end, stats.avx2_16bit);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
std::cerr << "AVX512 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<avx512bw::Kernels8>(matrices, end, stats.avx512_8bit);
+ RunAll<AVX512BW::Kernels8>(matrices, end, stats.avx512_8bit);
}
std::cerr << "AVX512 16bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<avx512bw::Kernels16>(matrices, end, stats.avx512_16bit);
+ RunAll<AVX512BW::Kernels16>(matrices, end, stats.avx512_16bit);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
std::cerr << "AVX512VNNI 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<avx512vnni::Kernels8>(matrices, end, stats.avx512vnni_8bit);
+ RunAll<AVX512VNNI::Kernels8>(matrices, end, stats.avx512vnni_8bit);
}
#endif
@@ -194,18 +194,18 @@ int main(int, char ** argv) {
}
for (std::size_t i = 0; i < sizeof(matrices) / sizeof(RandomMatrices); ++i) {
std::cout << "Multiply\t" << matrices[i].A_rows << '\t' << matrices[i].width << '\t' << matrices[i].B_cols << '\t' << "Samples=" << (kOutlierThreshold * stats.sse2_16bit[i].size()) << '\n';
- Print<ssse3::Kernels8>(stats.ssse3_8bit, i);
- Print<avx2::Kernels8>(stats.avx2_8bit, i);
+ Print<SSSE3::Kernels8>(stats.ssse3_8bit, i);
+ Print<AVX2::Kernels8>(stats.avx2_8bit, i);
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- Print<avx512bw::Kernels8>(stats.avx512_8bit, i);
+ Print<AVX512BW::Kernels8>(stats.avx512_8bit, i);
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
- Print<avx512vnni::Kernels8>(stats.avx512vnni_8bit, i);
+ Print<AVX512VNNI::Kernels8>(stats.avx512vnni_8bit, i);
#endif
- Print<sse2::Kernels16>(stats.sse2_16bit, i);
- Print<avx2::Kernels16>(stats.avx2_16bit, i);
+ Print<SSE2::Kernels16>(stats.sse2_16bit, i);
+ Print<AVX2::Kernels16>(stats.avx2_16bit, i);
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- Print<avx512bw::Kernels16>(stats.avx512_16bit, i);
+ Print<AVX512BW::Kernels16>(stats.avx512_16bit, i);
#endif
}
return 0;
diff --git a/benchmarks/benchmark_quantizer.cc b/benchmarks/benchmark_quantizer.cc
index 86d90dc..5235b1e 100644
--- a/benchmarks/benchmark_quantizer.cc
+++ b/benchmarks/benchmark_quantizer.cc
@@ -63,12 +63,12 @@ int main() {
for (float &element : in) {
element = dist(gen);
}
- QuantizerBench<intgemm::ssse3::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+ QuantizerBench<intgemm::SSSE3::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
- QuantizerBench<intgemm::avx2::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+ QuantizerBench<intgemm::AVX2::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- QuantizerBench<intgemm::avx512bw::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+ QuantizerBench<intgemm::AVX512BW::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
#endif
}
}
diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc
index 65deadb..c835b61 100644
--- a/benchmarks/biasmultiply.cc
+++ b/benchmarks/biasmultiply.cc
@@ -125,151 +125,151 @@ int main(int argc, char ** argv) {
repeat = atoi(argv[1]);
}
- std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<ssse3::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<SSSE3::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(8, 256, 256);
- oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(8, 2048, 256);
- oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(320, 256, 256);
- oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(472, 256, 256);
- oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(248, 256, 256);
- oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(200, 256, 256);
+ oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(8, 256, 256);
+ oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(8, 2048, 256);
+ oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(320, 256, 256);
+ oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(472, 256, 256);
+ oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(248, 256, 256);
+ oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of SSSE3 without bias took: " << oldSSSE3_nobias.count() << " seconds." << std::endl;
- std::chrono::duration<double> oldSSSE3 = testOld<ssse3::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> oldSSSE3 = testOld<SSSE3::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldSSSE3 += testOld<ssse3::Kernels8>(8, 256, 256);
- oldSSSE3 += testOld<ssse3::Kernels8>(8, 2048, 256);
- oldSSSE3 += testOld<ssse3::Kernels8>(320, 256, 256);
- oldSSSE3 += testOld<ssse3::Kernels8>(472, 256, 256);
- oldSSSE3 += testOld<ssse3::Kernels8>(248, 256, 256);
- oldSSSE3 += testOld<ssse3::Kernels8>(200, 256, 256);
+ oldSSSE3 += testOld<SSSE3::Kernels8>(8, 256, 256);
+ oldSSSE3 += testOld<SSSE3::Kernels8>(8, 2048, 256);
+ oldSSSE3 += testOld<SSSE3::Kernels8>(320, 256, 256);
+ oldSSSE3 += testOld<SSSE3::Kernels8>(472, 256, 256);
+ oldSSSE3 += testOld<SSSE3::Kernels8>(248, 256, 256);
+ oldSSSE3 += testOld<SSSE3::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of SSSE3 took: " << oldSSSE3.count() << " seconds." << std::endl;
- std::chrono::duration<double> newTimeSSSE3 = testOld<ssse3::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> newTimeSSSE3 = testOld<SSSE3::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- newTimeSSSE3 += testNew<ssse3::Kernels8>(8, 256, 256);
- newTimeSSSE3 += testNew<ssse3::Kernels8>(8, 2048, 256);
- newTimeSSSE3 += testNew<ssse3::Kernels8>(320, 256, 256);
- newTimeSSSE3 += testNew<ssse3::Kernels8>(472, 256, 256);
- newTimeSSSE3 += testNew<ssse3::Kernels8>(248, 256, 256);
- newTimeSSSE3 += testNew<ssse3::Kernels8>(200, 256, 256);
+ newTimeSSSE3 += testNew<SSSE3::Kernels8>(8, 256, 256);
+ newTimeSSSE3 += testNew<SSSE3::Kernels8>(8, 2048, 256);
+ newTimeSSSE3 += testNew<SSSE3::Kernels8>(320, 256, 256);
+ newTimeSSSE3 += testNew<SSSE3::Kernels8>(472, 256, 256);
+ newTimeSSSE3 += testNew<SSSE3::Kernels8>(248, 256, 256);
+ newTimeSSSE3 += testNew<SSSE3::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl;
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
- std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<avx2::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<AVX2::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(8, 256, 256);
- oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(8, 2048, 256);
- oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(320, 256, 256);
- oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(472, 256, 256);
- oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(248, 256, 256);
- oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(200, 256, 256);
+ oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(8, 256, 256);
+ oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(8, 2048, 256);
+ oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(320, 256, 256);
+ oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(472, 256, 256);
+ oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(248, 256, 256);
+ oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX2 without bias took: " << oldAVX2_nobias.count() << " seconds." << std::endl;
- std::chrono::duration<double> oldAVX2 = testOld<avx2::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> oldAVX2 = testOld<AVX2::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX2 += testOld<avx2::Kernels8>(8, 256, 256);
- oldAVX2 += testOld<avx2::Kernels8>(8, 2048, 256);
- oldAVX2 += testOld<avx2::Kernels8>(320, 256, 256);
- oldAVX2 += testOld<avx2::Kernels8>(472, 256, 256);
- oldAVX2 += testOld<avx2::Kernels8>(248, 256, 256);
- oldAVX2 += testOld<avx2::Kernels8>(200, 256, 256);
+ oldAVX2 += testOld<AVX2::Kernels8>(8, 256, 256);
+ oldAVX2 += testOld<AVX2::Kernels8>(8, 2048, 256);
+ oldAVX2 += testOld<AVX2::Kernels8>(320, 256, 256);
+ oldAVX2 += testOld<AVX2::Kernels8>(472, 256, 256);
+ oldAVX2 += testOld<AVX2::Kernels8>(248, 256, 256);
+ oldAVX2 += testOld<AVX2::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX2 took: " << oldAVX2.count() << " seconds." << std::endl;
- std::chrono::duration<double> newTimeAVX2 = testOld<avx2::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> newTimeAVX2 = testOld<AVX2::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- newTimeAVX2 += testNew<avx2::Kernels8>(8, 256, 256);
- newTimeAVX2 += testNew<avx2::Kernels8>(8, 2048, 256);
- newTimeAVX2 += testNew<avx2::Kernels8>(320, 256, 256);
- newTimeAVX2 += testNew<avx2::Kernels8>(472, 256, 256);
- newTimeAVX2 += testNew<avx2::Kernels8>(248, 256, 256);
- newTimeAVX2 += testNew<avx2::Kernels8>(200, 256, 256);
+ newTimeAVX2 += testNew<AVX2::Kernels8>(8, 256, 256);
+ newTimeAVX2 += testNew<AVX2::Kernels8>(8, 2048, 256);
+ newTimeAVX2 += testNew<AVX2::Kernels8>(320, 256, 256);
+ newTimeAVX2 += testNew<AVX2::Kernels8>(472, 256, 256);
+ newTimeAVX2 += testNew<AVX2::Kernels8>(248, 256, 256);
+ newTimeAVX2 += testNew<AVX2::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
if (kCPU < CPUType::AVX512BW) return 0;
- std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<avx512bw::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(8, 256, 256);
- oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(8, 2048, 256);
- oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(320, 256, 256);
- oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(472, 256, 256);
- oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(248, 256, 256);
- oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(200, 256, 256);
+ oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(8, 256, 256);
+ oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(8, 2048, 256);
+ oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(320, 256, 256);
+ oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(472, 256, 256);
+ oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(248, 256, 256);
+ oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512 without bias took: " << oldAVX512_nobias.count() << " seconds." << std::endl;
- std::chrono::duration<double> oldAVX512 = testOld<avx512bw::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> oldAVX512 = testOld<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX512 += testOld<avx512bw::Kernels8>(8, 256, 256);
- oldAVX512 += testOld<avx512bw::Kernels8>(8, 2048, 256);
- oldAVX512 += testOld<avx512bw::Kernels8>(320, 256, 256);
- oldAVX512 += testOld<avx512bw::Kernels8>(472, 256, 256);
- oldAVX512 += testOld<avx512bw::Kernels8>(248, 256, 256);
- oldAVX512 += testOld<avx512bw::Kernels8>(200, 256, 256);
+ oldAVX512 += testOld<AVX512BW::Kernels8>(8, 256, 256);
+ oldAVX512 += testOld<AVX512BW::Kernels8>(8, 2048, 256);
+ oldAVX512 += testOld<AVX512BW::Kernels8>(320, 256, 256);
+ oldAVX512 += testOld<AVX512BW::Kernels8>(472, 256, 256);
+ oldAVX512 += testOld<AVX512BW::Kernels8>(248, 256, 256);
+ oldAVX512 += testOld<AVX512BW::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512 took: " << oldAVX512.count() << " seconds." << std::endl;
- std::chrono::duration<double> newTimeAVX512 = testOld<avx512bw::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> newTimeAVX512 = testOld<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- newTimeAVX512 += testNew<avx512bw::Kernels8>(8, 256, 256);
- newTimeAVX512 += testNew<avx512bw::Kernels8>(8, 2048, 256);
- newTimeAVX512 += testNew<avx512bw::Kernels8>(320, 256, 256);
- newTimeAVX512 += testNew<avx512bw::Kernels8>(472, 256, 256);
- newTimeAVX512 += testNew<avx512bw::Kernels8>(248, 256, 256);
- newTimeAVX512 += testNew<avx512bw::Kernels8>(200, 256, 256);
+ newTimeAVX512 += testNew<AVX512BW::Kernels8>(8, 256, 256);
+ newTimeAVX512 += testNew<AVX512BW::Kernels8>(8, 2048, 256);
+ newTimeAVX512 += testNew<AVX512BW::Kernels8>(320, 256, 256);
+ newTimeAVX512 += testNew<AVX512BW::Kernels8>(472, 256, 256);
+ newTimeAVX512 += testNew<AVX512BW::Kernels8>(248, 256, 256);
+ newTimeAVX512 += testNew<AVX512BW::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted AVX512 took: " << newTimeAVX512.count() << " seconds." << std::endl;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
if (kCPU < CPUType::AVX512VNNI) return 0;
- std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<avx512bw::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(8, 256, 256);
- oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(8, 2048, 256);
- oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(320, 256, 256);
- oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(472, 256, 256);
- oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(248, 256, 256);
- oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(200, 256, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(8, 256, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(8, 2048, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(320, 256, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(472, 256, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(248, 256, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512VNNI without bias took: " << oldAVX512VNNI_nobias.count() << " seconds." << std::endl;
- std::chrono::duration<double> oldAVX512VNNI = testOld<avx512bw::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> oldAVX512VNNI = testOld<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX512VNNI += testOld<avx512vnni::Kernels8>(8, 256, 256);
- oldAVX512VNNI += testOld<avx512vnni::Kernels8>(8, 2048, 256);
- oldAVX512VNNI += testOld<avx512vnni::Kernels8>(320, 256, 256);
- oldAVX512VNNI += testOld<avx512vnni::Kernels8>(472, 256, 256);
- oldAVX512VNNI += testOld<avx512vnni::Kernels8>(248, 256, 256);
- oldAVX512VNNI += testOld<avx512vnni::Kernels8>(200, 256, 256);
+ oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(8, 256, 256);
+ oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(8, 2048, 256);
+ oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(320, 256, 256);
+ oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(472, 256, 256);
+ oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(248, 256, 256);
+ oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512VNNI took: " << oldAVX512VNNI.count() << " seconds." << std::endl;
- std::chrono::duration<double> newTimeAVX512VNNI = testOld<avx512bw::Kernels8>(1, 64, 8);
+ std::chrono::duration<double> newTimeAVX512VNNI = testOld<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(8, 256, 256);
- newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(8, 2048, 256);
- newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(320, 256, 256);
- newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(472, 256, 256);
- newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(248, 256, 256);
- newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(200, 256, 256);
+ newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(8, 256, 256);
+ newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(8, 2048, 256);
+ newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(320, 256, 256);
+ newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(472, 256, 256);
+ newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(248, 256, 256);
+ newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted AVX512VNNI took: " << newTimeAVX512VNNI.count() << " seconds." << std::endl;
diff --git a/intgemm/avx2_gemm.h b/intgemm/avx2_gemm.h
index 6e01679..d93ac8e 100644
--- a/intgemm/avx2_gemm.h
+++ b/intgemm/avx2_gemm.h
@@ -13,7 +13,7 @@
#include <cstring>
namespace intgemm {
-namespace avx2 {
+namespace AVX2 {
INTGEMM_AVX2 inline Register QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
return kernels::quantize(loadu_ps<FRegister>(input), quant_mult_reg);
@@ -73,14 +73,14 @@ struct Kernels16 {
static const Index kBTileCol = 8;
/*
INTGEMM_AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
- PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols);
+ PrepareBFor16(input, output, AVX2::QuantizeTile16(quant_mult), rows, cols);
}*/
- INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16)
+ INTGEMM_PREPARE_B_16(INTGEMM_AVX2, AVX2::QuantizeTile16)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int16_t)
- INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile16, int16_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile16, int16_t)
INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
+ AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
}
INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2)
@@ -129,10 +129,10 @@ class QuantizeTile8 {
const __m256i neg127 = _mm256_set1_epi8(-127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
// Grab 4 registers at a time in 32-bit format.
- __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
- __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
- __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
- __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
+ __m256i g0 = AVX2::QuantizerGrab(input0, quant_mult);
+ __m256i g1 = AVX2::QuantizerGrab(input1, quant_mult);
+ __m256i g2 = AVX2::QuantizerGrab(input2, quant_mult);
+ __m256i g3 = AVX2::QuantizerGrab(input3, quant_mult);
// Pack 32-bit to 16-bit.
__m256i packed0 = _mm256_packs_epi32(g0, g1);
__m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -155,10 +155,10 @@ class QuantizeTile8 {
const __m256i pos127 = _mm256_set1_epi8(127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
// Grab 4 registers at a time in 32-bit format.
- __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
- __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
- __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
- __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
+ __m256i g0 = AVX2::QuantizerGrab(input0, quant_mult);
+ __m256i g1 = AVX2::QuantizerGrab(input1, quant_mult);
+ __m256i g2 = AVX2::QuantizerGrab(input2, quant_mult);
+ __m256i g3 = AVX2::QuantizerGrab(input3, quant_mult);
// Pack 32-bit to 16-bit.
__m256i packed0 = _mm256_packs_epi32(g0, g1);
__m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -207,12 +207,12 @@ struct Kernels8 {
static const Index kBTileRow = 32;
static const Index kBTileCol = 8;
- INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8)
+ INTGEMM_PREPARE_B_8(INTGEMM_AVX2, AVX2::QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int8_t)
- INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile8, int8_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile8, int8_t)
INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
+ AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
}
INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, CPUType::AVX2)
@@ -226,7 +226,7 @@ struct Kernels8 {
static const CPUType kUses = CPUType::AVX2;
};
-} // namespace avx2
+} // namespace AVX2
} // namespace intgemm
#endif
diff --git a/intgemm/avx512_gemm.h b/intgemm/avx512_gemm.h
index f9fb1eb..a69b2dc 100644
--- a/intgemm/avx512_gemm.h
+++ b/intgemm/avx512_gemm.h
@@ -31,7 +31,7 @@ namespace intgemm {
// So conversion in memory uses these, but I also implement a wider version for
// rearranging B.
-namespace avx512bw {
+namespace AVX512BW {
// Load from memory, multiply, and convert to int32_t.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
@@ -405,7 +405,7 @@ struct Kernels8 {
static const CPUType kUses = CPUType::AVX512BW;
};
-} // namespace avx512bw
+} // namespace AVX512BW
} // namespace intgemm
#endif
diff --git a/intgemm/avx512vnni_gemm.h b/intgemm/avx512vnni_gemm.h
index c660168..747bdf9 100644
--- a/intgemm/avx512vnni_gemm.h
+++ b/intgemm/avx512vnni_gemm.h
@@ -7,7 +7,7 @@
#include "types.h"
namespace intgemm {
-namespace avx512vnni {
+namespace AVX512VNNI {
// Workaround extra vmovdqa64 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663
INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) {
@@ -18,7 +18,7 @@ INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) {
#endif
}
-struct Kernels8 : public avx512bw::Kernels8 {
+struct Kernels8 : public AVX512BW::Kernels8 {
template <typename Callback>
INTGEMM_AVX512VNNI static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
assert(width % sizeof(Register) == 0);
@@ -162,7 +162,7 @@ struct Kernels8 : public avx512bw::Kernels8 {
static const CPUType kUses = CPUType::AVX512VNNI;
};
-} // namespace avx512vnni
+} // namespace AVX512VNNI
} // namespace intgemm
#endif
diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc
index d45cf60..82ad750 100644
--- a/intgemm/intgemm.cc
+++ b/intgemm/intgemm.cc
@@ -11,69 +11,69 @@ MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/
throw UnsupportedCPU();
}
-void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(avx512bw::Kernels16::Quantize, avx512bw::Kernels16::Quantize, avx2::Kernels16::Quantize, sse2::Kernels16::Quantize, sse2::Kernels16::Quantize, Unsupported_16bit::Quantize);
+void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512BW::Kernels16::Quantize, AVX512BW::Kernels16::Quantize, AVX2::Kernels16::Quantize, SSE2::Kernels16::Quantize, SSE2::Kernels16::Quantize, Unsupported_16bit::Quantize);
-void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512bw::Kernels16::PrepareB, avx512bw::Kernels16::PrepareB, avx2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, Unsupported_16bit::PrepareB);
+void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512BW::Kernels16::PrepareB, AVX512BW::Kernels16::PrepareB, AVX2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, Unsupported_16bit::PrepareB);
-void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBQuantizedTransposed, avx512bw::Kernels16::PrepareBQuantizedTransposed, avx2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
+void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
-void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBTransposed, avx512bw::Kernels16::PrepareBTransposed, avx2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
+void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBTransposed, AVX512BW::Kernels16::PrepareBTransposed, AVX2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
-void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512bw::Kernels16::SelectColumnsB, avx512bw::Kernels16::SelectColumnsB, avx2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
+void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512BW::Kernels16::SelectColumnsB, AVX512BW::Kernels16::SelectColumnsB, AVX2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
-const char *const Int16::kName = ChooseCPU(avx512bw::Kernels16::kName, avx512bw::Kernels16::kName, avx2::Kernels16::kName, sse2::Kernels16::kName, sse2::Kernels16::kName, Unsupported_16bit::kName);
+const char *const Int16::kName = ChooseCPU(AVX512BW::Kernels16::kName, AVX512BW::Kernels16::kName, AVX2::Kernels16::kName, SSE2::Kernels16::kName, SSE2::Kernels16::kName, Unsupported_16bit::kName);
-void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::Quantize, avx512bw::Kernels8::Quantize, avx2::Kernels8::Quantize, ssse3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
+void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::Quantize, AVX512BW::Kernels8::Quantize, AVX2::Kernels8::Quantize, SSSE3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
-void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
+void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
-void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512vnni::Kernels8::PrepareB, avx512bw::Kernels8::PrepareB, avx2::Kernels8::PrepareB, ssse3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
+void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512VNNI::Kernels8::PrepareB, AVX512BW::Kernels8::PrepareB, AVX2::Kernels8::PrepareB, SSSE3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
-void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBQuantizedTransposed, avx512bw::Kernels8::PrepareBQuantizedTransposed, avx2::Kernels8::PrepareBQuantizedTransposed, ssse3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
+void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX2::Kernels8::PrepareBQuantizedTransposed, SSSE3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
-void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBTransposed, avx512bw::Kernels8::PrepareBTransposed, avx2::Kernels8::PrepareBTransposed, ssse3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
+void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBTransposed, AVX512BW::Kernels8::PrepareBTransposed, AVX2::Kernels8::PrepareBTransposed, SSSE3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
-void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512vnni::Kernels8::SelectColumnsB, avx512bw::Kernels8::SelectColumnsB, avx2::Kernels8::SelectColumnsB, ssse3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
+void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI::Kernels8::SelectColumnsB, AVX512BW::Kernels8::SelectColumnsB, AVX2::Kernels8::SelectColumnsB, SSSE3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
-const char *const Int8::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
+const char *const Int8::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
-void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
+void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
-const char *const Int8Shift::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
+const char *const Int8Shift::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED);
#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX2)
-namespace avx2{
-using sse2::MaxAbsolute;
-using sse2::VectorMeanStd;
-} // namespace avx2
+namespace AVX2{
+using SSE2::MaxAbsolute;
+using SSE2::VectorMeanStd;
+} // namespace AVX2
#endif
#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW)
-namespace avx512bw {
-using avx2::MaxAbsolute;
-using avx2::VectorMeanStd;
-} // namespace avx512bw
+namespace AVX512BW {
+using AVX2::MaxAbsolute;
+using AVX2::VectorMeanStd;
+} // namespace AVX512BW
#endif
-float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512bw::MaxAbsolute, avx512bw::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute);
+float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512BW::MaxAbsolute, AVX512BW::MaxAbsolute, AVX2::MaxAbsolute, SSE2::MaxAbsolute, SSE2::MaxAbsolute, Unsupported_MaxAbsolute);
-MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512bw::VectorMeanStd, avx512bw::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, Unsupported_VectorMeanStd);
+MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(AVX512BW::VectorMeanStd, AVX512BW::VectorMeanStd, AVX2::VectorMeanStd, SSE2::VectorMeanStd, SSE2::VectorMeanStd, Unsupported_VectorMeanStd);
constexpr const char *const Unsupported_16bit::kName;
constexpr const char *const Unsupported_8bit::kName;
-constexpr const char *const sse2::Kernels16::kName;
-constexpr const char *const ssse3::Kernels8::kName;
+constexpr const char *const SSE2::Kernels16::kName;
+constexpr const char *const SSSE3::Kernels8::kName;
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-constexpr const char *const avx2::Kernels8::kName;
-constexpr const char *const avx2::Kernels16::kName;
+constexpr const char *const AVX2::Kernels8::kName;
+constexpr const char *const AVX2::Kernels16::kName;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-constexpr const char *const avx512bw::Kernels8::kName;
-constexpr const char *const avx512bw::Kernels16::kName;
+constexpr const char *const AVX512BW::Kernels8::kName;
+constexpr const char *const AVX512BW::Kernels16::kName;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-constexpr const char *const avx512vnni::Kernels8::kName;
+constexpr const char *const AVX512VNNI::Kernels8::kName;
#endif
}
diff --git a/intgemm/intgemm.h b/intgemm/intgemm.h
index a354b60..029a8ec 100644
--- a/intgemm/intgemm.h
+++ b/intgemm/intgemm.h
@@ -127,21 +127,21 @@ struct Unsupported_8bit {
#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
// These won't ever be called in this capacity, but it does let the code below compile.
-namespace avx512vnni {
+namespace AVX512VNNI {
typedef Unsupported_8bit Kernels8;
-} // namespace avx512vnni
+} // namespace AVX512VNNI
#endif
#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-namespace avx512bw {
+namespace AVX512BW {
typedef Unsupported_8bit Kernels8;
typedef Unsupported_16bit Kernels16;
-} // namespace avx512bw
+} // namespace AVX512BW
#endif
#ifndef INTGEMM_COMPILER_SUPPORTS_AVX2
-namespace avx2 {
+namespace AVX2 {
typedef Unsupported_8bit Kernels8;
typedef Unsupported_16bit Kernels16;
-} // namespace avx2
+} // namespace AVX2
#endif
@@ -309,7 +309,7 @@ private:
};
template <typename Callback>
-void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512vnni::Kernels8>, OMPParallelWrap<Callback, avx512bw::Kernels8>, OMPParallelWrap<Callback, avx2::Kernels8>, OMPParallelWrap<Callback, ssse3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>);
+void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512VNNI::Kernels8>, OMPParallelWrap<Callback, AVX512BW::Kernels8>, OMPParallelWrap<Callback, AVX2::Kernels8>, OMPParallelWrap<Callback, SSSE3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>);
/*
* 8-bit matrix multiplication with shifting A by 127
@@ -373,14 +373,14 @@ private:
template <class Callback>
void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(
- OMPParallelWrap8Shift<Callback, avx512vnni::Kernels8>,
- OMPParallelWrap8Shift<Callback, avx512bw::Kernels8>,
- OMPParallelWrap8Shift<Callback, avx2::Kernels8>,
- OMPParallelWrap8Shift<Callback, ssse3::Kernels8>,
+ OMPParallelWrap8Shift<Callback, AVX512VNNI::Kernels8>,
+ OMPParallelWrap8Shift<Callback, AVX512BW::Kernels8>,
+ OMPParallelWrap8Shift<Callback, AVX2::Kernels8>,
+ OMPParallelWrap8Shift<Callback, SSSE3::Kernels8>,
Unsupported_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift<Callback>);
template <class Callback>
-void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(avx512vnni::Kernels8::PrepareBias<Callback>, avx512bw::Kernels8::PrepareBias<Callback>, avx2::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
+void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI::Kernels8::PrepareBias<Callback>, AVX512BW::Kernels8::PrepareBias<Callback>, AVX2::Kernels8::PrepareBias<Callback>, SSSE3::Kernels8::PrepareBias<Callback>, SSSE3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
/*
* 16-bit matrix multiplication
@@ -436,7 +436,7 @@ private:
};
template <typename Callback>
-void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512bw::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, avx512bw::Kernels16>, OMPParallelWrap<Callback, avx2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, Unsupported_16bit::Multiply<Callback>);
+void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512BW::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, AVX512BW::Kernels16>, OMPParallelWrap<Callback, AVX2::Kernels16>, OMPParallelWrap<Callback, SSE2::Kernels16>, OMPParallelWrap<Callback, SSE2::Kernels16>, Unsupported_16bit::Multiply<Callback>);
extern const CPUType kCPU;
diff --git a/intgemm/sse2_gemm.h b/intgemm/sse2_gemm.h
index cd49efe..cd855a6 100644
--- a/intgemm/sse2_gemm.h
+++ b/intgemm/sse2_gemm.h
@@ -9,7 +9,7 @@
// 8 bit is in ssse3_gemm.h
namespace intgemm {
-namespace sse2 {
+namespace SSE2 {
INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg);
@@ -80,5 +80,5 @@ struct Kernels16 {
static const CPUType kUses = CPUType::SSE2;
};
-} // namespace sse2
+} // namespace SSE2
} // namespace intgemm
diff --git a/intgemm/ssse3_gemm.h b/intgemm/ssse3_gemm.h
index 865fe12..db403bd 100644
--- a/intgemm/ssse3_gemm.h
+++ b/intgemm/ssse3_gemm.h
@@ -11,7 +11,7 @@
// 16-bit is in sse2_gemm.h
namespace intgemm {
-namespace ssse3 {
+namespace SSSE3 {
INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg);
@@ -131,12 +131,12 @@ struct Kernels8 {
static const Index kBTileRow = 16;
static const Index kBTileCol = 8;
- INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8)
+ INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, SSSE3::QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t)
INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t)
INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
+ SSSE3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
}
INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
@@ -150,5 +150,5 @@ struct Kernels8 {
static const CPUType kUses = CPUType::SSSE3;
};
-} // namespace ssse3
+} // namespace SSSE3
} // namespace intgemm
diff --git a/intgemm/stats.inl b/intgemm/stats.inl
index d6a850e..68a5b8e 100644
--- a/intgemm/stats.inl
+++ b/intgemm/stats.inl
@@ -1,12 +1,12 @@
/* This file is included multiple times, once per architecture. */
#if defined(INTGEMM_THIS_IS_AVX512DQ)
-#define INTGEMM_ARCH avx512bw
+#define INTGEMM_ARCH AVX512BW
#define INTGEMM_TARGET INTGEMM_AVX512DQ
#elif defined(INTGEMM_THIS_IS_AVX2)
-#define INTGEMM_ARCH avx2
+#define INTGEMM_ARCH AVX2
#define INTGEMM_TARGET INTGEMM_AVX2
#elif defined(INTGEMM_THIS_IS_SSE2)
-#define INTGEMM_ARCH sse2
+#define INTGEMM_ARCH SSE2
#define INTGEMM_TARGET INTGEMM_SSE2
#else
#error Included with unexpected architecture
diff --git a/intgemm/types.h b/intgemm/types.h
index 174f1ff..602130a 100644
--- a/intgemm/types.h
+++ b/intgemm/types.h
@@ -70,30 +70,30 @@ struct MeanStd {
};
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-namespace avx512vnni {
+namespace AVX512VNNI {
typedef __m512i Register;
typedef __m512 FRegister;
-} // namespace avx512vnni
+} // namespace AVX512VNNI
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-namespace avx512bw {
+namespace AVX512BW {
typedef __m512i Register;
typedef __m512 FRegister;
-} // namespace avx512bw
+} // namespace AVX512BW
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-namespace avx2 {
+namespace AVX2 {
typedef __m256i Register;
typedef __m256 FRegister;
-} // namespace avx2
+} // namespace AVX2
#endif
-namespace ssse3 {
+namespace SSSE3 {
typedef __m128i Register;
typedef __m128 FRegister;
-} // namespace ssse3
-namespace sse2 {
+} // namespace SSSE3
+namespace SSE2 {
typedef __m128i Register;
typedef __m128 FRegister;
-} // namespace sse2
+} // namespace SSE2
} // namespace intgemm
diff --git a/test/add127_test.cc b/test/add127_test.cc
index 723e143..c31732c 100644
--- a/test/add127_test.cc
+++ b/test/add127_test.cc
@@ -282,55 +282,55 @@ template <class Routine> void TestMultiplyShiftInt(Index A_rows, Index width, In
// Bias
TEST_CASE("PrepareBias SSSE3", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
- TestPrepareBias<ssse3::Kernels8>(256,256);
- TestPrepareBias<ssse3::Kernels8>(2048,256);
- TestPrepareBias<ssse3::Kernels8>(512,512);
+ TestPrepareBias<SSSE3::Kernels8>(256,256);
+ TestPrepareBias<SSSE3::Kernels8>(2048,256);
+ TestPrepareBias<SSSE3::Kernels8>(512,512);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("PrepareBias AVX2", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
- TestPrepareBias<avx2::Kernels8>(256,256);
- TestPrepareBias<avx2::Kernels8>(2048,256);
- TestPrepareBias<avx2::Kernels8>(512,512);
+ TestPrepareBias<AVX2::Kernels8>(256,256);
+ TestPrepareBias<AVX2::Kernels8>(2048,256);
+ TestPrepareBias<AVX2::Kernels8>(512,512);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("PrepareBias AVX512F", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
- TestPrepareBias<avx512bw::Kernels8>(256,256);
- TestPrepareBias<avx512bw::Kernels8>(2048,256);
- TestPrepareBias<avx512bw::Kernels8>(512,512);
+ TestPrepareBias<AVX512BW::Kernels8>(256,256);
+ TestPrepareBias<AVX512BW::Kernels8>(2048,256);
+ TestPrepareBias<AVX512BW::Kernels8>(512,512);
}
#endif
//A
TEST_CASE("PrepareA SSSE3", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
- TestPrepareA<ssse3::Kernels8>(64,64);
- TestPrepareA<ssse3::Kernels8>(256,256);
- TestPrepareA<ssse3::Kernels8>(512,512);
- TestPrepareA<ssse3::Kernels8>(2048,256);
+ TestPrepareA<SSSE3::Kernels8>(64,64);
+ TestPrepareA<SSSE3::Kernels8>(256,256);
+ TestPrepareA<SSSE3::Kernels8>(512,512);
+ TestPrepareA<SSSE3::Kernels8>(2048,256);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("PrepareA AVX2", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
- TestPrepareA<avx2::Kernels8>(64,64);
- TestPrepareA<avx2::Kernels8>(256,256);
- TestPrepareA<avx2::Kernels8>(512,512);
- TestPrepareA<avx2::Kernels8>(2048,256);
+ TestPrepareA<AVX2::Kernels8>(64,64);
+ TestPrepareA<AVX2::Kernels8>(256,256);
+ TestPrepareA<AVX2::Kernels8>(512,512);
+ TestPrepareA<AVX2::Kernels8>(2048,256);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("PrepareA AVX512F", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
- TestPrepareA<avx512bw::Kernels8>(64,64);
- TestPrepareA<avx512bw::Kernels8>(256,256);
- TestPrepareA<avx512bw::Kernels8>(512,512);
- TestPrepareA<avx512bw::Kernels8>(2048,256);
+ TestPrepareA<AVX512BW::Kernels8>(64,64);
+ TestPrepareA<AVX512BW::Kernels8>(256,256);
+ TestPrepareA<AVX512BW::Kernels8>(512,512);
+ TestPrepareA<AVX512BW::Kernels8>(2048,256);
}
#endif
@@ -338,153 +338,153 @@ TEST_CASE("PrepareA AVX512F", "[Add127]") {
TEST_CASE ("Multiply SSSE3 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
- TestMultiplyBiasNew<ssse3::Kernels8>(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f);
- TestMultiplyBiasNew<ssse3::Kernels8>(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f);
- TestMultiplyBiasNew<ssse3::Kernels8>(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f);
- TestMultiplyBiasNew<ssse3::Kernels8>(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f);
- TestMultiplyBiasNew<ssse3::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
- TestMultiplyBiasNew<ssse3::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
- TestMultiplyBiasNew<ssse3::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<SSSE3::Kernels8>(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f);
+ TestMultiplyBiasNew<SSSE3::Kernels8>(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<SSSE3::Kernels8>(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f);
+ TestMultiplyBiasNew<SSSE3::Kernels8>(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f);
+ TestMultiplyBiasNew<SSSE3::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<SSSE3::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
+ TestMultiplyBiasNew<SSSE3::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE ("Multiply AVX2 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiplyBiasNew<avx2::Kernels8>(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f);
- TestMultiplyBiasNew<avx2::Kernels8>(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f);
- TestMultiplyBiasNew<avx2::Kernels8>(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f);
- TestMultiplyBiasNew<avx2::Kernels8>(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f);
- TestMultiplyBiasNew<avx2::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
- TestMultiplyBiasNew<avx2::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
- TestMultiplyBiasNew<avx2::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<AVX2::Kernels8>(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f);
+ TestMultiplyBiasNew<AVX2::Kernels8>(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<AVX2::Kernels8>(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f);
+ TestMultiplyBiasNew<AVX2::Kernels8>(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f);
+ TestMultiplyBiasNew<AVX2::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<AVX2::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
+ TestMultiplyBiasNew<AVX2::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512F 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiplyBiasNew<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
- TestMultiplyBiasNew<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
- TestMultiplyBiasNew<avx512bw::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
- TestMultiplyBiasNew<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyBiasNew<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
- TestMultiplyBiasNew<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyBiasNew<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<AVX512BW::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
+ TestMultiplyBiasNew<AVX512BW::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<AVX512BW::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
+ TestMultiplyBiasNew<AVX512BW::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<AVX512BW::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<AVX512BW::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<AVX512BW::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::AVX512VNNI) return;
- TestMultiplyBiasNew<avx512vnni::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
- TestMultiplyBiasNew<avx512vnni::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
- TestMultiplyBiasNew<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
- TestMultiplyBiasNew<avx512vnni::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyBiasNew<avx512vnni::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
- TestMultiplyBiasNew<avx512vnni::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyBiasNew<avx512vnni::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<AVX512VNNI::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
+ TestMultiplyBiasNew<AVX512VNNI::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<AVX512VNNI::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
+ TestMultiplyBiasNew<AVX512VNNI::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<AVX512VNNI::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<AVX512VNNI::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<AVX512VNNI::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
}
#endif
//Multiply old vs new
TEST_CASE ("Multiply SSSE3 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
- TestMultiplyShiftNonShift<ssse3::Kernels8>(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<ssse3::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
- TestMultiplyShiftNonShift<ssse3::Kernels8>(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad
- TestMultiplyShiftNonShift<ssse3::Kernels8>(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f);
- TestMultiplyShiftNonShift<ssse3::Kernels8>(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f);
- TestMultiplyShiftNonShift<ssse3::Kernels8>(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f);
- TestMultiplyShiftNonShift<ssse3::Kernels8>(200, 256, 256, 1, 0.74f, 0.17f, 0.006f);
+ TestMultiplyShiftNonShift<SSSE3::Kernels8>(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<SSSE3::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
+ TestMultiplyShiftNonShift<SSSE3::Kernels8>(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad
+ TestMultiplyShiftNonShift<SSSE3::Kernels8>(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f);
+ TestMultiplyShiftNonShift<SSSE3::Kernels8>(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f);
+ TestMultiplyShiftNonShift<SSSE3::Kernels8>(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f);
+ TestMultiplyShiftNonShift<SSSE3::Kernels8>(200, 256, 256, 1, 0.74f, 0.17f, 0.006f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE ("Multiply AVX2 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiplyShiftNonShift<avx2::Kernels8>(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<avx2::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
- TestMultiplyShiftNonShift<avx2::Kernels8>(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad
- TestMultiplyShiftNonShift<avx2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftNonShift<avx2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
- TestMultiplyShiftNonShift<avx2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftNonShift<avx2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
+ TestMultiplyShiftNonShift<AVX2::Kernels8>(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<AVX2::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
+ TestMultiplyShiftNonShift<AVX2::Kernels8>(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad
+ TestMultiplyShiftNonShift<AVX2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftNonShift<AVX2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
+ TestMultiplyShiftNonShift<AVX2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftNonShift<AVX2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512F 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiplyShiftNonShift<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
- TestMultiplyShiftNonShift<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
- TestMultiplyShiftNonShift<avx512bw::Kernels8>(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f);
- TestMultiplyShiftNonShift<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyShiftNonShift<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
- TestMultiplyShiftNonShift<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyShiftNonShift<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
+ TestMultiplyShiftNonShift<AVX512BW::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
+ TestMultiplyShiftNonShift<AVX512BW::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
+ TestMultiplyShiftNonShift<AVX512BW::Kernels8>(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f);
+ TestMultiplyShiftNonShift<AVX512BW::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyShiftNonShift<AVX512BW::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
+ TestMultiplyShiftNonShift<AVX512BW::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyShiftNonShift<AVX512BW::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::AVX512VNNI) return;
- TestMultiplyShiftNonShift<avx512vnni::Kernels8>(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f);
- TestMultiplyShiftNonShift<avx512vnni::Kernels8>(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
- TestMultiplyShiftNonShift<avx512vnni::Kernels8>(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<avx512vnni::Kernels8>(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<avx512vnni::Kernels8>(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<avx512vnni::Kernels8>(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f);
+ TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
+ TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f);
}
#endif
//Multiply Shift vs int shift implementation
TEST_CASE ("Multiply SSSE3 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
- TestMultiplyShiftInt<ssse3::Kernels8>(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<ssse3::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<ssse3::Kernels8>(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f);
- TestMultiplyShiftInt<ssse3::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftInt<ssse3::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<ssse3::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftInt<ssse3::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<SSSE3::Kernels8>(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<SSSE3::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<SSSE3::Kernels8>(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f);
+ TestMultiplyShiftInt<SSSE3::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftInt<SSSE3::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<SSSE3::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftInt<SSSE3::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE ("Multiply AVX2 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiplyShiftInt<avx2::Kernels8>(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<avx2::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<avx2::Kernels8>(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f);
- TestMultiplyShiftInt<avx2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftInt<avx2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<avx2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftInt<avx2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<AVX2::Kernels8>(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX2::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<AVX2::Kernels8>(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f);
+ TestMultiplyShiftInt<AVX2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftInt<AVX2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<AVX2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftInt<AVX2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512F 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiplyShiftInt<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
- TestMultiplyShiftInt<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<avx512bw::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX512BW::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
+ TestMultiplyShiftInt<AVX512BW::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX512BW::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<AVX512BW::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX512BW::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX512BW::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX512BW::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::AVX512VNNI) return;
- TestMultiplyShiftInt<avx512vnni::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
- TestMultiplyShiftInt<avx512vnni::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<avx512vnni::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<avx512vnni::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<avx512vnni::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<avx512vnni::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX512VNNI::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
+ TestMultiplyShiftInt<AVX512VNNI::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX512VNNI::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<AVX512VNNI::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX512VNNI::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX512VNNI::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<AVX512VNNI::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
}
#endif
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index 47201c0..5395d40 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -85,32 +85,32 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("Prepare AVX512", "[prepare]") {
if (kCPU < CPUType::AVX512BW) return;
- TestPrepare<avx512bw::Kernels8>(64, 8);
- TestPrepare<avx512bw::Kernels8>(256, 32);
- TestPrepare<avx512bw::Kernels16>(64, 8);
- TestPrepare<avx512bw::Kernels16>(256, 32);
+ TestPrepare<AVX512BW::Kernels8>(64, 8);
+ TestPrepare<AVX512BW::Kernels8>(256, 32);
+ TestPrepare<AVX512BW::Kernels16>(64, 8);
+ TestPrepare<AVX512BW::Kernels16>(256, 32);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("Prepare AVX2", "[prepare]") {
if (kCPU < CPUType::AVX2) return;
- TestPrepare<avx2::Kernels8>(64, 32);
- TestPrepare<avx2::Kernels16>(64, 32);
+ TestPrepare<AVX2::Kernels8>(64, 32);
+ TestPrepare<AVX2::Kernels16>(64, 32);
}
#endif
TEST_CASE("Prepare SSSE3", "[prepare]") {
if (kCPU < CPUType::SSSE3) return;
- TestPrepare<ssse3::Kernels8>(16, 8);
- TestPrepare<ssse3::Kernels8>(32, 16);
- TestPrepare<ssse3::Kernels8>(32, 32);
+ TestPrepare<SSSE3::Kernels8>(16, 8);
+ TestPrepare<SSSE3::Kernels8>(32, 16);
+ TestPrepare<SSSE3::Kernels8>(32, 32);
}
TEST_CASE("Prepare SSE2", "[prepare]") {
if (kCPU < CPUType::SSE2) return;
- TestPrepare<sse2::Kernels16>(8, 8);
- TestPrepare<sse2::Kernels16>(32, 32);
+ TestPrepare<SSE2::Kernels16>(8, 8);
+ TestPrepare<SSE2::Kernels16>(32, 32);
}
template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 16) {
@@ -152,29 +152,29 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("SelectColumnsB AVX512", "[select]") {
if (kCPU < CPUType::AVX512BW) return;
- TestSelectColumnsB<avx512bw::Kernels8>();
- TestSelectColumnsB<avx512bw::Kernels16>(256, 256);
+ TestSelectColumnsB<AVX512BW::Kernels8>();
+ TestSelectColumnsB<AVX512BW::Kernels16>(256, 256);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("SelectColumnsB AVX2", "[select]") {
if (kCPU < CPUType::AVX2) return;
- TestSelectColumnsB<avx2::Kernels8>(256, 256);
- TestSelectColumnsB<avx2::Kernels16>(256, 256);
+ TestSelectColumnsB<AVX2::Kernels8>(256, 256);
+ TestSelectColumnsB<AVX2::Kernels16>(256, 256);
}
#endif
TEST_CASE("SelectColumnsB SSSE3", "[select]") {
if (kCPU < CPUType::SSSE3) return;
- TestSelectColumnsB<ssse3::Kernels8>();
- TestSelectColumnsB<ssse3::Kernels8>(256, 256);
+ TestSelectColumnsB<SSSE3::Kernels8>();
+ TestSelectColumnsB<SSSE3::Kernels8>(256, 256);
}
TEST_CASE("SelectColumnsB SSE2", "[select]") {
if (kCPU < CPUType::SSE2) return;
- TestSelectColumnsB<sse2::Kernels16>();
- TestSelectColumnsB<sse2::Kernels16>(256, 256);
+ TestSelectColumnsB<SSE2::Kernels16>();
+ TestSelectColumnsB<SSE2::Kernels16>(256, 256);
}
template <class Register> void TestMax() {
@@ -219,20 +219,20 @@ template <float (*Backend) (const float *, const float *)> void TestMaxAbsolute(
TEST_CASE("MaxAbsolute SSE2", "[max]") {
if (kCPU < CPUType::SSE2) return;
- TestMaxAbsolute<sse2::MaxAbsolute>();
+ TestMaxAbsolute<SSE2::MaxAbsolute>();
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("MaxAbsolute AVX2", "[max]") {
if (kCPU < CPUType::AVX2) return;
- TestMaxAbsolute<avx2::MaxAbsolute>();
+ TestMaxAbsolute<AVX2::MaxAbsolute>();
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("MaxAbsolute AVX512BW", "[max]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMaxAbsolute<avx512bw::MaxAbsolute>();
+ TestMaxAbsolute<AVX512BW::MaxAbsolute>();
}
#endif
@@ -364,148 +364,148 @@ template <class Routine> void TestMultiplyBias(Index A_rows, Index width, Index
TEST_CASE ("Multiply SSE2 16bit", "[multiply]") {
if (kCPU < CPUType::SSE2) return;
- TestMultiply<sse2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiply<sse2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
- TestMultiply<sse2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiply<sse2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiply<sse2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiply<sse2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<SSE2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<SSE2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+ TestMultiply<SSE2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<SSE2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<SSE2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<SSE2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply SSE2 16bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::SSE2) return;
- TestMultiplyBias<sse2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<sse2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
- TestMultiplyBias<sse2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<sse2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<sse2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<sse2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<SSE2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<SSE2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+ TestMultiplyBias<SSE2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<SSE2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<SSE2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<SSE2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply SSSE3 8bit", "[multiply]") {
if (kCPU < CPUType::SSSE3) return;
- TestMultiply<ssse3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
- TestMultiply<ssse3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
- TestMultiply<ssse3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
- TestMultiply<ssse3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
- TestMultiply<ssse3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
- TestMultiply<ssse3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
+ TestMultiply<SSSE3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
+ TestMultiply<SSSE3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
+ TestMultiply<SSSE3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
+ TestMultiply<SSSE3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
+ TestMultiply<SSSE3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
+ TestMultiply<SSSE3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
}
TEST_CASE ("Multiply SSSE3 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::SSSE3) return;
- TestMultiplyBias<ssse3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
- TestMultiplyBias<ssse3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
- TestMultiplyBias<ssse3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
- TestMultiplyBias<ssse3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
- TestMultiplyBias<ssse3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
- TestMultiplyBias<ssse3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
+ TestMultiplyBias<SSSE3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
+ TestMultiplyBias<SSSE3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
+ TestMultiplyBias<SSSE3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
+ TestMultiplyBias<SSSE3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
+ TestMultiplyBias<SSSE3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
+ TestMultiplyBias<SSSE3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE ("Multiply AVX2 8bit", "[multiply]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiply<avx2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
- TestMultiply<avx2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
- TestMultiply<avx2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
- TestMultiply<avx2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
- TestMultiply<avx2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
- TestMultiply<avx2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
+ TestMultiply<AVX2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
+ TestMultiply<AVX2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
+ TestMultiply<AVX2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
+ TestMultiply<AVX2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
+ TestMultiply<AVX2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
+ TestMultiply<AVX2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
}
TEST_CASE ("Multiply AVX2 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiplyBias<avx2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
- TestMultiplyBias<avx2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
- TestMultiplyBias<avx2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
- TestMultiplyBias<avx2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
- TestMultiplyBias<avx2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
- TestMultiplyBias<avx2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
+ TestMultiplyBias<AVX2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
+ TestMultiplyBias<AVX2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
+ TestMultiplyBias<AVX2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
+ TestMultiplyBias<AVX2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
+ TestMultiplyBias<AVX2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
+ TestMultiplyBias<AVX2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
}
TEST_CASE ("Multiply AVX2 16bit", "[multiply]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiply<avx2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiply<avx2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
- TestMultiply<avx2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiply<avx2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiply<avx2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiply<avx2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<AVX2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<AVX2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+ TestMultiply<AVX2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<AVX2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<AVX2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<AVX2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply AVX2 16bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiplyBias<avx2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<avx2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
- TestMultiplyBias<avx2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<avx2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<avx2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<avx2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<AVX2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<AVX2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+ TestMultiplyBias<AVX2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<AVX2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<AVX2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<AVX2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512 8bit", "[multiply]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiply<avx512bw::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
- TestMultiply<avx512bw::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
- TestMultiply<avx512bw::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
- TestMultiply<avx512bw::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
- TestMultiply<avx512bw::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
- TestMultiply<avx512bw::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
+ TestMultiply<AVX512BW::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+ TestMultiply<AVX512BW::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
+ TestMultiply<AVX512BW::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+ TestMultiply<AVX512BW::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiply<AVX512BW::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiply<AVX512BW::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
TEST_CASE ("Multiply AVX512 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiplyBias<avx512bw::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
- TestMultiplyBias<avx512bw::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
- TestMultiplyBias<avx512bw::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
- TestMultiplyBias<avx512bw::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
- TestMultiplyBias<avx512bw::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
- TestMultiplyBias<avx512bw::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
+ TestMultiplyBias<AVX512BW::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+ TestMultiplyBias<AVX512BW::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
+ TestMultiplyBias<AVX512BW::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+ TestMultiplyBias<AVX512BW::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiplyBias<AVX512BW::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiplyBias<AVX512BW::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit", "[multiply]") {
if (kCPU < CPUType::AVX512VNNI) return;
- TestMultiply<avx512vnni::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
- TestMultiply<avx512vnni::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
- TestMultiply<avx512vnni::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
- TestMultiply<avx512vnni::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
- TestMultiply<avx512vnni::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
- TestMultiply<avx512vnni::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
+ TestMultiply<AVX512VNNI::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+ TestMultiply<AVX512VNNI::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
+ TestMultiply<AVX512VNNI::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+ TestMultiply<AVX512VNNI::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiply<AVX512VNNI::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiply<AVX512VNNI::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
TEST_CASE ("Multiply AVX512VNNI 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX512VNNI) return;
- TestMultiplyBias<avx512vnni::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
- TestMultiplyBias<avx512vnni::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
- TestMultiplyBias<avx512vnni::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
- TestMultiplyBias<avx512vnni::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
- TestMultiplyBias<avx512vnni::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
- TestMultiplyBias<avx512vnni::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
+ TestMultiplyBias<AVX512VNNI::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+ TestMultiplyBias<AVX512VNNI::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
+ TestMultiplyBias<AVX512VNNI::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+ TestMultiplyBias<AVX512VNNI::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiplyBias<AVX512VNNI::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiplyBias<AVX512VNNI::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
#endif
TEST_CASE ("Multiply AVX512 16bit", "[multiply]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiply<avx512bw::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiply<avx512bw::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
- TestMultiply<avx512bw::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiply<avx512bw::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiply<avx512bw::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiply<avx512bw::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<AVX512BW::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<AVX512BW::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
+ TestMultiply<AVX512BW::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<AVX512BW::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<AVX512BW::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<AVX512BW::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply AVX512 16bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiplyBias<avx512bw::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<avx512bw::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
- TestMultiplyBias<avx512bw::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<avx512bw::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<avx512bw::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<avx512bw::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<AVX512BW::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<AVX512BW::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
+ TestMultiplyBias<AVX512BW::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<AVX512BW::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<AVX512BW::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<AVX512BW::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
#endif
diff --git a/test/prepare_b_quantized_transposed.cc b/test/prepare_b_quantized_transposed.cc
index e27992a..defe9a0 100644
--- a/test/prepare_b_quantized_transposed.cc
+++ b/test/prepare_b_quantized_transposed.cc
@@ -62,14 +62,14 @@ TEST_CASE("PrepareBQuantizedTransposed SSE2", "") {
if (kCPU < CPUType::SSE2)
return;
- CHECK(TestMany<sse2::Kernels16>(32, 128));
+ CHECK(TestMany<SSE2::Kernels16>(32, 128));
}
TEST_CASE("PrepareBQuantizedTransposed SSSE3", "") {
if (kCPU < CPUType::SSSE3)
return;
- CHECK(TestMany<ssse3::Kernels8>(32, 128));
+ CHECK(TestMany<SSSE3::Kernels8>(32, 128));
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
@@ -77,8 +77,8 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") {
if (kCPU < CPUType::AVX2)
return;
- CHECK(TestMany<avx2::Kernels8>(32, 128));
- CHECK(TestMany<avx2::Kernels16>(32, 128));
+ CHECK(TestMany<AVX2::Kernels8>(32, 128));
+ CHECK(TestMany<AVX2::Kernels16>(32, 128));
}
#endif
@@ -87,8 +87,8 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") {
if (kCPU < CPUType::AVX512BW)
return;
- CHECK(TestMany<avx512bw::Kernels8>(64, 128));
- CHECK(TestMany<avx512bw::Kernels16>(64, 128));
+ CHECK(TestMany<AVX512BW::Kernels8>(64, 128));
+ CHECK(TestMany<AVX512BW::Kernels16>(64, 128));
}
#endif
diff --git a/test/prepare_b_transposed.cc b/test/prepare_b_transposed.cc
index 1ba88df..1c11fbe 100644
--- a/test/prepare_b_transposed.cc
+++ b/test/prepare_b_transposed.cc
@@ -63,14 +63,14 @@ TEST_CASE("PrepareBTransposed SSE2", "") {
if (kCPU < CPUType::SSE2)
return;
- CHECK(TestMany<sse2::Kernels16>(4, 128, 2.0f));
+ CHECK(TestMany<SSE2::Kernels16>(4, 128, 2.0f));
}
TEST_CASE("PrepareBTransposed SSSE3", "") {
if (kCPU < CPUType::SSSE3)
return;
- CHECK(TestMany<ssse3::Kernels8>(4, 128, 2.0f));
+ CHECK(TestMany<SSSE3::Kernels8>(4, 128, 2.0f));
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
@@ -78,8 +78,8 @@ TEST_CASE("PrepareBTransposed AVX2", "") {
if (kCPU < CPUType::AVX2)
return;
- CHECK(TestMany<avx2::Kernels8>(8, 128, 2.0f));
- CHECK(TestMany<avx2::Kernels16>(8, 128, 2.0f));
+ CHECK(TestMany<AVX2::Kernels8>(8, 128, 2.0f));
+ CHECK(TestMany<AVX2::Kernels16>(8, 128, 2.0f));
}
#endif
@@ -88,8 +88,8 @@ TEST_CASE("PrepareBTransposed AVX512", "") {
if (kCPU < CPUType::AVX512BW)
return;
- CHECK(TestMany<avx512bw::Kernels8>(16, 128, 2.0f));
- CHECK(TestMany<avx512bw::Kernels16>(16, 128, 2.0f));
+ CHECK(TestMany<AVX512BW::Kernels8>(16, 128, 2.0f));
+ CHECK(TestMany<AVX512BW::Kernels16>(16, 128, 2.0f));
}
#endif
diff --git a/test/quantize_test.cc b/test/quantize_test.cc
index e9f7980..622ff71 100644
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -120,78 +120,78 @@ template <class Backend> void TestMany(std::size_t grow) {
TEST_CASE ("Quantize SSE2", "[quantize]") {
if (kCPU < CPUType::SSE2) return;
- TestMany<sse2::Kernels16>(8);
+ TestMany<SSE2::Kernels16>(8);
}
TEST_CASE ("Quantize SSSE3", "[quantize]") {
if (kCPU < CPUType::SSSE3) return;
- TestMany<ssse3::Kernels8>(1);
+ TestMany<SSSE3::Kernels8>(1);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE ("Quantize AVX2", "[quantize]") {
if (kCPU < CPUType::AVX2) return;
- TestMany<avx2::Kernels8>(1);
- TestMany<avx2::Kernels16>(16);
+ TestMany<AVX2::Kernels8>(1);
+ TestMany<AVX2::Kernels16>(16);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Quantize AVX512", "[quantize]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMany<avx512bw::Kernels8>(1);
- TestMany<avx512bw::Kernels16>(16);
+ TestMany<AVX512BW::Kernels8>(1);
+ TestMany<AVX512BW::Kernels16>(16);
}
#endif
TEST_CASE("QuantizeStd SSSE3", "[VectorMeanStd]") {
if (kCPU < CPUType::SSSE3) return;
- testVectorMeanStd<sse2::VectorMeanStd>(64);
- testVectorMeanStd<sse2::VectorMeanStd>(64, true);
- testVectorMeanStd<sse2::VectorMeanStd>(256);
- testVectorMeanStd<sse2::VectorMeanStd>(256, true);
- testVectorMeanStd<sse2::VectorMeanStd>(2048);
- testVectorMeanStd<sse2::VectorMeanStd>(2048, true);
- testVectorMeanStd<sse2::VectorMeanStd>(65536);
- testVectorMeanStd<sse2::VectorMeanStd>(65536, true);
- testVectorMeanStd<sse2::VectorMeanStd>(81920);
- testVectorMeanStd<sse2::VectorMeanStd>(81920, true);
- testVectorMeanStd<sse2::VectorMeanStd>(120832);
- testVectorMeanStd<sse2::VectorMeanStd>(120832, true);
+ testVectorMeanStd<SSE2::VectorMeanStd>(64);
+ testVectorMeanStd<SSE2::VectorMeanStd>(64, true);
+ testVectorMeanStd<SSE2::VectorMeanStd>(256);
+ testVectorMeanStd<SSE2::VectorMeanStd>(256, true);
+ testVectorMeanStd<SSE2::VectorMeanStd>(2048);
+ testVectorMeanStd<SSE2::VectorMeanStd>(2048, true);
+ testVectorMeanStd<SSE2::VectorMeanStd>(65536);
+ testVectorMeanStd<SSE2::VectorMeanStd>(65536, true);
+ testVectorMeanStd<SSE2::VectorMeanStd>(81920);
+ testVectorMeanStd<SSE2::VectorMeanStd>(81920, true);
+ testVectorMeanStd<SSE2::VectorMeanStd>(120832);
+ testVectorMeanStd<SSE2::VectorMeanStd>(120832, true);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("QuantizeStd AVX2", "[VectorMeanStd]") {
if (kCPU < CPUType::AVX2) return;
- testVectorMeanStd<avx2::VectorMeanStd>(64);
- testVectorMeanStd<avx2::VectorMeanStd>(64, true);
- testVectorMeanStd<avx2::VectorMeanStd>(256);
- testVectorMeanStd<avx2::VectorMeanStd>(256, true);
- testVectorMeanStd<avx2::VectorMeanStd>(2048);
- testVectorMeanStd<avx2::VectorMeanStd>(2048, true);
- testVectorMeanStd<avx2::VectorMeanStd>(65536);
- testVectorMeanStd<avx2::VectorMeanStd>(65536, true);
- testVectorMeanStd<avx2::VectorMeanStd>(81920);
- testVectorMeanStd<avx2::VectorMeanStd>(81920, true);
- testVectorMeanStd<avx2::VectorMeanStd>(120832);
- testVectorMeanStd<avx2::VectorMeanStd>(120832, true);
+ testVectorMeanStd<AVX2::VectorMeanStd>(64);
+ testVectorMeanStd<AVX2::VectorMeanStd>(64, true);
+ testVectorMeanStd<AVX2::VectorMeanStd>(256);
+ testVectorMeanStd<AVX2::VectorMeanStd>(256, true);
+ testVectorMeanStd<AVX2::VectorMeanStd>(2048);
+ testVectorMeanStd<AVX2::VectorMeanStd>(2048, true);
+ testVectorMeanStd<AVX2::VectorMeanStd>(65536);
+ testVectorMeanStd<AVX2::VectorMeanStd>(65536, true);
+ testVectorMeanStd<AVX2::VectorMeanStd>(81920);
+ testVectorMeanStd<AVX2::VectorMeanStd>(81920, true);
+ testVectorMeanStd<AVX2::VectorMeanStd>(120832);
+ testVectorMeanStd<AVX2::VectorMeanStd>(120832, true);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("QuantizeStd AVX512BW", "[VectorMeanStd]") {
if (kCPU < CPUType::AVX512BW) return;
- testVectorMeanStd<avx512bw::VectorMeanStd>(64);
- testVectorMeanStd<avx512bw::VectorMeanStd>(64, true);
- testVectorMeanStd<avx512bw::VectorMeanStd>(256);
- testVectorMeanStd<avx512bw::VectorMeanStd>(256, true);
- testVectorMeanStd<avx512bw::VectorMeanStd>(2048);
- testVectorMeanStd<avx512bw::VectorMeanStd>(2048, true);
- testVectorMeanStd<avx512bw::VectorMeanStd>(65536);
- testVectorMeanStd<avx512bw::VectorMeanStd>(65536, true);
- testVectorMeanStd<avx512bw::VectorMeanStd>(81920);
- testVectorMeanStd<avx512bw::VectorMeanStd>(81920, true);
- testVectorMeanStd<avx512bw::VectorMeanStd>(120832);
- testVectorMeanStd<avx512bw::VectorMeanStd>(120832, true);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(64);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(64, true);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(256);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(256, true);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(2048);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(2048, true);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(65536);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(65536, true);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(81920);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(81920, true);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(120832);
+ testVectorMeanStd<AVX512BW::VectorMeanStd>(120832, true);
}
#endif