diff options
author | Kenneth Heafield <kheafiel@amazon.com> | 2020-03-17 13:44:38 +0300 |
---|---|---|
committer | Kenneth Heafield <kheafiel@amazon.com> | 2020-03-17 13:44:38 +0300 |
commit | 79a3be9e7b78a4cbb231b5fc4e23dc0593f20240 (patch) | |
tree | 3dd60dbf7b979db3eeb46083477d00433c78c63a | |
parent | cf230bea70434809aedb0f7dba454037a3a709f1 (diff) | |
parent | 261a5fbcf7558fc3c2ac22b33fe0c2930d440fc3 (diff) |
Merge branch 'master' of github.com:kpu/intgemm
35 files changed, 74 insertions, 74 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 8fcdfd7..32c19ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,15 +16,13 @@ else() add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-unknown-pragmas) endif() -# Check if compiler supports AVX512 -try_compile(INTGEMM_COMPILER_SUPPORTS_AVX512 +# Check if compiler supports AVX512BW +try_compile(INTGEMM_COMPILER_SUPPORTS_AVX512BW ${CMAKE_CURRENT_BINARY_DIR}/compile_tests - ${CMAKE_CURRENT_SOURCE_DIR}/compile_test_avx512.cc - #Hack: pass compiler arguments as definitions because the test code overrides CXX_FLAGS :'( - COMPILE_DEFINITIONS -mavx512f -mavx512bw -mavx512dq) + ${CMAKE_CURRENT_SOURCE_DIR}/compile_test_avx512bw.cc) -if(NOT INTGEMM_COMPILER_SUPPORTS_AVX512) - message(WARNING "${Orange}Not building AVX512-based multiplication because your compiler is too old.\nFor details rerun cmake with --debug-trycompile then try to build in compile_tests/CMakeFiles/CMakeTmp.${ColourReset}") +if(NOT INTGEMM_COMPILER_SUPPORTS_AVX512BW) + message(WARNING "${Orange}Not building AVX512BW-based multiplication because your compiler is too old.\nFor details rerun cmake with --debug-trycompile then try to build in compile_tests/CMakeFiles/CMakeTmp.${ColourReset}") endif() try_compile(INTGEMM_COMPILER_SUPPORTS_AVX512VNNI diff --git a/avx512_gemm.h b/avx512_gemm.h index c592e1c..6286ccc 100644 --- a/avx512_gemm.h +++ b/avx512_gemm.h @@ -2,7 +2,7 @@ #include "intgemm_config.h" -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW #include "interleave.h" #include "kernels.h" diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 26b0ac5..6063d5c 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -196,7 +196,7 @@ int main(int, char ** argv) { RunAll<AVX2_16bit>(matrices, end, stats.avx2_16bit); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW std::cerr << "AVX512 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; @@ -225,7 +225,7 @@ int main(int, char ** argv) { std::cout << "Multiply\t" << matrices[i].A_rows << '\t' << matrices[i].width << '\t' << matrices[i].B_cols << '\t' << "Samples=" << (kOutlierThreshold * stats.sse2_16bit[i].size()) << '\n'; Print<SSSE3_8bit>(stats.ssse3_8bit, i); Print<AVX2_8bit>(stats.avx2_8bit, i); -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW Print<AVX512_8bit>(stats.avx512_8bit, i); #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI @@ -233,7 +233,7 @@ int main(int, char ** argv) { #endif Print<SSE2_16bit>(stats.sse2_16bit, i); Print<AVX2_16bit>(stats.avx2_16bit, i); -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW Print<AVX512_16bit>(stats.avx512_16bit, i); #endif } diff --git a/benchmarks/benchmark_quantizer.cc b/benchmarks/benchmark_quantizer.cc index b9b0782..eb96499 100644 --- a/benchmarks/benchmark_quantizer.cc +++ b/benchmarks/benchmark_quantizer.cc @@ -36,7 +36,7 @@ int main() { } QuantizerBench<intgemm::SSSE3_8bit>(in.begin(), out.begin(), count); QuantizerBench<intgemm::AVX2_8bit>(in.begin(), out.begin(), count); -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW QuantizerBench<intgemm::AVX512_8bit>(in.begin(), out.begin(), count); #endif } diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc index f3a0bb8..8f0816f 100644 --- a/benchmarks/biasmultiply.cc +++ b/benchmarks/biasmultiply.cc @@ -196,7 +196,7 @@ int main(int argc, char ** argv) { } std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl; -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW if (kCPU < CPUType::AVX512BW) return 0; std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<AVX512_8bit>(1, 64, 8); for (int i = 0; i<repeat; i++) { diff --git a/callbacks.h b/callbacks.h index c8a29df..24f9009 100644 --- a/callbacks.h +++ b/callbacks.h @@ -18,7 +18,7 @@ #include "callbacks/implementations.inl" #undef CALLBACKS_THIS_IS_AVX2 -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW #define CALLBACKS_THIS_IS_AVX512BW #include "callbacks/implementations.inl" #undef CALLBACKS_THIS_IS_AVX512BW diff --git a/compile_test_avx512.cc b/compile_test_avx512.cc deleted file mode 100644 index f56cc12..0000000 --- a/compile_test_avx512.cc +++ /dev/null @@ -1,16 +0,0 @@ -// Some compilers don't have AVX512BW support. Test for them. -#include <immintrin.h> - -#include <iostream> - -int main() { - // AVX512F - __m512i value = _mm512_set1_epi32(1); - // AVX512BW - value = _mm512_maddubs_epi16(value, value); - - __m256i value2 = _mm256_set1_epi8(1); - // AVX512DQ - value = _mm512_inserti32x8(value, value2, 1); - return *(int*)&value && __builtin_cpu_supports("avx512f"); -} diff --git a/compile_test_avx512bw.cc b/compile_test_avx512bw.cc new file mode 100644 index 0000000..8d07551 --- /dev/null +++ b/compile_test_avx512bw.cc @@ -0,0 +1,18 @@ +// Some compilers don't have AVX512BW support. Test for them. +#include <immintrin.h> + +#if defined __INTEL_COMPILER +#define INTGEMM_AVX512BW __attribute__ ((target ("avx512f"))) +#else +#define INTGEMM_AVX512BW __attribute__ ((target ("avx512bw"))) +#endif + +INTGEMM_AVX512BW int Test() { + // AVX512BW + __m512i value = _mm512_set1_epi32(1); + value = _mm512_maddubs_epi16(value, value); + return *(int*)&value; +} + +int main() { +} diff --git a/interleave.h b/interleave.h index 9bd4fdd..231be46 100644 --- a/interleave.h +++ b/interleave.h @@ -28,7 +28,7 @@ INTGEMM_INTERLEAVE_N(target, type, 64) INTGEMM_INTERLEAVE(INTGEMM_SSE2, __m128i) INTGEMM_INTERLEAVE(INTGEMM_AVX2, __m256i) -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW INTGEMM_INTERLEAVE(INTGEMM_AVX512BW, __m512i) #endif @@ -44,7 +44,7 @@ target static inline void Swap(Register &a, Register &b) { \ INTGEMM_SWAP(INTGEMM_SSE2, __m128i) INTGEMM_SWAP(INTGEMM_AVX2, __m256i) -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_SWAP(INTGEMM_AVX512BW, __m512i) #endif @@ -97,7 +97,7 @@ target static inline void Transpose16InLane(Register &r0, Register &r1, Register INTGEMM_TRANSPOSE16(INTGEMM_SSE2, __m128i) INTGEMM_TRANSPOSE16(INTGEMM_AVX2, __m256i) -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_TRANSPOSE16(INTGEMM_AVX512BW, __m512i) #endif @@ -46,7 +46,7 @@ constexpr const char *const SSE2_16bit::kName; constexpr const char *const SSSE3_8bit::kName; constexpr const char *const AVX2_8bit::kName; constexpr const char *const AVX2_16bit::kName; -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW constexpr const char *const AVX512_8bit::kName; constexpr const char *const AVX512_16bit::kName; #endif @@ -119,7 +119,7 @@ struct Unsupported_8bit { constexpr static const char *const kName = "8-bit Unsupported"; }; -#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW // These won't ever be called in this capacity, but it does let the code below compile. typedef Unsupported_16bit AVX512_16bit; typedef Unsupported_8bit AVX512_8bit; @@ -153,7 +153,7 @@ template <class T> T ChooseCPU(T avx512vnni #endif , T -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW avx512bw #endif , T avx2, T ssse3, T sse2, T unsupported) { @@ -174,7 +174,7 @@ template <class T> T ChooseCPU(T # ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI if (ecx & (1 << 11)) return avx512vnni; # endif -# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW if (ebx & (1 << 30)) return avx512bw; # endif if (ebx & (1 << 5)) return avx2; @@ -196,7 +196,7 @@ template <class T> T ChooseCPU(T # endif ) return vnni; # endif -# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW if ( # ifdef __INTEL_COMPILER _may_i_use_cpu_feature(_FEATURE_AVX512BW) diff --git a/intgemm_config.h.in b/intgemm_config.h.in index 11be91c..920e9ae 100644 --- a/intgemm_config.h.in +++ b/intgemm_config.h.in @@ -1,4 +1,4 @@ #pragma once -#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX512 +#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX512BW #cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX512VNNI diff --git a/intrinsics.h b/intrinsics.h index 5fe3159..d662472 100644 --- a/intrinsics.h +++ b/intrinsics.h @@ -381,7 +381,7 @@ INTGEMM_AVX2 static inline __m256i xor_si(__m256i a, __m256i b) { * AVX512 * */ -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW INTGEMM_AVX512BW static inline __m512i abs_epi8(__m512i arg) { return _mm512_abs_epi8(arg); @@ -16,7 +16,7 @@ #include "kernels/implementations.inl" #undef KERNELS_THIS_IS_AVX2 -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW #define KERNELS_THIS_IS_AVX512BW #include "kernels/implementations.inl" #undef KERNELS_THIS_IS_AVX512BW @@ -36,7 +36,7 @@ INTGEMM_AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4 return _mm256_add_epi32(rev, blended); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) { // Form [0th 128-bit register of pack0123, 0st 128-bit register of pack4567, 2nd 128-bit register of pack0123, 2nd 128-bit register of pack4567] @@ -115,7 +115,7 @@ target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Reg INTGEMM_PACK0123(INTGEMM_SSE2, __m128i) INTGEMM_PACK0123(INTGEMM_AVX2, __m256i) -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i) #endif diff --git a/test/add127_test.cc b/test/add127_test.cc index c271a0c..d959b14 100644 --- a/test/add127_test.cc +++ b/test/add127_test.cc @@ -295,7 +295,7 @@ TEST_CASE("PrepareBias AVX2", "[Add127]") { TEST_CASE("PrepareBias AVX512F", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 + #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TestPrepareBias<AVX512_8bit>(256,256); TestPrepareBias<AVX512_8bit>(2048,256); TestPrepareBias<AVX512_8bit>(512,512); @@ -321,7 +321,7 @@ TEST_CASE("PrepareA AVX2", "[Add127]") { TEST_CASE("PrepareA AVX512F", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 + #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TestPrepareA<AVX512_8bit>(64,64); TestPrepareA<AVX512_8bit>(256,256); TestPrepareA<AVX512_8bit>(512,512); @@ -352,7 +352,7 @@ TEST_CASE ("Multiply AVX2 8bit Shift with bias", "[Add127]") { TestMultiplyBiasNew<AVX2_8bit>(248, 256, 256, 0.48, 0.64, 0.16, 0.15); TestMultiplyBiasNew<AVX2_8bit>(200, 256, 256, 0.55, 0.74, 0.17, 0.16); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; TestMultiplyBiasNew<AVX512_8bit>(1, 64, 8, 0.0001, 0.05, 0.03, 0.001); @@ -400,7 +400,7 @@ TEST_CASE ("Multiply AVX2 8bit Shift vs nonshift", "[Add127]") { TestMultiplyShiftNonShift<AVX2_8bit>(248, 256, 256, 0.0001, 0.64, 0.16, 0.0001); TestMultiplyShiftNonShift<AVX2_8bit>(200, 256, 256, 0.0001, 0.74, 0.17, 0.0001); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; TestMultiplyShiftNonShift<AVX512_8bit>(1, 64, 8, 0.0001, 0.05, 0.03, 0.001); @@ -448,7 +448,7 @@ TEST_CASE ("Multiply AVX2 8bit Shift vs Int", "[Add127]") { TestMultiplyShiftInt<AVX2_8bit>(248, 256, 256, 0.0001f, 0.64, 0.16, 0.0001f); TestMultiplyShiftInt<AVX2_8bit>(200, 256, 256, 0.0001f, 0.74, 0.17, 0.0001f); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; TestMultiplyShiftInt<AVX512_8bit>(1, 64, 8, 0.0001f, 0.05, 0.03, 0.0001f); diff --git a/test/kernels/add_bias_test.cc b/test/kernels/add_bias_test.cc index 7b10b56..2dd4e3d 100644 --- a/test/kernels/add_bias_test.cc +++ b/test/kernels/add_bias_test.cc @@ -48,7 +48,7 @@ KERNEL_TEST_CASE("add_bias/int AVX2") { return kernel_add_bias_test<CPUType::AVX KERNEL_TEST_CASE("add_bias/float AVX2") { return kernel_add_bias_test<CPUType::AVX2, float>(); } KERNEL_TEST_CASE("add_bias/double AVX2") { return kernel_add_bias_test<CPUType::AVX2, double>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, int8_t>(); template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, int16_t>(); template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, int>(); diff --git a/test/kernels/bitwise_not_test.cc b/test/kernels/bitwise_not_test.cc index 02b700b..309be7e 100644 --- a/test/kernels/bitwise_not_test.cc +++ b/test/kernels/bitwise_not_test.cc @@ -30,7 +30,7 @@ KERNEL_TEST_CASE("bitwise_not SSE2") { return kernel_bitwise_not_test<CPUType::S template INTGEMM_AVX2 void kernel_bitwise_not_test<CPUType::AVX2>(); KERNEL_TEST_CASE("bitwise_not AVX2") { return kernel_bitwise_not_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_bitwise_not_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("bitwise_not AVX512BW") { return kernel_bitwise_not_test<CPUType::AVX512BW>(); } #endif diff --git a/test/kernels/downcast_test.cc b/test/kernels/downcast_test.cc index 5ecc084..d3261c7 100644 --- a/test/kernels/downcast_test.cc +++ b/test/kernels/downcast_test.cc @@ -32,7 +32,7 @@ KERNEL_TEST_CASE("downcast32to8 SSE2") { return kernel_downcast32to8_test<CPUTyp template INTGEMM_AVX2 void kernel_downcast32to8_test<CPUType::AVX2>(); KERNEL_TEST_CASE("downcast32to8 AVX2") { return kernel_downcast32to8_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_downcast32to8_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("downcast32to8 AVX512BW") { return kernel_downcast32to8_test<CPUType::AVX512BW>(); } #endif @@ -62,7 +62,7 @@ KERNEL_TEST_CASE("downcast32to16 SSE2") { return kernel_downcast32to16_test<CPUT template INTGEMM_AVX2 void kernel_downcast32to16_test<CPUType::AVX2>(); KERNEL_TEST_CASE("downcast32to16 AVX2") { return kernel_downcast32to16_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_downcast32to16_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("downcast32to16 AVX512BW") { return kernel_downcast32to16_test<CPUType::AVX512BW>(); } #endif @@ -92,7 +92,7 @@ KERNEL_TEST_CASE("downcast16to8 SSE2") { return kernel_downcast16to8_test<CPUTyp template INTGEMM_AVX2 void kernel_downcast16to8_test<CPUType::AVX2>(); KERNEL_TEST_CASE("downcast16to8 AVX2") { return kernel_downcast16to8_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_downcast16to8_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("downcast16to8 AVX512BW") { return kernel_downcast16to8_test<CPUType::AVX512BW>(); } #endif diff --git a/test/kernels/exp_test.cc b/test/kernels/exp_test.cc index d54f2ca..cf85562 100644 --- a/test/kernels/exp_test.cc +++ b/test/kernels/exp_test.cc @@ -27,7 +27,7 @@ void kernel_exp_approx_taylor_test() { template INTGEMM_AVX2 void kernel_exp_approx_taylor_test<CPUType::AVX2>(); KERNEL_TEST_CASE("exp_approx_taylor AVX2") { return kernel_exp_approx_taylor_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_exp_approx_taylor_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("exp_approx_taylor AVX512BW") { return kernel_exp_approx_taylor_test<CPUType::AVX512BW>(); } #endif diff --git a/test/kernels/floor_test.cc b/test/kernels/floor_test.cc index 10914a3..365d16d 100644 --- a/test/kernels/floor_test.cc +++ b/test/kernels/floor_test.cc @@ -30,7 +30,7 @@ KERNEL_TEST_CASE("floor SSE2") { return kernel_floor_test<CPUType::SSE2>(); } template INTGEMM_AVX2 void kernel_floor_test<CPUType::AVX2>(); KERNEL_TEST_CASE("floor AVX2") { return kernel_floor_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_floor_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("floor AVX512BW") { return kernel_floor_test<CPUType::AVX512BW>(); } #endif diff --git a/test/kernels/multiply_sat_test.cc b/test/kernels/multiply_sat_test.cc index 0e86c0e..edea772 100644 --- a/test/kernels/multiply_sat_test.cc +++ b/test/kernels/multiply_sat_test.cc @@ -41,7 +41,7 @@ template INTGEMM_AVX2 void kernel_multiply_sat_test<CPUType::AVX2, int16_t>(); KERNEL_TEST_CASE("multiply_sat/int8 AVX2") { return kernel_multiply_sat_test<CPUType::AVX2, int8_t>(); } KERNEL_TEST_CASE("multiply_sat/int16 AVX2") { return kernel_multiply_sat_test<CPUType::AVX2, int16_t>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_multiply_sat_test<CPUType::AVX512BW, int8_t>(); template INTGEMM_AVX512BW void kernel_multiply_sat_test<CPUType::AVX512BW, int16_t>(); KERNEL_TEST_CASE("multiply_sat/int8 AVX512BW") { return kernel_multiply_sat_test<CPUType::AVX512BW, int8_t>(); } diff --git a/test/kernels/multiply_test.cc b/test/kernels/multiply_test.cc index 0eea965..30f1640 100644 --- a/test/kernels/multiply_test.cc +++ b/test/kernels/multiply_test.cc @@ -48,7 +48,7 @@ KERNEL_TEST_CASE("multiply/int AVX2") { return kernel_multiply_test<CPUType::AVX KERNEL_TEST_CASE("multiply/float AVX2") { return kernel_multiply_test<CPUType::AVX2, float>(); } KERNEL_TEST_CASE("multiply/double AVX2") { return kernel_multiply_test<CPUType::AVX2, double>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, int8_t>(); template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, int16_t>(); template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, int>(); diff --git a/test/kernels/quantize_test.cc b/test/kernels/quantize_test.cc index c9eae0a..07bfe21 100644 --- a/test/kernels/quantize_test.cc +++ b/test/kernels/quantize_test.cc @@ -31,7 +31,7 @@ KERNEL_TEST_CASE("quantize SSE2") { return kernel_quantize_test<CPUType::SSE2>() template INTGEMM_AVX2 void kernel_quantize_test<CPUType::AVX2>(); KERNEL_TEST_CASE("quantize AVX2") { return kernel_quantize_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_quantize_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("quantize AVX512BW") { return kernel_quantize_test<CPUType::AVX512BW>(); } #endif diff --git a/test/kernels/relu_test.cc b/test/kernels/relu_test.cc index 25a212a..c291dea 100644 --- a/test/kernels/relu_test.cc +++ b/test/kernels/relu_test.cc @@ -46,7 +46,7 @@ KERNEL_TEST_CASE("relu/int AVX2") { return kernel_relu_test<CPUType::AVX2, int>( KERNEL_TEST_CASE("relu/float AVX2") { return kernel_relu_test<CPUType::AVX2, float>(); } KERNEL_TEST_CASE("relu/double AVX2") { return kernel_relu_test<CPUType::AVX2, double>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, int8_t>(); template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, int16_t>(); template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, int>(); diff --git a/test/kernels/rescale_test.cc b/test/kernels/rescale_test.cc index d380c8d..2f79d39 100644 --- a/test/kernels/rescale_test.cc +++ b/test/kernels/rescale_test.cc @@ -32,7 +32,7 @@ KERNEL_TEST_CASE("rescale SSE2") { return kernel_rescale_test<CPUType::SSE2>(); template INTGEMM_AVX2 void kernel_rescale_test<CPUType::AVX2>(); KERNEL_TEST_CASE("rescale AVX2") { return kernel_rescale_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_rescale_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("rescale AVX512BW") { return kernel_rescale_test<CPUType::AVX512BW>(); } #endif diff --git a/test/kernels/sigmoid_test.cc b/test/kernels/sigmoid_test.cc index e4743e2..a8b0b3c 100644 --- a/test/kernels/sigmoid_test.cc +++ b/test/kernels/sigmoid_test.cc @@ -34,7 +34,7 @@ void kernel_sigmoid_test() { template INTGEMM_AVX2 void kernel_sigmoid_test<CPUType::AVX2>(); KERNEL_TEST_CASE("sigmoid AVX2") { return kernel_sigmoid_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_sigmoid_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("sigmoid AVX512BW") { return kernel_sigmoid_test<CPUType::AVX512BW>(); } #endif diff --git a/test/kernels/tanh_test.cc b/test/kernels/tanh_test.cc index 737ac9b..2d688ea 100644 --- a/test/kernels/tanh_test.cc +++ b/test/kernels/tanh_test.cc @@ -27,7 +27,7 @@ void kernel_tanh_test() { template INTGEMM_AVX2 void kernel_tanh_test<CPUType::AVX2>(); KERNEL_TEST_CASE("tanh AVX2") { return kernel_tanh_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_tanh_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("tanh AVX512BW") { return kernel_tanh_test<CPUType::AVX512BW>(); } #endif diff --git a/test/kernels/unquantize_test.cc b/test/kernels/unquantize_test.cc index 6f40da6..20c3d6a 100644 --- a/test/kernels/unquantize_test.cc +++ b/test/kernels/unquantize_test.cc @@ -31,7 +31,7 @@ KERNEL_TEST_CASE("unquantize SSE2") { return kernel_unquantize_test<CPUType::SSE template INTGEMM_AVX2 void kernel_unquantize_test<CPUType::AVX2>(); KERNEL_TEST_CASE("unquantize AVX2") { return kernel_unquantize_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_unquantize_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("unquantize AVX512BW") { return kernel_unquantize_test<CPUType::AVX512BW>(); } #endif diff --git a/test/kernels/upcast_test.cc b/test/kernels/upcast_test.cc index df6a62e..497c734 100644 --- a/test/kernels/upcast_test.cc +++ b/test/kernels/upcast_test.cc @@ -33,7 +33,7 @@ KERNEL_TEST_CASE("upcast8to16 SSE2") { return kernel_upcast8to16_test<CPUType::S template INTGEMM_AVX2 void kernel_upcast8to16_test<CPUType::AVX2>(); KERNEL_TEST_CASE("upcast8to16 AVX2") { return kernel_upcast8to16_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_upcast8to16_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("upcast8to16 AVX512BW") { return kernel_upcast8to16_test<CPUType::AVX512BW>(); } #endif @@ -65,7 +65,7 @@ KERNEL_TEST_CASE("upcast16to32 SSE2") { return kernel_upcast16to32_test<CPUType: template INTGEMM_AVX2 void kernel_upcast16to32_test<CPUType::AVX2>(); KERNEL_TEST_CASE("upcast16to32 AVX2") { return kernel_upcast16to32_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_upcast16to32_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("upcast16to32 AVX512BW") { return kernel_upcast16to32_test<CPUType::AVX512BW>(); } #endif @@ -99,7 +99,7 @@ KERNEL_TEST_CASE("upcast8to32 SSE2") { return kernel_upcast8to32_test<CPUType::S template INTGEMM_AVX2 void kernel_upcast8to32_test<CPUType::AVX2>(); KERNEL_TEST_CASE("upcast8to32 AVX2") { return kernel_upcast8to32_test<CPUType::AVX2>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_upcast8to32_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("upcast8to32 AVX512BW") { return kernel_upcast8to32_test<CPUType::AVX512BW>(); } #endif diff --git a/test/kernels/write_test.cc b/test/kernels/write_test.cc index aeaafcb..f2e3d35 100644 --- a/test/kernels/write_test.cc +++ b/test/kernels/write_test.cc @@ -46,7 +46,7 @@ KERNEL_TEST_CASE("write/int AVX2") { return kernel_write_test<CPUType::AVX2, int KERNEL_TEST_CASE("write/float AVX2") { return kernel_write_test<CPUType::AVX2, float>(); } KERNEL_TEST_CASE("write/double AVX2") { return kernel_write_test<CPUType::AVX2, double>(); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, int8_t>(); template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, int16_t>(); template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, int>(); diff --git a/test/multiply_test.cc b/test/multiply_test.cc index 7316bb0..260dd76 100644 --- a/test/multiply_test.cc +++ b/test/multiply_test.cc @@ -82,7 +82,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) { TEST_CASE("Prepare AVX512", "[prepare]") { if (kCPU < CPUType::AVX512BW) return; -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TestPrepare<AVX512_8bit>(64, 8); TestPrepare<AVX512_8bit>(256, 32); TestPrepare<AVX512_16bit>(64, 8); @@ -147,7 +147,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1 TEST_CASE("SelectColumnsB AVX512", "[select]") { if (kCPU < CPUType::AVX512BW) return; -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TestSelectColumnsB<AVX512_8bit>(); TestSelectColumnsB<AVX512_16bit>(256, 256); #endif @@ -223,7 +223,7 @@ TEST_CASE("MaxAbsolute AVX2", "[max]") { TEST_CASE("MaxAbsolute AVX512F", "[max]") { if (kCPU < CPUType::AVX512BW) return; - #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 + #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TestMaxAbsolute<avx512f::MaxAbsolute>(); #endif } @@ -434,7 +434,7 @@ TEST_CASE ("Multiply AVX2 16bit with bias", "[biased_multiply]") { TestMultiplyBias<AVX2_16bit>(200, 256, 256, .1, 1, 0.01); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512 8bit", "[multiply]") { if (kCPU < CPUType::AVX512BW) return; TestMultiply<AVX512_8bit>(8, 256, 256, 0, 0.25, 0.062); diff --git a/test/prepare_b_quantized_transposed.cc b/test/prepare_b_quantized_transposed.cc index 68c0a29..3a5faaf 100644 --- a/test/prepare_b_quantized_transposed.cc +++ b/test/prepare_b_quantized_transposed.cc @@ -80,7 +80,7 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") { CHECK(TestMany<AVX2_16bit>(32, 128)); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("PrepareBQuantizedTransposed AVX512", "") { if (kCPU < CPUType::AVX512BW) return; diff --git a/test/prepare_b_transposed.cc b/test/prepare_b_transposed.cc index 219e56a..1a4ed88 100644 --- a/test/prepare_b_transposed.cc +++ b/test/prepare_b_transposed.cc @@ -81,7 +81,7 @@ TEST_CASE("PrepareBTransposed AVX2", "") { CHECK(TestMany<AVX2_16bit>(8, 128, 2.0f)); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("PrepareBTransposed AVX512", "") { if (kCPU < CPUType::AVX512BW) return; diff --git a/test/quantize_test.cc b/test/quantize_test.cc index 3263812..ee27261 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -86,7 +86,7 @@ TEST_CASE ("Quantize AVX2", "[quantize]") { TestMany<AVX2_8bit>(1); TestMany<AVX2_16bit>(16); } -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Quantize AVX512", "[quantize]") { if (kCPU < CPUType::AVX512BW) return; TestMany<AVX512_8bit>(1); |