From d85e98ddceb02989551170b993b0f27fc5a109eb Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 17 Nov 2020 21:10:45 +0000 Subject: Make AVX2 optional if compiler doesn't support it Mostly for emscripten --- CMakeLists.txt | 11 +++++++++-- benchmarks/benchmark.cc | 3 ++- benchmarks/benchmark_quantizer.cc | 2 ++ benchmarks/biasmultiply.cc | 2 ++ compile_test_avx2.cc | 17 +++++++++++++++++ intgemm/avx2_gemm.h | 6 ++++++ intgemm/callbacks.h | 2 ++ intgemm/interleave.h | 7 +++++++ intgemm/intgemm.cc | 8 ++++++++ intgemm/intgemm.h | 7 +++++++ intgemm/intgemm_config.h.in | 1 + intgemm/intrinsics.h | 3 +++ intgemm/kernels.h | 2 ++ intgemm/multiply.h | 11 ++++++++--- intgemm/stats.h | 4 ++++ test/add127_test.cc | 21 +++++++++++++++++---- test/kernels/add_bias_test.cc | 2 ++ test/kernels/bitwise_not_test.cc | 2 ++ test/kernels/downcast_test.cc | 6 ++++++ test/kernels/exp_test.cc | 2 ++ test/kernels/floor_test.cc | 2 ++ test/kernels/multiply_test.cc | 2 ++ test/kernels/quantize_test.cc | 2 ++ test/kernels/relu_test.cc | 2 ++ test/kernels/rescale_test.cc | 2 ++ test/kernels/sigmoid_test.cc | 2 ++ test/kernels/tanh_test.cc | 2 ++ test/kernels/unquantize_test.cc | 2 ++ test/kernels/upcast_test.cc | 6 ++++++ test/kernels/write_test.cc | 2 ++ test/multiply_test.cc | 21 +++++++++++++++------ test/prepare_b_quantized_transposed.cc | 2 ++ test/prepare_b_transposed.cc | 14 ++++++++------ test/quantize_test.cc | 14 +++++++++----- 34 files changed, 167 insertions(+), 27 deletions(-) create mode 100644 compile_test_avx2.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index d1885f5..7c3ccfa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,19 +16,26 @@ else() add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-unknown-pragmas) endif() +# Check if compiler supports AVX2 (this should only catch emscripten) +try_compile(INTGEMM_COMPILER_SUPPORTS_AVX2 + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests + ${CMAKE_CURRENT_SOURCE_DIR}/compile_test_avx2.cc) +if(NOT INTGEMM_COMPILER_SUPPORTS_AVX2) + message(WARNING "${Orange}Not building AVX2-based multiplication because your compiler is too old.\nFor details rerun cmake with --debug-trycompile then try to build in compile_tests/CMakeFiles/CMakeTmp.${ColourReset}") +endif() + # Check if compiler supports AVX512BW try_compile(INTGEMM_COMPILER_SUPPORTS_AVX512BW ${CMAKE_CURRENT_BINARY_DIR}/compile_tests ${CMAKE_CURRENT_SOURCE_DIR}/compile_test_avx512bw.cc) - if(NOT INTGEMM_COMPILER_SUPPORTS_AVX512BW) message(WARNING "${Orange}Not building AVX512BW-based multiplication because your compiler is too old.\nFor details rerun cmake with --debug-trycompile then try to build in compile_tests/CMakeFiles/CMakeTmp.${ColourReset}") endif() +# Check if the compiler supports AVX512VNNI try_compile(INTGEMM_COMPILER_SUPPORTS_AVX512VNNI ${CMAKE_CURRENT_BINARY_DIR}/compile_tests ${CMAKE_CURRENT_SOURCE_DIR}/compile_test_avx512vnni.cc) -#No compiler flags for this test; that's part of the test! if(NOT INTGEMM_COMPILER_SUPPORTS_AVX512VNNI) message(WARNING "${Orange}Not building AVX512VNNI-based multiplication because your compiler is too old.\nFor details rerun cmake with --debug-trycompile then try to build in compile_tests/CMakeFiles/CMakeTmp.${ColourReset}") endif() diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index c6133bf..2dbe483 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -154,6 +154,7 @@ int main(int, char ** argv) { RunAll(matrices, end, stats.sse2_16bit); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 std::cerr << "AVX2 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; @@ -165,7 +166,7 @@ int main(int, char ** argv) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; RunAll(matrices, end, stats.avx2_16bit); } - +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW std::cerr << "AVX512 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { diff --git a/benchmarks/benchmark_quantizer.cc b/benchmarks/benchmark_quantizer.cc index 5f36bd7..86d90dc 100644 --- a/benchmarks/benchmark_quantizer.cc +++ b/benchmarks/benchmark_quantizer.cc @@ -64,7 +64,9 @@ int main() { element = dist(gen); } QuantizerBench(in.begin(), out.begin(), static_cast(count)); +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 QuantizerBench(in.begin(), out.begin(), static_cast(count)); +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW QuantizerBench(in.begin(), out.begin(), static_cast(count)); #endif diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc index 490bf3b..65deadb 100644 --- a/benchmarks/biasmultiply.cc +++ b/benchmarks/biasmultiply.cc @@ -161,6 +161,7 @@ int main(int argc, char ** argv) { std::cout << repeat << " iterations of Shifted SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl; +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 std::chrono::duration oldAVX2_nobias = testOld_nobias(1, 64, 8); for (int i = 0; i(8, 256, 256); @@ -196,6 +197,7 @@ int main(int argc, char ** argv) { } std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl; +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW if (kCPU < CPUType::AVX512BW) return 0; std::chrono::duration oldAVX512_nobias = testOld_nobias(1, 64, 8); diff --git a/compile_test_avx2.cc b/compile_test_avx2.cc new file mode 100644 index 0000000..794983b --- /dev/null +++ b/compile_test_avx2.cc @@ -0,0 +1,17 @@ +// Some compilers don't have AVX512BW support. Test for them. +#include + +#if defined(_MSC_VER) +#define INTGEMM_AVX2 +#else +#define INTGEMM_AVX2 __attribute__ ((target ("avx2"))) +#endif + +INTGEMM_AVX2 int Test() { + __m256i value = _mm256_set1_epi32(1); + value = _mm256_abs_epi8(value); + return *(int*)&value; +} + +int main() { +} diff --git a/intgemm/avx2_gemm.h b/intgemm/avx2_gemm.h index 5e81475..6e01679 100644 --- a/intgemm/avx2_gemm.h +++ b/intgemm/avx2_gemm.h @@ -1,5 +1,9 @@ #pragma once +#include "intgemm/intgemm_config.h" + +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 + #include "interleave.h" #include "kernels.h" #include "multiply.h" @@ -224,3 +228,5 @@ struct Kernels8 { } // namespace avx2 } // namespace intgemm + +#endif diff --git a/intgemm/callbacks.h b/intgemm/callbacks.h index 23d3be1..c304466 100644 --- a/intgemm/callbacks.h +++ b/intgemm/callbacks.h @@ -14,9 +14,11 @@ #include "callbacks/implementations.inl" #undef CALLBACKS_THIS_IS_SSE2 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 #define CALLBACKS_THIS_IS_AVX2 #include "callbacks/implementations.inl" #undef CALLBACKS_THIS_IS_AVX2 +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW #define CALLBACKS_THIS_IS_AVX512BW diff --git a/intgemm/interleave.h b/intgemm/interleave.h index 1ec686b..95f05ce 100644 --- a/intgemm/interleave.h +++ b/intgemm/interleave.h @@ -26,7 +26,10 @@ INTGEMM_INTERLEAVE_N(target, type, 32) \ INTGEMM_INTERLEAVE_N(target, type, 64) INTGEMM_INTERLEAVE(INTGEMM_SSE2, __m128i) + +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 INTGEMM_INTERLEAVE(INTGEMM_AVX2, __m256i) +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW INTGEMM_INTERLEAVE(INTGEMM_AVX512BW, __m512i) #endif @@ -42,7 +45,9 @@ target static inline void Swap(Register &a, Register &b) { \ } \ INTGEMM_SWAP(INTGEMM_SSE2, __m128i) +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 INTGEMM_SWAP(INTGEMM_AVX2, __m256i) +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_SWAP(INTGEMM_AVX512BW, __m512i) @@ -95,7 +100,9 @@ target static inline void Transpose16InLane(Register &r0, Register &r1, Register } \ INTGEMM_TRANSPOSE16(INTGEMM_SSE2, __m128i) +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 INTGEMM_TRANSPOSE16(INTGEMM_AVX2, __m256i) +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_TRANSPOSE16(INTGEMM_AVX512BW, __m512i) diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc index f859b9a..d45cf60 100644 --- a/intgemm/intgemm.cc +++ b/intgemm/intgemm.cc @@ -43,6 +43,12 @@ const char *const Int8Shift::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx5 const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED); +#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX2) +namespace avx2{ +using sse2::MaxAbsolute; +using sse2::VectorMeanStd; +} // namespace avx2 +#endif #if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW) namespace avx512bw { using avx2::MaxAbsolute; @@ -58,8 +64,10 @@ constexpr const char *const Unsupported_16bit::kName; constexpr const char *const Unsupported_8bit::kName; constexpr const char *const sse2::Kernels16::kName; constexpr const char *const ssse3::Kernels8::kName; +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 constexpr const char *const avx2::Kernels8::kName; constexpr const char *const avx2::Kernels16::kName; +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW constexpr const char *const avx512bw::Kernels8::kName; constexpr const char *const avx512bw::Kernels16::kName; diff --git a/intgemm/intgemm.h b/intgemm/intgemm.h index 8e2da02..b53387e 100644 --- a/intgemm/intgemm.h +++ b/intgemm/intgemm.h @@ -134,6 +134,13 @@ typedef Unsupported_8bit Kernels8; typedef Unsupported_16bit Kernels16; } // namespace avx512bw #endif +#ifndef INTGEMM_COMPILER_SUPPORTS_AVX2 +namespace avx2 { +typedef Unsupported_8bit Kernels8; +typedef Unsupported_16bit Kernels16; +} // namespace avx2 +#endif + /* Returns: * axx512vnni if the CPU supports AVX512VNNI diff --git a/intgemm/intgemm_config.h.in b/intgemm/intgemm_config.h.in index 920e9ae..a2c8cbd 100644 --- a/intgemm/intgemm_config.h.in +++ b/intgemm/intgemm_config.h.in @@ -1,4 +1,5 @@ #pragma once +#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX2 #cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX512BW #cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX512VNNI diff --git a/intgemm/intrinsics.h b/intgemm/intrinsics.h index 480f421..31957bc 100644 --- a/intgemm/intrinsics.h +++ b/intgemm/intrinsics.h @@ -215,6 +215,8 @@ INTGEMM_SSE2 static inline __m128i xor_si(__m128i a, __m128i b) { * AVX2 * */ + +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 INTGEMM_AVX2 static inline __m256i abs_epi8(__m256i arg) { return _mm256_abs_epi8(arg); } @@ -390,6 +392,7 @@ INTGEMM_AVX2 static inline __m256i unpackhi_epi64(__m256i a, __m256i b) { INTGEMM_AVX2 static inline __m256i xor_si(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } +#endif /* * diff --git a/intgemm/kernels.h b/intgemm/kernels.h index ee35966..57036f4 100644 --- a/intgemm/kernels.h +++ b/intgemm/kernels.h @@ -12,9 +12,11 @@ #include "kernels/implementations.inl" #undef KERNELS_THIS_IS_SSE2 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 #define KERNELS_THIS_IS_AVX2 #include "kernels/implementations.inl" #undef KERNELS_THIS_IS_AVX2 +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW #define KERNELS_THIS_IS_AVX512BW diff --git a/intgemm/multiply.h b/intgemm/multiply.h index e201e09..84c0655 100644 --- a/intgemm/multiply.h +++ b/intgemm/multiply.h @@ -13,6 +13,7 @@ INTGEMM_SSE2 static inline dvector_t PermuteSummer(__m128i p return { pack0123, pack4567 }; } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 INTGEMM_AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) { // This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f __m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21); @@ -20,7 +21,7 @@ INTGEMM_AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4 __m256i blended = _mm256_blend_epi32(pack0123, pack4567, 0xf0); return _mm256_add_epi32(rev, blended); } - +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ @@ -99,7 +100,9 @@ target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Reg } \ INTGEMM_PACK0123(INTGEMM_SSE2, __m128i) +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 INTGEMM_PACK0123(INTGEMM_AVX2, __m256i) +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i) @@ -111,10 +114,12 @@ INTGEMM_SSE2 static inline void RunCallback(Callback& callback_impl, dvector_t INTGEMM_AVX2 static inline void RunCallback(Callback& callback_impl, vector_t total, Index row_idx, Index col_idx, Index rows, Index cols) { callback_impl(total, callbacks::OutputBufferInfo(row_idx, col_idx, rows, cols)); } +#endif // 16-bit multiplier for INTGEMM_SSE2, INTGEMM_AVX2, and AVX512. // C = A * B * unquant_mult @@ -374,7 +379,7 @@ template target static void Multiply(const int16_t *A, const * 256-bit. We had to wait for INTGEMM_AVX2 to get 256-bit versions of vpsignb and * vpmaddubsw. That's why this code is generic over 128-bit or 256-bit. */ - +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 INTGEMM_AVX2 inline static void InnerINTGEMM_AVX2( __m256i a, const __m256i *b, __m256i &sum0, __m256i &sum1, __m256i &sum2, __m256i &sum3, @@ -514,7 +519,7 @@ INTGEMM_AVX2 inline static void InnerINTGEMM_AVX2( sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a))); #endif } - +#endif // For INTGEMM_SSSE3 without AVX INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3( diff --git a/intgemm/stats.h b/intgemm/stats.h index 6f9eda2..9573c4b 100644 --- a/intgemm/stats.h +++ b/intgemm/stats.h @@ -32,12 +32,14 @@ INTGEMM_SSE2 static inline float AddFloat32(__m128 a) { return *reinterpret_cast(&a); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 INTGEMM_AVX2 static inline float MaxFloat32(__m256 a) { return MaxFloat32(max_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))); } INTGEMM_AVX2 static inline float AddFloat32(__m256 a) { return AddFloat32(add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW // Find the maximum float. @@ -61,9 +63,11 @@ constexpr int32_t kFloatAbsoluteMask = 0x7fffffff; #include "stats.inl" #undef INTGEMM_THIS_IS_SSE2 +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 #define INTGEMM_THIS_IS_AVX2 #include "stats.inl" #undef INTGEMM_THIS_IS_AVX2 +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW #define INTGEMM_THIS_IS_AVX512DQ diff --git a/test/add127_test.cc b/test/add127_test.cc index b7ce49b..723e143 100644 --- a/test/add127_test.cc +++ b/test/add127_test.cc @@ -287,21 +287,23 @@ TEST_CASE("PrepareBias SSSE3", "[Add127]") { TestPrepareBias(512,512); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("PrepareBias AVX2", "[Add127]") { if (kCPU < CPUType::AVX2) return; TestPrepareBias(256,256); TestPrepareBias(2048,256); TestPrepareBias(512,512); } +#endif +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("PrepareBias AVX512F", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TestPrepareBias(256,256); TestPrepareBias(2048,256); TestPrepareBias(512,512); - #endif } +#endif //A TEST_CASE("PrepareA SSSE3", "[Add127]") { @@ -312,6 +314,7 @@ TEST_CASE("PrepareA SSSE3", "[Add127]") { TestPrepareA(2048,256); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("PrepareA AVX2", "[Add127]") { if (kCPU < CPUType::AVX2) return; TestPrepareA(64,64); @@ -319,16 +322,17 @@ TEST_CASE("PrepareA AVX2", "[Add127]") { TestPrepareA(512,512); TestPrepareA(2048,256); } +#endif +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("PrepareA AVX512F", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TestPrepareA(64,64); TestPrepareA(256,256); TestPrepareA(512,512); TestPrepareA(2048,256); - #endif } +#endif // Multiply @@ -343,6 +347,7 @@ TEST_CASE ("Multiply SSSE3 8bit Shift with bias", "[Add127]") { TestMultiplyBiasNew(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE ("Multiply AVX2 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::AVX2) return; TestMultiplyBiasNew(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f); @@ -353,6 +358,8 @@ TEST_CASE ("Multiply AVX2 8bit Shift with bias", "[Add127]") { TestMultiplyBiasNew(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f); TestMultiplyBiasNew(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f); } +#endif + #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; @@ -391,6 +398,7 @@ TEST_CASE ("Multiply SSSE3 8bit Shift vs nonshift", "[Add127]") { TestMultiplyShiftNonShift(200, 256, 256, 1, 0.74f, 0.17f, 0.006f); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE ("Multiply AVX2 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::AVX2) return; TestMultiplyShiftNonShift(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f); @@ -401,6 +409,8 @@ TEST_CASE ("Multiply AVX2 8bit Shift vs nonshift", "[Add127]") { TestMultiplyShiftNonShift(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); TestMultiplyShiftNonShift(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); } +#endif + #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; @@ -439,6 +449,7 @@ TEST_CASE ("Multiply SSSE3 8bit Shift vs Int", "[Add127]") { TestMultiplyShiftInt(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE ("Multiply AVX2 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::AVX2) return; TestMultiplyShiftInt(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f); @@ -449,6 +460,8 @@ TEST_CASE ("Multiply AVX2 8bit Shift vs Int", "[Add127]") { TestMultiplyShiftInt(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); TestMultiplyShiftInt(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); } +#endif + #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; diff --git a/test/kernels/add_bias_test.cc b/test/kernels/add_bias_test.cc index 492c669..b9e5fd9 100644 --- a/test/kernels/add_bias_test.cc +++ b/test/kernels/add_bias_test.cc @@ -37,6 +37,7 @@ KERNEL_TEST_CASE("add_bias/int SSE2") { return kernel_add_bias_test(); } KERNEL_TEST_CASE("add_bias/double SSE2") { return kernel_add_bias_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_add_bias_test(); template INTGEMM_AVX2 void kernel_add_bias_test(); template INTGEMM_AVX2 void kernel_add_bias_test(); @@ -47,6 +48,7 @@ KERNEL_TEST_CASE("add_bias/int16 AVX2") { return kernel_add_bias_test(); } KERNEL_TEST_CASE("add_bias/float AVX2") { return kernel_add_bias_test(); } KERNEL_TEST_CASE("add_bias/double AVX2") { return kernel_add_bias_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_add_bias_test(); diff --git a/test/kernels/bitwise_not_test.cc b/test/kernels/bitwise_not_test.cc index e908c43..6c28c95 100644 --- a/test/kernels/bitwise_not_test.cc +++ b/test/kernels/bitwise_not_test.cc @@ -28,8 +28,10 @@ void kernel_bitwise_not_test() { template INTGEMM_SSE2 void kernel_bitwise_not_test(); KERNEL_TEST_CASE("bitwise_not SSE2") { return kernel_bitwise_not_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_bitwise_not_test(); KERNEL_TEST_CASE("bitwise_not AVX2") { return kernel_bitwise_not_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_bitwise_not_test(); diff --git a/test/kernels/downcast_test.cc b/test/kernels/downcast_test.cc index 5f9db66..0f2ccd0 100644 --- a/test/kernels/downcast_test.cc +++ b/test/kernels/downcast_test.cc @@ -30,8 +30,10 @@ void kernel_downcast32to8_test() { template INTGEMM_SSE2 void kernel_downcast32to8_test(); KERNEL_TEST_CASE("downcast32to8 SSE2") { return kernel_downcast32to8_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_downcast32to8_test(); KERNEL_TEST_CASE("downcast32to8 AVX2") { return kernel_downcast32to8_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_downcast32to8_test(); @@ -60,8 +62,10 @@ void kernel_downcast32to16_test() { template INTGEMM_SSE2 void kernel_downcast32to16_test(); KERNEL_TEST_CASE("downcast32to16 SSE2") { return kernel_downcast32to16_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_downcast32to16_test(); KERNEL_TEST_CASE("downcast32to16 AVX2") { return kernel_downcast32to16_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_downcast32to16_test(); @@ -90,8 +94,10 @@ void kernel_downcast16to8_test() { template INTGEMM_SSE2 void kernel_downcast16to8_test(); KERNEL_TEST_CASE("downcast16to8 SSE2") { return kernel_downcast16to8_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_downcast16to8_test(); KERNEL_TEST_CASE("downcast16to8 AVX2") { return kernel_downcast16to8_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_downcast16to8_test(); diff --git a/test/kernels/exp_test.cc b/test/kernels/exp_test.cc index 838e228..9f535f2 100644 --- a/test/kernels/exp_test.cc +++ b/test/kernels/exp_test.cc @@ -25,8 +25,10 @@ void kernel_exp_approx_taylor_test() { CHECK_EPS(output[i], exp(input[i]), 0.001f); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_exp_approx_taylor_test(); KERNEL_TEST_CASE("exp_approx_taylor AVX2") { return kernel_exp_approx_taylor_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_exp_approx_taylor_test(); diff --git a/test/kernels/floor_test.cc b/test/kernels/floor_test.cc index 2659c3f..9b7a214 100644 --- a/test/kernels/floor_test.cc +++ b/test/kernels/floor_test.cc @@ -28,8 +28,10 @@ void kernel_floor_test() { template INTGEMM_SSE2 void kernel_floor_test(); KERNEL_TEST_CASE("floor SSE2") { return kernel_floor_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_floor_test(); KERNEL_TEST_CASE("floor AVX2") { return kernel_floor_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_floor_test(); diff --git a/test/kernels/multiply_test.cc b/test/kernels/multiply_test.cc index 029e3ac..fc1a51e 100644 --- a/test/kernels/multiply_test.cc +++ b/test/kernels/multiply_test.cc @@ -38,6 +38,7 @@ KERNEL_TEST_CASE("multiply/int SSE2") { return kernel_multiply_test(); } KERNEL_TEST_CASE("multiply/double SSE2") { return kernel_multiply_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_multiply_test(); template INTGEMM_AVX2 void kernel_multiply_test(); template INTGEMM_AVX2 void kernel_multiply_test(); @@ -48,6 +49,7 @@ KERNEL_TEST_CASE("multiply/int16 AVX2") { return kernel_multiply_test(); } KERNEL_TEST_CASE("multiply/float AVX2") { return kernel_multiply_test(); } KERNEL_TEST_CASE("multiply/double AVX2") { return kernel_multiply_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_multiply_test(); diff --git a/test/kernels/quantize_test.cc b/test/kernels/quantize_test.cc index ae3c068..93280f7 100644 --- a/test/kernels/quantize_test.cc +++ b/test/kernels/quantize_test.cc @@ -28,8 +28,10 @@ void kernel_quantize_test() { template INTGEMM_SSE2 void kernel_quantize_test(); KERNEL_TEST_CASE("quantize SSE2") { return kernel_quantize_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_quantize_test(); KERNEL_TEST_CASE("quantize AVX2") { return kernel_quantize_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_quantize_test(); diff --git a/test/kernels/relu_test.cc b/test/kernels/relu_test.cc index 6fcef98..8fd30ae 100644 --- a/test/kernels/relu_test.cc +++ b/test/kernels/relu_test.cc @@ -36,6 +36,7 @@ KERNEL_TEST_CASE("relu/int SSE2") { return kernel_relu_test( KERNEL_TEST_CASE("relu/float SSE2") { return kernel_relu_test(); } KERNEL_TEST_CASE("relu/double SSE2") { return kernel_relu_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_relu_test(); template INTGEMM_AVX2 void kernel_relu_test(); template INTGEMM_AVX2 void kernel_relu_test(); @@ -46,6 +47,7 @@ KERNEL_TEST_CASE("relu/int16 AVX2") { return kernel_relu_test(); } KERNEL_TEST_CASE("relu/float AVX2") { return kernel_relu_test(); } KERNEL_TEST_CASE("relu/double AVX2") { return kernel_relu_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_relu_test(); diff --git a/test/kernels/rescale_test.cc b/test/kernels/rescale_test.cc index 280b513..13937ed 100644 --- a/test/kernels/rescale_test.cc +++ b/test/kernels/rescale_test.cc @@ -30,8 +30,10 @@ void kernel_rescale_test() { template INTGEMM_SSE2 void kernel_rescale_test(); KERNEL_TEST_CASE("rescale SSE2") { return kernel_rescale_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_rescale_test(); KERNEL_TEST_CASE("rescale AVX2") { return kernel_rescale_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_rescale_test(); diff --git a/test/kernels/sigmoid_test.cc b/test/kernels/sigmoid_test.cc index af9dad1..7827593 100644 --- a/test/kernels/sigmoid_test.cc +++ b/test/kernels/sigmoid_test.cc @@ -32,8 +32,10 @@ void kernel_sigmoid_test() { CHECK_EPS(output[i], sigmoid_ref(input[i]), 0.001f); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_sigmoid_test(); KERNEL_TEST_CASE("sigmoid AVX2") { return kernel_sigmoid_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_sigmoid_test(); diff --git a/test/kernels/tanh_test.cc b/test/kernels/tanh_test.cc index e2c36f5..1d00042 100644 --- a/test/kernels/tanh_test.cc +++ b/test/kernels/tanh_test.cc @@ -25,8 +25,10 @@ void kernel_tanh_test() { CHECK_EPS(output[i], tanh(input[i]), 0.001f); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_tanh_test(); KERNEL_TEST_CASE("tanh AVX2") { return kernel_tanh_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_tanh_test(); diff --git a/test/kernels/unquantize_test.cc b/test/kernels/unquantize_test.cc index ee4bc80..edfafa5 100644 --- a/test/kernels/unquantize_test.cc +++ b/test/kernels/unquantize_test.cc @@ -28,8 +28,10 @@ void kernel_unquantize_test() { template INTGEMM_SSE2 void kernel_unquantize_test(); KERNEL_TEST_CASE("unquantize SSE2") { return kernel_unquantize_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_unquantize_test(); KERNEL_TEST_CASE("unquantize AVX2") { return kernel_unquantize_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_unquantize_test(); diff --git a/test/kernels/upcast_test.cc b/test/kernels/upcast_test.cc index 92be1bd..0733922 100644 --- a/test/kernels/upcast_test.cc +++ b/test/kernels/upcast_test.cc @@ -33,8 +33,10 @@ void kernel_upcast8to16_test() { template INTGEMM_SSE2 void kernel_upcast8to16_test(); KERNEL_TEST_CASE("upcast8to16 SSE2") { return kernel_upcast8to16_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_upcast8to16_test(); KERNEL_TEST_CASE("upcast8to16 AVX2") { return kernel_upcast8to16_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_upcast8to16_test(); @@ -65,8 +67,10 @@ void kernel_upcast16to32_test() { template INTGEMM_SSE2 void kernel_upcast16to32_test(); KERNEL_TEST_CASE("upcast16to32 SSE2") { return kernel_upcast16to32_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_upcast16to32_test(); KERNEL_TEST_CASE("upcast16to32 AVX2") { return kernel_upcast16to32_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_upcast16to32_test(); @@ -100,8 +104,10 @@ void kernel_upcast8to32_test() { template INTGEMM_SSE2 void kernel_upcast8to32_test(); KERNEL_TEST_CASE("upcast8to32 SSE2") { return kernel_upcast8to32_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_upcast8to32_test(); KERNEL_TEST_CASE("upcast8to32 AVX2") { return kernel_upcast8to32_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_upcast8to32_test(); diff --git a/test/kernels/write_test.cc b/test/kernels/write_test.cc index c263fca..a136a86 100644 --- a/test/kernels/write_test.cc +++ b/test/kernels/write_test.cc @@ -36,6 +36,7 @@ KERNEL_TEST_CASE("write/int SSE2") { return kernel_write_test(); } KERNEL_TEST_CASE("write/double SSE2") { return kernel_write_test(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 template INTGEMM_AVX2 void kernel_write_test(); template INTGEMM_AVX2 void kernel_write_test(); template INTGEMM_AVX2 void kernel_write_test(); @@ -46,6 +47,7 @@ KERNEL_TEST_CASE("write/int16 AVX2") { return kernel_write_test(); } KERNEL_TEST_CASE("write/float AVX2") { return kernel_write_test(); } KERNEL_TEST_CASE("write/double AVX2") { return kernel_write_test(); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW template INTGEMM_AVX512BW void kernel_write_test(); diff --git a/test/multiply_test.cc b/test/multiply_test.cc index 6c16edd..47201c0 100644 --- a/test/multiply_test.cc +++ b/test/multiply_test.cc @@ -82,21 +82,23 @@ template void TestPrepare(Index rows = 32, Index cols = 16) { PrintMatrix(reference.begin(), rows, cols) << "Routine" << '\n' << PrintMatrix(test.begin(), rows, cols)); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("Prepare AVX512", "[prepare]") { if (kCPU < CPUType::AVX512BW) return; -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TestPrepare(64, 8); TestPrepare(256, 32); TestPrepare(64, 8); TestPrepare(256, 32); -#endif } +#endif +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("Prepare AVX2", "[prepare]") { if (kCPU < CPUType::AVX2) return; TestPrepare(64, 32); TestPrepare(64, 32); } +#endif TEST_CASE("Prepare SSSE3", "[prepare]") { if (kCPU < CPUType::SSSE3) return; @@ -147,19 +149,21 @@ template void TestSelectColumnsB(Index rows = 64, Index cols = 1 PrintMatrix(ref.begin(), rows, kSelectCols) << PrintMatrix(test.begin(), rows, kSelectCols)); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("SelectColumnsB AVX512", "[select]") { if (kCPU < CPUType::AVX512BW) return; -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TestSelectColumnsB(); TestSelectColumnsB(256, 256); -#endif } +#endif +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("SelectColumnsB AVX2", "[select]") { if (kCPU < CPUType::AVX2) return; TestSelectColumnsB(256, 256); TestSelectColumnsB(256, 256); } +#endif TEST_CASE("SelectColumnsB SSSE3", "[select]") { if (kCPU < CPUType::SSSE3) return; @@ -218,17 +222,19 @@ TEST_CASE("MaxAbsolute SSE2", "[max]") { TestMaxAbsolute(); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("MaxAbsolute AVX2", "[max]") { if (kCPU < CPUType::AVX2) return; TestMaxAbsolute(); } +#endif +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("MaxAbsolute AVX512BW", "[max]") { if (kCPU < CPUType::AVX512BW) return; - #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TestMaxAbsolute(); - #endif } +#endif // Based on https://arxiv.org/abs/1705.01991 @@ -396,6 +402,8 @@ TEST_CASE ("Multiply SSSE3 8bit with bias", "[biased_multiply]") { TestMultiplyBias(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f); } + +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE ("Multiply AVX2 8bit", "[multiply]") { if (kCPU < CPUType::AVX2) return; TestMultiply(8, 256, 256, .1f, 1, 0.1f); @@ -435,6 +443,7 @@ TEST_CASE ("Multiply AVX2 16bit with bias", "[biased_multiply]") { TestMultiplyBias(248, 256, 256, .1f, 1, 0.01f); TestMultiplyBias(200, 256, 256, .1f, 1, 0.01f); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512 8bit", "[multiply]") { diff --git a/test/prepare_b_quantized_transposed.cc b/test/prepare_b_quantized_transposed.cc index 1437e0a..e27992a 100644 --- a/test/prepare_b_quantized_transposed.cc +++ b/test/prepare_b_quantized_transposed.cc @@ -72,6 +72,7 @@ TEST_CASE("PrepareBQuantizedTransposed SSSE3", "") { CHECK(TestMany(32, 128)); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("PrepareBQuantizedTransposed AVX2", "") { if (kCPU < CPUType::AVX2) return; @@ -79,6 +80,7 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") { CHECK(TestMany(32, 128)); CHECK(TestMany(32, 128)); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("PrepareBQuantizedTransposed AVX512", "") { diff --git a/test/prepare_b_transposed.cc b/test/prepare_b_transposed.cc index bc35138..1ba88df 100644 --- a/test/prepare_b_transposed.cc +++ b/test/prepare_b_transposed.cc @@ -73,6 +73,7 @@ TEST_CASE("PrepareBTransposed SSSE3", "") { CHECK(TestMany(4, 128, 2.0f)); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("PrepareBTransposed AVX2", "") { if (kCPU < CPUType::AVX2) return; @@ -80,15 +81,16 @@ TEST_CASE("PrepareBTransposed AVX2", "") { CHECK(TestMany(8, 128, 2.0f)); CHECK(TestMany(8, 128, 2.0f)); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - TEST_CASE("PrepareBTransposed AVX512", "") { - if (kCPU < CPUType::AVX512BW) - return; +TEST_CASE("PrepareBTransposed AVX512", "") { + if (kCPU < CPUType::AVX512BW) + return; - CHECK(TestMany(16, 128, 2.0f)); - CHECK(TestMany(16, 128, 2.0f)); - } + CHECK(TestMany(16, 128, 2.0f)); + CHECK(TestMany(16, 128, 2.0f)); +} #endif } diff --git a/test/quantize_test.cc b/test/quantize_test.cc index 550ec66..e9f7980 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -128,17 +128,19 @@ TEST_CASE ("Quantize SSSE3", "[quantize]") { TestMany(1); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE ("Quantize AVX2", "[quantize]") { if (kCPU < CPUType::AVX2) return; TestMany(1); TestMany(16); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - TEST_CASE ("Quantize AVX512", "[quantize]") { - if (kCPU < CPUType::AVX512BW) return; - TestMany(1); - TestMany(16); - } +TEST_CASE ("Quantize AVX512", "[quantize]") { + if (kCPU < CPUType::AVX512BW) return; + TestMany(1); + TestMany(16); +} #endif TEST_CASE("QuantizeStd SSSE3", "[VectorMeanStd]") { @@ -157,6 +159,7 @@ TEST_CASE("QuantizeStd SSSE3", "[VectorMeanStd]") { testVectorMeanStd(120832, true); } +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2 TEST_CASE("QuantizeStd AVX2", "[VectorMeanStd]") { if (kCPU < CPUType::AVX2) return; testVectorMeanStd(64); @@ -172,6 +175,7 @@ TEST_CASE("QuantizeStd AVX2", "[VectorMeanStd]") { testVectorMeanStd(120832); testVectorMeanStd(120832, true); } +#endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE("QuantizeStd AVX512BW", "[VectorMeanStd]") { -- cgit v1.2.3