diff options
author | Kenneth Heafield <kpu@users.noreply.github.com> | 2020-03-17 14:02:44 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-03-17 14:02:44 +0300 |
commit | 8033cdf974f69599edde6ba9126bff01b91f2435 (patch) | |
tree | 3dd60dbf7b979db3eeb46083477d00433c78c63a | |
parent | 261a5fbcf7558fc3c2ac22b33fe0c2930d440fc3 (diff) | |
parent | 79a3be9e7b78a4cbb231b5fc4e23dc0593f20240 (diff) |
Merge pull request #71 from kpuatamazon/master
Improve compiler support
-rw-r--r-- | CMakeLists.txt | 3 | ||||
-rw-r--r-- | avx2_gemm.h | 4 | ||||
-rw-r--r-- | avx512_gemm.h | 42 | ||||
-rw-r--r-- | benchmarks/biasmultiply.cc | 1 | ||||
-rw-r--r-- | compile_test_avx512vnni.cc | 10 | ||||
-rw-r--r-- | intgemm.cc | 8 | ||||
-rw-r--r-- | intgemm.h | 90 | ||||
-rw-r--r-- | multiply.h | 19 | ||||
-rw-r--r-- | ssse3_gemm.h | 3 | ||||
-rw-r--r-- | test/kernels/multiply_sat_test.cc | 14 |
10 files changed, 128 insertions, 66 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index bbb9b83..32c19ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,8 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) add_library(intgemm STATIC intgemm.cc) -if (OPENMP) +option(USE_OPENMP "Use OpenMP" OFF) +if (USE_OPENMP) message(STATUS "Compiling with OpenMP") find_package(OpenMP) if (NOT ${OpenMP_CXX_FOUND}) diff --git a/avx2_gemm.h b/avx2_gemm.h index 335fa0d..25866bb 100644 --- a/avx2_gemm.h +++ b/avx2_gemm.h @@ -201,7 +201,9 @@ struct AVX2_8bit { INTGEMM_AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { Quantize(input, output, quant_mult, rows * cols); } - + private: + INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2, __m256i, avx2) + public: INTGEMM_QUANTIZE(INTGEMM_AVX2, __m256i, avx2) // Currently A is prepared by quantization but this could theoretically change. diff --git a/avx512_gemm.h b/avx512_gemm.h index cdbfff5..6286ccc 100644 --- a/avx512_gemm.h +++ b/avx512_gemm.h @@ -224,26 +224,48 @@ struct AVX512_8bit { Quantize(input, output, quant_mult, rows * cols); } + private: + /* g++ (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0 does not carry target attributes + * to the hidden function it creates in implementing #pragma omp parallel for. + * So intrinstics were not working inside the for loop when compiled with + * OMP. Also, passing register types across #pragma omp parallel for + * generated an internal compiler error. + * The problem does not occur in g++-8 (Ubuntu 8.3.0-6ubuntu1~18.04.1) 8.3.0. + * As a workaround, I split into #pragma omp parallel with boring types + * passed across the boundary then call this function with target attributes. + */ + INTGEMM_AVX512BW static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { + const __m512i neg127 = _mm512_set1_epi32(-127); + const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); + const std::size_t kBatch = sizeof(__m512i) / sizeof(float); +#pragma omp for + for (std::size_t i = 0; i < count; i += kBatch) { + __m512i asint = avx512f::QuantizerGrab(input + i, quant_mult_reg); + asint = _mm512_max_epi32(asint, neg127); + // There doesn't seem to be an unmasked version. + _mm512_mask_cvtsepi32_storeu_epi8(output + i, 0xffff, asint); + } + } + + public: // Technically output can be unaligned in Quantize. // But then it will need to be aligned for Multiply. // Convert to 8-bit signed integers. /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) { assert(reinterpret_cast<uintptr_t>(input) % sizeof(__m512i) == 0); - const __m512i neg127 = _mm512_set1_epi32(-127); - const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); const std::size_t kBatch = sizeof(__m512i) / sizeof(float); - const float *fast_input_end = input + (size & ~(kBatch - 1)); - int8_t *fast_output_end = output + (size & ~(kBatch - 1)); -#pragma omp parallel for - for (const float *input_it = input; input_it < fast_input_end; input_it += kBatch) { - __m512i asint = avx512f::QuantizerGrab(input_it, quant_mult_reg); - asint = _mm512_max_epi32(asint, neg127); - // There doesn't seem to be an unmasked version. - _mm512_mask_cvtsepi32_storeu_epi8(output + (input_it - input), 0xffff, asint); + std::size_t fast_size = (size & ~(kBatch - 1)); + const float *fast_input_end = input + fast_size; + int8_t *fast_output_end = output + fast_size; +#pragma omp parallel + { + QuantizeThread(input, output, quant_mult, fast_size); } std::size_t overhang = size & (kBatch - 1); if (!overhang) return; // We needed a branch anyway for the empty case. + const __m512i neg127 = _mm512_set1_epi32(-127); + const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); __m512i asint = avx512f::QuantizerGrab(fast_input_end, quant_mult_reg); asint = _mm512_max_epi32(asint, neg127); _mm512_mask_cvtsepi32_storeu_epi8(fast_output_end, (1 << overhang) - 1, asint); diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc index 515536e..8f0816f 100644 --- a/benchmarks/biasmultiply.cc +++ b/benchmarks/biasmultiply.cc @@ -8,7 +8,6 @@ using namespace intgemm; template <class Routine> void testOld(Index /*rows*/, Index /*cols*/) { - } template <class Routine> diff --git a/compile_test_avx512vnni.cc b/compile_test_avx512vnni.cc index 611cc53..deb0f88 100644 --- a/compile_test_avx512vnni.cc +++ b/compile_test_avx512vnni.cc @@ -19,11 +19,13 @@ bool Foo() { } int main() { - return Foo() && -#ifdef __INTEL_COMPILER - _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI) + return Foo() +#if defined(__GNUC__) || defined(__clang__) + // uses cpuid +#elif defined(__INTEL_COMPILER) + && _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI) #else - __builtin_cpu_supports("avx512vnni") + && __builtin_cpu_supports("avx512vnni") #endif ; } @@ -40,12 +40,18 @@ const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType:: float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512f::MaxAbsolute, avx512f::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute); +constexpr const char *const Unsupported_16bit::kName; +constexpr const char *const Unsupported_8bit::kName; constexpr const char *const SSE2_16bit::kName; constexpr const char *const SSSE3_8bit::kName; constexpr const char *const AVX2_8bit::kName; constexpr const char *const AVX2_16bit::kName; +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW constexpr const char *const AVX512_8bit::kName; -constexpr const char *const AVX512VNNI_8bit::kName; constexpr const char *const AVX512_16bit::kName; +#endif +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI +constexpr const char *const AVX512VNNI_8bit::kName; +#endif } @@ -51,7 +51,7 @@ #include "avx512_gemm.h" #include "avx512vnni_gemm.h" -#if defined(__GNUC__) && defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW) +#if defined(__GNUC__) || defined(__clang__) #include "cpuid.h" #endif @@ -124,7 +124,7 @@ struct Unsupported_8bit { typedef Unsupported_16bit AVX512_16bit; typedef Unsupported_8bit AVX512_8bit; namespace avx512f { -static inline float MaxAbsolute(const float *begin, const float *end) { +static inline float MaxAbsolute(const float * /*begin*/, const float * /*end*/) { throw UnsupportedCPU(); } } //namespace @@ -135,26 +135,6 @@ static inline float MaxAbsolute(const float *begin, const float *end) { typedef Unsupported_8bit AVX512VNNI_8bit; #endif - -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW -// gcc 5.4.0 bizarrely supports avx512bw targets but not __builtin_cpu_supports("avx512bw"). So implement it manually. -inline bool CheckAVX512BW() { - __builtin_cpu_init (); -#ifdef __INTEL_COMPILER - return _may_i_use_cpu_feature(_FEATURE_AVX512BW) -#elif __GNUC__ - unsigned int m = __get_cpuid_max(0, NULL); - if (m < 7) return false; - unsigned int eax, ebx, ecx, edx; - __cpuid_count(7, 0, eax, ebx, ecx, edx); - const unsigned int avx512bw_bit = (1 << 30); - return ebx & avx512bw_bit; -#else - return __builtin_cpu_supports("avx512bw"); -#endif -} -#endif - /* Returns: * axx512vnni if the CPU supports AVX512VNNI * @@ -172,24 +152,59 @@ template <class T> T ChooseCPU(T #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI avx512vnni #endif - , T avx512bw, T avx2, T ssse3, T sse2, T unsupported) { - __builtin_cpu_init (); -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI - if ( -#ifdef __INTEL_COMPILER - _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI) -#else - __builtin_cpu_supports("avx512vnni") + , T +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW + avx512bw #endif - ) { - return avx512vnni; + , T avx2, T ssse3, T sse2, T unsupported) { + /* If intgemm is compiled by gcc 6.4.1 then dlopened into an executable + * compiled by gcc 7.3.0, there will be a undefined symbol __cpu_info. + * Work around this by calling the intrinsics more directly instead of + * __builtin_cpu_supports. + * + * clang 6.0.0-1ubuntu2 supports vnni but doesn't have + * __builtin_cpu_supports("avx512vnni") + * so use the hand-coded CPUID for clang. + */ +#if defined(__GNUC__) || defined(__clang__) + unsigned int m = __get_cpuid_max(0, NULL); + unsigned int eax, ebx, ecx, edx; + if (m >= 7) { + __cpuid_count(7, 0, eax, ebx, ecx, edx); +# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI + if (ecx & (1 << 11)) return avx512vnni; +# endif +# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW + if (ebx & (1 << 30)) return avx512bw; +# endif + if (ebx & (1 << 5)) return avx2; } -#endif -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - if (CheckAVX512BW()) { - return avx512bw; + if (m >= 1) { + __cpuid_count(1, 0, eax, ebx, ecx, edx); + if (ecx & (1 << 9)) return ssse3; + if (edx & (1 << 26)) return sse2; } -#endif + return unsupported; +#else // not gcc or clang. + __builtin_cpu_init(); +# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI + if ( +# ifdef __INTEL_COMPILER + _may_i_use_cpu_feature(_FEATURE_AVX512_VNNI) +# else + __builtin_cpu_supports("avx512vnni") +# endif + ) return vnni; +# endif +# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW + if ( +# ifdef __INTEL_COMPILER + _may_i_use_cpu_feature(_FEATURE_AVX512BW) +# else + __builtin_cpu_supports("avx512bw") +# endif + ) return avx512bw; +# endif if (__builtin_cpu_supports("avx2")) { return avx2; } else if (__builtin_cpu_supports("ssse3")) { @@ -199,6 +214,7 @@ template <class T> T ChooseCPU(T } else { return unsupported; } +#endif } struct TileInfo { @@ -60,19 +60,30 @@ static inline INTGEMM_AVX512F float MaxFloat32(__m512 a) { #endif // Quantize function used for SSSE3 and AVX2. +// Separate function for thread to work around gcc 7 bug that doesn't imbue +// target attributes across #pragma omp parallel. +#define INTGEMM_QUANTIZE_THREAD(target, Register, name) \ +target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \ + name::QuantizeTile8 q(quant_mult); \ + _Pragma("omp for") \ + for (std::size_t i = 0; i < count; i += sizeof(Register)) { \ + *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \ + } \ +} + #define INTGEMM_QUANTIZE(target, Register, name) \ target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \ assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \ - name::QuantizeTile8 q(quant_mult); \ const std::size_t kBatch = sizeof(Register); \ const std::size_t fast_end = size & ~(kBatch - 1); \ - _Pragma("omp parallel for") \ - for (std::size_t i = 0; i < fast_end; i += kBatch) { \ - *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \ + _Pragma("omp parallel") \ + { \ + QuantizeThread(input, output, quant_mult, fast_end); \ } \ std::size_t overhang = size & (kBatch - 1); \ if (!overhang) return; \ + name::QuantizeTile8 q(quant_mult); \ /* Each does size(Register) / 32 == kBatch / 4 floats at a time. * If we're allowed to read one of them, then we can read the whole register. */ \ const float *inputs[4]; \ diff --git a/ssse3_gemm.h b/ssse3_gemm.h index 2cf341e..fd3ab8c 100644 --- a/ssse3_gemm.h +++ b/ssse3_gemm.h @@ -116,6 +116,9 @@ struct SSSE3_8bit { Quantize(input, output, quant_mult, rows * cols); } + private: + INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3, __m128i, ssse3) + public: INTGEMM_QUANTIZE(INTGEMM_SSSE3, __m128i, ssse3) // Version with unsigned int + 127 diff --git a/test/kernels/multiply_sat_test.cc b/test/kernels/multiply_sat_test.cc index 36a9b9f..edea772 100644 --- a/test/kernels/multiply_sat_test.cc +++ b/test/kernels/multiply_sat_test.cc @@ -21,13 +21,13 @@ void kernel_multiply_sat_test() { std::iota(input1.begin(), input1.end(), -int(VECTOR_LENGTH / 2)); std::iota(input2.begin(), input2.end(), -int(VECTOR_LENGTH / 3)); - for (std::size_t shift = 0; shift <= 2 * 8 * sizeof(Type_); ++shift) { - *output.template as<vec_t>() = kernels::multiply_sat<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>(), shift); - for (std::size_t i = 0; i < output.size(); ++i) { - auto ref = (int64_t(input1[i]) * input2[i]) >> shift; - auto ref_sat = Type_(std::min<int64_t>(std::numeric_limits<Type_>::max(), std::max<int64_t>(std::numeric_limits<Type_>::min(), ref))); - CHECK(output[i] == ref_sat); - } + // TODO: try all shifts. The shift must be an immediate. + std::size_t shift = 1; + *output.template as<vec_t>() = kernels::multiply_sat<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>(), shift); + for (std::size_t i = 0; i < output.size(); ++i) { + auto ref = (int64_t(input1[i]) * input2[i]) >> shift; + auto ref_sat = Type_(std::min<int64_t>(std::numeric_limits<Type_>::max(), std::max<int64_t>(std::numeric_limits<Type_>::min(), ref))); + CHECK(output[i] == ref_sat); } } |