From 197acffac6c4288a47a55af2010bebec18a5a341 Mon Sep 17 00:00:00 2001 From: Young Jin Kim Date: Fri, 18 Oct 2019 11:47:58 -0700 Subject: Change AVX2 compile check to runtime check --- src/OptimizedKernelsAvx2.cc | 51 ++++++------ src/QuantUtilsAvx2.cc | 190 ++++++++++++++++++++++---------------------- 2 files changed, 123 insertions(+), 118 deletions(-) diff --git a/src/OptimizedKernelsAvx2.cc b/src/OptimizedKernelsAvx2.cc index e8c65c3..326bd72 100644 --- a/src/OptimizedKernelsAvx2.cc +++ b/src/OptimizedKernelsAvx2.cc @@ -7,6 +7,7 @@ #include "OptimizedKernelsAvx2.h" #include +#include "fbgemm/Utils.h" using namespace std; @@ -14,37 +15,37 @@ namespace fbgemm { int32_t reduceAvx2(const uint8_t* A, int len) { int32_t row_sum = 0; -#if defined(__AVX2__) - __m256i sum_v = _mm256_setzero_si256(); - __m256i one_epi16_v = _mm256_set1_epi16(1); - __m256i one_epi8_v = _mm256_set1_epi8(1); + if (fbgemm::fbgemmHasAvx2Support()) { + __m256i sum_v = _mm256_setzero_si256(); + __m256i one_epi16_v = _mm256_set1_epi16(1); + __m256i one_epi8_v = _mm256_set1_epi8(1); - int i; - // vectorized - for (i = 0; i < len / 32 * 32; i += 32) { - __m256i src_v = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(A + i)); - sum_v = _mm256_add_epi32( + int i; + // vectorized + for (i = 0; i < len / 32 * 32; i += 32) { + __m256i src_v = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(A + i)); + sum_v = _mm256_add_epi32( sum_v, _mm256_madd_epi16( - _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v)); - } - - alignas(64) int32_t temp[8]; - _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v); - for (int k = 0; k < 8; ++k) { - row_sum += temp[k]; - } + _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v)); + } - // scalar - for (; i < len; ++i) { - row_sum += A[i]; - } + alignas(64) int32_t temp[8]; + _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v); + for (int k = 0; k < 8; ++k) { + row_sum += temp[k]; + } -#else - for (int i = 0; i < len; ++i) { - row_sum += A[i]; + // scalar + for (; i < len; ++i) { + row_sum += A[i]; + } + } else { + for (int i = 0; i < len; ++i) { + row_sum += A[i]; + } } -#endif + return row_sum; } diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc index 9381f0c..ac1853a 100644 --- a/src/QuantUtilsAvx2.cc +++ b/src/QuantUtilsAvx2.cc @@ -24,13 +24,14 @@ void QuantizeAvx2( T* dst, int len, const TensorQuantizationParams& qparams) { -#if defined(__AVX2__) && (defined(__FMA__) || defined(_MSC_VER)) - constexpr int VLEN = 8; - constexpr float min_val = std::numeric_limits::min(); - constexpr float max_val = std::numeric_limits::max(); - std::size_t i = 0; - __m256 inverse_scale_v = _mm256_set1_ps(1.f / qparams.scale); - __m256i shuffle_mask_v = _mm256_set_epi8( + // original compile condition - #if defined(__AVX2__) && (defined(__FMA__) || defined(_MSC_VER)) + if (fbgemm::fbgemmHasAvx2Support()) { + constexpr int VLEN = 8; + constexpr float min_val = std::numeric_limits::min(); + constexpr float max_val = std::numeric_limits::max(); + std::size_t i = 0; + __m256 inverse_scale_v = _mm256_set1_ps(1.f / qparams.scale); + __m256i shuffle_mask_v = _mm256_set_epi8( 0xff, 0xff, 0xff, @@ -63,39 +64,39 @@ void QuantizeAvx2( 0x08, 0x04, 0x00); - __m256i permute_mask_v = + __m256i permute_mask_v = _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); - for (; i < len / VLEN * VLEN; i += VLEN) { - __m256 src_v = _mm256_loadu_ps(src + i); - __m256 transformed_v = _mm256_fmadd_ps( + for (; i < len / VLEN * VLEN; i += VLEN) { + __m256 src_v = _mm256_loadu_ps(src + i); + __m256 transformed_v = _mm256_fmadd_ps( src_v, inverse_scale_v, _mm256_set1_ps(qparams.zero_point)); - __m256 clipped_v = _mm256_min_ps( + __m256 clipped_v = _mm256_min_ps( _mm256_max_ps(transformed_v, _mm256_set1_ps(min_val)), _mm256_set1_ps(max_val)); - __m256i rounded_v = _mm256_cvtps_epi32(clipped_v); + __m256i rounded_v = _mm256_cvtps_epi32(clipped_v); - // An instruction sequence to save 8 32-bit integers as 8 8-bit integers - rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v); - rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask_v); - _mm_storel_epi64( + // An instruction sequence to save 8 32-bit integers as 8 8-bit integers + rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v); + rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask_v); + _mm_storel_epi64( reinterpret_cast<__m128i*>(dst + i), _mm256_castsi256_si128(rounded_v)); - } + } - for (; i < len; ++i) { - float transformed = qparams.zero_point + src[i] / qparams.scale; - float clipped = std::min(std::max(transformed, min_val), max_val); - // Not exactly the same behavior as the vectorized code. - // The vectorized code above always rounds to even in halfway cases - // (https://software.intel.com/en-us/node/523819), but std::nearbyint - // does the same only when the current rounding mode is FE_TONEAREST. - // However, in practice, this should not be a problem because most cases - // use the default rounding mode FE_TONEAREST. - // Note that we cannot implement the same behavior as the vectorized code - // using std::round because it does rounding away from zero in halfway - // cases. - dst[i] = nearbyint(clipped); + for (; i < len; ++i) { + float transformed = qparams.zero_point + src[i] / qparams.scale; + float clipped = std::min(std::max(transformed, min_val), max_val); + // Not exactly the same behavior as the vectorized code. + // The vectorized code above always rounds to even in halfway cases + // (https://software.intel.com/en-us/node/523819), but std::nearbyint + // does the same only when the current rounding mode is FE_TONEAREST. + // However, in practice, this should not be a problem because most cases + // use the default rounding mode FE_TONEAREST. + // Note that we cannot implement the same behavior as the vectorized code + // using std::round because it does rounding away from zero in halfway + // cases. + dst[i] = nearbyint(clipped); + } } -#endif } // Instantiate QuantizeAvx2 for known datatypes @@ -123,24 +124,24 @@ void FindMinMax(const float* a, float* min, float* max, int len) { float temp_min = *a, temp_max = *a; int i = 0; -#ifdef __AVX2__ - __m256 min_v = _mm256_set1_ps(*a), max_v = _mm256_set1_ps(*a); - constexpr int VLEN = 8; - if (len >= VLEN) { - for (; i < len / VLEN * VLEN; i += VLEN) { - min_v = _mm256_min_ps(min_v, _mm256_loadu_ps(a + i)); - max_v = _mm256_max_ps(max_v, _mm256_loadu_ps(a + i)); - } + if (fbgemm::fbgemmHasAvx2Support()) { + __m256 min_v = _mm256_set1_ps(*a), max_v = _mm256_set1_ps(*a); + constexpr int VLEN = 8; + if (len >= VLEN) { + for (; i < len / VLEN * VLEN; i += VLEN) { + min_v = _mm256_min_ps(min_v, _mm256_loadu_ps(a + i)); + max_v = _mm256_max_ps(max_v, _mm256_loadu_ps(a + i)); + } - float min_buf[VLEN], max_buf[VLEN]; - _mm256_storeu_ps(min_buf, min_v); - _mm256_storeu_ps(max_buf, max_v); - for (int j = 0; j < VLEN; ++j) { - temp_min = std::min(temp_min, min_buf[j]); - temp_max = std::max(temp_max, max_buf[j]); + float min_buf[VLEN], max_buf[VLEN]; + _mm256_storeu_ps(min_buf, min_v); + _mm256_storeu_ps(max_buf, max_v); + for (int j = 0; j < VLEN; ++j) { + temp_min = std::min(temp_min, min_buf[j]); + temp_max = std::max(temp_max, max_buf[j]); + } } } -#endif for (; i < len; i++) { temp_min = std::min(temp_min, a[i]); @@ -153,15 +154,15 @@ void FindMinMax(const float* a, float* min, float* max, int len) { //////////////////////////////////////////////////////////////////////////////// // Requantization (with floats) -#ifdef __AVX2__ void RequantizeAvx2( const int32_t* src, uint8_t* dst, int len, const RequantizationParams& params) { - DoNothing<> doNothingObj{}; - int32_t Bq_zero_point[] = { 0 }; - ReQuantizeOutput requantizeObj( + if (fbgemm::fbgemmHasAvx2Support()) { + DoNothing<> doNothingObj{}; + int32_t Bq_zero_point[] = { 0 }; + ReQuantizeOutput requantizeObj( doNothingObj, ¶ms.real_multiplier, params.target_qparams.zero_point, @@ -171,7 +172,8 @@ void RequantizeAvx2( nullptr, // col_offsets nullptr, // bias len); // ncol - requantizeObj.f(dst, src, {0, 1, 0, len}, 0, 0); + requantizeObj.f(dst, src, { 0, 1, 0, len }, 0, 0); + } } void RequantizeFixedPointAvx2( @@ -179,24 +181,26 @@ void RequantizeFixedPointAvx2( uint8_t* dst, int len, const RequantizationParams& params) { - constexpr int VLEN = 8; + if (fbgemm::fbgemmHasAvx2Support()) + { + constexpr int VLEN = 8; - __m256i b = _mm256_set1_epi32(params.multiplier); + __m256i b = _mm256_set1_epi32(params.multiplier); - // AVX2 doesn't support arithmetic right shift. - // As a work around, we convert 64-bit multiplied results to uint64_t by - // adding 0x8000000000000000ULL, logical right shift, and subtract by - // (0x8000000000000000ULL >> right_shift). - __m256i pre_shift_nudge = _mm256_set1_epi64x( + // AVX2 doesn't support arithmetic right shift. + // As a work around, we convert 64-bit multiplied results to uint64_t by + // adding 0x8000000000000000ULL, logical right shift, and subtract by + // (0x8000000000000000ULL >> right_shift). + __m256i pre_shift_nudge = _mm256_set1_epi64x( (1ll << (params.right_shift - 1)) + 0x8000000000000000ULL); - __m256i post_shift_nudge = _mm256_set1_epi64x( + __m256i post_shift_nudge = _mm256_set1_epi64x( params.target_qparams.zero_point - (0x8000000000000000ULL >> params.right_shift)); - __m256i min_v = _mm256_set1_epi32(numeric_limits::min()); - __m256i max_v = _mm256_set1_epi32(numeric_limits::max()); + __m256i min_v = _mm256_set1_epi32(numeric_limits::min()); + __m256i max_v = _mm256_set1_epi32(numeric_limits::max()); - __m256i shuffle_mask_v = _mm256_set_epi8( + __m256i shuffle_mask_v = _mm256_set_epi8( 0xff, 0xff, 0xff, @@ -229,53 +233,53 @@ void RequantizeFixedPointAvx2( 0x08, 0x04, 0x00); - __m256i permute_mask_v = + __m256i permute_mask_v = _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); - int i = 0; - for (; i < len / VLEN * VLEN; i += VLEN) { - __m256i a_v = _mm256_loadu_si256((const __m256i*)(src + i)); + int i = 0; + for (; i < len / VLEN * VLEN; i += VLEN) { + __m256i a_v = _mm256_loadu_si256((const __m256i*)(src + i)); - // a = a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7 - // b = b0 | b1 | b3 | b3 | b4 | b5 | b6 | b7 - __m256i a_even_v = a_v; - __m256i a_odd_v = _mm256_srli_si256(a_v, 4); + // a = a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7 + // b = b0 | b1 | b3 | b3 | b4 | b5 | b6 | b7 + __m256i a_even_v = a_v; + __m256i a_odd_v = _mm256_srli_si256(a_v, 4); - __m256i ab_even_v = _mm256_mul_epi32(a_even_v, b); - __m256i ab_odd_v = _mm256_mul_epi32(a_odd_v, b); + __m256i ab_even_v = _mm256_mul_epi32(a_even_v, b); + __m256i ab_odd_v = _mm256_mul_epi32(a_odd_v, b); - __m256i even_rounded_v = _mm256_add_epi64(ab_even_v, pre_shift_nudge); - __m256i odd_rounded_v = _mm256_add_epi64(ab_odd_v, pre_shift_nudge); + __m256i even_rounded_v = _mm256_add_epi64(ab_even_v, pre_shift_nudge); + __m256i odd_rounded_v = _mm256_add_epi64(ab_odd_v, pre_shift_nudge); - __m256i even_result_v = _mm256_add_epi64( + __m256i even_result_v = _mm256_add_epi64( _mm256_srli_epi64(even_rounded_v, params.right_shift), post_shift_nudge); - __m256i odd_result_v = _mm256_add_epi64( + __m256i odd_result_v = _mm256_add_epi64( _mm256_srli_epi64(odd_rounded_v, params.right_shift), post_shift_nudge); - odd_result_v = _mm256_slli_si256(odd_result_v, 4); + odd_result_v = _mm256_slli_si256(odd_result_v, 4); - // even_result_v has numbers we want in its even 32-bit SIMD lanes, and - // odd_result_v has numbers we want in its odd 32-bit SIMD lanes. - // Use blend to combine them. - __m256i result_v = _mm256_blend_epi32(even_result_v, odd_result_v, 0xaa); - __m256i clipped_v = + // even_result_v has numbers we want in its even 32-bit SIMD lanes, and + // odd_result_v has numbers we want in its odd 32-bit SIMD lanes. + // Use blend to combine them. + __m256i result_v = _mm256_blend_epi32(even_result_v, odd_result_v, 0xaa); + __m256i clipped_v = _mm256_max_epi32(min_v, _mm256_min_epi32(max_v, result_v)); - clipped_v = _mm256_shuffle_epi8(clipped_v, shuffle_mask_v); - clipped_v = _mm256_permutevar8x32_epi32(clipped_v, permute_mask_v); - *(int64_t*)(dst + i) = _mm256_extract_epi64(clipped_v, 0); - } + clipped_v = _mm256_shuffle_epi8(clipped_v, shuffle_mask_v); + clipped_v = _mm256_permutevar8x32_epi32(clipped_v, permute_mask_v); + *(int64_t*)(dst + i) = _mm256_extract_epi64(clipped_v, 0); + } - for (; i < len; ++i) { - int64_t ab_64 = + for (; i < len; ++i) { + int64_t ab_64 = static_cast(src[i]) * static_cast(params.multiplier); - int64_t nudge = 1ll << std::max(0, params.right_shift - 1); - int64_t quantized_down = params.target_qparams.zero_point + + int64_t nudge = 1ll << std::max(0, params.right_shift - 1); + int64_t quantized_down = params.target_qparams.zero_point + ((ab_64 + nudge) >> params.right_shift); - dst[i] = std::min(std::max(quantized_down, 0l), 255l); + dst[i] = std::min(std::max(quantized_down, 0l), 255l); + } } } -#endif template < bool A_SYMMETRIC, -- cgit v1.2.3