Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYoung Jin Kim <youki@microsoft.com>2019-10-18 21:47:58 +0300
committerYoung Jin Kim <youki@microsoft.com>2019-10-18 21:47:58 +0300
commit197acffac6c4288a47a55af2010bebec18a5a341 (patch)
tree37e337a1a7c7d836869ee0bc3378058e517835dc
parent21f93c950b8b27918cd59c8f3139fb41ad1bd2c6 (diff)
Change AVX2 compile check to runtime check
-rw-r--r--src/OptimizedKernelsAvx2.cc51
-rw-r--r--src/QuantUtilsAvx2.cc190
2 files changed, 123 insertions, 118 deletions
diff --git a/src/OptimizedKernelsAvx2.cc b/src/OptimizedKernelsAvx2.cc
index e8c65c3..326bd72 100644
--- a/src/OptimizedKernelsAvx2.cc
+++ b/src/OptimizedKernelsAvx2.cc
@@ -7,6 +7,7 @@
#include "OptimizedKernelsAvx2.h"
#include <immintrin.h>
+#include "fbgemm/Utils.h"
using namespace std;
@@ -14,37 +15,37 @@ namespace fbgemm {
int32_t reduceAvx2(const uint8_t* A, int len) {
int32_t row_sum = 0;
-#if defined(__AVX2__)
- __m256i sum_v = _mm256_setzero_si256();
- __m256i one_epi16_v = _mm256_set1_epi16(1);
- __m256i one_epi8_v = _mm256_set1_epi8(1);
+ if (fbgemm::fbgemmHasAvx2Support()) {
+ __m256i sum_v = _mm256_setzero_si256();
+ __m256i one_epi16_v = _mm256_set1_epi16(1);
+ __m256i one_epi8_v = _mm256_set1_epi8(1);
- int i;
- // vectorized
- for (i = 0; i < len / 32 * 32; i += 32) {
- __m256i src_v = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(A + i));
- sum_v = _mm256_add_epi32(
+ int i;
+ // vectorized
+ for (i = 0; i < len / 32 * 32; i += 32) {
+ __m256i src_v = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(A + i));
+ sum_v = _mm256_add_epi32(
sum_v,
_mm256_madd_epi16(
- _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v));
- }
-
- alignas(64) int32_t temp[8];
- _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v);
- for (int k = 0; k < 8; ++k) {
- row_sum += temp[k];
- }
+ _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v));
+ }
- // scalar
- for (; i < len; ++i) {
- row_sum += A[i];
- }
+ alignas(64) int32_t temp[8];
+ _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v);
+ for (int k = 0; k < 8; ++k) {
+ row_sum += temp[k];
+ }
-#else
- for (int i = 0; i < len; ++i) {
- row_sum += A[i];
+ // scalar
+ for (; i < len; ++i) {
+ row_sum += A[i];
+ }
+ } else {
+ for (int i = 0; i < len; ++i) {
+ row_sum += A[i];
+ }
}
-#endif
+
return row_sum;
}
diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
index 9381f0c..ac1853a 100644
--- a/src/QuantUtilsAvx2.cc
+++ b/src/QuantUtilsAvx2.cc
@@ -24,13 +24,14 @@ void QuantizeAvx2(
T* dst,
int len,
const TensorQuantizationParams& qparams) {
-#if defined(__AVX2__) && (defined(__FMA__) || defined(_MSC_VER))
- constexpr int VLEN = 8;
- constexpr float min_val = std::numeric_limits<T>::min();
- constexpr float max_val = std::numeric_limits<T>::max();
- std::size_t i = 0;
- __m256 inverse_scale_v = _mm256_set1_ps(1.f / qparams.scale);
- __m256i shuffle_mask_v = _mm256_set_epi8(
+ // original compile condition - #if defined(__AVX2__) && (defined(__FMA__) || defined(_MSC_VER))
+ if (fbgemm::fbgemmHasAvx2Support()) {
+ constexpr int VLEN = 8;
+ constexpr float min_val = std::numeric_limits<T>::min();
+ constexpr float max_val = std::numeric_limits<T>::max();
+ std::size_t i = 0;
+ __m256 inverse_scale_v = _mm256_set1_ps(1.f / qparams.scale);
+ __m256i shuffle_mask_v = _mm256_set_epi8(
0xff,
0xff,
0xff,
@@ -63,39 +64,39 @@ void QuantizeAvx2(
0x08,
0x04,
0x00);
- __m256i permute_mask_v =
+ __m256i permute_mask_v =
_mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
- for (; i < len / VLEN * VLEN; i += VLEN) {
- __m256 src_v = _mm256_loadu_ps(src + i);
- __m256 transformed_v = _mm256_fmadd_ps(
+ for (; i < len / VLEN * VLEN; i += VLEN) {
+ __m256 src_v = _mm256_loadu_ps(src + i);
+ __m256 transformed_v = _mm256_fmadd_ps(
src_v, inverse_scale_v, _mm256_set1_ps(qparams.zero_point));
- __m256 clipped_v = _mm256_min_ps(
+ __m256 clipped_v = _mm256_min_ps(
_mm256_max_ps(transformed_v, _mm256_set1_ps(min_val)),
_mm256_set1_ps(max_val));
- __m256i rounded_v = _mm256_cvtps_epi32(clipped_v);
+ __m256i rounded_v = _mm256_cvtps_epi32(clipped_v);
- // An instruction sequence to save 8 32-bit integers as 8 8-bit integers
- rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v);
- rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask_v);
- _mm_storel_epi64(
+ // An instruction sequence to save 8 32-bit integers as 8 8-bit integers
+ rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v);
+ rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask_v);
+ _mm_storel_epi64(
reinterpret_cast<__m128i*>(dst + i), _mm256_castsi256_si128(rounded_v));
- }
+ }
- for (; i < len; ++i) {
- float transformed = qparams.zero_point + src[i] / qparams.scale;
- float clipped = std::min(std::max(transformed, min_val), max_val);
- // Not exactly the same behavior as the vectorized code.
- // The vectorized code above always rounds to even in halfway cases
- // (https://software.intel.com/en-us/node/523819), but std::nearbyint
- // does the same only when the current rounding mode is FE_TONEAREST.
- // However, in practice, this should not be a problem because most cases
- // use the default rounding mode FE_TONEAREST.
- // Note that we cannot implement the same behavior as the vectorized code
- // using std::round because it does rounding away from zero in halfway
- // cases.
- dst[i] = nearbyint(clipped);
+ for (; i < len; ++i) {
+ float transformed = qparams.zero_point + src[i] / qparams.scale;
+ float clipped = std::min(std::max(transformed, min_val), max_val);
+ // Not exactly the same behavior as the vectorized code.
+ // The vectorized code above always rounds to even in halfway cases
+ // (https://software.intel.com/en-us/node/523819), but std::nearbyint
+ // does the same only when the current rounding mode is FE_TONEAREST.
+ // However, in practice, this should not be a problem because most cases
+ // use the default rounding mode FE_TONEAREST.
+ // Note that we cannot implement the same behavior as the vectorized code
+ // using std::round because it does rounding away from zero in halfway
+ // cases.
+ dst[i] = nearbyint(clipped);
+ }
}
-#endif
}
// Instantiate QuantizeAvx2 for known datatypes
@@ -123,24 +124,24 @@ void FindMinMax(const float* a, float* min, float* max, int len) {
float temp_min = *a, temp_max = *a;
int i = 0;
-#ifdef __AVX2__
- __m256 min_v = _mm256_set1_ps(*a), max_v = _mm256_set1_ps(*a);
- constexpr int VLEN = 8;
- if (len >= VLEN) {
- for (; i < len / VLEN * VLEN; i += VLEN) {
- min_v = _mm256_min_ps(min_v, _mm256_loadu_ps(a + i));
- max_v = _mm256_max_ps(max_v, _mm256_loadu_ps(a + i));
- }
+ if (fbgemm::fbgemmHasAvx2Support()) {
+ __m256 min_v = _mm256_set1_ps(*a), max_v = _mm256_set1_ps(*a);
+ constexpr int VLEN = 8;
+ if (len >= VLEN) {
+ for (; i < len / VLEN * VLEN; i += VLEN) {
+ min_v = _mm256_min_ps(min_v, _mm256_loadu_ps(a + i));
+ max_v = _mm256_max_ps(max_v, _mm256_loadu_ps(a + i));
+ }
- float min_buf[VLEN], max_buf[VLEN];
- _mm256_storeu_ps(min_buf, min_v);
- _mm256_storeu_ps(max_buf, max_v);
- for (int j = 0; j < VLEN; ++j) {
- temp_min = std::min(temp_min, min_buf[j]);
- temp_max = std::max(temp_max, max_buf[j]);
+ float min_buf[VLEN], max_buf[VLEN];
+ _mm256_storeu_ps(min_buf, min_v);
+ _mm256_storeu_ps(max_buf, max_v);
+ for (int j = 0; j < VLEN; ++j) {
+ temp_min = std::min(temp_min, min_buf[j]);
+ temp_max = std::max(temp_max, max_buf[j]);
+ }
}
}
-#endif
for (; i < len; i++) {
temp_min = std::min(temp_min, a[i]);
@@ -153,15 +154,15 @@ void FindMinMax(const float* a, float* min, float* max, int len) {
////////////////////////////////////////////////////////////////////////////////
// Requantization (with floats)
-#ifdef __AVX2__
void RequantizeAvx2(
const int32_t* src,
uint8_t* dst,
int len,
const RequantizationParams& params) {
- DoNothing<> doNothingObj{};
- int32_t Bq_zero_point[] = { 0 };
- ReQuantizeOutput<false /* FUSE_RELU */> requantizeObj(
+ if (fbgemm::fbgemmHasAvx2Support()) {
+ DoNothing<> doNothingObj{};
+ int32_t Bq_zero_point[] = { 0 };
+ ReQuantizeOutput<false /* FUSE_RELU */> requantizeObj(
doNothingObj,
&params.real_multiplier,
params.target_qparams.zero_point,
@@ -171,7 +172,8 @@ void RequantizeAvx2(
nullptr, // col_offsets
nullptr, // bias
len); // ncol
- requantizeObj.f<inst_set_t::avx2>(dst, src, {0, 1, 0, len}, 0, 0);
+ requantizeObj.f<inst_set_t::avx2>(dst, src, { 0, 1, 0, len }, 0, 0);
+ }
}
void RequantizeFixedPointAvx2(
@@ -179,24 +181,26 @@ void RequantizeFixedPointAvx2(
uint8_t* dst,
int len,
const RequantizationParams& params) {
- constexpr int VLEN = 8;
+ if (fbgemm::fbgemmHasAvx2Support())
+ {
+ constexpr int VLEN = 8;
- __m256i b = _mm256_set1_epi32(params.multiplier);
+ __m256i b = _mm256_set1_epi32(params.multiplier);
- // AVX2 doesn't support arithmetic right shift.
- // As a work around, we convert 64-bit multiplied results to uint64_t by
- // adding 0x8000000000000000ULL, logical right shift, and subtract by
- // (0x8000000000000000ULL >> right_shift).
- __m256i pre_shift_nudge = _mm256_set1_epi64x(
+ // AVX2 doesn't support arithmetic right shift.
+ // As a work around, we convert 64-bit multiplied results to uint64_t by
+ // adding 0x8000000000000000ULL, logical right shift, and subtract by
+ // (0x8000000000000000ULL >> right_shift).
+ __m256i pre_shift_nudge = _mm256_set1_epi64x(
(1ll << (params.right_shift - 1)) + 0x8000000000000000ULL);
- __m256i post_shift_nudge = _mm256_set1_epi64x(
+ __m256i post_shift_nudge = _mm256_set1_epi64x(
params.target_qparams.zero_point -
(0x8000000000000000ULL >> params.right_shift));
- __m256i min_v = _mm256_set1_epi32(numeric_limits<uint8_t>::min());
- __m256i max_v = _mm256_set1_epi32(numeric_limits<uint8_t>::max());
+ __m256i min_v = _mm256_set1_epi32(numeric_limits<uint8_t>::min());
+ __m256i max_v = _mm256_set1_epi32(numeric_limits<uint8_t>::max());
- __m256i shuffle_mask_v = _mm256_set_epi8(
+ __m256i shuffle_mask_v = _mm256_set_epi8(
0xff,
0xff,
0xff,
@@ -229,53 +233,53 @@ void RequantizeFixedPointAvx2(
0x08,
0x04,
0x00);
- __m256i permute_mask_v =
+ __m256i permute_mask_v =
_mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
- int i = 0;
- for (; i < len / VLEN * VLEN; i += VLEN) {
- __m256i a_v = _mm256_loadu_si256((const __m256i*)(src + i));
+ int i = 0;
+ for (; i < len / VLEN * VLEN; i += VLEN) {
+ __m256i a_v = _mm256_loadu_si256((const __m256i*)(src + i));
- // a = a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7
- // b = b0 | b1 | b3 | b3 | b4 | b5 | b6 | b7
- __m256i a_even_v = a_v;
- __m256i a_odd_v = _mm256_srli_si256(a_v, 4);
+ // a = a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7
+ // b = b0 | b1 | b3 | b3 | b4 | b5 | b6 | b7
+ __m256i a_even_v = a_v;
+ __m256i a_odd_v = _mm256_srli_si256(a_v, 4);
- __m256i ab_even_v = _mm256_mul_epi32(a_even_v, b);
- __m256i ab_odd_v = _mm256_mul_epi32(a_odd_v, b);
+ __m256i ab_even_v = _mm256_mul_epi32(a_even_v, b);
+ __m256i ab_odd_v = _mm256_mul_epi32(a_odd_v, b);
- __m256i even_rounded_v = _mm256_add_epi64(ab_even_v, pre_shift_nudge);
- __m256i odd_rounded_v = _mm256_add_epi64(ab_odd_v, pre_shift_nudge);
+ __m256i even_rounded_v = _mm256_add_epi64(ab_even_v, pre_shift_nudge);
+ __m256i odd_rounded_v = _mm256_add_epi64(ab_odd_v, pre_shift_nudge);
- __m256i even_result_v = _mm256_add_epi64(
+ __m256i even_result_v = _mm256_add_epi64(
_mm256_srli_epi64(even_rounded_v, params.right_shift),
post_shift_nudge);
- __m256i odd_result_v = _mm256_add_epi64(
+ __m256i odd_result_v = _mm256_add_epi64(
_mm256_srli_epi64(odd_rounded_v, params.right_shift), post_shift_nudge);
- odd_result_v = _mm256_slli_si256(odd_result_v, 4);
+ odd_result_v = _mm256_slli_si256(odd_result_v, 4);
- // even_result_v has numbers we want in its even 32-bit SIMD lanes, and
- // odd_result_v has numbers we want in its odd 32-bit SIMD lanes.
- // Use blend to combine them.
- __m256i result_v = _mm256_blend_epi32(even_result_v, odd_result_v, 0xaa);
- __m256i clipped_v =
+ // even_result_v has numbers we want in its even 32-bit SIMD lanes, and
+ // odd_result_v has numbers we want in its odd 32-bit SIMD lanes.
+ // Use blend to combine them.
+ __m256i result_v = _mm256_blend_epi32(even_result_v, odd_result_v, 0xaa);
+ __m256i clipped_v =
_mm256_max_epi32(min_v, _mm256_min_epi32(max_v, result_v));
- clipped_v = _mm256_shuffle_epi8(clipped_v, shuffle_mask_v);
- clipped_v = _mm256_permutevar8x32_epi32(clipped_v, permute_mask_v);
- *(int64_t*)(dst + i) = _mm256_extract_epi64(clipped_v, 0);
- }
+ clipped_v = _mm256_shuffle_epi8(clipped_v, shuffle_mask_v);
+ clipped_v = _mm256_permutevar8x32_epi32(clipped_v, permute_mask_v);
+ *(int64_t*)(dst + i) = _mm256_extract_epi64(clipped_v, 0);
+ }
- for (; i < len; ++i) {
- int64_t ab_64 =
+ for (; i < len; ++i) {
+ int64_t ab_64 =
static_cast<int64_t>(src[i]) * static_cast<int64_t>(params.multiplier);
- int64_t nudge = 1ll << std::max(0, params.right_shift - 1);
- int64_t quantized_down = params.target_qparams.zero_point +
+ int64_t nudge = 1ll << std::max(0, params.right_shift - 1);
+ int64_t quantized_down = params.target_qparams.zero_point +
((ab_64 + nudge) >> params.right_shift);
- dst[i] = std::min<int64_t>(std::max<int64_t>(quantized_down, 0l), 255l);
+ dst[i] = std::min<int64_t>(std::max<int64_t>(quantized_down, 0l), 255l);
+ }
}
}
-#endif
template <
bool A_SYMMETRIC,