diff options
author | Daya S Khudia <dskhudia@fb.com> | 2018-12-05 23:37:26 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-12-05 23:55:02 +0300 |
commit | a9198891b103a75c21b140eea9c89c2276431da4 (patch) | |
tree | dab460556db3f8d6bdc4dd2ea66d83125d2c8f77 | |
parent | f22e24d3dd9dad2d780b2ed822ea52f5fb5778ce (diff) |
clean up PackAWithQuantRowOffset from avx2 intrinsics (#36)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/36
Isolate avx2 usage from quantization fusion with packing.
Reviewed By: jianyuh
Differential Revision: D13311108
fbshipit-source-id: 3be39aa9c84efb6b4f2cc06d7abcab97c232098b
-rw-r--r-- | src/ExecuteKernel.cc | 1 | ||||
-rw-r--r-- | src/PackAWithQuantRowOffset.cc | 96 | ||||
-rw-r--r-- | src/QuantUtilsAvx2.cc | 47 | ||||
-rw-r--r-- | src/Utils.cc | 1 |
4 files changed, 59 insertions, 86 deletions
diff --git a/src/ExecuteKernel.cc b/src/ExecuteKernel.cc index 3bc7e36..d25179e 100644 --- a/src/ExecuteKernel.cc +++ b/src/ExecuteKernel.cc @@ -5,7 +5,6 @@ * LICENSE file in the root directory of this source tree. */ #include "ExecuteKernel.h" -#include <immintrin.h> #include "fbgemm/Fbgemm.h" #include "fbgemm/Utils.h" diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc index c1e5b07..98e862b 100644 --- a/src/PackAWithQuantRowOffset.cc +++ b/src/PackAWithQuantRowOffset.cc @@ -12,6 +12,8 @@ #include <iostream> #include <stdexcept> #include "fbgemm/Fbgemm.h" +#include "fbgemm/QuantUtilsAvx2.h" +#include "OptimizedKernelsAvx2.h" namespace fbgemm { @@ -108,92 +110,30 @@ void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) { tr ? smat_transposed : smat_ + block.row_start * ld_ + block.col_start; int32_t ld_temp = tr ? block.col_size : ld_; -#if defined(__AVX2__) && defined(__FMA__) - constexpr int VLEN = 8; - __m256 inverse_scale_v = _mm256_set1_ps(1.0f / scale_); - __m256i shuffle_mask_v = _mm256_set_epi8( - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0c, - 0x08, - 0x04, - 0x00, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0c, - 0x08, - 0x04, - 0x00); - __m256i permute_mask_v = - _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); -#endif + static_assert( + std::is_same<T, uint8_t>::value, + "PackAWithQuantRowOffset<T, accT>::pack only works for T == uint8_t"); + + // Only scale and zero points are used in QuantizeAvx2 + TensorQuantizationParams qparams; + qparams.scale = scale_; + qparams.zero_point = zero_pt_; for (int i = 0; i < block.row_size; ++i) { + QuantizeAvx2( + smat_temp + i * ld_temp, + out + i * BaseType::blockColSize(), + block.col_size, + qparams); int32_t row_sum = row_offset_acc ? row_offset_buf[i] : 0; - int j = 0; -#if defined(__AVX2__) && defined(__FMA__) - static_assert( - std::is_same<T, uint8_t>::value, - "PackAWithQuantRowOffset<T, accT>::pack only works for T == uint8_t"); - for (; j < block.col_size / VLEN * VLEN; j += VLEN) { - __m256 val_v = _mm256_loadu_ps(smat_temp + i * ld_temp + j); - __m256 transformed_v = _mm256_fmadd_ps( - val_v, inverse_scale_v, _mm256_set1_ps(zero_pt_)); - __m256 clipped_v = _mm256_max_ps( - _mm256_set1_ps(std::numeric_limits<uint8_t>::min()), - _mm256_min_ps( - transformed_v, - _mm256_set1_ps(std::numeric_limits<uint8_t>::max()))); - __m256i res_v = _mm256_cvtps_epi32(clipped_v); - - // An instruction sequence to save 8 32-bit integers as 8 8-bit integers - res_v = _mm256_shuffle_epi8(res_v, shuffle_mask_v); - res_v = _mm256_permutevar8x32_epi32(res_v, permute_mask_v); - _mm_storel_epi64( - reinterpret_cast<__m128i*>(out + i * BaseType::blockColSize() + j), - _mm256_castsi256_si128(res_v)); + row_sum += reduceAvx2(out + i * BaseType::blockColSize(), block.col_size); + row_offset_buf[i] = row_sum; - for (int j2 = j; j2 < j + VLEN; ++j2) { - row_sum += out[i * BaseType::blockColSize() + j2]; - } - } -#endif - for (; j < block.col_size; ++j) { - float val = smat_temp[i * ld_temp + j]; - float transformed = val / scale_ + zero_pt_; - float clipped = std::min<float>( - std::max<float>(transformed, std::numeric_limits<uint8_t>::min()), - std::numeric_limits<uint8_t>::max()); - T res = nearbyint(clipped); - row_sum += res; - out[i * BaseType::blockColSize() + j] = res; - } // zero fill // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill. - for (; j < block_p.col_size; ++j) { + for (int j = block.col_start + block.col_size; j < block_p.col_size; ++j) { out[i * BaseType::blockColSize() + j] = 0; } - row_offset_buf[i] = row_sum; } } diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc index 60a33f1..38a809f 100644 --- a/src/QuantUtilsAvx2.cc +++ b/src/QuantUtilsAvx2.cc @@ -18,7 +18,6 @@ using namespace std; //////////////////////////////////////////////////////////////////////////////// // Utility functions -// FIXME: code duplication with PackAWithQuantRowOffset void QuantizeAvx2( const float* src, uint8_t* dst, @@ -28,6 +27,41 @@ void QuantizeAvx2( constexpr int VLEN = 8; std::size_t i = 0; __m256 inverse_scale_v = _mm256_set1_ps(1.f / qparams.scale); + __m256i shuffle_mask_v = _mm256_set_epi8( + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0c, + 0x08, + 0x04, + 0x00, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0c, + 0x08, + 0x04, + 0x00); + __m256i permute_mask_v = + _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); for (; i < len / VLEN * VLEN; i += VLEN) { __m256 src_v = _mm256_loadu_ps(src + i); __m256 transformed_v = _mm256_fmadd_ps( @@ -36,11 +70,12 @@ void QuantizeAvx2( _mm256_max_ps(transformed_v, _mm256_set1_ps(0.f)), _mm256_set1_ps(255.f)); __m256i rounded_v = _mm256_cvtps_epi32(clipped_v); - alignas(64) std::int32_t temp_int32[VLEN]; - _mm256_store_si256((__m256i*)temp_int32, rounded_v); - for (int j = 0; j < VLEN; ++j) { - dst[i + j] = temp_int32[j]; - } + + // An instruction sequence to save 8 32-bit integers as 8 8-bit integers + rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v); + rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask_v); + _mm_storel_epi64( + reinterpret_cast<__m128i*>(dst + i), _mm256_castsi256_si128(rounded_v)); } for (; i < len; ++i) { diff --git a/src/Utils.cc b/src/Utils.cc index 3034f5b..88d029d 100644 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -7,7 +7,6 @@ #include "fbgemm/Utils.h" #include "TransposeUtils.h" #include <cpuinfo.h> -#include <immintrin.h> #include <cassert> #include <cinttypes> #include <cmath> |