Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaya S Khudia <dskhudia@fb.com>2018-12-05 23:37:26 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2018-12-05 23:55:02 +0300
commita9198891b103a75c21b140eea9c89c2276431da4 (patch)
treedab460556db3f8d6bdc4dd2ea66d83125d2c8f77
parentf22e24d3dd9dad2d780b2ed822ea52f5fb5778ce (diff)
clean up PackAWithQuantRowOffset from avx2 intrinsics (#36)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/36 Isolate avx2 usage from quantization fusion with packing. Reviewed By: jianyuh Differential Revision: D13311108 fbshipit-source-id: 3be39aa9c84efb6b4f2cc06d7abcab97c232098b
-rw-r--r--src/ExecuteKernel.cc1
-rw-r--r--src/PackAWithQuantRowOffset.cc96
-rw-r--r--src/QuantUtilsAvx2.cc47
-rw-r--r--src/Utils.cc1
4 files changed, 59 insertions, 86 deletions
diff --git a/src/ExecuteKernel.cc b/src/ExecuteKernel.cc
index 3bc7e36..d25179e 100644
--- a/src/ExecuteKernel.cc
+++ b/src/ExecuteKernel.cc
@@ -5,7 +5,6 @@
* LICENSE file in the root directory of this source tree.
*/
#include "ExecuteKernel.h"
-#include <immintrin.h>
#include "fbgemm/Fbgemm.h"
#include "fbgemm/Utils.h"
diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc
index c1e5b07..98e862b 100644
--- a/src/PackAWithQuantRowOffset.cc
+++ b/src/PackAWithQuantRowOffset.cc
@@ -12,6 +12,8 @@
#include <iostream>
#include <stdexcept>
#include "fbgemm/Fbgemm.h"
+#include "fbgemm/QuantUtilsAvx2.h"
+#include "OptimizedKernelsAvx2.h"
namespace fbgemm {
@@ -108,92 +110,30 @@ void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) {
tr ? smat_transposed : smat_ + block.row_start * ld_ + block.col_start;
int32_t ld_temp = tr ? block.col_size : ld_;
-#if defined(__AVX2__) && defined(__FMA__)
- constexpr int VLEN = 8;
- __m256 inverse_scale_v = _mm256_set1_ps(1.0f / scale_);
- __m256i shuffle_mask_v = _mm256_set_epi8(
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0x0c,
- 0x08,
- 0x04,
- 0x00,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0xff,
- 0x0c,
- 0x08,
- 0x04,
- 0x00);
- __m256i permute_mask_v =
- _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
-#endif
+ static_assert(
+ std::is_same<T, uint8_t>::value,
+ "PackAWithQuantRowOffset<T, accT>::pack only works for T == uint8_t");
+
+ // Only scale and zero points are used in QuantizeAvx2
+ TensorQuantizationParams qparams;
+ qparams.scale = scale_;
+ qparams.zero_point = zero_pt_;
for (int i = 0; i < block.row_size; ++i) {
+ QuantizeAvx2(
+ smat_temp + i * ld_temp,
+ out + i * BaseType::blockColSize(),
+ block.col_size,
+ qparams);
int32_t row_sum = row_offset_acc ? row_offset_buf[i] : 0;
- int j = 0;
-#if defined(__AVX2__) && defined(__FMA__)
- static_assert(
- std::is_same<T, uint8_t>::value,
- "PackAWithQuantRowOffset<T, accT>::pack only works for T == uint8_t");
- for (; j < block.col_size / VLEN * VLEN; j += VLEN) {
- __m256 val_v = _mm256_loadu_ps(smat_temp + i * ld_temp + j);
- __m256 transformed_v = _mm256_fmadd_ps(
- val_v, inverse_scale_v, _mm256_set1_ps(zero_pt_));
- __m256 clipped_v = _mm256_max_ps(
- _mm256_set1_ps(std::numeric_limits<uint8_t>::min()),
- _mm256_min_ps(
- transformed_v,
- _mm256_set1_ps(std::numeric_limits<uint8_t>::max())));
- __m256i res_v = _mm256_cvtps_epi32(clipped_v);
-
- // An instruction sequence to save 8 32-bit integers as 8 8-bit integers
- res_v = _mm256_shuffle_epi8(res_v, shuffle_mask_v);
- res_v = _mm256_permutevar8x32_epi32(res_v, permute_mask_v);
- _mm_storel_epi64(
- reinterpret_cast<__m128i*>(out + i * BaseType::blockColSize() + j),
- _mm256_castsi256_si128(res_v));
+ row_sum += reduceAvx2(out + i * BaseType::blockColSize(), block.col_size);
+ row_offset_buf[i] = row_sum;
- for (int j2 = j; j2 < j + VLEN; ++j2) {
- row_sum += out[i * BaseType::blockColSize() + j2];
- }
- }
-#endif
- for (; j < block.col_size; ++j) {
- float val = smat_temp[i * ld_temp + j];
- float transformed = val / scale_ + zero_pt_;
- float clipped = std::min<float>(
- std::max<float>(transformed, std::numeric_limits<uint8_t>::min()),
- std::numeric_limits<uint8_t>::max());
- T res = nearbyint(clipped);
- row_sum += res;
- out[i * BaseType::blockColSize() + j] = res;
- }
// zero fill
// Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
- for (; j < block_p.col_size; ++j) {
+ for (int j = block.col_start + block.col_size; j < block_p.col_size; ++j) {
out[i * BaseType::blockColSize() + j] = 0;
}
- row_offset_buf[i] = row_sum;
}
}
diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
index 60a33f1..38a809f 100644
--- a/src/QuantUtilsAvx2.cc
+++ b/src/QuantUtilsAvx2.cc
@@ -18,7 +18,6 @@ using namespace std;
////////////////////////////////////////////////////////////////////////////////
// Utility functions
-// FIXME: code duplication with PackAWithQuantRowOffset
void QuantizeAvx2(
const float* src,
uint8_t* dst,
@@ -28,6 +27,41 @@ void QuantizeAvx2(
constexpr int VLEN = 8;
std::size_t i = 0;
__m256 inverse_scale_v = _mm256_set1_ps(1.f / qparams.scale);
+ __m256i shuffle_mask_v = _mm256_set_epi8(
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x0c,
+ 0x08,
+ 0x04,
+ 0x00,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0xff,
+ 0x0c,
+ 0x08,
+ 0x04,
+ 0x00);
+ __m256i permute_mask_v =
+ _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
for (; i < len / VLEN * VLEN; i += VLEN) {
__m256 src_v = _mm256_loadu_ps(src + i);
__m256 transformed_v = _mm256_fmadd_ps(
@@ -36,11 +70,12 @@ void QuantizeAvx2(
_mm256_max_ps(transformed_v, _mm256_set1_ps(0.f)),
_mm256_set1_ps(255.f));
__m256i rounded_v = _mm256_cvtps_epi32(clipped_v);
- alignas(64) std::int32_t temp_int32[VLEN];
- _mm256_store_si256((__m256i*)temp_int32, rounded_v);
- for (int j = 0; j < VLEN; ++j) {
- dst[i + j] = temp_int32[j];
- }
+
+ // An instruction sequence to save 8 32-bit integers as 8 8-bit integers
+ rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v);
+ rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask_v);
+ _mm_storel_epi64(
+ reinterpret_cast<__m128i*>(dst + i), _mm256_castsi256_si128(rounded_v));
}
for (; i < len; ++i) {
diff --git a/src/Utils.cc b/src/Utils.cc
index 3034f5b..88d029d 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -7,7 +7,6 @@
#include "fbgemm/Utils.h"
#include "TransposeUtils.h"
#include <cpuinfo.h>
-#include <immintrin.h>
#include <cassert>
#include <cinttypes>
#include <cmath>