Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/QuantUtilsAvx2.cc')
-rw-r--r--src/QuantUtilsAvx2.cc166
1 files changed, 166 insertions, 0 deletions
diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
index be12142..875c9e1 100644
--- a/src/QuantUtilsAvx2.cc
+++ b/src/QuantUtilsAvx2.cc
@@ -662,6 +662,165 @@ template <
bool B_SYMMETRIC,
QuantizationGranularity Q_GRAN,
bool HAS_BIAS,
+ bool FUSE_RELU>
+void requantizeForFloatAvx2(
+ float* out,
+ const int32_t* inp,
+ const block_type_t& block,
+ int ld_out,
+ int ld_in,
+ const requantizationForFloatParams_t& r) {
+ // Adoption of implementation at QNNPACK/src/requantization/fp32-sse2.c
+ // using AVX2 instructions
+ int quant_param_idx = 0;
+ if (Q_GRAN == QuantizationGranularity::GROUP) {
+ int ncol_per_group = r.ncols / r.groups;
+ int g = block.col_start / ncol_per_group;
+ quant_param_idx = g;
+ }
+ __m256 multiplier_v = _mm256_set1_ps(r.A_scale * r.B_scale[quant_param_idx]);
+
+ assert(
+ (A_SYMMETRIC == (r.A_zero_point == 0)) &&
+ "A_SYMMETRIC == true if and only if A_zero_point == 0");
+ assert(
+ (B_SYMMETRIC ==
+ ((Q_GRAN == QuantizationGranularity::TENSOR && r.B_zero_point[0] == 0) ||
+ r.row_offsets == nullptr)) &&
+ "B_SYMMETRIC == true if and only if B_zero_point == 0 "
+ "or r.row_offsets == nullptr");
+ assert(
+ (HAS_BIAS == (r.bias != nullptr)) &&
+ "HAS_BIAS == true if and only if bias != nullptr");
+
+ __m256i A_zero_point_v = _mm256_set1_epi32(r.A_zero_point);
+
+ constexpr int VLEN = 8;
+ for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ // Scale row_offset with Bq_zero_point
+ int32_t row_offset = 0;
+ if (B_SYMMETRIC) {
+ row_offset = 0;
+ } else if (
+ Q_GRAN == QuantizationGranularity::TENSOR ||
+ Q_GRAN == QuantizationGranularity::GROUP) {
+ row_offset =
+ r.row_offsets[i - block.row_start] * r.B_zero_point[quant_param_idx];
+ } else {
+ assert(
+ Q_GRAN == QuantizationGranularity::OUT_CHANNEL &&
+ "unknown quantization granularity");
+ }
+ __m256i row_offset_v = _mm256_set1_epi32(row_offset);
+
+ int j = block.col_start;
+ for (; j < block.col_start + (block.col_size / VLEN * VLEN); j += VLEN) {
+ __m256i x_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
+ inp + (i - block.row_start) * ld_in + (j - block.col_start)));
+
+ if (!A_SYMMETRIC) {
+ __m256i col_off_v = _mm256_mullo_epi32(
+ A_zero_point_v,
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(r.col_offsets + j)));
+ x_v = _mm256_sub_epi32(x_v, col_off_v);
+ }
+
+ if (!B_SYMMETRIC) {
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ row_offset_v = _mm256_mullo_epi32(
+ _mm256_set1_epi32(r.row_offsets[i - block.row_start]),
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(r.B_zero_point + j)));
+ }
+ x_v = _mm256_sub_epi32(x_v, row_offset_v);
+ }
+
+ __m256 x_scaled_v;
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ x_scaled_v = _mm256_mul_ps(
+ _mm256_cvtepi32_ps(x_v),
+ _mm256_mul_ps(
+ _mm256_set1_ps(r.A_scale), _mm256_loadu_ps(r.B_scale + j)));
+ } else {
+ x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+ }
+
+ if (HAS_BIAS) {
+ x_scaled_v = _mm256_add_ps(x_scaled_v, _mm256_loadu_ps(r.bias + j));
+ }
+ if (FUSE_RELU) {
+ x_scaled_v = _mm256_max_ps(_mm256_setzero_ps(), x_scaled_v);
+ }
+
+ _mm256_storeu_ps(out + i * ld_out + j, x_scaled_v);
+ } // j loop vectorized
+
+ int remainder = block.col_start + block.col_size - j;
+ if (remainder > 0) {
+ alignas(64) const int masks[8][8] = {
+ // NOTE: clang-format wants to use a different formatting but the
+ // current formatting should be easier to read.
+ { 0, 0, 0, 0, 0, 0, 0, 0, },
+ { -1, 0, 0, 0, 0, 0, 0, 0, },
+ { -1, -1, 0, 0, 0, 0, 0, 0, },
+ { -1, -1, -1, 0, 0, 0, 0, 0, },
+ { -1, -1, -1, -1, 0, 0, 0, 0, },
+ { -1, -1, -1, -1, -1, 0, 0, 0, },
+ { -1, -1, -1, -1, -1, -1, 0, 0, },
+ { -1, -1, -1, -1, -1, -1, -1, 0, },
+ };
+ __m256i mask_v = _mm256_load_si256(
+ reinterpret_cast<const __m256i*>(masks[remainder]));
+
+ __m256i x_v = _mm256_maskload_epi32(
+ inp + (i - block.row_start) * ld_in + (j - block.col_start),
+ mask_v);
+
+ if (!A_SYMMETRIC) {
+ __m256i col_off_v = _mm256_mullo_epi32(
+ A_zero_point_v, _mm256_maskload_epi32(r.col_offsets + j, mask_v));
+ x_v = _mm256_sub_epi32(x_v, col_off_v);
+ }
+
+ if (!B_SYMMETRIC) {
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ row_offset_v = _mm256_mullo_epi32(
+ _mm256_set1_epi32(r.row_offsets[i - block.row_start]),
+ _mm256_maskload_epi32(r.B_zero_point + j, mask_v));
+ }
+ x_v = _mm256_sub_epi32(x_v, row_offset_v);
+ }
+
+ __m256 x_scaled_v;
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ x_scaled_v = _mm256_mul_ps(
+ _mm256_cvtepi32_ps(x_v),
+ _mm256_mul_ps(
+ _mm256_set1_ps(r.A_scale),
+ _mm256_maskload_ps(r.B_scale + j, mask_v)));
+ } else {
+ x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+ }
+
+ if (HAS_BIAS) {
+ x_scaled_v =
+ _mm256_add_ps(x_scaled_v, _mm256_maskload_ps(r.bias + j, mask_v));
+ }
+ if (FUSE_RELU) {
+ x_scaled_v = _mm256_max_ps(_mm256_setzero_ps(), x_scaled_v);
+ }
+
+ _mm256_maskstore_ps(out + i * ld_out + j, mask_v, x_scaled_v);
+ } // j loop remainder
+ } // i loop
+}
+
+template <
+ bool A_SYMMETRIC,
+ bool B_SYMMETRIC,
+ QuantizationGranularity Q_GRAN,
+ bool HAS_BIAS,
bool FUSE_RELU,
int C_PER_G>
void requantizeOutputProcessingGConvAvx2(
@@ -1120,6 +1279,13 @@ void requantizeOutputProcessingGConvAvx2(
int ld_out, \
int ld_in, \
const requantizationParams_t& r); \
+ template void requantizeForFloatAvx2<A_SYM, B_SYM, Q_GRAN, BIAS, RELU>( \
+ float* out, \
+ const int32_t* inp, \
+ const block_type_t& block, \
+ int ld_out, \
+ int ld_in, \
+ const requantizationForFloatParams_t& r); \
template void \
requantizeOutputProcessingGConvAvx2<A_SYM, B_SYM, Q_GRAN, BIAS, RELU, 4>( \
uint8_t * out, \