diff options
Diffstat (limited to 'src/QuantUtilsAvx2.cc')
-rw-r--r-- | src/QuantUtilsAvx2.cc | 166 |
1 files changed, 166 insertions, 0 deletions
diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc index be12142..875c9e1 100644 --- a/src/QuantUtilsAvx2.cc +++ b/src/QuantUtilsAvx2.cc @@ -662,6 +662,165 @@ template < bool B_SYMMETRIC, QuantizationGranularity Q_GRAN, bool HAS_BIAS, + bool FUSE_RELU> +void requantizeForFloatAvx2( + float* out, + const int32_t* inp, + const block_type_t& block, + int ld_out, + int ld_in, + const requantizationForFloatParams_t& r) { + // Adoption of implementation at QNNPACK/src/requantization/fp32-sse2.c + // using AVX2 instructions + int quant_param_idx = 0; + if (Q_GRAN == QuantizationGranularity::GROUP) { + int ncol_per_group = r.ncols / r.groups; + int g = block.col_start / ncol_per_group; + quant_param_idx = g; + } + __m256 multiplier_v = _mm256_set1_ps(r.A_scale * r.B_scale[quant_param_idx]); + + assert( + (A_SYMMETRIC == (r.A_zero_point == 0)) && + "A_SYMMETRIC == true if and only if A_zero_point == 0"); + assert( + (B_SYMMETRIC == + ((Q_GRAN == QuantizationGranularity::TENSOR && r.B_zero_point[0] == 0) || + r.row_offsets == nullptr)) && + "B_SYMMETRIC == true if and only if B_zero_point == 0 " + "or r.row_offsets == nullptr"); + assert( + (HAS_BIAS == (r.bias != nullptr)) && + "HAS_BIAS == true if and only if bias != nullptr"); + + __m256i A_zero_point_v = _mm256_set1_epi32(r.A_zero_point); + + constexpr int VLEN = 8; + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + // Scale row_offset with Bq_zero_point + int32_t row_offset = 0; + if (B_SYMMETRIC) { + row_offset = 0; + } else if ( + Q_GRAN == QuantizationGranularity::TENSOR || + Q_GRAN == QuantizationGranularity::GROUP) { + row_offset = + r.row_offsets[i - block.row_start] * r.B_zero_point[quant_param_idx]; + } else { + assert( + Q_GRAN == QuantizationGranularity::OUT_CHANNEL && + "unknown quantization granularity"); + } + __m256i row_offset_v = _mm256_set1_epi32(row_offset); + + int j = block.col_start; + for (; j < block.col_start + (block.col_size / VLEN * VLEN); j += VLEN) { + __m256i x_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>( + inp + (i - block.row_start) * ld_in + (j - block.col_start))); + + if (!A_SYMMETRIC) { + __m256i col_off_v = _mm256_mullo_epi32( + A_zero_point_v, + _mm256_loadu_si256( + reinterpret_cast<const __m256i*>(r.col_offsets + j))); + x_v = _mm256_sub_epi32(x_v, col_off_v); + } + + if (!B_SYMMETRIC) { + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(r.row_offsets[i - block.row_start]), + _mm256_loadu_si256( + reinterpret_cast<const __m256i*>(r.B_zero_point + j))); + } + x_v = _mm256_sub_epi32(x_v, row_offset_v); + } + + __m256 x_scaled_v; + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + x_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(x_v), + _mm256_mul_ps( + _mm256_set1_ps(r.A_scale), _mm256_loadu_ps(r.B_scale + j))); + } else { + x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); + } + + if (HAS_BIAS) { + x_scaled_v = _mm256_add_ps(x_scaled_v, _mm256_loadu_ps(r.bias + j)); + } + if (FUSE_RELU) { + x_scaled_v = _mm256_max_ps(_mm256_setzero_ps(), x_scaled_v); + } + + _mm256_storeu_ps(out + i * ld_out + j, x_scaled_v); + } // j loop vectorized + + int remainder = block.col_start + block.col_size - j; + if (remainder > 0) { + alignas(64) const int masks[8][8] = { + // NOTE: clang-format wants to use a different formatting but the + // current formatting should be easier to read. + { 0, 0, 0, 0, 0, 0, 0, 0, }, + { -1, 0, 0, 0, 0, 0, 0, 0, }, + { -1, -1, 0, 0, 0, 0, 0, 0, }, + { -1, -1, -1, 0, 0, 0, 0, 0, }, + { -1, -1, -1, -1, 0, 0, 0, 0, }, + { -1, -1, -1, -1, -1, 0, 0, 0, }, + { -1, -1, -1, -1, -1, -1, 0, 0, }, + { -1, -1, -1, -1, -1, -1, -1, 0, }, + }; + __m256i mask_v = _mm256_load_si256( + reinterpret_cast<const __m256i*>(masks[remainder])); + + __m256i x_v = _mm256_maskload_epi32( + inp + (i - block.row_start) * ld_in + (j - block.col_start), + mask_v); + + if (!A_SYMMETRIC) { + __m256i col_off_v = _mm256_mullo_epi32( + A_zero_point_v, _mm256_maskload_epi32(r.col_offsets + j, mask_v)); + x_v = _mm256_sub_epi32(x_v, col_off_v); + } + + if (!B_SYMMETRIC) { + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(r.row_offsets[i - block.row_start]), + _mm256_maskload_epi32(r.B_zero_point + j, mask_v)); + } + x_v = _mm256_sub_epi32(x_v, row_offset_v); + } + + __m256 x_scaled_v; + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + x_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(x_v), + _mm256_mul_ps( + _mm256_set1_ps(r.A_scale), + _mm256_maskload_ps(r.B_scale + j, mask_v))); + } else { + x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); + } + + if (HAS_BIAS) { + x_scaled_v = + _mm256_add_ps(x_scaled_v, _mm256_maskload_ps(r.bias + j, mask_v)); + } + if (FUSE_RELU) { + x_scaled_v = _mm256_max_ps(_mm256_setzero_ps(), x_scaled_v); + } + + _mm256_maskstore_ps(out + i * ld_out + j, mask_v, x_scaled_v); + } // j loop remainder + } // i loop +} + +template < + bool A_SYMMETRIC, + bool B_SYMMETRIC, + QuantizationGranularity Q_GRAN, + bool HAS_BIAS, bool FUSE_RELU, int C_PER_G> void requantizeOutputProcessingGConvAvx2( @@ -1120,6 +1279,13 @@ void requantizeOutputProcessingGConvAvx2( int ld_out, \ int ld_in, \ const requantizationParams_t& r); \ + template void requantizeForFloatAvx2<A_SYM, B_SYM, Q_GRAN, BIAS, RELU>( \ + float* out, \ + const int32_t* inp, \ + const block_type_t& block, \ + int ld_out, \ + int ld_in, \ + const requantizationForFloatParams_t& r); \ template void \ requantizeOutputProcessingGConvAvx2<A_SYM, B_SYM, Q_GRAN, BIAS, RELU, 4>( \ uint8_t * out, \ |