diff options
author | Jongsoo Park <jongsoo@fb.com> | 2019-03-13 06:14:32 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-03-13 06:17:49 +0300 |
commit | 6011ce3b0c1fccee549e85b37e475c7a734ad742 (patch) | |
tree | 7089177b6c7da36c2582da1cf9b42eca9dfb2ea7 | |
parent | 50b43162fd1742122d01f2704945c78f13e0d73e (diff) |
optimize requantize for float out processing (#85)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/85
Optimizing performance of output processing when output is dequantized right away.
Reviewed By: protonu
Differential Revision: D14433141
fbshipit-source-id: f99a8d82000c43e554461acf036462a4e8f7e300
-rw-r--r-- | bench/PackedFloatInOutBenchmark.cc | 42 | ||||
-rw-r--r-- | include/fbgemm/OutputProcessing-inl.h | 274 | ||||
-rw-r--r-- | include/fbgemm/QuantUtilsAvx2.h | 14 | ||||
-rw-r--r-- | include/fbgemm/UtilsAvx2.h | 15 | ||||
-rw-r--r-- | src/QuantUtilsAvx2.cc | 166 |
5 files changed, 415 insertions, 96 deletions
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc index 1397125..66ca67e 100644 --- a/bench/PackedFloatInOutBenchmark.cc +++ b/bench/PackedFloatInOutBenchmark.cc @@ -76,8 +76,21 @@ void performance_test() { constexpr int NWARMUP = 4; constexpr int NITER = 10; +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + cout << "WARNING: the timer may be inaccurate when used by multiple threads." + << endl; + cout << "M, " + << "N, " + << "K, " + << "Packing (ms), " + << "Kernel (ms), " + << "Postprocessing (ms), " + << "Total (ms), " + << "GOPs" << endl; +#else cout << setw(7) << "M, " << setw(7) << "N, " << setw(7) << "K, " << setw(18) << "Type, " << setw(5) << "GOPS" << endl; +#endif chrono::time_point<chrono::high_resolution_clock> start, end; for (auto shape : shapes) { @@ -203,7 +216,23 @@ void performance_test() { ttot = 0; type = "FBGEMM_i8_acc32"; +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + double total_packing_time = 0.0; + double total_computing_time = 0.0; + double total_kernel_time = 0.0; + double total_postprocessing_time = 0.0; + double total_run_time = 0.0; +#endif + for (auto i = 0; i < NWARMUP + NITER; ++i) { +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + packing_time = 0.0; + computing_time = 0.0; + kernel_time = 0.0; + postprocessing_time = 0.0; + run_time = 0.0; +#endif + llc_flush(llc); start = chrono::high_resolution_clock::now(); fbgemmPacked( @@ -220,6 +249,13 @@ void performance_test() { if (i >= NWARMUP) { auto dur = chrono::duration_cast<chrono::nanoseconds>(end - start); ttot += dur.count(); +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + total_packing_time += packing_time; + total_computing_time += computing_time; + total_kernel_time += kernel_time; + total_postprocessing_time += postprocessing_time; + total_run_time += run_time; +#endif } } ((volatile char*)(llc.data())); @@ -237,6 +273,12 @@ void performance_test() { // row_offsets.size(), 5); // printMatrix(matrix_op_t::NoTranspose, Cfp32_fb.data(), // m, n, n, "C fb fp32"); +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + cout << total_packing_time / (double)NITER / 1e6 << ", " + << total_kernel_time / (double)NITER / 1e6 << ", " + << total_postprocessing_time / (double)NITER / 1e6 << ", " + << total_run_time / (double)NITER / 1e6 << ", "; +#endif cout << setw(5) << m << ", " << setw(5) << n << ", " << setw(5) << k << ", " << setw(16) << type << ", " << setw(5) << fixed << setw(5) << setprecision(1) << nops / ttot << endl; diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h index 9485b18..d984c60 100644 --- a/include/fbgemm/OutputProcessing-inl.h +++ b/include/fbgemm/OutputProcessing-inl.h @@ -77,7 +77,7 @@ inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f( block.col_size <= ncol_per_group && "ReQuantizeOutput should be called at most 1 group at a time."); int g = block.col_start / ncol_per_group; - if (instSet == inst_set_t::anyarch) { + if (instSet == inst_set_t::anyarch || !std::is_same<outT, uint8_t>::value) { for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)]; @@ -111,88 +111,84 @@ inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f( } } } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) { - if (std::is_same<outT, uint8_t>::value) { - bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR && - Bq_zero_point_[0] == 0) || - q_row_offsets_ == nullptr; + bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR && + Bq_zero_point_[0] == 0) || + q_row_offsets_ == nullptr; - requantizationParams_t r = {Aq_zero_point_, - Bq_zero_point_, - C_zero_point_, - C_multiplier_, - q_row_offsets_, - q_col_offsets_, - bias_, - ncols_, - groups_}; + requantizationParams_t r = {Aq_zero_point_, + Bq_zero_point_, + C_zero_point_, + C_multiplier_, + q_row_offsets_, + q_col_offsets_, + bias_, + ncols_, + groups_}; - if (Aq_zero_point_ == 0) { - if (b_symmetric) { - if (bias_ == nullptr) { - requantizeOutputProcessingAvx2< - true, - true, - Q_GRAN, - false, - FUSE_RELU>(out, inp, block, ld_out, ld_in, r); - } else { - requantizeOutputProcessingAvx2<true, true, Q_GRAN, true, FUSE_RELU>( - out, inp, block, ld_out, ld_in, r); - } + if (Aq_zero_point_ == 0) { + if (b_symmetric) { + if (bias_ == nullptr) { + requantizeOutputProcessingAvx2< + true, + true, + Q_GRAN, + false, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); } else { - if (bias_ == nullptr) { - requantizeOutputProcessingAvx2< - true, - false, - Q_GRAN, - false, - FUSE_RELU>(out, inp, block, ld_out, ld_in, r); - } else { - requantizeOutputProcessingAvx2< - true, - false, - Q_GRAN, - true, - FUSE_RELU>(out, inp, block, ld_out, ld_in, r); - } + requantizeOutputProcessingAvx2<true, true, Q_GRAN, true, FUSE_RELU>( + out, inp, block, ld_out, ld_in, r); } } else { - if (b_symmetric) { - if (bias_ == nullptr) { - requantizeOutputProcessingAvx2< - false, - true, - Q_GRAN, - false, - FUSE_RELU>(out, inp, block, ld_out, ld_in, r); - } else { - requantizeOutputProcessingAvx2< - false, - true, - Q_GRAN, - true, - FUSE_RELU>(out, inp, block, ld_out, ld_in, r); - } + if (bias_ == nullptr) { + requantizeOutputProcessingAvx2< + true, + false, + Q_GRAN, + false, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); } else { - if (bias_ == nullptr) { - requantizeOutputProcessingAvx2< - false, - false, - Q_GRAN, - false, - FUSE_RELU>(out, inp, block, ld_out, ld_in, r); - } else { - requantizeOutputProcessingAvx2< - false, - false, - Q_GRAN, - true, - FUSE_RELU>(out, inp, block, ld_out, ld_in, r); - } + requantizeOutputProcessingAvx2< + true, + false, + Q_GRAN, + true, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); } } } else { - assert(0 && "Not supported yet"); + if (b_symmetric) { + if (bias_ == nullptr) { + requantizeOutputProcessingAvx2< + false, + true, + Q_GRAN, + false, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } else { + requantizeOutputProcessingAvx2< + false, + true, + Q_GRAN, + true, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } + } else { + if (bias_ == nullptr) { + requantizeOutputProcessingAvx2< + false, + false, + Q_GRAN, + false, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } else { + requantizeOutputProcessingAvx2< + false, + false, + Q_GRAN, + true, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } + } } } else { assert(0 && "Not supported yet"); @@ -224,33 +220,119 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f( block.col_size <= ncol_per_group && "ReQuantizeOutput should be called at most 1 group at a time."); int g = block.col_start / ncol_per_group; - for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { - for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { - inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start]; - if (Aq_zero_point_) { - raw -= Aq_zero_point_ * q_col_offsets_[j]; + if (instSet == inst_set_t::anyarch || !std::is_same<outT, float>::value) { + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { + inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start]; + if (Aq_zero_point_) { + raw -= Aq_zero_point_ * q_col_offsets_[j]; + } + int Bq_zero_point_idx; + if (Q_GRAN == QuantizationGranularity::TENSOR) { + Bq_zero_point_idx = 0; + } else if (Q_GRAN == QuantizationGranularity::GROUP) { + Bq_zero_point_idx = g; + } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + Bq_zero_point_idx = j; + } else { + assert(false && "unknown quantization granularity"); + } + if (q_row_offsets_) { + raw -= q_row_offsets_[i - block.row_start] * + Bq_zero_point_[Bq_zero_point_idx]; + } + float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx]; + if (bias_) { + res += bias_[j]; + } + out[i * ld_out + j] = res; + if (FUSE_RELU) { + out[i * ld_out + j] = std::max<outT>(0.0f, out[i * ld_out + j]); + } } - int Bq_zero_point_idx; - if (Q_GRAN == QuantizationGranularity::TENSOR) { - Bq_zero_point_idx = 0; - } else if (Q_GRAN == QuantizationGranularity::GROUP) { - Bq_zero_point_idx = g; - } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { - Bq_zero_point_idx = j; + } + } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) { + bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR && + Bq_zero_point_[0] == 0) || + q_row_offsets_ == nullptr; + + requantizationForFloatParams_t r = {Aq_zero_point_, + Bq_zero_point_, + Aq_scale_, + Bq_scale_, + q_row_offsets_, + q_col_offsets_, + bias_, + ncols_, + groups_}; + + if (Aq_zero_point_ == 0) { + if (b_symmetric) { + if (bias_ == nullptr) { + requantizeForFloatAvx2< + true, + true, + Q_GRAN, + false, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } else { + requantizeForFloatAvx2<true, true, Q_GRAN, true, FUSE_RELU>( + out, inp, block, ld_out, ld_in, r); + } } else { - assert(false && "unknown quantization granularity"); - } - raw -= q_row_offsets_[i - block.row_start] * - Bq_zero_point_[Bq_zero_point_idx]; - float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx]; - if (bias_) { - res += bias_[j]; + if (bias_ == nullptr) { + requantizeForFloatAvx2< + true, + false, + Q_GRAN, + false, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } else { + requantizeForFloatAvx2< + true, + false, + Q_GRAN, + true, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } } - out[i * ld_out + j] = res; - if (FUSE_RELU) { - out[i * ld_out + j] = std::max<outT>(0.0f, out[i * ld_out + j]); + } else { + if (b_symmetric) { + if (bias_ == nullptr) { + requantizeForFloatAvx2< + false, + true, + Q_GRAN, + false, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } else { + requantizeForFloatAvx2< + false, + true, + Q_GRAN, + true, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } + } else { + if (bias_ == nullptr) { + requantizeForFloatAvx2< + false, + false, + Q_GRAN, + false, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } else { + requantizeForFloatAvx2< + false, + false, + Q_GRAN, + true, + FUSE_RELU>(out, inp, block, ld_out, ld_in, r); + } } } + } else { + assert(0 && "Not supported yet"); } return nextop_.template f<instSet>(out, out, block, ld_out, ld_out); diff --git a/include/fbgemm/QuantUtilsAvx2.h b/include/fbgemm/QuantUtilsAvx2.h index 04aeba1..47f33a8 100644 --- a/include/fbgemm/QuantUtilsAvx2.h +++ b/include/fbgemm/QuantUtilsAvx2.h @@ -95,4 +95,18 @@ FBGEMM_API void requantizeOutputProcessingGConvAvx2( int ld_in, const requantizationParams_t& r); +template < + bool A_SYMMETRIC, + bool B_SYMMETRIC, + QuantizationGranularity Q_GRAN, + bool HAS_BIAS, + bool FUSE_RELU> +FBGEMM_API void requantizeForFloatAvx2( + float* out, + const std::int32_t* inp, + const block_type_t& block, + int ld_out, + int ld_in, + const requantizationForFloatParams_t& r); + } // namespace fbgemm diff --git a/include/fbgemm/UtilsAvx2.h b/include/fbgemm/UtilsAvx2.h index 53fb39d..082edc1 100644 --- a/include/fbgemm/UtilsAvx2.h +++ b/include/fbgemm/UtilsAvx2.h @@ -56,4 +56,19 @@ struct requantizationParams_t { int groups; }; +/** + * @brief A struct to represent all the parameters for requantizing for floats. + */ +struct requantizationForFloatParams_t { + std::int32_t A_zero_point; + const std::int32_t* B_zero_point; + float A_scale; + const float* B_scale; + const std::int32_t* row_offsets; + const std::int32_t* col_offsets; + const float* bias; + std::uint32_t ncols; + int groups; +}; + } // namespace fbgemm diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc index be12142..875c9e1 100644 --- a/src/QuantUtilsAvx2.cc +++ b/src/QuantUtilsAvx2.cc @@ -662,6 +662,165 @@ template < bool B_SYMMETRIC, QuantizationGranularity Q_GRAN, bool HAS_BIAS, + bool FUSE_RELU> +void requantizeForFloatAvx2( + float* out, + const int32_t* inp, + const block_type_t& block, + int ld_out, + int ld_in, + const requantizationForFloatParams_t& r) { + // Adoption of implementation at QNNPACK/src/requantization/fp32-sse2.c + // using AVX2 instructions + int quant_param_idx = 0; + if (Q_GRAN == QuantizationGranularity::GROUP) { + int ncol_per_group = r.ncols / r.groups; + int g = block.col_start / ncol_per_group; + quant_param_idx = g; + } + __m256 multiplier_v = _mm256_set1_ps(r.A_scale * r.B_scale[quant_param_idx]); + + assert( + (A_SYMMETRIC == (r.A_zero_point == 0)) && + "A_SYMMETRIC == true if and only if A_zero_point == 0"); + assert( + (B_SYMMETRIC == + ((Q_GRAN == QuantizationGranularity::TENSOR && r.B_zero_point[0] == 0) || + r.row_offsets == nullptr)) && + "B_SYMMETRIC == true if and only if B_zero_point == 0 " + "or r.row_offsets == nullptr"); + assert( + (HAS_BIAS == (r.bias != nullptr)) && + "HAS_BIAS == true if and only if bias != nullptr"); + + __m256i A_zero_point_v = _mm256_set1_epi32(r.A_zero_point); + + constexpr int VLEN = 8; + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + // Scale row_offset with Bq_zero_point + int32_t row_offset = 0; + if (B_SYMMETRIC) { + row_offset = 0; + } else if ( + Q_GRAN == QuantizationGranularity::TENSOR || + Q_GRAN == QuantizationGranularity::GROUP) { + row_offset = + r.row_offsets[i - block.row_start] * r.B_zero_point[quant_param_idx]; + } else { + assert( + Q_GRAN == QuantizationGranularity::OUT_CHANNEL && + "unknown quantization granularity"); + } + __m256i row_offset_v = _mm256_set1_epi32(row_offset); + + int j = block.col_start; + for (; j < block.col_start + (block.col_size / VLEN * VLEN); j += VLEN) { + __m256i x_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>( + inp + (i - block.row_start) * ld_in + (j - block.col_start))); + + if (!A_SYMMETRIC) { + __m256i col_off_v = _mm256_mullo_epi32( + A_zero_point_v, + _mm256_loadu_si256( + reinterpret_cast<const __m256i*>(r.col_offsets + j))); + x_v = _mm256_sub_epi32(x_v, col_off_v); + } + + if (!B_SYMMETRIC) { + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(r.row_offsets[i - block.row_start]), + _mm256_loadu_si256( + reinterpret_cast<const __m256i*>(r.B_zero_point + j))); + } + x_v = _mm256_sub_epi32(x_v, row_offset_v); + } + + __m256 x_scaled_v; + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + x_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(x_v), + _mm256_mul_ps( + _mm256_set1_ps(r.A_scale), _mm256_loadu_ps(r.B_scale + j))); + } else { + x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); + } + + if (HAS_BIAS) { + x_scaled_v = _mm256_add_ps(x_scaled_v, _mm256_loadu_ps(r.bias + j)); + } + if (FUSE_RELU) { + x_scaled_v = _mm256_max_ps(_mm256_setzero_ps(), x_scaled_v); + } + + _mm256_storeu_ps(out + i * ld_out + j, x_scaled_v); + } // j loop vectorized + + int remainder = block.col_start + block.col_size - j; + if (remainder > 0) { + alignas(64) const int masks[8][8] = { + // NOTE: clang-format wants to use a different formatting but the + // current formatting should be easier to read. + { 0, 0, 0, 0, 0, 0, 0, 0, }, + { -1, 0, 0, 0, 0, 0, 0, 0, }, + { -1, -1, 0, 0, 0, 0, 0, 0, }, + { -1, -1, -1, 0, 0, 0, 0, 0, }, + { -1, -1, -1, -1, 0, 0, 0, 0, }, + { -1, -1, -1, -1, -1, 0, 0, 0, }, + { -1, -1, -1, -1, -1, -1, 0, 0, }, + { -1, -1, -1, -1, -1, -1, -1, 0, }, + }; + __m256i mask_v = _mm256_load_si256( + reinterpret_cast<const __m256i*>(masks[remainder])); + + __m256i x_v = _mm256_maskload_epi32( + inp + (i - block.row_start) * ld_in + (j - block.col_start), + mask_v); + + if (!A_SYMMETRIC) { + __m256i col_off_v = _mm256_mullo_epi32( + A_zero_point_v, _mm256_maskload_epi32(r.col_offsets + j, mask_v)); + x_v = _mm256_sub_epi32(x_v, col_off_v); + } + + if (!B_SYMMETRIC) { + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(r.row_offsets[i - block.row_start]), + _mm256_maskload_epi32(r.B_zero_point + j, mask_v)); + } + x_v = _mm256_sub_epi32(x_v, row_offset_v); + } + + __m256 x_scaled_v; + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + x_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(x_v), + _mm256_mul_ps( + _mm256_set1_ps(r.A_scale), + _mm256_maskload_ps(r.B_scale + j, mask_v))); + } else { + x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); + } + + if (HAS_BIAS) { + x_scaled_v = + _mm256_add_ps(x_scaled_v, _mm256_maskload_ps(r.bias + j, mask_v)); + } + if (FUSE_RELU) { + x_scaled_v = _mm256_max_ps(_mm256_setzero_ps(), x_scaled_v); + } + + _mm256_maskstore_ps(out + i * ld_out + j, mask_v, x_scaled_v); + } // j loop remainder + } // i loop +} + +template < + bool A_SYMMETRIC, + bool B_SYMMETRIC, + QuantizationGranularity Q_GRAN, + bool HAS_BIAS, bool FUSE_RELU, int C_PER_G> void requantizeOutputProcessingGConvAvx2( @@ -1120,6 +1279,13 @@ void requantizeOutputProcessingGConvAvx2( int ld_out, \ int ld_in, \ const requantizationParams_t& r); \ + template void requantizeForFloatAvx2<A_SYM, B_SYM, Q_GRAN, BIAS, RELU>( \ + float* out, \ + const int32_t* inp, \ + const block_type_t& block, \ + int ld_out, \ + int ld_in, \ + const requantizationForFloatParams_t& r); \ template void \ requantizeOutputProcessingGConvAvx2<A_SYM, B_SYM, Q_GRAN, BIAS, RELU, 4>( \ uint8_t * out, \ |