Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJongsoo Park <jongsoo@fb.com>2019-03-13 06:14:32 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-03-13 06:17:49 +0300
commit6011ce3b0c1fccee549e85b37e475c7a734ad742 (patch)
tree7089177b6c7da36c2582da1cf9b42eca9dfb2ea7
parent50b43162fd1742122d01f2704945c78f13e0d73e (diff)
optimize requantize for float out processing (#85)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/85 Optimizing performance of output processing when output is dequantized right away. Reviewed By: protonu Differential Revision: D14433141 fbshipit-source-id: f99a8d82000c43e554461acf036462a4e8f7e300
-rw-r--r--bench/PackedFloatInOutBenchmark.cc42
-rw-r--r--include/fbgemm/OutputProcessing-inl.h274
-rw-r--r--include/fbgemm/QuantUtilsAvx2.h14
-rw-r--r--include/fbgemm/UtilsAvx2.h15
-rw-r--r--src/QuantUtilsAvx2.cc166
5 files changed, 415 insertions, 96 deletions
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
index 1397125..66ca67e 100644
--- a/bench/PackedFloatInOutBenchmark.cc
+++ b/bench/PackedFloatInOutBenchmark.cc
@@ -76,8 +76,21 @@ void performance_test() {
constexpr int NWARMUP = 4;
constexpr int NITER = 10;
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ cout << "WARNING: the timer may be inaccurate when used by multiple threads."
+ << endl;
+ cout << "M, "
+ << "N, "
+ << "K, "
+ << "Packing (ms), "
+ << "Kernel (ms), "
+ << "Postprocessing (ms), "
+ << "Total (ms), "
+ << "GOPs" << endl;
+#else
cout << setw(7) << "M, " << setw(7) << "N, " << setw(7) << "K, " << setw(18)
<< "Type, " << setw(5) << "GOPS" << endl;
+#endif
chrono::time_point<chrono::high_resolution_clock> start, end;
for (auto shape : shapes) {
@@ -203,7 +216,23 @@ void performance_test() {
ttot = 0;
type = "FBGEMM_i8_acc32";
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ double total_packing_time = 0.0;
+ double total_computing_time = 0.0;
+ double total_kernel_time = 0.0;
+ double total_postprocessing_time = 0.0;
+ double total_run_time = 0.0;
+#endif
+
for (auto i = 0; i < NWARMUP + NITER; ++i) {
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ packing_time = 0.0;
+ computing_time = 0.0;
+ kernel_time = 0.0;
+ postprocessing_time = 0.0;
+ run_time = 0.0;
+#endif
+
llc_flush(llc);
start = chrono::high_resolution_clock::now();
fbgemmPacked(
@@ -220,6 +249,13 @@ void performance_test() {
if (i >= NWARMUP) {
auto dur = chrono::duration_cast<chrono::nanoseconds>(end - start);
ttot += dur.count();
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ total_packing_time += packing_time;
+ total_computing_time += computing_time;
+ total_kernel_time += kernel_time;
+ total_postprocessing_time += postprocessing_time;
+ total_run_time += run_time;
+#endif
}
}
((volatile char*)(llc.data()));
@@ -237,6 +273,12 @@ void performance_test() {
// row_offsets.size(), 5);
// printMatrix(matrix_op_t::NoTranspose, Cfp32_fb.data(),
// m, n, n, "C fb fp32");
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ cout << total_packing_time / (double)NITER / 1e6 << ", "
+ << total_kernel_time / (double)NITER / 1e6 << ", "
+ << total_postprocessing_time / (double)NITER / 1e6 << ", "
+ << total_run_time / (double)NITER / 1e6 << ", ";
+#endif
cout << setw(5) << m << ", " << setw(5) << n << ", " << setw(5) << k << ", "
<< setw(16) << type << ", " << setw(5) << fixed << setw(5)
<< setprecision(1) << nops / ttot << endl;
diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h
index 9485b18..d984c60 100644
--- a/include/fbgemm/OutputProcessing-inl.h
+++ b/include/fbgemm/OutputProcessing-inl.h
@@ -77,7 +77,7 @@ inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
block.col_size <= ncol_per_group &&
"ReQuantizeOutput should be called at most 1 group at a time.");
int g = block.col_start / ncol_per_group;
- if (instSet == inst_set_t::anyarch) {
+ if (instSet == inst_set_t::anyarch || !std::is_same<outT, uint8_t>::value) {
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)];
@@ -111,88 +111,84 @@ inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
}
}
} else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
- if (std::is_same<outT, uint8_t>::value) {
- bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR &&
- Bq_zero_point_[0] == 0) ||
- q_row_offsets_ == nullptr;
+ bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR &&
+ Bq_zero_point_[0] == 0) ||
+ q_row_offsets_ == nullptr;
- requantizationParams_t r = {Aq_zero_point_,
- Bq_zero_point_,
- C_zero_point_,
- C_multiplier_,
- q_row_offsets_,
- q_col_offsets_,
- bias_,
- ncols_,
- groups_};
+ requantizationParams_t r = {Aq_zero_point_,
+ Bq_zero_point_,
+ C_zero_point_,
+ C_multiplier_,
+ q_row_offsets_,
+ q_col_offsets_,
+ bias_,
+ ncols_,
+ groups_};
- if (Aq_zero_point_ == 0) {
- if (b_symmetric) {
- if (bias_ == nullptr) {
- requantizeOutputProcessingAvx2<
- true,
- true,
- Q_GRAN,
- false,
- FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
- } else {
- requantizeOutputProcessingAvx2<true, true, Q_GRAN, true, FUSE_RELU>(
- out, inp, block, ld_out, ld_in, r);
- }
+ if (Aq_zero_point_ == 0) {
+ if (b_symmetric) {
+ if (bias_ == nullptr) {
+ requantizeOutputProcessingAvx2<
+ true,
+ true,
+ Q_GRAN,
+ false,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
} else {
- if (bias_ == nullptr) {
- requantizeOutputProcessingAvx2<
- true,
- false,
- Q_GRAN,
- false,
- FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
- } else {
- requantizeOutputProcessingAvx2<
- true,
- false,
- Q_GRAN,
- true,
- FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
- }
+ requantizeOutputProcessingAvx2<true, true, Q_GRAN, true, FUSE_RELU>(
+ out, inp, block, ld_out, ld_in, r);
}
} else {
- if (b_symmetric) {
- if (bias_ == nullptr) {
- requantizeOutputProcessingAvx2<
- false,
- true,
- Q_GRAN,
- false,
- FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
- } else {
- requantizeOutputProcessingAvx2<
- false,
- true,
- Q_GRAN,
- true,
- FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
- }
+ if (bias_ == nullptr) {
+ requantizeOutputProcessingAvx2<
+ true,
+ false,
+ Q_GRAN,
+ false,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
} else {
- if (bias_ == nullptr) {
- requantizeOutputProcessingAvx2<
- false,
- false,
- Q_GRAN,
- false,
- FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
- } else {
- requantizeOutputProcessingAvx2<
- false,
- false,
- Q_GRAN,
- true,
- FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
- }
+ requantizeOutputProcessingAvx2<
+ true,
+ false,
+ Q_GRAN,
+ true,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
}
}
} else {
- assert(0 && "Not supported yet");
+ if (b_symmetric) {
+ if (bias_ == nullptr) {
+ requantizeOutputProcessingAvx2<
+ false,
+ true,
+ Q_GRAN,
+ false,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ } else {
+ requantizeOutputProcessingAvx2<
+ false,
+ true,
+ Q_GRAN,
+ true,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ }
+ } else {
+ if (bias_ == nullptr) {
+ requantizeOutputProcessingAvx2<
+ false,
+ false,
+ Q_GRAN,
+ false,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ } else {
+ requantizeOutputProcessingAvx2<
+ false,
+ false,
+ Q_GRAN,
+ true,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ }
+ }
}
} else {
assert(0 && "Not supported yet");
@@ -224,33 +220,119 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
block.col_size <= ncol_per_group &&
"ReQuantizeOutput should be called at most 1 group at a time.");
int g = block.col_start / ncol_per_group;
- for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
- for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
- inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start];
- if (Aq_zero_point_) {
- raw -= Aq_zero_point_ * q_col_offsets_[j];
+ if (instSet == inst_set_t::anyarch || !std::is_same<outT, float>::value) {
+ for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
+ inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start];
+ if (Aq_zero_point_) {
+ raw -= Aq_zero_point_ * q_col_offsets_[j];
+ }
+ int Bq_zero_point_idx;
+ if (Q_GRAN == QuantizationGranularity::TENSOR) {
+ Bq_zero_point_idx = 0;
+ } else if (Q_GRAN == QuantizationGranularity::GROUP) {
+ Bq_zero_point_idx = g;
+ } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ Bq_zero_point_idx = j;
+ } else {
+ assert(false && "unknown quantization granularity");
+ }
+ if (q_row_offsets_) {
+ raw -= q_row_offsets_[i - block.row_start] *
+ Bq_zero_point_[Bq_zero_point_idx];
+ }
+ float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx];
+ if (bias_) {
+ res += bias_[j];
+ }
+ out[i * ld_out + j] = res;
+ if (FUSE_RELU) {
+ out[i * ld_out + j] = std::max<outT>(0.0f, out[i * ld_out + j]);
+ }
}
- int Bq_zero_point_idx;
- if (Q_GRAN == QuantizationGranularity::TENSOR) {
- Bq_zero_point_idx = 0;
- } else if (Q_GRAN == QuantizationGranularity::GROUP) {
- Bq_zero_point_idx = g;
- } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
- Bq_zero_point_idx = j;
+ }
+ } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
+ bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR &&
+ Bq_zero_point_[0] == 0) ||
+ q_row_offsets_ == nullptr;
+
+ requantizationForFloatParams_t r = {Aq_zero_point_,
+ Bq_zero_point_,
+ Aq_scale_,
+ Bq_scale_,
+ q_row_offsets_,
+ q_col_offsets_,
+ bias_,
+ ncols_,
+ groups_};
+
+ if (Aq_zero_point_ == 0) {
+ if (b_symmetric) {
+ if (bias_ == nullptr) {
+ requantizeForFloatAvx2<
+ true,
+ true,
+ Q_GRAN,
+ false,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ } else {
+ requantizeForFloatAvx2<true, true, Q_GRAN, true, FUSE_RELU>(
+ out, inp, block, ld_out, ld_in, r);
+ }
} else {
- assert(false && "unknown quantization granularity");
- }
- raw -= q_row_offsets_[i - block.row_start] *
- Bq_zero_point_[Bq_zero_point_idx];
- float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx];
- if (bias_) {
- res += bias_[j];
+ if (bias_ == nullptr) {
+ requantizeForFloatAvx2<
+ true,
+ false,
+ Q_GRAN,
+ false,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ } else {
+ requantizeForFloatAvx2<
+ true,
+ false,
+ Q_GRAN,
+ true,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ }
}
- out[i * ld_out + j] = res;
- if (FUSE_RELU) {
- out[i * ld_out + j] = std::max<outT>(0.0f, out[i * ld_out + j]);
+ } else {
+ if (b_symmetric) {
+ if (bias_ == nullptr) {
+ requantizeForFloatAvx2<
+ false,
+ true,
+ Q_GRAN,
+ false,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ } else {
+ requantizeForFloatAvx2<
+ false,
+ true,
+ Q_GRAN,
+ true,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ }
+ } else {
+ if (bias_ == nullptr) {
+ requantizeForFloatAvx2<
+ false,
+ false,
+ Q_GRAN,
+ false,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ } else {
+ requantizeForFloatAvx2<
+ false,
+ false,
+ Q_GRAN,
+ true,
+ FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+ }
}
}
+ } else {
+ assert(0 && "Not supported yet");
}
return nextop_.template f<instSet>(out, out, block, ld_out, ld_out);
diff --git a/include/fbgemm/QuantUtilsAvx2.h b/include/fbgemm/QuantUtilsAvx2.h
index 04aeba1..47f33a8 100644
--- a/include/fbgemm/QuantUtilsAvx2.h
+++ b/include/fbgemm/QuantUtilsAvx2.h
@@ -95,4 +95,18 @@ FBGEMM_API void requantizeOutputProcessingGConvAvx2(
int ld_in,
const requantizationParams_t& r);
+template <
+ bool A_SYMMETRIC,
+ bool B_SYMMETRIC,
+ QuantizationGranularity Q_GRAN,
+ bool HAS_BIAS,
+ bool FUSE_RELU>
+FBGEMM_API void requantizeForFloatAvx2(
+ float* out,
+ const std::int32_t* inp,
+ const block_type_t& block,
+ int ld_out,
+ int ld_in,
+ const requantizationForFloatParams_t& r);
+
} // namespace fbgemm
diff --git a/include/fbgemm/UtilsAvx2.h b/include/fbgemm/UtilsAvx2.h
index 53fb39d..082edc1 100644
--- a/include/fbgemm/UtilsAvx2.h
+++ b/include/fbgemm/UtilsAvx2.h
@@ -56,4 +56,19 @@ struct requantizationParams_t {
int groups;
};
+/**
+ * @brief A struct to represent all the parameters for requantizing for floats.
+ */
+struct requantizationForFloatParams_t {
+ std::int32_t A_zero_point;
+ const std::int32_t* B_zero_point;
+ float A_scale;
+ const float* B_scale;
+ const std::int32_t* row_offsets;
+ const std::int32_t* col_offsets;
+ const float* bias;
+ std::uint32_t ncols;
+ int groups;
+};
+
} // namespace fbgemm
diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
index be12142..875c9e1 100644
--- a/src/QuantUtilsAvx2.cc
+++ b/src/QuantUtilsAvx2.cc
@@ -662,6 +662,165 @@ template <
bool B_SYMMETRIC,
QuantizationGranularity Q_GRAN,
bool HAS_BIAS,
+ bool FUSE_RELU>
+void requantizeForFloatAvx2(
+ float* out,
+ const int32_t* inp,
+ const block_type_t& block,
+ int ld_out,
+ int ld_in,
+ const requantizationForFloatParams_t& r) {
+ // Adoption of implementation at QNNPACK/src/requantization/fp32-sse2.c
+ // using AVX2 instructions
+ int quant_param_idx = 0;
+ if (Q_GRAN == QuantizationGranularity::GROUP) {
+ int ncol_per_group = r.ncols / r.groups;
+ int g = block.col_start / ncol_per_group;
+ quant_param_idx = g;
+ }
+ __m256 multiplier_v = _mm256_set1_ps(r.A_scale * r.B_scale[quant_param_idx]);
+
+ assert(
+ (A_SYMMETRIC == (r.A_zero_point == 0)) &&
+ "A_SYMMETRIC == true if and only if A_zero_point == 0");
+ assert(
+ (B_SYMMETRIC ==
+ ((Q_GRAN == QuantizationGranularity::TENSOR && r.B_zero_point[0] == 0) ||
+ r.row_offsets == nullptr)) &&
+ "B_SYMMETRIC == true if and only if B_zero_point == 0 "
+ "or r.row_offsets == nullptr");
+ assert(
+ (HAS_BIAS == (r.bias != nullptr)) &&
+ "HAS_BIAS == true if and only if bias != nullptr");
+
+ __m256i A_zero_point_v = _mm256_set1_epi32(r.A_zero_point);
+
+ constexpr int VLEN = 8;
+ for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ // Scale row_offset with Bq_zero_point
+ int32_t row_offset = 0;
+ if (B_SYMMETRIC) {
+ row_offset = 0;
+ } else if (
+ Q_GRAN == QuantizationGranularity::TENSOR ||
+ Q_GRAN == QuantizationGranularity::GROUP) {
+ row_offset =
+ r.row_offsets[i - block.row_start] * r.B_zero_point[quant_param_idx];
+ } else {
+ assert(
+ Q_GRAN == QuantizationGranularity::OUT_CHANNEL &&
+ "unknown quantization granularity");
+ }
+ __m256i row_offset_v = _mm256_set1_epi32(row_offset);
+
+ int j = block.col_start;
+ for (; j < block.col_start + (block.col_size / VLEN * VLEN); j += VLEN) {
+ __m256i x_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
+ inp + (i - block.row_start) * ld_in + (j - block.col_start)));
+
+ if (!A_SYMMETRIC) {
+ __m256i col_off_v = _mm256_mullo_epi32(
+ A_zero_point_v,
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(r.col_offsets + j)));
+ x_v = _mm256_sub_epi32(x_v, col_off_v);
+ }
+
+ if (!B_SYMMETRIC) {
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ row_offset_v = _mm256_mullo_epi32(
+ _mm256_set1_epi32(r.row_offsets[i - block.row_start]),
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(r.B_zero_point + j)));
+ }
+ x_v = _mm256_sub_epi32(x_v, row_offset_v);
+ }
+
+ __m256 x_scaled_v;
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ x_scaled_v = _mm256_mul_ps(
+ _mm256_cvtepi32_ps(x_v),
+ _mm256_mul_ps(
+ _mm256_set1_ps(r.A_scale), _mm256_loadu_ps(r.B_scale + j)));
+ } else {
+ x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+ }
+
+ if (HAS_BIAS) {
+ x_scaled_v = _mm256_add_ps(x_scaled_v, _mm256_loadu_ps(r.bias + j));
+ }
+ if (FUSE_RELU) {
+ x_scaled_v = _mm256_max_ps(_mm256_setzero_ps(), x_scaled_v);
+ }
+
+ _mm256_storeu_ps(out + i * ld_out + j, x_scaled_v);
+ } // j loop vectorized
+
+ int remainder = block.col_start + block.col_size - j;
+ if (remainder > 0) {
+ alignas(64) const int masks[8][8] = {
+ // NOTE: clang-format wants to use a different formatting but the
+ // current formatting should be easier to read.
+ { 0, 0, 0, 0, 0, 0, 0, 0, },
+ { -1, 0, 0, 0, 0, 0, 0, 0, },
+ { -1, -1, 0, 0, 0, 0, 0, 0, },
+ { -1, -1, -1, 0, 0, 0, 0, 0, },
+ { -1, -1, -1, -1, 0, 0, 0, 0, },
+ { -1, -1, -1, -1, -1, 0, 0, 0, },
+ { -1, -1, -1, -1, -1, -1, 0, 0, },
+ { -1, -1, -1, -1, -1, -1, -1, 0, },
+ };
+ __m256i mask_v = _mm256_load_si256(
+ reinterpret_cast<const __m256i*>(masks[remainder]));
+
+ __m256i x_v = _mm256_maskload_epi32(
+ inp + (i - block.row_start) * ld_in + (j - block.col_start),
+ mask_v);
+
+ if (!A_SYMMETRIC) {
+ __m256i col_off_v = _mm256_mullo_epi32(
+ A_zero_point_v, _mm256_maskload_epi32(r.col_offsets + j, mask_v));
+ x_v = _mm256_sub_epi32(x_v, col_off_v);
+ }
+
+ if (!B_SYMMETRIC) {
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ row_offset_v = _mm256_mullo_epi32(
+ _mm256_set1_epi32(r.row_offsets[i - block.row_start]),
+ _mm256_maskload_epi32(r.B_zero_point + j, mask_v));
+ }
+ x_v = _mm256_sub_epi32(x_v, row_offset_v);
+ }
+
+ __m256 x_scaled_v;
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ x_scaled_v = _mm256_mul_ps(
+ _mm256_cvtepi32_ps(x_v),
+ _mm256_mul_ps(
+ _mm256_set1_ps(r.A_scale),
+ _mm256_maskload_ps(r.B_scale + j, mask_v)));
+ } else {
+ x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+ }
+
+ if (HAS_BIAS) {
+ x_scaled_v =
+ _mm256_add_ps(x_scaled_v, _mm256_maskload_ps(r.bias + j, mask_v));
+ }
+ if (FUSE_RELU) {
+ x_scaled_v = _mm256_max_ps(_mm256_setzero_ps(), x_scaled_v);
+ }
+
+ _mm256_maskstore_ps(out + i * ld_out + j, mask_v, x_scaled_v);
+ } // j loop remainder
+ } // i loop
+}
+
+template <
+ bool A_SYMMETRIC,
+ bool B_SYMMETRIC,
+ QuantizationGranularity Q_GRAN,
+ bool HAS_BIAS,
bool FUSE_RELU,
int C_PER_G>
void requantizeOutputProcessingGConvAvx2(
@@ -1120,6 +1279,13 @@ void requantizeOutputProcessingGConvAvx2(
int ld_out, \
int ld_in, \
const requantizationParams_t& r); \
+ template void requantizeForFloatAvx2<A_SYM, B_SYM, Q_GRAN, BIAS, RELU>( \
+ float* out, \
+ const int32_t* inp, \
+ const block_type_t& block, \
+ int ld_out, \
+ int ld_in, \
+ const requantizationForFloatParams_t& r); \
template void \
requantizeOutputProcessingGConvAvx2<A_SYM, B_SYM, Q_GRAN, BIAS, RELU, 4>( \
uint8_t * out, \