optimize requantize for float out processing (#85)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/85 Optimizing performance of output processing when output is dequantized right away. Reviewed By: protonu Differential Revision: D14433141 fbshipit-source-id: f99a8d82000c43e554461acf036462a4e8f7e300
author: Jongsoo Park <jongsoo@fb.com> 2019-03-13 06:14:32 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-03-13 06:17:49 +0300
commit: 6011ce3b0c1fccee549e85b37e475c7a734ad742 (patch)
tree: 7089177b6c7da36c2582da1cf9b42eca9dfb2ea7
parent: 50b43162fd1742122d01f2704945c78f13e0d73e (diff)
5 files changed, 415 insertions, 96 deletions
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
index 1397125..66ca67e 100644
--- a/bench/PackedFloatInOutBenchmark.cc
+++ b/bench/PackedFloatInOutBenchmark.cc
@@ -76,8 +76,21 @@ void performance_test() {
   constexpr int NWARMUP = 4;
   constexpr int NITER = 10;
 
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+  cout << "WARNING: the timer may be inaccurate when used by multiple threads."
+       << endl;
+  cout << "M, "
+       << "N, "
+       << "K, "
+       << "Packing (ms), "
+       << "Kernel (ms), "
+       << "Postprocessing (ms), "
+       << "Total (ms), "
+       << "GOPs" << endl;
+#else
   cout << setw(7) << "M, " << setw(7) << "N, " << setw(7) << "K, " << setw(18)
        << "Type, " << setw(5) << "GOPS" << endl;
+#endif
 
   chrono::time_point<chrono::high_resolution_clock> start, end;
   for (auto shape : shapes) {
@@ -203,7 +216,23 @@ void performance_test() {
 
     ttot = 0;
     type = "FBGEMM_i8_acc32";
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+    double total_packing_time = 0.0;
+    double total_computing_time = 0.0;
+    double total_kernel_time = 0.0;
+    double total_postprocessing_time = 0.0;
+    double total_run_time = 0.0;
+#endif
+
     for (auto i = 0; i < NWARMUP + NITER; ++i) {
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+      packing_time = 0.0;
+      computing_time = 0.0;
+      kernel_time = 0.0;
+      postprocessing_time = 0.0;
+      run_time = 0.0;
+#endif
+
       llc_flush(llc);
       start = chrono::high_resolution_clock::now();
       fbgemmPacked(
@@ -220,6 +249,13 @@ void performance_test() {
       if (i >= NWARMUP) {
         auto dur = chrono::duration_cast<chrono::nanoseconds>(end - start);
         ttot += dur.count();
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+        total_packing_time += packing_time;
+        total_computing_time += computing_time;
+        total_kernel_time += kernel_time;
+        total_postprocessing_time += postprocessing_time;
+        total_run_time += run_time;
+#endif
       }
     }
     ((volatile char*)(llc.data()));
@@ -237,6 +273,12 @@ void performance_test() {
     // row_offsets.size(), 5);
     // printMatrix(matrix_op_t::NoTranspose, Cfp32_fb.data(),
     // m, n, n, "C fb fp32");
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+    cout << total_packing_time / (double)NITER / 1e6 << ", "
+         << total_kernel_time / (double)NITER / 1e6 << ", "
+         << total_postprocessing_time / (double)NITER / 1e6 << ", "
+         << total_run_time / (double)NITER / 1e6 << ", ";
+#endif
     cout << setw(5) << m << ", " << setw(5) << n << ", " << setw(5) << k << ", "
          << setw(16) << type << ", " << setw(5) << fixed << setw(5)
          << setprecision(1) << nops / ttot << endl;
diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h
index 9485b18..d984c60 100644
--- a/include/fbgemm/OutputProcessing-inl.h
+++ b/include/fbgemm/OutputProcessing-inl.h
@@ -77,7 +77,7 @@ inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
       block.col_size <= ncol_per_group &&
       "ReQuantizeOutput should be called at most 1 group at a time.");
   int g = block.col_start / ncol_per_group;
-  if (instSet == inst_set_t::anyarch) {
+  if (instSet == inst_set_t::anyarch || !std::is_same<outT, uint8_t>::value) {
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
       for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
         inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)];
@@ -111,88 +111,84 @@ inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
       }
     }
   } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
-    if (std::is_same<outT, uint8_t>::value) {
-      bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR &&
-                          Bq_zero_point_[0] == 0) ||
-          q_row_offsets_ == nullptr;
+    bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR &&
+                        Bq_zero_point_[0] == 0) ||
+        q_row_offsets_ == nullptr;
 
-      requantizationParams_t r = {Aq_zero_point_,
-                                  Bq_zero_point_,
-                                  C_zero_point_,
-                                  C_multiplier_,
-                                  q_row_offsets_,
-                                  q_col_offsets_,
-                                  bias_,
-                                  ncols_,
-                                  groups_};
+    requantizationParams_t r = {Aq_zero_point_,
+                                Bq_zero_point_,
+                                C_zero_point_,
+                                C_multiplier_,
+                                q_row_offsets_,
+                                q_col_offsets_,
+                                bias_,
+                                ncols_,
+                                groups_};
 
-      if (Aq_zero_point_ == 0) {
-        if (b_symmetric) {
-          if (bias_ == nullptr) {
-            requantizeOutputProcessingAvx2<
-                true,
-                true,
-                Q_GRAN,
-                false,
-                FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
-          } else {
-            requantizeOutputProcessingAvx2<true, true, Q_GRAN, true, FUSE_RELU>(
-                out, inp, block, ld_out, ld_in, r);
-          }
+    if (Aq_zero_point_ == 0) {
+      if (b_symmetric) {
+        if (bias_ == nullptr) {
+          requantizeOutputProcessingAvx2<
+              true,
+              true,
+              Q_GRAN,
+              false,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
         } else {
-          if (bias_ == nullptr) {
-            requantizeOutputProcessingAvx2<
-                true,
-                false,
-                Q_GRAN,
-                false,
-                FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
-          } else {
-            requantizeOutputProcessingAvx2<
-                true,
-                false,
-                Q_GRAN,
-                true,
-                FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
-          }
+          requantizeOutputProcessingAvx2<true, true, Q_GRAN, true, FUSE_RELU>(
+              out, inp, block, ld_out, ld_in, r);
         }
       } else {
-        if (b_symmetric) {
-          if (bias_ == nullptr) {
-            requantizeOutputProcessingAvx2<
-                false,
-                true,
-                Q_GRAN,
-                false,
-                FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
-          } else {
-            requantizeOutputProcessingAvx2<
-                false,
-                true,
-                Q_GRAN,
-                true,
-                FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
-          }
+        if (bias_ == nullptr) {
+          requantizeOutputProcessingAvx2<
+              true,
+              false,
+              Q_GRAN,
+              false,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
         } else {
-          if (bias_ == nullptr) {
-            requantizeOutputProcessingAvx2<
-                false,
-                false,
-                Q_GRAN,
-                false,
-                FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
-          } else {
-            requantizeOutputProcessingAvx2<
-                false,
-                false,
-                Q_GRAN,
-                true,
-                FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
-          }
+          requantizeOutputProcessingAvx2<
+              true,
+              false,
+              Q_GRAN,
+              true,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
         }
       }
     } else {
-      assert(0 && "Not supported yet");
+      if (b_symmetric) {
+        if (bias_ == nullptr) {
+          requantizeOutputProcessingAvx2<
+              false,
+              true,
+              Q_GRAN,
+              false,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        } else {
+          requantizeOutputProcessingAvx2<
+              false,
+              true,
+              Q_GRAN,
+              true,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        }
+      } else {
+        if (bias_ == nullptr) {
+          requantizeOutputProcessingAvx2<
+              false,
+              false,
+              Q_GRAN,
+              false,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        } else {
+          requantizeOutputProcessingAvx2<
+              false,
+              false,
+              Q_GRAN,
+              true,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        }
+      }
     }
   } else {
     assert(0 && "Not supported yet");
@@ -224,33 +220,119 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
       block.col_size <= ncol_per_group &&
       "ReQuantizeOutput should be called at most 1 group at a time.");
   int g = block.col_start / ncol_per_group;
-  for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
-    for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
-      inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start];
-      if (Aq_zero_point_) {
-        raw -= Aq_zero_point_ * q_col_offsets_[j];
+  if (instSet == inst_set_t::anyarch || !std::is_same<outT, float>::value) {
+    for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+      for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
+        inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start];
+        if (Aq_zero_point_) {
+          raw -= Aq_zero_point_ * q_col_offsets_[j];
+        }
+        int Bq_zero_point_idx;
+        if (Q_GRAN == QuantizationGranularity::TENSOR) {
+          Bq_zero_point_idx = 0;
+        } else if (Q_GRAN == QuantizationGranularity::GROUP) {
+          Bq_zero_point_idx = g;
+        } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          Bq_zero_point_idx = j;
+        } else {
+          assert(false && "unknown quantization granularity");
+        }
+        if (q_row_offsets_) {
+          raw -= q_row_offsets_[i - block.row_start] *
+              Bq_zero_point_[Bq_zero_point_idx];
+        }
+        float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx];
+        if (bias_) {
+          res += bias_[j];
+        }
+        out[i * ld_out + j] = res;
+        if (FUSE_RELU) {
+          out[i * ld_out + j] = std::max<outT>(0.0f, out[i * ld_out + j]);
+        }
       }
-      int Bq_zero_point_idx;
-      if (Q_GRAN == QuantizationGranularity::TENSOR) {
-        Bq_zero_point_idx = 0;
-      } else if (Q_GRAN == QuantizationGranularity::GROUP) {
-        Bq_zero_point_idx = g;
-      } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
-        Bq_zero_point_idx = j;
+    }
+  } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
+    bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR &&
+                        Bq_zero_point_[0] == 0) ||
+        q_row_offsets_ == nullptr;
+
+    requantizationForFloatParams_t r = {Aq_zero_point_,
+                                        Bq_zero_point_,
+                                        Aq_scale_,
+                                        Bq_scale_,
+                                        q_row_offsets_,
+                                        q_col_offsets_,
+                                        bias_,
+                                        ncols_,
+                                        groups_};
+
+    if (Aq_zero_point_ == 0) {
+      if (b_symmetric) {
+        if (bias_ == nullptr) {
+          requantizeForFloatAvx2<
+              true,
+              true,
+              Q_GRAN,
+              false,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        } else {
+          requantizeForFloatAvx2<true, true, Q_GRAN, true, FUSE_RELU>(
+              out, inp, block, ld_out, ld_in, r);
+        }
       } else {
-        assert(false && "unknown quantization granularity");
-      }
-      raw -= q_row_offsets_[i - block.row_start] *
-          Bq_zero_point_[Bq_zero_point_idx];
-      float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx];
-      if (bias_) {
-        res += bias_[j];
+        if (bias_ == nullptr) {
+          requantizeForFloatAvx2<
+              true,
+              false,
+              Q_GRAN,
+              false,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        } else {
+          requantizeForFloatAvx2<
+              true,
+              false,
+              Q_GRAN,
+              true,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        }
       }
-      out[i * ld_out + j] = res;
-      if (FUSE_RELU) {
-        out[i * ld_out + j] = std::max<outT>(0.0f, out[i * ld_out + j]);
+    } else {
+      if (b_symmetric) {
+        if (bias_ == nullptr) {
+          requantizeForFloatAvx2<
+              false,
+              true,
+              Q_GRAN,
+              false,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        } else {
+          requantizeForFloatAvx2<
+              false,
+              true,
+              Q_GRAN,
+              true,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        }
+      } else {
+        if (bias_ == nullptr) {
+          requantizeForFloatAvx2<
+              false,
+              false,
+              Q_GRAN,
+              false,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        } else {
+          requantizeForFloatAvx2<
+              false,
+              false,
+              Q_GRAN,
+              true,
+              FUSE_RELU>(out, inp, block, ld_out, ld_in, r);
+        }
       }
     }
+  } else {
+    assert(0 && "Not supported yet");
   }
 
   return nextop_.template f<instSet>(out, out, block, ld_out, ld_out);
diff --git a/include/fbgemm/QuantUtilsAvx2.h b/include/fbgemm/QuantUtilsAvx2.h
index 04aeba1..47f33a8 100644
--- a/include/fbgemm/QuantUtilsAvx2.h
+++ b/include/fbgemm/QuantUtilsAvx2.h
@@ -95,4 +95,18 @@ FBGEMM_API void requantizeOutputProcessingGConvAvx2(
     int ld_in,
     const requantizationParams_t& r);
 
+template <
+    bool A_SYMMETRIC,
+    bool B_SYMMETRIC,
+    QuantizationGranularity Q_GRAN,
+    bool HAS_BIAS,
+    bool FUSE_RELU>
+FBGEMM_API void requantizeForFloatAvx2(
+    float* out,
+    const std::int32_t* inp,
+    const block_type_t& block,
+    int ld_out,
+    int ld_in,
+    const requantizationForFloatParams_t& r);
+
 } // namespace fbgemm
diff --git a/include/fbgemm/UtilsAvx2.h b/include/fbgemm/UtilsAvx2.h
index 53fb39d..082edc1 100644
--- a/include/fbgemm/UtilsAvx2.h
+++ b/include/fbgemm/UtilsAvx2.h
@@ -56,4 +56,19 @@ struct requantizationParams_t {
   int groups;
 };
 
+/**
+ * @brief A struct to represent all the parameters for requantizing for floats.
+ */
+struct requantizationForFloatParams_t {
+  std::int32_t A_zero_point;
+  const std::int32_t* B_zero_point;
+  float A_scale;
+  const float* B_scale;
+  const std::int32_t* row_offsets;
+  const std::int32_t* col_offsets;
+  const float* bias;
+  std::uint32_t ncols;
+  int groups;
+};
+
 } // namespace fbgemm
diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
index be12142..875c9e1 100644
--- a/src/QuantUtilsAvx2.cc
+++ b/src/QuantUtilsAvx2.cc
@@ -662,6 +662,165 @@ template <
     bool B_SYMMETRIC,
     QuantizationGranularity Q_GRAN,
     bool HAS_BIAS,
+    bool FUSE_RELU>
+void requantizeForFloatAvx2(
+    float* out,
+    const int32_t* inp,
+    const block_type_t& block,
+    int ld_out,
+    int ld_in,
+    const requantizationForFloatParams_t& r) {
+  // Adoption of implementation at QNNPACK/src/requantization/fp32-sse2.c
+  // using AVX2 instructions
+  int quant_param_idx = 0;
+  if (Q_GRAN == QuantizationGranularity::GROUP) {
+    int ncol_per_group = r.ncols / r.groups;
+    int g = block.col_start / ncol_per_group;
+    quant_param_idx = g;
+  }
+  __m256 multiplier_v = _mm256_set1_ps(r.A_scale * r.B_scale[quant_param_idx]);
+
+  assert(
+      (A_SYMMETRIC == (r.A_zero_point == 0)) &&
+      "A_SYMMETRIC == true if and only if A_zero_point == 0");
+  assert(
+      (B_SYMMETRIC ==
+       ((Q_GRAN == QuantizationGranularity::TENSOR && r.B_zero_point[0] == 0) ||
+        r.row_offsets == nullptr)) &&
+      "B_SYMMETRIC == true if and only if B_zero_point == 0 "
+      "or r.row_offsets == nullptr");
+  assert(
+      (HAS_BIAS == (r.bias != nullptr)) &&
+      "HAS_BIAS == true if and only if bias != nullptr");
+
+  __m256i A_zero_point_v = _mm256_set1_epi32(r.A_zero_point);
+
+  constexpr int VLEN = 8;
+  for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+    // Scale row_offset with Bq_zero_point
+    int32_t row_offset = 0;
+    if (B_SYMMETRIC) {
+      row_offset = 0;
+    } else if (
+        Q_GRAN == QuantizationGranularity::TENSOR ||
+        Q_GRAN == QuantizationGranularity::GROUP) {
+      row_offset =
+          r.row_offsets[i - block.row_start] * r.B_zero_point[quant_param_idx];
+    } else {
+      assert(
+          Q_GRAN == QuantizationGranularity::OUT_CHANNEL &&
+          "unknown quantization granularity");
+    }
+    __m256i row_offset_v = _mm256_set1_epi32(row_offset);
+
+    int j = block.col_start;
+    for (; j < block.col_start + (block.col_size / VLEN * VLEN); j += VLEN) {
+      __m256i x_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
+          inp + (i - block.row_start) * ld_in + (j - block.col_start)));
+
+      if (!A_SYMMETRIC) {
+        __m256i col_off_v = _mm256_mullo_epi32(
+            A_zero_point_v,
+            _mm256_loadu_si256(
+                reinterpret_cast<const __m256i*>(r.col_offsets + j)));
+        x_v = _mm256_sub_epi32(x_v, col_off_v);
+      }
+
+      if (!B_SYMMETRIC) {
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(r.row_offsets[i - block.row_start]),
+              _mm256_loadu_si256(
+                  reinterpret_cast<const __m256i*>(r.B_zero_point + j)));
+        }
+        x_v = _mm256_sub_epi32(x_v, row_offset_v);
+      }
+
+      __m256 x_scaled_v;
+      if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        x_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(x_v),
+            _mm256_mul_ps(
+                _mm256_set1_ps(r.A_scale), _mm256_loadu_ps(r.B_scale + j)));
+      } else {
+        x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+      }
+
+      if (HAS_BIAS) {
+        x_scaled_v = _mm256_add_ps(x_scaled_v, _mm256_loadu_ps(r.bias + j));
+      }
+      if (FUSE_RELU) {
+        x_scaled_v = _mm256_max_ps(_mm256_setzero_ps(), x_scaled_v);
+      }
+
+      _mm256_storeu_ps(out + i * ld_out + j, x_scaled_v);
+    } // j loop vectorized
+
+    int remainder = block.col_start + block.col_size - j;
+    if (remainder > 0) {
+      alignas(64) const int masks[8][8] = {
+        // NOTE: clang-format wants to use a different formatting but the
+        // current formatting should be easier to read.
+        {  0,  0,  0,  0,  0,  0,  0,  0,  },
+        { -1,  0,  0,  0,  0,  0,  0,  0,  },
+        { -1, -1,  0,  0,  0,  0,  0,  0,  },
+        { -1, -1, -1,  0,  0,  0,  0,  0,  },
+        { -1, -1, -1, -1,  0,  0,  0,  0,  },
+        { -1, -1, -1, -1, -1,  0,  0,  0,  },
+        { -1, -1, -1, -1, -1, -1,  0,  0,  },
+        { -1, -1, -1, -1, -1, -1, -1,  0,  },
+      };
+      __m256i mask_v = _mm256_load_si256(
+          reinterpret_cast<const __m256i*>(masks[remainder]));
+
+      __m256i x_v = _mm256_maskload_epi32(
+          inp + (i - block.row_start) * ld_in + (j - block.col_start),
+          mask_v);
+
+      if (!A_SYMMETRIC) {
+         __m256i col_off_v = _mm256_mullo_epi32(
+            A_zero_point_v, _mm256_maskload_epi32(r.col_offsets + j, mask_v));
+        x_v = _mm256_sub_epi32(x_v, col_off_v);
+      }
+
+      if (!B_SYMMETRIC) {
+        if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+          row_offset_v = _mm256_mullo_epi32(
+              _mm256_set1_epi32(r.row_offsets[i - block.row_start]),
+              _mm256_maskload_epi32(r.B_zero_point + j, mask_v));
+        }
+        x_v = _mm256_sub_epi32(x_v, row_offset_v);
+      }
+
+      __m256 x_scaled_v;
+      if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+        x_scaled_v = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(x_v),
+            _mm256_mul_ps(
+                _mm256_set1_ps(r.A_scale),
+                _mm256_maskload_ps(r.B_scale + j, mask_v)));
+      } else {
+        x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+      }
+
+      if (HAS_BIAS) {
+        x_scaled_v =
+            _mm256_add_ps(x_scaled_v, _mm256_maskload_ps(r.bias + j, mask_v));
+      }
+      if (FUSE_RELU) {
+        x_scaled_v = _mm256_max_ps(_mm256_setzero_ps(), x_scaled_v);
+      }
+
+      _mm256_maskstore_ps(out + i * ld_out + j, mask_v, x_scaled_v);
+    } // j loop remainder
+  } // i loop
+}
+
+template <
+    bool A_SYMMETRIC,
+    bool B_SYMMETRIC,
+    QuantizationGranularity Q_GRAN,
+    bool HAS_BIAS,
     bool FUSE_RELU,
     int C_PER_G>
 void requantizeOutputProcessingGConvAvx2(
@@ -1120,6 +1279,13 @@ void requantizeOutputProcessingGConvAvx2(
       int ld_out,                                                            \
       int ld_in,                                                             \
       const requantizationParams_t& r);                                      \
+  template void requantizeForFloatAvx2<A_SYM, B_SYM, Q_GRAN, BIAS, RELU>(    \
+      float* out,                                                            \
+      const int32_t* inp,                                                    \
+      const block_type_t& block,                                             \
+      int ld_out,                                                            \
+      int ld_in,                                                             \
+      const requantizationForFloatParams_t& r);                              \
   template void                                                              \
   requantizeOutputProcessingGConvAvx2<A_SYM, B_SYM, Q_GRAN, BIAS, RELU, 4>(  \
       uint8_t * out,                                                         \
author	Jongsoo Park <jongsoo@fb.com>	2019-03-13 06:14:32 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-03-13 06:17:49 +0300
commit	6011ce3b0c1fccee549e85b37e475c7a734ad742 (patch)
tree	7089177b6c7da36c2582da1cf9b42eca9dfb2ea7
parent	50b43162fd1742122d01f2704945c78f13e0d73e (diff)