API changes to take unquantized bias for depthwise conv

Summary: Changing interface for on the fly bias quantization Also adding code to quantize bias on the fly Reviewed By: jianyuh Differential Revision: D17099709 fbshipit-source-id: 5cca79189c00710e703044350260a9fcaca77bb3
author: Daya Khudia <dskhudia@fb.com> 2019-09-11 21:47:58 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-09-11 21:52:06 +0300
commit: 415035019ccbca2b11b62f1503fdd61e8bc59b10 (patch)
tree: a970118ee111cb6531ddc7527f720fd94c5fa0cf
parent: 685b1855d739868839cd8f774d987c1a4599b138 (diff)
5 files changed, 923 insertions, 156 deletions
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index 4f36e6c..6b2f8b8 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -267,6 +267,7 @@ int main() {
             col_offsets.data(),
             bias.data(),
             false, /* fuse_relu */
+            1.0f, /* act_scale * w_scale */
             tid,
             num_threads);
       }
diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
index 98c4ed7..19946cf 100644
--- a/include/fbgemm/FbgemmI8DepthwiseAvx2.h
+++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
@@ -50,12 +50,39 @@ using Packed5ConvMatrix = PackedDepthWiseConvMatrix<5>;
 using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>;
 using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>;
 
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    std::int32_t B_zero_point,
+    const Packed3x3ConvMatrix& Bp,
+    float C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const std::int32_t* bias,
+    bool fuse_relu = false,
+    int thread_id = 0,
+    int num_threads = 1);
+
 /**
- * Depth-wise 3x3 convolution with pad=1 and K a multiple of 8, fused with
- * requantization.
+ * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
+ * This version is fused with requantization.
  *
  * @col_offsets nullptr if col_offsets are folded into bias
+ * @act_times_w_scale Only used if BIAS_TYPE is float, i.e., bias is
+ *                    unquantized.
  */
+template <typename BIAS_TYPE = std::int32_t>
 FBGEMM_API void depthwise_3x3_pad_1(
     int N,
     int H,
@@ -71,8 +98,9 @@ FBGEMM_API void depthwise_3x3_pad_1(
     std::int32_t C_zero_point,
     std::uint8_t* C,
     const std::int32_t* col_offsets,
-    const std::int32_t* bias,
+    const BIAS_TYPE* bias,
     bool fuse_relu = false,
+    float act_times_w_scale = 1.0f,
     int thread_id = 0,
     int num_threads = 1);
 
@@ -82,6 +110,31 @@ FBGEMM_API void depthwise_3x3_pad_1(
  *
  * @col_offsets nullptr if col_offsets are folded into bias
  */
+template <typename BIAS_TYPE = std::int32_t>
+FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const std::int32_t* B_zero_point,
+    const Packed3x3ConvMatrix& Bp,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const BIAS_TYPE* bias,
+    bool fuse_relu = false,
+    const float* act_times_w_scale = nullptr,
+    int thread_id = 0,
+    int num_threads = 1);
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ */
 FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
     int N,
     int H,
@@ -102,9 +155,35 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
     int thread_id = 0,
     int num_threads = 1);
 
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3x3_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    std::int32_t B_zero_point,
+    const Packed3x3x3ConvMatrix& Bp,
+    float C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const std::int32_t* bias,
+    bool fuse_relu = false,
+    int thread_id = 0,
+    int num_threads = 1);
 /**
  * @col_offsets nullptr if col_offsets are folded into bias
  */
+template <typename BIAS_TYPE = std::int32_t>
 FBGEMM_API void depthwise_3x3x3_pad_1(
     int N,
     int T,
@@ -122,6 +201,33 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
     std::int32_t C_zero_point,
     std::uint8_t* C,
     const std::int32_t* col_offsets,
+    const BIAS_TYPE* bias,
+    bool fuse_relu = false,
+    float act_times_w_scale = 1.0f,
+    int thread_id = 0,
+    int num_threads = 1);
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const std::int32_t* B_zero_point,
+    const Packed3x3x3ConvMatrix& Bp,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
     const std::int32_t* bias,
     bool fuse_relu = false,
     int thread_id = 0,
@@ -130,6 +236,7 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
 /**
  * @col_offsets nullptr if col_offsets are folded into bias
  */
+template <typename BIAS_TYPE = std::int32_t>
 FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
     int N,
     int T,
@@ -147,8 +254,9 @@ FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
     std::int32_t C_zero_point,
     std::uint8_t* C,
     const std::int32_t* col_offsets,
-    const std::int32_t* bias,
+    const BIAS_TYPE* bias,
     bool fuse_relu = false,
+    const float* act_times_w_scale = nullptr,
     int thread_id = 0,
     int num_threads = 1);
 
diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
index 164411d..6a1e55b 100644
--- a/src/FbgemmConv.cc
+++ b/src/FbgemmConv.cc
@@ -115,6 +115,7 @@ int fbgemmConv(
               outProcess.getColOffsets(),
               outProcess.getBias(),
               outProcess.RELU_FUSED, // fuse_relu
+              1.0f, // act_scale * weight_scale
               thread_id,
               num_threads);
         } else if (
@@ -140,6 +141,7 @@ int fbgemmConv(
               outProcess.getColOffsets(),
               outProcess.getBias(),
               outProcess.RELU_FUSED, // fuse_relu
+              nullptr, // act_scale * weight_scale
               thread_id,
               num_threads);
         } else {
@@ -167,6 +169,7 @@ int fbgemmConv(
               outProcess.getColOffsets(),
               outProcess.getBias(),
               outProcess.RELU_FUSED, // fuse_relu
+              1.0f, // act_scale * weight_scale
               thread_id,
               num_threads);
         } else if (
@@ -191,6 +194,7 @@ int fbgemmConv(
               outProcess.getColOffsets(),
               outProcess.getBias(),
               outProcess.RELU_FUSED, // fuse_relu
+              nullptr, // act_scale * weight_scale
               thread_id,
               num_threads);
         } else {
diff --git a/src/FbgemmI8DepthwiseAvx2.cc b/src/FbgemmI8DepthwiseAvx2.cc
index 7454ef4..aa7b90e 100644
--- a/src/FbgemmI8DepthwiseAvx2.cc
+++ b/src/FbgemmI8DepthwiseAvx2.cc
@@ -10,6 +10,7 @@
 #include <cassert>
 #include <cmath> // for lrintf and sqrt
 #include <tuple> // for tie
+#include <type_traits> // for is_same
 
 #include <immintrin.h>
 
@@ -578,7 +579,8 @@ template <
     bool HAS_BIAS,
     bool PER_CHANNEL_QUANTIZATION,
     bool A_SYMMETRIC,
-    bool B_SYMMETRIC>
+    bool B_SYMMETRIC,
+    typename BIAS_TYPE>
 static inline __attribute__((always_inline)) void requantize_(
     int32_t A_zero_point,
     const float* C_multiplier,
@@ -588,10 +590,16 @@ static inline __attribute__((always_inline)) void requantize_(
     int n,
     const int32_t* row_offsets,
     const int32_t* col_offsets,
-    const int32_t* bias) {
+    const BIAS_TYPE* bias,
+    const float* act_times_w_scale = nullptr) {
   __m256 multiplier_v = _mm256_setzero_ps();
+  // Broadcasted reciprocal of act_times_w_scale
+  __m256 act_times_w_rcp_v = _mm256_setzero_ps();
   if (!PER_CHANNEL_QUANTIZATION) {
     multiplier_v = _mm256_set1_ps(*C_multiplier);
+    if (is_same<BIAS_TYPE, float>::value) {
+      act_times_w_rcp_v = _mm256_set1_ps(1.0f / (*act_times_w_scale));
+    }
   }
 
   __m256i min_v = _mm256_set1_epi8(static_cast<uint8_t>(0));
@@ -673,39 +681,95 @@ static inline __attribute__((always_inline)) void requantize_(
       w_v = _mm256_sub_epi32(w_v, col_off_v);
     }
 
+    // convert to float
+    __m256 xf_v, yf_v, zf_v, wf_v;
     if (HAS_BIAS) { // static if
-      x_v = _mm256_add_epi32(
-          x_v, _mm256_loadu_si256(reinterpret_cast<const __m256i*>(bias + j)));
-      y_v = _mm256_add_epi32(
-          y_v,
-          _mm256_loadu_si256(
-              reinterpret_cast<const __m256i*>(bias + j + VLEN)));
-      z_v = _mm256_add_epi32(
-          z_v,
-          _mm256_loadu_si256(
-              reinterpret_cast<const __m256i*>(bias + j + 2 * VLEN)));
-      w_v = _mm256_add_epi32(
-          w_v,
-          _mm256_loadu_si256(
-              reinterpret_cast<const __m256i*>(bias + j + 3 * VLEN)));
+      if (is_same<BIAS_TYPE, float>::value) {
+        __m256 x_bias_v, y_bias_v, z_bias_v, w_bias_v;
+        if (PER_CHANNEL_QUANTIZATION) {
+          x_bias_v = _mm256_div_ps(
+              _mm256_loadu_ps(
+                  reinterpret_cast<const float*>(bias + j + 0 * VLEN)),
+              _mm256_loadu_ps(act_times_w_scale + j + 0 * VLEN));
+          y_bias_v = _mm256_div_ps(
+              _mm256_loadu_ps(
+                  reinterpret_cast<const float*>(bias + j + 1 * VLEN)),
+              _mm256_loadu_ps(act_times_w_scale + j + 1 * VLEN));
+          z_bias_v = _mm256_div_ps(
+              _mm256_loadu_ps(
+                  reinterpret_cast<const float*>(bias + j + 2 * VLEN)),
+              _mm256_loadu_ps(act_times_w_scale + j + 2 * VLEN));
+          w_bias_v = _mm256_div_ps(
+              _mm256_loadu_ps(
+                  reinterpret_cast<const float*>(bias + j + 3 * VLEN)),
+              _mm256_loadu_ps(act_times_w_scale + j + 3 * VLEN));
+        } else {
+          x_bias_v = _mm256_mul_ps(
+              _mm256_loadu_ps(
+                  reinterpret_cast<const float*>(bias + j + 0 * VLEN)),
+              act_times_w_rcp_v);
+          y_bias_v = _mm256_mul_ps(
+              _mm256_loadu_ps(
+                  reinterpret_cast<const float*>(bias + j + 1 * VLEN)),
+              act_times_w_rcp_v);
+          z_bias_v = _mm256_mul_ps(
+              _mm256_loadu_ps(
+                  reinterpret_cast<const float*>(bias + j + 2 * VLEN)),
+              act_times_w_rcp_v);
+          w_bias_v = _mm256_mul_ps(
+              _mm256_loadu_ps(
+                  reinterpret_cast<const float*>(bias + j + 3 * VLEN)),
+              act_times_w_rcp_v);
+        }
+        xf_v = _mm256_add_ps(_mm256_cvtepi32_ps(x_v), x_bias_v);
+        yf_v = _mm256_add_ps(_mm256_cvtepi32_ps(y_v), y_bias_v);
+        zf_v = _mm256_add_ps(_mm256_cvtepi32_ps(z_v), z_bias_v);
+        wf_v = _mm256_add_ps(_mm256_cvtepi32_ps(w_v), w_bias_v);
+      } else {
+        x_v = _mm256_add_epi32(
+            x_v,
+            _mm256_loadu_si256(
+                reinterpret_cast<const __m256i*>(bias + j + 0 * VLEN)));
+        y_v = _mm256_add_epi32(
+            y_v,
+            _mm256_loadu_si256(
+                reinterpret_cast<const __m256i*>(bias + j + 1 * VLEN)));
+        z_v = _mm256_add_epi32(
+            z_v,
+            _mm256_loadu_si256(
+                reinterpret_cast<const __m256i*>(bias + j + 2 * VLEN)));
+        w_v = _mm256_add_epi32(
+            w_v,
+            _mm256_loadu_si256(
+                reinterpret_cast<const __m256i*>(bias + j + 3 * VLEN)));
+        xf_v = _mm256_cvtepi32_ps(x_v);
+        yf_v = _mm256_cvtepi32_ps(y_v);
+        zf_v = _mm256_cvtepi32_ps(z_v);
+        wf_v = _mm256_cvtepi32_ps(w_v);
+      }
+    } else {
+      xf_v = _mm256_cvtepi32_ps(x_v);
+      yf_v = _mm256_cvtepi32_ps(y_v);
+      zf_v = _mm256_cvtepi32_ps(z_v);
+      wf_v = _mm256_cvtepi32_ps(w_v);
     }
 
     if (PER_CHANNEL_QUANTIZATION) {
-      multiplier_v = _mm256_loadu_ps(C_multiplier + j);
+      multiplier_v = _mm256_loadu_ps(C_multiplier + j + 0 * VLEN);
     }
-    __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+    __m256 x_scaled_v = _mm256_mul_ps(xf_v, multiplier_v);
     if (PER_CHANNEL_QUANTIZATION) {
-      multiplier_v = _mm256_loadu_ps(C_multiplier + j + VLEN);
+      multiplier_v = _mm256_loadu_ps(C_multiplier + j + 1 * VLEN);
     }
-    __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v);
+    __m256 y_scaled_v = _mm256_mul_ps(yf_v, multiplier_v);
     if (PER_CHANNEL_QUANTIZATION) {
       multiplier_v = _mm256_loadu_ps(C_multiplier + j + 2 * VLEN);
     }
-    __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v);
+    __m256 z_scaled_v = _mm256_mul_ps(zf_v, multiplier_v);
     if (PER_CHANNEL_QUANTIZATION) {
       multiplier_v = _mm256_loadu_ps(C_multiplier + j + 3 * VLEN);
     }
-    __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v);
+    __m256 w_scaled_v = _mm256_mul_ps(wf_v, multiplier_v);
 
     __m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
     __m256i y_rounded_v = _mm256_cvtps_epi32(y_scaled_v);
@@ -745,15 +809,35 @@ static inline __attribute__((always_inline)) void requantize_(
       x_v = _mm256_sub_epi32(x_v, col_off_v);
     }
 
+    // Convert to float
+    __m256 xf_v;
     if (HAS_BIAS) { // static if
-      x_v = _mm256_add_epi32(
-          x_v, _mm256_loadu_si256(reinterpret_cast<const __m256i*>(bias + j)));
+      if (is_same<BIAS_TYPE, float>::value) {
+        __m256 x_bias_v;
+        if (PER_CHANNEL_QUANTIZATION) {
+          x_bias_v = _mm256_div_ps(
+              _mm256_loadu_ps(reinterpret_cast<const float*>(bias + j)),
+              _mm256_loadu_ps(act_times_w_scale + j));
+        } else {
+          x_bias_v = _mm256_mul_ps(
+              _mm256_loadu_ps(reinterpret_cast<const float*>(bias + j)),
+              act_times_w_rcp_v);
+        }
+        xf_v = _mm256_add_ps(_mm256_cvtepi32_ps(x_v), x_bias_v);
+      } else {
+        x_v = _mm256_add_epi32(
+            x_v,
+            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(bias + j)));
+        xf_v = _mm256_cvtepi32_ps(x_v);
+      }
+    } else {
+      xf_v = _mm256_cvtepi32_ps(x_v);
     }
 
     if (PER_CHANNEL_QUANTIZATION) {
       multiplier_v = _mm256_loadu_ps(C_multiplier + j);
     }
-    __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+    __m256 x_scaled_v = _mm256_mul_ps(xf_v, multiplier_v);
     __m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
 
     __m256i x_packed_v = _mm256_adds_epi16(
@@ -779,11 +863,20 @@ static inline __attribute__((always_inline)) void requantize_(
     if (!A_SYMMETRIC) {
       raw -= A_zero_point * col_offsets[j];
     }
+    float raw_f;
     if (HAS_BIAS) { // static if
-      raw += bias[j];
+      if (is_same<BIAS_TYPE, float>::value) {
+        raw_f = raw;
+        raw_f += bias[j] / act_times_w_scale[PER_CHANNEL_QUANTIZATION ? j : 0];
+      } else {
+        raw += bias[j];
+        raw_f = raw;
+      }
+    } else {
+      raw_f = raw;
     }
 
-    float ab = raw * C_multiplier[PER_CHANNEL_QUANTIZATION ? j : 0];
+    float ab = raw_f * C_multiplier[PER_CHANNEL_QUANTIZATION ? j : 0];
     long rounded = lrintf(ab) + C_zero_point;
 
     C_uint8[j] = std::max(
@@ -1165,7 +1258,12 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
   }
 }
 
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
+template <
+    bool FUSE_RELU,
+    bool HAS_BIAS,
+    bool A_SYMMETRIC,
+    bool B_SYMMETRIC,
+    typename BIAS_TYPE>
 static inline __attribute__((always_inline)) void depthwise_3x3_kernel_(
     int H,
     int W,
@@ -1184,7 +1282,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3_kernel_(
     uint8_t* C_uint8,
     int32_t* row_offsets,
     const int32_t* col_offsets,
-    const int32_t* bias) {
+    const BIAS_TYPE* bias,
+    float act_times_w_scale) {
   constexpr int S = 3;
   constexpr int PAD_T = 1, PAD_L = 1, PAD_R = 1;
   int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
@@ -1229,7 +1328,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3_kernel_(
       HAS_BIAS,
       false, /*PER_CHAN_QUANT*/
       A_SYMMETRIC,
-      B_SYMMETRIC>(
+      B_SYMMETRIC,
+      BIAS_TYPE>(
       A_zero_point,
       &C_multiplier,
       C_zero_point,
@@ -1238,10 +1338,16 @@ static inline __attribute__((always_inline)) void depthwise_3x3_kernel_(
       K,
       row_offsets,
       col_offsets,
-      bias);
+      bias,
+      &act_times_w_scale);
 }
 
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
+template <
+    bool FUSE_RELU,
+    bool HAS_BIAS,
+    bool A_SYMMETRIC,
+    bool B_SYMMETRIC,
+    typename BIAS_TYPE>
 static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
     int T,
     int H,
@@ -1263,7 +1369,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
     uint8_t* C_uint8,
     int32_t* row_offsets,
     const int32_t* col_offsets,
-    const int32_t* bias) {
+    const BIAS_TYPE* bias,
+    float act_times_w_scale) {
   constexpr int R = 3, S = 3;
   constexpr int PAD_P = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
   int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
@@ -1323,10 +1430,11 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
       K,
       row_offsets,
       col_offsets,
-      bias);
+      bias,
+      &act_times_w_scale);
 }
 
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
+template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
 static inline __attribute__((always_inline)) void
 depthwise_3x3_per_channel_quantization_kernel_(
     int H,
@@ -1346,7 +1454,8 @@ depthwise_3x3_per_channel_quantization_kernel_(
     uint8_t* C_uint8,
     int32_t* row_offsets,
     const int32_t* col_offsets,
-    const int32_t* bias) {
+    const BIAS_TYPE* bias,
+    const float* act_times_w_scale) {
   constexpr int S = 3;
   constexpr int PAD_T = 1, PAD_L = 1, PAD_R = 1;
   int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
@@ -1397,7 +1506,8 @@ depthwise_3x3_per_channel_quantization_kernel_(
       HAS_BIAS,
       true, /*PER_CHAN_QUANT*/
       A_SYMMETRIC,
-      false /*B_SYMM*/>(
+      false, /*B_SYMM*/
+      BIAS_TYPE>(
       A_zero_point,
       C_multiplier,
       C_zero_point,
@@ -1406,10 +1516,11 @@ depthwise_3x3_per_channel_quantization_kernel_(
       K,
       row_offsets,
       col_offsets,
-      bias);
+      bias,
+      act_times_w_scale);
 }
 
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
+template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
 static inline __attribute__((always_inline)) void
 depthwise_3x3x3_per_channel_quantization_kernel_(
     int T,
@@ -1432,7 +1543,8 @@ depthwise_3x3x3_per_channel_quantization_kernel_(
     uint8_t* C_uint8,
     int32_t* row_offsets,
     const int32_t* col_offsets,
-    const int32_t* bias) {
+    const BIAS_TYPE* bias,
+    const float* act_times_w_scale) {
   constexpr int R = 3, S = 3;
   constexpr int PAD_P = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
   int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
@@ -1497,7 +1609,8 @@ depthwise_3x3x3_per_channel_quantization_kernel_(
       K,
       row_offsets,
       col_offsets,
-      bias);
+      bias,
+      act_times_w_scale);
 }
 
 static pair<int, int> closest_factors_(int n) {
@@ -1512,7 +1625,12 @@ static pair<int, int> closest_factors_(int n) {
 // This implemntation should be general enough to handle not just 3x3 but other
 // filter shapes by parameterizing with R and S but restricting it to just 3x3
 // for now.
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
+template <
+    bool FUSE_RELU,
+    bool HAS_BIAS,
+    bool A_SYMMETRIC,
+    bool B_SYMMETRIC,
+    typename BIAS_TYPE>
 static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
     int N,
     int H,
@@ -1529,7 +1647,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
     int32_t* C_int32,
     uint8_t* C_uint8,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    float act_times_w_scale,
     int thread_id,
     int num_threads) {
   assert(K % 8 == 0);
@@ -1588,7 +1707,12 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
 
     if (h_begin == 0) {
       if (w_begin == 0) {
-        depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+        depthwise_3x3_kernel_<
+            FUSE_RELU,
+            HAS_BIAS,
+            A_SYMMETRIC,
+            B_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -1606,11 +1730,17 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
-        depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+        depthwise_3x3_kernel_<
+            FUSE_RELU,
+            HAS_BIAS,
+            A_SYMMETRIC,
+            B_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -1628,12 +1758,18 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       if (w_end == W_OUT) {
         w = W_OUT - 1;
-        depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+        depthwise_3x3_kernel_<
+            FUSE_RELU,
+            HAS_BIAS,
+            A_SYMMETRIC,
+            B_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -1651,14 +1787,20 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
     }
 
     for (h = std::max(1, h_begin); h < std::min(H - 1, h_end); ++h) {
       if (w_begin == 0) {
         w = 0;
-        depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+        depthwise_3x3_kernel_<
+            FUSE_RELU,
+            HAS_BIAS,
+            A_SYMMETRIC,
+            B_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -1676,11 +1818,17 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
-        depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+        depthwise_3x3_kernel_<
+            FUSE_RELU,
+            HAS_BIAS,
+            A_SYMMETRIC,
+            B_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -1698,12 +1846,18 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       if (w_end == W_OUT) {
         w = W_OUT - 1;
-        depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+        depthwise_3x3_kernel_<
+            FUSE_RELU,
+            HAS_BIAS,
+            A_SYMMETRIC,
+            B_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -1721,7 +1875,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
     }
 
@@ -1729,7 +1884,12 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
       h = H_OUT - 1;
       w = 0;
       if (w_begin == 0) {
-        depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+        depthwise_3x3_kernel_<
+            FUSE_RELU,
+            HAS_BIAS,
+            A_SYMMETRIC,
+            B_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -1747,11 +1907,17 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
-        depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+        depthwise_3x3_kernel_<
+            FUSE_RELU,
+            HAS_BIAS,
+            A_SYMMETRIC,
+            B_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -1769,12 +1935,18 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       if (w_end == W_OUT) {
         w = W_OUT - 1;
-        depthwise_3x3_kernel_<FUSE_RELU, HAS_BIAS, A_SYMMETRIC, B_SYMMETRIC>(
+        depthwise_3x3_kernel_<
+            FUSE_RELU,
+            HAS_BIAS,
+            A_SYMMETRIC,
+            B_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -1792,13 +1964,19 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
     }
   } // for each n
 };
 
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
+template <
+    bool FUSE_RELU,
+    bool HAS_BIAS,
+    bool A_SYMMETRIC,
+    bool B_SYMMETRIC,
+    typename BIAS_TYPE>
 static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
     int N,
     int T,
@@ -1817,7 +1995,8 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
     int32_t* C_int32,
     uint8_t* C_uint8,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    float act_times_w_scale,
     int thread_id,
     int num_threads) {
   assert(K % 8 == 0);
@@ -1901,14 +2080,15 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
               C_uint8_base,
               row_offsets,
               col_offsets,
-              bias);
+              bias,
+              act_times_w_scale);
         } // w
       } // h
     } // t
   } // for each n
 };
 
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
+template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
 static inline __attribute__((always_inline)) void
 depthwise_3x3_per_channel_quantization_pad_1_(
     int N,
@@ -1926,7 +2106,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
     int32_t* C_int32,
     uint8_t* C_uint8,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
   assert(K % 8 == 0);
@@ -1988,7 +2169,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
         depthwise_3x3_per_channel_quantization_kernel_<
             FUSE_RELU,
             HAS_BIAS,
-            A_SYMMETRIC>(
+            A_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -2006,14 +2188,16 @@ depthwise_3x3_per_channel_quantization_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
         depthwise_3x3_per_channel_quantization_kernel_<
             FUSE_RELU,
             HAS_BIAS,
-            A_SYMMETRIC>(
+            A_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -2031,7 +2215,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       if (w_end == W_OUT) {
@@ -2039,7 +2224,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
         depthwise_3x3_per_channel_quantization_kernel_<
             FUSE_RELU,
             HAS_BIAS,
-            A_SYMMETRIC>(
+            A_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -2057,7 +2243,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
     }
 
@@ -2067,7 +2254,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
         depthwise_3x3_per_channel_quantization_kernel_<
             FUSE_RELU,
             HAS_BIAS,
-            A_SYMMETRIC>(
+            A_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -2085,14 +2273,16 @@ depthwise_3x3_per_channel_quantization_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
         depthwise_3x3_per_channel_quantization_kernel_<
             FUSE_RELU,
             HAS_BIAS,
-            A_SYMMETRIC>(
+            A_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -2110,7 +2300,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       if (w_end == W_OUT) {
@@ -2118,7 +2309,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
         depthwise_3x3_per_channel_quantization_kernel_<
             FUSE_RELU,
             HAS_BIAS,
-            A_SYMMETRIC>(
+            A_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -2136,7 +2328,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
     }
 
@@ -2147,7 +2340,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
         depthwise_3x3_per_channel_quantization_kernel_<
             FUSE_RELU,
             HAS_BIAS,
-            A_SYMMETRIC>(
+            A_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -2165,14 +2359,16 @@ depthwise_3x3_per_channel_quantization_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) {
         depthwise_3x3_per_channel_quantization_kernel_<
             FUSE_RELU,
             HAS_BIAS,
-            A_SYMMETRIC>(
+            A_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -2190,7 +2386,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
 
       if (w_end == W_OUT) {
@@ -2198,7 +2395,8 @@ depthwise_3x3_per_channel_quantization_pad_1_(
         depthwise_3x3_per_channel_quantization_kernel_<
             FUSE_RELU,
             HAS_BIAS,
-            A_SYMMETRIC>(
+            A_SYMMETRIC,
+            BIAS_TYPE>(
             H,
             W,
             K,
@@ -2216,13 +2414,14 @@ depthwise_3x3_per_channel_quantization_pad_1_(
             C_uint8_base,
             row_offsets,
             col_offsets,
-            bias);
+            bias,
+            act_times_w_scale);
       }
     }
   } // for each n
 };
 
-template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
+template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
 static inline __attribute__((always_inline)) void
 depthwise_3x3x3_per_channel_quantization_pad_1_(
     int N,
@@ -2242,7 +2441,8 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
     int32_t* C_int32,
     uint8_t* C_uint8,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
   assert(K % 8 == 0);
@@ -2304,7 +2504,8 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
           depthwise_3x3x3_per_channel_quantization_kernel_<
               FUSE_RELU,
               HAS_BIAS,
-              A_SYMMETRIC>(
+              A_SYMMETRIC,
+              BIAS_TYPE>(
               T,
               H,
               W,
@@ -2325,7 +2526,8 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
               C_uint8_base,
               row_offsets,
               col_offsets,
-              bias);
+              bias,
+              act_times_w_scale);
         } // w
       } // h
     } // t
@@ -2333,7 +2535,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
 };
 
 // Dispatch A_SYMMETRIC and B_SYMMETRIC
-template <bool FUSE_RELU, bool HAS_BIAS>
+template <bool FUSE_RELU, bool HAS_BIAS, typename BIAS_TYPE>
 static void depthwise_3x3_pad_1_(
     int N,
     int H,
@@ -2349,7 +2551,8 @@ static void depthwise_3x3_pad_1_(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    float act_times_w_scale,
     int thread_id,
     int num_threads) {
   int32_t C_int32_temp[(K + 31) / 32 * 32];
@@ -2359,7 +2562,8 @@ static void depthwise_3x3_pad_1_(
           FUSE_RELU,
           HAS_BIAS,
           true /*A_symmetric*/,
-          true /*B_symmetric*/>(
+          true /*B_symmetric*/,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -2376,6 +2580,7 @@ static void depthwise_3x3_pad_1_(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else {
@@ -2383,7 +2588,8 @@ static void depthwise_3x3_pad_1_(
           FUSE_RELU,
           HAS_BIAS,
           true /*A_symmetric*/,
-          false /*B_symmetric*/>(
+          false /*B_symmetric*/,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -2400,6 +2606,7 @@ static void depthwise_3x3_pad_1_(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     }
@@ -2409,7 +2616,8 @@ static void depthwise_3x3_pad_1_(
           FUSE_RELU,
           HAS_BIAS,
           false /*A_symmetric*/,
-          true /*B_symmetric*/>(
+          true /*B_symmetric*/,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -2426,6 +2634,7 @@ static void depthwise_3x3_pad_1_(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else {
@@ -2433,7 +2642,8 @@ static void depthwise_3x3_pad_1_(
           FUSE_RELU,
           HAS_BIAS,
           false /*A_symmetric*/,
-          false /*B_symmetric*/>(
+          false /*B_symmetric*/,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -2450,6 +2660,7 @@ static void depthwise_3x3_pad_1_(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     }
@@ -2457,7 +2668,7 @@ static void depthwise_3x3_pad_1_(
 }
 
 // Dispatch HAS_BIAS
-template <bool FUSE_RELU>
+template <bool FUSE_RELU, typename BIAS_TYPE>
 static void depthwise_3x3_pad_1_(
     int N,
     int H,
@@ -2473,11 +2684,12 @@ static void depthwise_3x3_pad_1_(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    float act_times_w_scale,
     int thread_id,
     int num_threads) {
   if (bias) {
-    depthwise_3x3_pad_1_<FUSE_RELU, true /*HAS_BIAS*/>(
+    depthwise_3x3_pad_1_<FUSE_RELU, true /*HAS_BIAS*/, BIAS_TYPE>(
         N,
         H,
         W,
@@ -2493,10 +2705,11 @@ static void depthwise_3x3_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   } else {
-    depthwise_3x3_pad_1_<FUSE_RELU, false /*HAS_BIAS*/>(
+    depthwise_3x3_pad_1_<FUSE_RELU, false /*HAS_BIAS*/, BIAS_TYPE>(
         N,
         H,
         W,
@@ -2512,6 +2725,7 @@ static void depthwise_3x3_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   }
@@ -2519,6 +2733,7 @@ static void depthwise_3x3_pad_1_(
 
 // Dispatch input shape and FUSE_RELU
 // assumption: W > 3 and H > 3
+template <typename BIAS_TYPE>
 void depthwise_3x3_pad_1(
     int N,
     int H,
@@ -2534,8 +2749,9 @@ void depthwise_3x3_pad_1(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
     bool fuse_relu,
+    float act_times_w_scale,
     int thread_id,
     int num_threads) {
   if (stride_h == 0 || stride_w == 0 || num_threads == 0) {
@@ -2548,7 +2764,7 @@ void depthwise_3x3_pad_1(
   }
   if (fuse_relu) {
     if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) {
-      depthwise_3x3_pad_1_<true /* FUSE_RELU */>(
+      depthwise_3x3_pad_1_<true /* FUSE_RELU */, BIAS_TYPE>(
           N,
           H,
           W,
@@ -2564,10 +2780,11 @@ void depthwise_3x3_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) {
-      depthwise_3x3_pad_1_<true /* FUSE_RELU */>(
+      depthwise_3x3_pad_1_<true /* FUSE_RELU */, BIAS_TYPE>(
           N,
           H,
           W,
@@ -2583,10 +2800,11 @@ void depthwise_3x3_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (1 == stride_h && 1 == stride_w) {
-      depthwise_3x3_pad_1_<true /* FUSE_RELU */>(
+      depthwise_3x3_pad_1_<true /* FUSE_RELU */, BIAS_TYPE>(
           N,
           H,
           W,
@@ -2602,10 +2820,11 @@ void depthwise_3x3_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (2 == stride_h && 2 == stride_w) {
-      depthwise_3x3_pad_1_<true /* FUSE_RELU */>(
+      depthwise_3x3_pad_1_<true /* FUSE_RELU */, BIAS_TYPE>(
           N,
           H,
           W,
@@ -2621,10 +2840,11 @@ void depthwise_3x3_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else {
-      depthwise_3x3_pad_1_<true /* FUSE_RELU */>(
+      depthwise_3x3_pad_1_<true /* FUSE_RELU */, BIAS_TYPE>(
           N,
           H,
           W,
@@ -2640,12 +2860,13 @@ void depthwise_3x3_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     }
   } else {
     if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) {
-      depthwise_3x3_pad_1_<false /* FUSE_RELU */>(
+      depthwise_3x3_pad_1_<false /* FUSE_RELU */, BIAS_TYPE>(
           N,
           H,
           W,
@@ -2661,10 +2882,11 @@ void depthwise_3x3_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) {
-      depthwise_3x3_pad_1_<false /* FUSE_RELU */>(
+      depthwise_3x3_pad_1_<false /* FUSE_RELU */, BIAS_TYPE>(
           N,
           H,
           W,
@@ -2680,10 +2902,11 @@ void depthwise_3x3_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (1 == stride_h && 1 == stride_w) {
-      depthwise_3x3_pad_1_<false /* FUSE_RELU */>(
+      depthwise_3x3_pad_1_<false /* FUSE_RELU */, BIAS_TYPE>(
           N,
           H,
           W,
@@ -2699,10 +2922,11 @@ void depthwise_3x3_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (2 == stride_h && 2 == stride_w) {
-      depthwise_3x3_pad_1_<false /* FUSE_RELU */>(
+      depthwise_3x3_pad_1_<false /* FUSE_RELU */, BIAS_TYPE>(
           N,
           H,
           W,
@@ -2718,10 +2942,11 @@ void depthwise_3x3_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else {
-      depthwise_3x3_pad_1_<false /* FUSE_RELU */>(
+      depthwise_3x3_pad_1_<false /* FUSE_RELU */, BIAS_TYPE>(
           N,
           H,
           W,
@@ -2737,6 +2962,7 @@ void depthwise_3x3_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     }
@@ -2744,7 +2970,7 @@ void depthwise_3x3_pad_1(
 }
 
 // Dispatch A_SYMMETRIC and B_SYMMETRIC
-template <bool FUSE_RELU, bool HAS_BIAS>
+template <bool FUSE_RELU, bool HAS_BIAS, typename BIAS_TYPE>
 static void depthwise_3x3x3_pad_1_(
     int N,
     int T,
@@ -2762,7 +2988,8 @@ static void depthwise_3x3x3_pad_1_(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    float act_times_w_scale,
     int thread_id,
     int num_threads) {
   int32_t C_int32_temp[(K + 31) / 32 * 32];
@@ -2772,7 +2999,8 @@ static void depthwise_3x3x3_pad_1_(
           FUSE_RELU,
           HAS_BIAS,
           true /*A_symmetric*/,
-          true /*B_symmetric*/>(
+          true /*B_symmetric*/,
+          BIAS_TYPE>(
           N,
           T,
           H,
@@ -2791,6 +3019,7 @@ static void depthwise_3x3x3_pad_1_(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else {
@@ -2798,7 +3027,8 @@ static void depthwise_3x3x3_pad_1_(
           FUSE_RELU,
           HAS_BIAS,
           true /*A_symmetric*/,
-          false /*B_symmetric*/>(
+          false /*B_symmetric*/,
+          BIAS_TYPE>(
           N,
           T,
           H,
@@ -2817,6 +3047,7 @@ static void depthwise_3x3x3_pad_1_(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     }
@@ -2826,7 +3057,8 @@ static void depthwise_3x3x3_pad_1_(
           FUSE_RELU,
           HAS_BIAS,
           false /*A_symmetric*/,
-          true /*B_symmetric*/>(
+          true /*B_symmetric*/,
+          BIAS_TYPE>(
           N,
           T,
           H,
@@ -2845,6 +3077,7 @@ static void depthwise_3x3x3_pad_1_(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else {
@@ -2852,7 +3085,8 @@ static void depthwise_3x3x3_pad_1_(
           FUSE_RELU,
           HAS_BIAS,
           false /*A_symmetric*/,
-          false /*B_symmetric*/>(
+          false /*B_symmetric*/,
+          BIAS_TYPE>(
           N,
           T,
           H,
@@ -2871,6 +3105,7 @@ static void depthwise_3x3x3_pad_1_(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     }
@@ -2878,7 +3113,7 @@ static void depthwise_3x3x3_pad_1_(
 }
 
 // Dispatch HAS_BIAS
-template <bool FUSE_RELU>
+template <bool FUSE_RELU, typename BIAS_TYPE>
 static void depthwise_3x3x3_pad_1_(
     int N,
     int T,
@@ -2896,11 +3131,12 @@ static void depthwise_3x3x3_pad_1_(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    float act_times_w_scale,
     int thread_id,
     int num_threads) {
   if (bias) {
-    depthwise_3x3x3_pad_1_<FUSE_RELU, true /*HAS_BIAS*/>(
+    depthwise_3x3x3_pad_1_<FUSE_RELU, true /*HAS_BIAS*/, BIAS_TYPE>(
         N,
         T,
         H,
@@ -2918,10 +3154,11 @@ static void depthwise_3x3x3_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   } else {
-    depthwise_3x3x3_pad_1_<FUSE_RELU, false /*HAS_BIAS*/>(
+    depthwise_3x3x3_pad_1_<FUSE_RELU, false /*HAS_BIAS*/, BIAS_TYPE>(
         N,
         T,
         H,
@@ -2939,12 +3176,14 @@ static void depthwise_3x3x3_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   }
 }
 
 // Dispatch FUSE_RELU
+template <typename BIAS_TYPE>
 void depthwise_3x3x3_pad_1(
     int N,
     int T,
@@ -2962,8 +3201,9 @@ void depthwise_3x3x3_pad_1(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
     bool fuse_relu,
+    float act_times_w_scale,
     int thread_id,
     int num_threads) {
   if (stride_t == 0 || stride_h == 0 || stride_w == 0 || num_threads == 0) {
@@ -2977,7 +3217,7 @@ void depthwise_3x3x3_pad_1(
     return;
   }
   if (fuse_relu) {
-    depthwise_3x3x3_pad_1_<true /*FUSE_RELU*/>(
+    depthwise_3x3x3_pad_1_<true /*FUSE_RELU*/, BIAS_TYPE>(
         N,
         T,
         H,
@@ -2995,10 +3235,11 @@ void depthwise_3x3x3_pad_1(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   } else {
-    depthwise_3x3x3_pad_1_<false /*FUSE_RELU*/>(
+    depthwise_3x3x3_pad_1_<false /*FUSE_RELU*/, BIAS_TYPE>(
         N,
         T,
         H,
@@ -3016,13 +3257,14 @@ void depthwise_3x3x3_pad_1(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   }
 }
 
 // Dispatch A_SYMMETRIC
-template <bool FUSE_RELU, bool HAS_BIAS>
+template <bool FUSE_RELU, bool HAS_BIAS, typename BIAS_TYPE>
 static void depthwise_3x3_per_channel_quantization_pad_1_(
     int N,
     int H,
@@ -3038,7 +3280,8 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
   int32_t C_int32_temp[(K + 31) / 32 * 32];
@@ -3046,7 +3289,8 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
     depthwise_3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
         HAS_BIAS,
-        true /*A_SYMM*/>(
+        true /*A_SYMM*/,
+        BIAS_TYPE>(
         N,
         H,
         W,
@@ -3063,13 +3307,15 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   } else {
     depthwise_3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
         HAS_BIAS,
-        false /*A_SYMM*/>(
+        false /*A_SYMM*/,
+        BIAS_TYPE>(
         N,
         H,
         W,
@@ -3086,13 +3332,14 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   }
 }
 
 // Dispatch HAS_BIAS
-template <bool FUSE_RELU>
+template <bool FUSE_RELU, typename BIAS_TYPE>
 static void depthwise_3x3_per_channel_quantization_pad_1_(
     int N,
     int H,
@@ -3108,13 +3355,15 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
   if (bias) {
     depthwise_3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
-        true /* HAS_BIAS */>(
+        true /* HAS_BIAS */,
+        BIAS_TYPE>(
         N,
         H,
         W,
@@ -3130,12 +3379,14 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   } else {
     depthwise_3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
-        false /* HAS_BIAS */>(
+        false /* HAS_BIAS */,
+        BIAS_TYPE>(
         N,
         H,
         W,
@@ -3151,12 +3402,14 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   }
 }
 
 // Dispatch input shape and FUSE_RELU
+template <typename BIAS_TYPE>
 void depthwise_3x3_per_channel_quantization_pad_1(
     int N,
     int H,
@@ -3172,8 +3425,9 @@ void depthwise_3x3_per_channel_quantization_pad_1(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
     bool fuse_relu,
+    const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
   if (stride_h == 0 || stride_w == 0 || num_threads == 0) {
@@ -3186,7 +3440,9 @@ void depthwise_3x3_per_channel_quantization_pad_1(
   }
   if (fuse_relu) {
     if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) {
-      depthwise_3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+      depthwise_3x3_per_channel_quantization_pad_1_<
+          true /* FUSE_RELU */,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -3202,10 +3458,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) {
-      depthwise_3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+      depthwise_3x3_per_channel_quantization_pad_1_<
+          true /* FUSE_RELU */,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -3221,10 +3480,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (1 == stride_h && 1 == stride_w) {
-      depthwise_3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+      depthwise_3x3_per_channel_quantization_pad_1_<
+          true /* FUSE_RELU */,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -3240,10 +3502,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (2 == stride_h && 2 == stride_w) {
-      depthwise_3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+      depthwise_3x3_per_channel_quantization_pad_1_<
+          true /* FUSE_RELU */,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -3259,10 +3524,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else {
-      depthwise_3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+      depthwise_3x3_per_channel_quantization_pad_1_<
+          true /* FUSE_RELU */,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -3278,12 +3546,15 @@ void depthwise_3x3_per_channel_quantization_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     }
   } else {
     if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) {
-      depthwise_3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+      depthwise_3x3_per_channel_quantization_pad_1_<
+          false /* FUSE_RELU */,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -3299,10 +3570,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) {
-      depthwise_3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+      depthwise_3x3_per_channel_quantization_pad_1_<
+          false /* FUSE_RELU */,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -3318,10 +3592,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (1 == stride_h && 1 == stride_w) {
-      depthwise_3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+      depthwise_3x3_per_channel_quantization_pad_1_<
+          false /* FUSE_RELU */,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -3337,10 +3614,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else if (2 == stride_h && 2 == stride_w) {
-      depthwise_3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+      depthwise_3x3_per_channel_quantization_pad_1_<
+          false /* FUSE_RELU */,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -3356,10 +3636,13 @@ void depthwise_3x3_per_channel_quantization_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     } else {
-      depthwise_3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+      depthwise_3x3_per_channel_quantization_pad_1_<
+          false /* FUSE_RELU */,
+          BIAS_TYPE>(
           N,
           H,
           W,
@@ -3375,6 +3658,7 @@ void depthwise_3x3_per_channel_quantization_pad_1(
           C,
           col_offsets,
           bias,
+          act_times_w_scale,
           thread_id,
           num_threads);
     }
@@ -3382,7 +3666,7 @@ void depthwise_3x3_per_channel_quantization_pad_1(
 }
 
 // Dispatch A_SYMMETRIC
-template <bool FUSE_RELU, bool HAS_BIAS>
+template <bool FUSE_RELU, bool HAS_BIAS, typename BIAS_TYPE>
 static void depthwise_3x3x3_per_channel_quantization_pad_1_(
     int N,
     int T,
@@ -3400,7 +3684,8 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
   int32_t C_int32_temp[(K + 31) / 32 * 32];
@@ -3408,7 +3693,8 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
     depthwise_3x3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
         HAS_BIAS,
-        true /*A_SYMM*/>(
+        true /*A_SYMM*/,
+        BIAS_TYPE>(
         N,
         T,
         H,
@@ -3427,13 +3713,15 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   } else {
     depthwise_3x3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
         HAS_BIAS,
-        false /*A_SYMM*/>(
+        false /*A_SYMM*/,
+        BIAS_TYPE>(
         N,
         T,
         H,
@@ -3452,13 +3740,14 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   }
 }
 
 // Dispatch HAS_BIAS
-template <bool FUSE_RELU>
+template <bool FUSE_RELU, typename BIAS_TYPE>
 static void depthwise_3x3x3_per_channel_quantization_pad_1_(
     int N,
     int T,
@@ -3476,13 +3765,15 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
+    const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
   if (bias) {
     depthwise_3x3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
-        true /* HAS_BIAS */>(
+        true /* HAS_BIAS */,
+        BIAS_TYPE>(
         N,
         T,
         H,
@@ -3500,12 +3791,14 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   } else {
     depthwise_3x3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
-        false /* HAS_BIAS */>(
+        false /* HAS_BIAS */,
+        BIAS_TYPE>(
         N,
         T,
         H,
@@ -3523,12 +3816,14 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   }
 }
 
 // Dispatch FUSE_RELU
+template <typename BIAS_TYPE>
 void depthwise_3x3x3_per_channel_quantization_pad_1(
     int N,
     int T,
@@ -3546,8 +3841,9 @@ void depthwise_3x3x3_per_channel_quantization_pad_1(
     int32_t C_zero_point,
     uint8_t* C,
     const int32_t* col_offsets,
-    const int32_t* bias,
+    const BIAS_TYPE* bias,
     bool fuse_relu,
+    const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
   if (stride_t == 0 || stride_h == 0 || stride_w == 0 || num_threads == 0) {
@@ -3561,7 +3857,9 @@ void depthwise_3x3x3_per_channel_quantization_pad_1(
     return;
   }
   if (fuse_relu) {
-    depthwise_3x3x3_per_channel_quantization_pad_1_<true /* FUSE_RELU */>(
+    depthwise_3x3x3_per_channel_quantization_pad_1_<
+        true /* FUSE_RELU */,
+        BIAS_TYPE>(
         N,
         T,
         H,
@@ -3579,10 +3877,13 @@ void depthwise_3x3x3_per_channel_quantization_pad_1(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   } else {
-    depthwise_3x3x3_per_channel_quantization_pad_1_<false /* FUSE_RELU */>(
+    depthwise_3x3x3_per_channel_quantization_pad_1_<
+        false /* FUSE_RELU */,
+        BIAS_TYPE>(
         N,
         T,
         H,
@@ -3600,9 +3901,361 @@ void depthwise_3x3x3_per_channel_quantization_pad_1(
         C,
         col_offsets,
         bias,
+        act_times_w_scale,
         thread_id,
         num_threads);
   }
 }
 
+// To be removed
+void depthwise_3x3_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    int32_t B_zero_point,
+    const Packed3x3ConvMatrix& B,
+    float C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const int32_t* bias,
+    bool fuse_relu,
+    int thread_id,
+    int num_threads) {
+  depthwise_3x3_pad_1<std::int32_t>(
+      N,
+      H,
+      W,
+      K,
+      stride_h,
+      stride_w,
+      A_zero_point,
+      A,
+      B_zero_point,
+      B,
+      C_multiplier,
+      C_zero_point,
+      C,
+      col_offsets,
+      bias,
+      fuse_relu,
+      1.0f,
+      thread_id,
+      num_threads);
+}
+
+// To be removed
+void depthwise_3x3_per_channel_quantization_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    const int32_t* B_zero_point,
+    const Packed3x3ConvMatrix& Bp,
+    const float* C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const int32_t* bias,
+    bool fuse_relu,
+    int thread_id,
+    int num_threads) {
+  depthwise_3x3_per_channel_quantization_pad_1<std::int32_t>(
+      N,
+      H,
+      W,
+      K,
+      stride_h,
+      stride_w,
+      A_zero_point,
+      A,
+      B_zero_point,
+      Bp,
+      C_multiplier,
+      C_zero_point,
+      C,
+      col_offsets,
+      bias,
+      fuse_relu,
+      nullptr,
+      thread_id,
+      num_threads);
+}
+
+// To be removed
+void depthwise_3x3x3_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    int32_t B_zero_point,
+    const Packed3x3x3ConvMatrix& B,
+    float C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const int32_t* bias,
+    bool fuse_relu,
+    int thread_id,
+    int num_threads) {
+  depthwise_3x3x3_pad_1<int32_t>(
+      N,
+      T,
+      H,
+      W,
+      K,
+      stride_t,
+      stride_h,
+      stride_w,
+      A_zero_point,
+      A,
+      B_zero_point,
+      B,
+      C_multiplier,
+      C_zero_point,
+      C,
+      col_offsets,
+      bias,
+      fuse_relu,
+      1.0f, // act_scale * weight_scale
+      thread_id,
+      num_threads);
+}
+
+void depthwise_3x3x3_per_channel_quantization_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    const int32_t* B_zero_point,
+    const Packed3x3x3ConvMatrix& B,
+    const float* C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const int32_t* bias,
+    bool fuse_relu,
+    int thread_id,
+    int num_threads) {
+  depthwise_3x3x3_per_channel_quantization_pad_1(
+      N,
+      T,
+      H,
+      W,
+      K,
+      stride_t,
+      stride_h,
+      stride_w,
+      A_zero_point,
+      A,
+      B_zero_point,
+      B,
+      C_multiplier,
+      C_zero_point,
+      C,
+      col_offsets,
+      bias,
+      fuse_relu,
+      nullptr, // act_scale * weight_scale
+      thread_id,
+      num_threads);
+}
+
+template void depthwise_3x3_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    int32_t B_zero_point,
+    const Packed3x3ConvMatrix& B,
+    float C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const int32_t* bias,
+    bool fuse_relu,
+    float act_times_w_scale,
+    int thread_id,
+    int num_threads);
+
+template void depthwise_3x3_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    int32_t B_zero_point,
+    const Packed3x3ConvMatrix& B,
+    float C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const float* bias,
+    bool fuse_relu,
+    float act_times_w_scale,
+    int thread_id,
+    int num_threads);
+
+template void depthwise_3x3_per_channel_quantization_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    const int32_t* B_zero_point,
+    const Packed3x3ConvMatrix& Bp,
+    const float* C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const int32_t* bias,
+    bool fuse_relu,
+    const float* act_times_w_scale,
+    int thread_id,
+    int num_threads);
+
+template void depthwise_3x3_per_channel_quantization_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    const int32_t* B_zero_point,
+    const Packed3x3ConvMatrix& Bp,
+    const float* C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const float* bias,
+    bool fuse_relu,
+    const float* act_times_w_scale,
+    int thread_id,
+    int num_threads);
+
+template void depthwise_3x3x3_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    int32_t B_zero_point,
+    const Packed3x3x3ConvMatrix& B,
+    float C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const int32_t* bias,
+    bool fuse_relu,
+    float act_times_w_scale,
+    int thread_id,
+    int num_threads);
+
+template void depthwise_3x3x3_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    int32_t B_zero_point,
+    const Packed3x3x3ConvMatrix& B,
+    float C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const float* bias,
+    bool fuse_relu,
+    float act_times_w_scale,
+    int thread_id,
+    int num_threads);
+
+template void depthwise_3x3x3_per_channel_quantization_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    const int32_t* B_zero_point,
+    const Packed3x3x3ConvMatrix& B,
+    const float* C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const int32_t* bias,
+    bool fuse_relu,
+    const float* act_times_w_scale,
+    int thread_id,
+    int num_threads);
+
+template void depthwise_3x3x3_per_channel_quantization_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int32_t A_zero_point,
+    const uint8_t* A,
+    const int32_t* B_zero_point,
+    const Packed3x3x3ConvMatrix& B,
+    const float* C_multiplier,
+    int32_t C_zero_point,
+    uint8_t* C,
+    const int32_t* col_offsets,
+    const float* bias,
+    bool fuse_relu,
+    const float* act_times_w_scale,
+    int thread_id,
+    int num_threads);
+
 } // namespace fbgemm
diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc
index 00c39c0..8492acb 100644
--- a/test/I8DepthwiseTest.cc
+++ b/test/I8DepthwiseTest.cc
@@ -212,6 +212,7 @@ TEST_P(FBGemmDepthWiseTest, Test3x3) {
         a_symmetric ? nullptr : col_offsets.data(),
         bias.data(),
         false, /* fuse_relu */
+        1.0f, /* act_scale * w_scale */
         0,
         1);
author	Daya Khudia <dskhudia@fb.com>	2019-09-11 21:47:58 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-09-11 21:52:06 +0300
commit	415035019ccbca2b11b62f1503fdd61e8bc59b10 (patch)
tree	a970118ee111cb6531ddc7527f720fd94c5fa0cf
parent	685b1855d739868839cd8f774d987c1a4599b138 (diff)