1 files changed, 112 insertions, 4 deletions
diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
index 98c4ed7..19946cf 100644
--- a/include/fbgemm/FbgemmI8DepthwiseAvx2.h
+++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
@@ -50,12 +50,39 @@ using Packed5ConvMatrix = PackedDepthWiseConvMatrix<5>;
 using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>;
 using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>;
 
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    std::int32_t B_zero_point,
+    const Packed3x3ConvMatrix& Bp,
+    float C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const std::int32_t* bias,
+    bool fuse_relu = false,
+    int thread_id = 0,
+    int num_threads = 1);
+
 /**
- * Depth-wise 3x3 convolution with pad=1 and K a multiple of 8, fused with
- * requantization.
+ * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
+ * This version is fused with requantization.
  *
  * @col_offsets nullptr if col_offsets are folded into bias
+ * @act_times_w_scale Only used if BIAS_TYPE is float, i.e., bias is
+ *                    unquantized.
  */
+template <typename BIAS_TYPE = std::int32_t>
 FBGEMM_API void depthwise_3x3_pad_1(
     int N,
     int H,
@@ -71,8 +98,9 @@ FBGEMM_API void depthwise_3x3_pad_1(
     std::int32_t C_zero_point,
     std::uint8_t* C,
     const std::int32_t* col_offsets,
-    const std::int32_t* bias,
+    const BIAS_TYPE* bias,
     bool fuse_relu = false,
+    float act_times_w_scale = 1.0f,
     int thread_id = 0,
     int num_threads = 1);
 
@@ -82,6 +110,31 @@ FBGEMM_API void depthwise_3x3_pad_1(
  *
  * @col_offsets nullptr if col_offsets are folded into bias
  */
+template <typename BIAS_TYPE = std::int32_t>
+FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const std::int32_t* B_zero_point,
+    const Packed3x3ConvMatrix& Bp,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const BIAS_TYPE* bias,
+    bool fuse_relu = false,
+    const float* act_times_w_scale = nullptr,
+    int thread_id = 0,
+    int num_threads = 1);
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ */
 FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
     int N,
     int H,
@@ -102,9 +155,35 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
     int thread_id = 0,
     int num_threads = 1);
 
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3x3_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    std::int32_t B_zero_point,
+    const Packed3x3x3ConvMatrix& Bp,
+    float C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const std::int32_t* bias,
+    bool fuse_relu = false,
+    int thread_id = 0,
+    int num_threads = 1);
 /**
  * @col_offsets nullptr if col_offsets are folded into bias
  */
+template <typename BIAS_TYPE = std::int32_t>
 FBGEMM_API void depthwise_3x3x3_pad_1(
     int N,
     int T,
@@ -122,6 +201,33 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
     std::int32_t C_zero_point,
     std::uint8_t* C,
     const std::int32_t* col_offsets,
+    const BIAS_TYPE* bias,
+    bool fuse_relu = false,
+    float act_times_w_scale = 1.0f,
+    int thread_id = 0,
+    int num_threads = 1);
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const std::int32_t* B_zero_point,
+    const Packed3x3x3ConvMatrix& Bp,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
     const std::int32_t* bias,
     bool fuse_relu = false,
     int thread_id = 0,
@@ -130,6 +236,7 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
 /**
  * @col_offsets nullptr if col_offsets are folded into bias
  */
+template <typename BIAS_TYPE = std::int32_t>
 FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
     int N,
     int T,
@@ -147,8 +254,9 @@ FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
     std::int32_t C_zero_point,
     std::uint8_t* C,
     const std::int32_t* col_offsets,
-    const std::int32_t* bias,
+    const BIAS_TYPE* bias,
     bool fuse_relu = false,
+    const float* act_times_w_scale = nullptr,
     int thread_id = 0,
     int num_threads = 1);