1 files changed, 118 insertions, 35 deletions
diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
index e7b0ec4..c454b16 100644
--- a/include/fbgemm/FbgemmI8DepthwiseAvx2.h
+++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
@@ -11,19 +11,26 @@
 
 namespace fbgemm {
 
-// KERNEL_PROD is the product of all kernels.
-// For example, KERNEL_PROD = 9 for 3x3, and 27 for 3x3x3.
-template <int KERNEL_PROD>
 class FBGEMM_API PackedDepthWiseConvMatrix {
  public:
-  // smat in GRS layout
-  PackedDepthWiseConvMatrix(int K, const std::int8_t* smat);
+  /**
+   * @params K the number of channels (same as the number of groups because
+   *           depth-wise convolution has one input/output channel per group)
+   * @params kernel_prod the product of all kernels. For example, kernel_prod =
+   *                     9 for 3x3 conv, and 27 for 3x3x3 conv.
+   * @param smat the source unpacked weight in GRS layout
+   */
+  PackedDepthWiseConvMatrix(int K, int kernel_prod, const std::int8_t* smat);
   virtual ~PackedDepthWiseConvMatrix();
 
   const std::int8_t* PackedMat() const {
     return pmat_;
   }
 
+  int GetKernelProduct() const {
+    return kernel_prod_;
+  }
+
   /**
    * @brief Unpacks pmat_ into unpack_data.
    * Used for recovering the weight matrix into the original format
@@ -36,24 +43,26 @@ class FBGEMM_API PackedDepthWiseConvMatrix {
   int addr(int r, int c);
 
  private:
-  int K_;
-  std::int8_t* pmat_;
-}; // Packed3x3ConvMatrix
-
-using Packed3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3>;
-using Packed3x3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3 * 3>;
-using Packed1ConvMatrix = PackedDepthWiseConvMatrix<1>;
-using Packed2ConvMatrix = PackedDepthWiseConvMatrix<2>;
-using Packed3ConvMatrix = PackedDepthWiseConvMatrix<3>;
-using Packed4ConvMatrix = PackedDepthWiseConvMatrix<4>;
-using Packed5ConvMatrix = PackedDepthWiseConvMatrix<5>;
-using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>;
-using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>;
+  const int K_; /**< the number of channels */
+  const int kernel_prod_; /** the product of all kernel dims */
+  std::int8_t* pmat_; /** packed weight */
+}; // PackedDepthWiseConvMatrix
 
-/**
- * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
- * @params A The input image in NHWK layout
- * @params Bp The pre-packed filter
+class FBGEMM_API Packed3x3ConvMatrix : public PackedDepthWiseConvMatrix {
+ public:
+  Packed3x3ConvMatrix(int K, const std::int8_t* smat)
+      : PackedDepthWiseConvMatrix(K, 3 * 3, smat) {}
+};
+
+class FBGEMM_API Packed3x3x3ConvMatrix : public PackedDepthWiseConvMatrix {
+ public:
+  Packed3x3x3ConvMatrix(int K, const std::int8_t* smat)
+      : PackedDepthWiseConvMatrix(K, 3 * 3 * 3, smat) {}
+};
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
  */
 FBGEMM_API void depthwise_3x3_pad_1(
     int N,
@@ -64,8 +73,14 @@ FBGEMM_API void depthwise_3x3_pad_1(
     int stride_w,
     std::int32_t A_zero_point,
     const std::uint8_t* A,
-    const Packed3x3ConvMatrix& Bp,
-    std::int32_t* C,
+    std::int32_t B_zero_point,
+    const PackedDepthWiseConvMatrix& Bp,
+    float C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const std::int32_t* bias,
+    bool fuse_relu = false,
     int thread_id = 0,
     int num_threads = 1);
 
@@ -74,7 +89,10 @@ FBGEMM_API void depthwise_3x3_pad_1(
  * This version is fused with requantization.
  *
  * @col_offsets nullptr if col_offsets are folded into bias
+ * @act_times_w_scale Only used if BIAS_TYPE is float, i.e., bias is
+ *                    unquantized.
  */
+template <typename BIAS_TYPE = std::int32_t>
 FBGEMM_API void depthwise_3x3_pad_1(
     int N,
     int H,
@@ -85,22 +103,48 @@ FBGEMM_API void depthwise_3x3_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     std::int32_t B_zero_point,
-    const Packed3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     float C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
     const std::int32_t* col_offsets,
-    const std::int32_t* bias,
+    const BIAS_TYPE* bias,
     bool fuse_relu = false,
+    float act_times_w_scale = 1.0f,
     int thread_id = 0,
     int num_threads = 1);
 
 /**
- * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
- * This version is fused with requantization and uses per-channel quantization.
+ * Depth-wise 3x3 convolution with pad=1 and K a multiple of 8, fused with
+ * requantization, and using per-channel quantization.
  *
  * @col_offsets nullptr if col_offsets are folded into bias
  */
+template <typename BIAS_TYPE = std::int32_t>
+FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const std::int32_t* B_zero_point,
+    const PackedDepthWiseConvMatrix& Bp,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const BIAS_TYPE* bias,
+    bool fuse_relu = false,
+    const float* act_times_w_scale = nullptr,
+    int thread_id = 0,
+    int num_threads = 1);
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ */
 FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
     int N,
     int H,
@@ -111,7 +155,7 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     const std::int32_t* B_zero_point,
-    const Packed3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     const float* C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
@@ -121,6 +165,10 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
     int thread_id = 0,
     int num_threads = 1);
 
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
 FBGEMM_API void depthwise_3x3x3_pad_1(
     int N,
     int T,
@@ -132,14 +180,20 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
     int stride_w,
     std::int32_t A_zero_point,
     const std::uint8_t* A,
-    const Packed3x3x3ConvMatrix& Bp,
-    std::int32_t* C,
+    std::int32_t B_zero_point,
+    const PackedDepthWiseConvMatrix& Bp,
+    float C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const std::int32_t* bias,
+    bool fuse_relu = false,
     int thread_id = 0,
     int num_threads = 1);
-
 /**
  * @col_offsets nullptr if col_offsets are folded into bias
  */
+template <typename BIAS_TYPE = std::int32_t>
 FBGEMM_API void depthwise_3x3x3_pad_1(
     int N,
     int T,
@@ -152,11 +206,38 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     std::int32_t B_zero_point,
-    const Packed3x3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     float C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
     const std::int32_t* col_offsets,
+    const BIAS_TYPE* bias,
+    bool fuse_relu = false,
+    float act_times_w_scale = 1.0f,
+    int thread_id = 0,
+    int num_threads = 1);
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const std::int32_t* B_zero_point,
+    const PackedDepthWiseConvMatrix& Bp,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
     const std::int32_t* bias,
     bool fuse_relu = false,
     int thread_id = 0,
@@ -165,6 +246,7 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
 /**
  * @col_offsets nullptr if col_offsets are folded into bias
  */
+template <typename BIAS_TYPE = std::int32_t>
 FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
     int N,
     int T,
@@ -177,13 +259,14 @@ FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     const std::int32_t* B_zero_point,
-    const Packed3x3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     const float* C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
     const std::int32_t* col_offsets,
-    const std::int32_t* bias,
+    const BIAS_TYPE* bias,
     bool fuse_relu = false,
+    const float* act_times_w_scale = nullptr,
     int thread_id = 0,
     int num_threads = 1);