Unified convolution interface

Summary: We want to combine three different convolution interfaces under one top level function. Reviewed By: protonu Differential Revision: D15399811 fbshipit-source-id: 7390616d92783506fc156f0f6017f10b5f7f8e30
author: Daya Khudia <dskhudia@fb.com> 2019-06-05 22:44:57 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-06-05 22:50:08 +0300
commit: 8197494f3ae280941639c72bc1342a9faa8e2ad6 (patch)
tree: e0f7fb9b8aad61c46b7d2a79f019255514c03985 /include
parent: 77868418c7963572167690ef069b06cbfe67de1f (diff)
4 files changed, 288 insertions, 1 deletions
diff --git a/include/fbgemm/ConvUtils.h b/include/fbgemm/ConvUtils.h
index 1c8251e..11f3dcc 100644
--- a/include/fbgemm/ConvUtils.h
+++ b/include/fbgemm/ConvUtils.h
@@ -101,7 +101,7 @@ struct conv_param_t {
             std::to_string(stride[d]) + ", ";
       }
       for (int d = 0; d < SPATIAL_DIM * 2; ++d) {
-        out += "pad_" + dim_string[3 - (SPATIAL_DIM % 3) + d] + ":" +
+        out += "pad_" + dim_string[3 - SPATIAL_DIM + (d % SPATIAL_DIM)] + ":" +
             std::to_string(pad[d]);
         if (d < SPATIAL_DIM * 2 - 1) {
           out += ", ";
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index 720f681..721b12f 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -18,6 +18,7 @@
 #include "FbgemmBuild.h"
 #include "FbgemmI8Spmdm.h"
 #include "QuantUtilsAvx2.h"
+#include "FbgemmI8DepthwiseAvx2.h"
 #include "Types.h"
 #include "Utils.h"
 
@@ -524,6 +525,62 @@ class FBGEMM_API PackWeightMatrixForGConv {
 };
 
 /**
+ * @brief A container class to keep packed weight tensor for convolution.
+ *        The source tensor should already be quantized.
+ *
+ * @tparam SPATIAL_DIM is equal to 2 for 2D convolutions and 3 for 3D
+ *                     convolutions. Default value is 2.
+ * @tparam T is the datatype for source tensor. Default value is int8.
+ * @tparam accT is the datatype to accumulate into. Default value is int32.
+ */
+template <
+    int SPATIAL_DIM = 2,
+    typename T = std::int8_t,
+    typename accT = std::int32_t>
+class FBGEMM_API PackWeightsForConv {
+ public:
+  using This = PackWeightsForConv<SPATIAL_DIM, T, accT>;
+  using inpType = T;
+  using accType = accT;
+
+  PackWeightsForConv() = delete; // no default constructor
+
+  PackWeightsForConv(
+      const conv_param_t<SPATIAL_DIM>& conv_param,
+      const inpType* sdata,
+      const BlockingFactors* blocking_params = nullptr);
+
+  std::shared_ptr<PackBMatrix<T, accT>> getPackedWForIm2col() {
+    return W_im2col_packed_;
+  }
+
+  std::shared_ptr<Packed3x3ConvMatrix> getPackedWFor2DDW() {
+    return W_dw_2D_packed_;
+  }
+
+  std::shared_ptr<Packed3x3x3ConvMatrix> getPackedWFor3DDW() {
+    return W_dw_3D_packed_;
+  }
+
+  std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
+  getPackedWForGroupwise() {
+    return W_gconv_packed_;
+  }
+
+ private:
+  // Packed weights if we use im2col based convolution implementation
+  std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_;
+  // Packed weights if we use 2D depthwise convolution implementation
+  std::shared_ptr<Packed3x3ConvMatrix> W_dw_2D_packed_;
+  // Packed weights if we use 3D depthwise convolution implementation
+  std::shared_ptr<Packed3x3x3ConvMatrix> W_dw_3D_packed_;
+  // Packed weights if we use groupwise (small channels per group) convolution
+  // implementation
+  std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
+      W_gconv_packed_;
+};
+
+/**
  * @brief Matrix packed for the first input matrix in GEMM (usually activation),
  *        and row offsets used for requantization is computed during packing.
  *        Im2col is fused with packing here. The source matrix is already
@@ -1002,6 +1059,7 @@ template <
     typename nextOPType = DoNothing<outT, outT>>
 class FBGEMM_API ReQuantizeOutput {
  public:
+  static constexpr int RELU_FUSED = FUSE_RELU;
   using outType = outT;
   using inpType = inT;
   /**
@@ -1056,12 +1114,18 @@ class FBGEMM_API ReQuantizeOutput {
   const float* getCMultiplier() const {
     return C_multiplier_;
   }
+  std::int32_t getAZeroPoint() const {
+    return Aq_zero_point_;
+  }
   std::int32_t getCZeroPoint() const {
     return C_zero_point_;
   }
   const std::int32_t* getBZeroPoint() const {
     return Bq_zero_point_;
   }
+  const std::int32_t* getRowOffsets() const {
+    return q_row_offsets_;
+  }
   const std::int32_t* getColOffsets() const {
     return q_col_offsets_;
   }
@@ -1072,6 +1136,10 @@ class FBGEMM_API ReQuantizeOutput {
     return ncols_;
   }
 
+  void setRowOffsets(const std::int32_t* row_offsets) {
+    q_row_offsets_ = row_offsets;
+  }
+
  private:
   nextOPType& nextop_;
   const float* C_multiplier_;
@@ -1273,6 +1341,12 @@ void convDepthwiseSeparable(
     const processOutputType& output);
 
 /**
+ * @brief Is this depthwise convolution optimized?
+ */
+template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
+bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
+
+/**
  * @brief Is this groupwise convolution supported?
  */
 template <int SPATIAL_DIM>
@@ -1351,4 +1425,37 @@ static void fbgemmGetRange(
   }
 }
 
+/**
+ * @brief Performs convolution using fastest path available.
+ *
+ * @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions.
+ */
+template <
+    typename processOutputType,
+    int SPATIAL_DIM = 2,
+    typename ACC_T = std::int32_t>
+FBGEMM_API int fbgemmConv(
+    const conv_param_t<SPATIAL_DIM>& conv_p,
+    const std::uint8_t* activations,
+    PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights,
+    typename processOutputType::outType* out,
+    std::int32_t* outBuffer,
+    processOutputType& outProcess,
+    int thread_id,
+    int num_threads,
+    const BlockingFactors* blocking_params = nullptr);
+
+/**
+ * @brief Returns which fast path to take
+ *
+ * @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions.
+ *
+ * @return optimized_conv_t::depthwise, optimized_conv_t::groupwise or
+ *         optimized_conv_t::im2col
+ *
+ */
+template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
+FBGEMM_API optimized_conv_t
+ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
+
 } // namespace fbgemm
diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
new file mode 100644
index 0000000..432687c
--- /dev/null
+++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef I8DEPTHWISE_H
+#define I8DEPTHWISE_H
+#include <cstdint>
+#include "fbgemm/FbgemmBuild.h"
+
+namespace fbgemm {
+
+// KERNEL_PROD is the product of all kernels.
+// For example, KERNEL_PROD = 9 for 3x3, and 27 for 3x3x3.
+template <int KERNEL_PROD>
+class FBGEMM_API PackedDepthWiseConvMatrix {
+ public:
+  // smat in RSG layout
+  PackedDepthWiseConvMatrix(int K, const std::int8_t* smat);
+  virtual ~PackedDepthWiseConvMatrix();
+
+  const std::int8_t* PackedMat() const {
+    return pmat_;
+  }
+
+ private:
+  int K_;
+  std::int8_t* pmat_;
+}; // Packed3x3ConvMatrix
+
+using Packed3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3>;
+using Packed3x3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3 * 3>;
+
+/**
+ * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
+ * @params A The input image in NHWK layout
+ * @params Bp The pre-packed filter
+ */
+FBGEMM_API void depthwise_3x3_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const Packed3x3ConvMatrix& Bp,
+    std::int32_t* C,
+    int thread_id = 0,
+    int num_threads = 1);
+
+/**
+ * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
+ * This version is fused with requantization.
+ *
+ * @col_offsets nullptr if col_offsets are folded into bias
+ */
+FBGEMM_API void depthwise_3x3_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    std::int32_t B_zero_point,
+    const Packed3x3ConvMatrix& Bp,
+    float C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const std::int32_t* bias,
+    bool fuse_relu = false,
+    int thread_id = 0,
+    int num_threads = 1);
+
+/**
+ * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
+ * This version is fused with requantization and uses per-channel quantization.
+ *
+ * @col_offsets nullptr if col_offsets are folded into bias
+ */
+FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
+    int N,
+    int H,
+    int W,
+    int K,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const std::int32_t* B_zero_point,
+    const Packed3x3ConvMatrix& Bp,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const std::int32_t* bias,
+    bool fuse_relu = false,
+    int thread_id = 0,
+    int num_threads = 1);
+
+FBGEMM_API void depthwise_3x3x3_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const Packed3x3x3ConvMatrix& Bp,
+    std::int32_t* C,
+    int thread_id = 0,
+    int num_threads = 1);
+
+/**
+ * @col_offsets nullptr if col_offsets are folded into bias
+ */
+FBGEMM_API void depthwise_3x3x3_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    std::int32_t B_zero_point,
+    const Packed3x3x3ConvMatrix& Bp,
+    float C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const std::int32_t* bias,
+    bool fuse_relu = false,
+    int thread_id = 0,
+    int num_threads = 1);
+
+/**
+ * @col_offsets nullptr if col_offsets are folded into bias
+ */
+FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
+    int N,
+    int T,
+    int H,
+    int W,
+    int K,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const std::int32_t* B_zero_point,
+    const Packed3x3x3ConvMatrix& Bp,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const std::int32_t* bias,
+    bool fuse_relu = false,
+    int thread_id = 0,
+    int num_threads = 1);
+
+} // namespace fbgemm
+
+#endif
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 58232e7..ef1d4ab 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -32,6 +32,11 @@ enum class matrix_op_t { NoTranspose, Transpose };
 enum class inst_set_t { anyarch, avx2, avx512 };
 
 /**
+ * @brief Typed enum for optimized paths for convolutions
+ */
+enum class optimized_conv_t { depthwise, groupwise, im2col };
+
+/**
  * @brief Typed enum for implementation type.
  *
  * ref is reference and opt is optimized.
author	Daya Khudia <dskhudia@fb.com>	2019-06-05 22:44:57 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-06-05 22:50:08 +0300
commit	8197494f3ae280941639c72bc1342a9faa8e2ad6 (patch)
tree	e0f7fb9b8aad61c46b7d2a79f019255514c03985 /include
parent	77868418c7963572167690ef069b06cbfe67de1f (diff)