Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'include/fbgemm/FbgemmI8DepthwiseAvx2.h')
-rw-r--r--include/fbgemm/FbgemmI8DepthwiseAvx2.h153
1 files changed, 118 insertions, 35 deletions
diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
index e7b0ec4..c454b16 100644
--- a/include/fbgemm/FbgemmI8DepthwiseAvx2.h
+++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
@@ -11,19 +11,26 @@
namespace fbgemm {
-// KERNEL_PROD is the product of all kernels.
-// For example, KERNEL_PROD = 9 for 3x3, and 27 for 3x3x3.
-template <int KERNEL_PROD>
class FBGEMM_API PackedDepthWiseConvMatrix {
public:
- // smat in GRS layout
- PackedDepthWiseConvMatrix(int K, const std::int8_t* smat);
+ /**
+ * @params K the number of channels (same as the number of groups because
+ * depth-wise convolution has one input/output channel per group)
+ * @params kernel_prod the product of all kernels. For example, kernel_prod =
+ * 9 for 3x3 conv, and 27 for 3x3x3 conv.
+ * @param smat the source unpacked weight in GRS layout
+ */
+ PackedDepthWiseConvMatrix(int K, int kernel_prod, const std::int8_t* smat);
virtual ~PackedDepthWiseConvMatrix();
const std::int8_t* PackedMat() const {
return pmat_;
}
+ int GetKernelProduct() const {
+ return kernel_prod_;
+ }
+
/**
* @brief Unpacks pmat_ into unpack_data.
* Used for recovering the weight matrix into the original format
@@ -36,24 +43,26 @@ class FBGEMM_API PackedDepthWiseConvMatrix {
int addr(int r, int c);
private:
- int K_;
- std::int8_t* pmat_;
-}; // Packed3x3ConvMatrix
-
-using Packed3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3>;
-using Packed3x3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3 * 3>;
-using Packed1ConvMatrix = PackedDepthWiseConvMatrix<1>;
-using Packed2ConvMatrix = PackedDepthWiseConvMatrix<2>;
-using Packed3ConvMatrix = PackedDepthWiseConvMatrix<3>;
-using Packed4ConvMatrix = PackedDepthWiseConvMatrix<4>;
-using Packed5ConvMatrix = PackedDepthWiseConvMatrix<5>;
-using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>;
-using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>;
+ const int K_; /**< the number of channels */
+ const int kernel_prod_; /** the product of all kernel dims */
+ std::int8_t* pmat_; /** packed weight */
+}; // PackedDepthWiseConvMatrix
-/**
- * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
- * @params A The input image in NHWK layout
- * @params Bp The pre-packed filter
+class FBGEMM_API Packed3x3ConvMatrix : public PackedDepthWiseConvMatrix {
+ public:
+ Packed3x3ConvMatrix(int K, const std::int8_t* smat)
+ : PackedDepthWiseConvMatrix(K, 3 * 3, smat) {}
+};
+
+class FBGEMM_API Packed3x3x3ConvMatrix : public PackedDepthWiseConvMatrix {
+ public:
+ Packed3x3x3ConvMatrix(int K, const std::int8_t* smat)
+ : PackedDepthWiseConvMatrix(K, 3 * 3 * 3, smat) {}
+};
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
*/
FBGEMM_API void depthwise_3x3_pad_1(
int N,
@@ -64,8 +73,14 @@ FBGEMM_API void depthwise_3x3_pad_1(
int stride_w,
std::int32_t A_zero_point,
const std::uint8_t* A,
- const Packed3x3ConvMatrix& Bp,
- std::int32_t* C,
+ std::int32_t B_zero_point,
+ const PackedDepthWiseConvMatrix& Bp,
+ float C_multiplier,
+ std::int32_t C_zero_point,
+ std::uint8_t* C,
+ const std::int32_t* col_offsets,
+ const std::int32_t* bias,
+ bool fuse_relu = false,
int thread_id = 0,
int num_threads = 1);
@@ -74,7 +89,10 @@ FBGEMM_API void depthwise_3x3_pad_1(
* This version is fused with requantization.
*
* @col_offsets nullptr if col_offsets are folded into bias
+ * @act_times_w_scale Only used if BIAS_TYPE is float, i.e., bias is
+ * unquantized.
*/
+template <typename BIAS_TYPE = std::int32_t>
FBGEMM_API void depthwise_3x3_pad_1(
int N,
int H,
@@ -85,22 +103,48 @@ FBGEMM_API void depthwise_3x3_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
std::int32_t B_zero_point,
- const Packed3x3ConvMatrix& Bp,
+ const PackedDepthWiseConvMatrix& Bp,
float C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
const std::int32_t* col_offsets,
- const std::int32_t* bias,
+ const BIAS_TYPE* bias,
bool fuse_relu = false,
+ float act_times_w_scale = 1.0f,
int thread_id = 0,
int num_threads = 1);
/**
- * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8
- * This version is fused with requantization and uses per-channel quantization.
+ * Depth-wise 3x3 convolution with pad=1 and K a multiple of 8, fused with
+ * requantization, and using per-channel quantization.
*
* @col_offsets nullptr if col_offsets are folded into bias
*/
+template <typename BIAS_TYPE = std::int32_t>
+FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
+ int N,
+ int H,
+ int W,
+ int K,
+ int stride_h,
+ int stride_w,
+ std::int32_t A_zero_point,
+ const std::uint8_t* A,
+ const std::int32_t* B_zero_point,
+ const PackedDepthWiseConvMatrix& Bp,
+ const float* C_multiplier,
+ std::int32_t C_zero_point,
+ std::uint8_t* C,
+ const std::int32_t* col_offsets,
+ const BIAS_TYPE* bias,
+ bool fuse_relu = false,
+ const float* act_times_w_scale = nullptr,
+ int thread_id = 0,
+ int num_threads = 1);
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ */
FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
int N,
int H,
@@ -111,7 +155,7 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
const std::int32_t* B_zero_point,
- const Packed3x3ConvMatrix& Bp,
+ const PackedDepthWiseConvMatrix& Bp,
const float* C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
@@ -121,6 +165,10 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
int thread_id = 0,
int num_threads = 1);
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
FBGEMM_API void depthwise_3x3x3_pad_1(
int N,
int T,
@@ -132,14 +180,20 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
int stride_w,
std::int32_t A_zero_point,
const std::uint8_t* A,
- const Packed3x3x3ConvMatrix& Bp,
- std::int32_t* C,
+ std::int32_t B_zero_point,
+ const PackedDepthWiseConvMatrix& Bp,
+ float C_multiplier,
+ std::int32_t C_zero_point,
+ std::uint8_t* C,
+ const std::int32_t* col_offsets,
+ const std::int32_t* bias,
+ bool fuse_relu = false,
int thread_id = 0,
int num_threads = 1);
-
/**
* @col_offsets nullptr if col_offsets are folded into bias
*/
+template <typename BIAS_TYPE = std::int32_t>
FBGEMM_API void depthwise_3x3x3_pad_1(
int N,
int T,
@@ -152,11 +206,38 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
std::int32_t B_zero_point,
- const Packed3x3x3ConvMatrix& Bp,
+ const PackedDepthWiseConvMatrix& Bp,
float C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
const std::int32_t* col_offsets,
+ const BIAS_TYPE* bias,
+ bool fuse_relu = false,
+ float act_times_w_scale = 1.0f,
+ int thread_id = 0,
+ int num_threads = 1);
+
+/** To be removed. Keeping it just to make sure we don't change C2 files and
+ * fbgemm files in a single diff
+ *
+ */
+FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
+ int N,
+ int T,
+ int H,
+ int W,
+ int K,
+ int stride_t,
+ int stride_h,
+ int stride_w,
+ std::int32_t A_zero_point,
+ const std::uint8_t* A,
+ const std::int32_t* B_zero_point,
+ const PackedDepthWiseConvMatrix& Bp,
+ const float* C_multiplier,
+ std::int32_t C_zero_point,
+ std::uint8_t* C,
+ const std::int32_t* col_offsets,
const std::int32_t* bias,
bool fuse_relu = false,
int thread_id = 0,
@@ -165,6 +246,7 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
/**
* @col_offsets nullptr if col_offsets are folded into bias
*/
+template <typename BIAS_TYPE = std::int32_t>
FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
int N,
int T,
@@ -177,13 +259,14 @@ FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
const std::int32_t* B_zero_point,
- const Packed3x3x3ConvMatrix& Bp,
+ const PackedDepthWiseConvMatrix& Bp,
const float* C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
const std::int32_t* col_offsets,
- const std::int32_t* bias,
+ const BIAS_TYPE* bias,
bool fuse_relu = false,
+ const float* act_times_w_scale = nullptr,
int thread_id = 0,
int num_threads = 1);