diff options
Diffstat (limited to 'include/fbgemm/FbgemmI8DepthwiseAvx2.h')
-rw-r--r-- | include/fbgemm/FbgemmI8DepthwiseAvx2.h | 153 |
1 files changed, 118 insertions, 35 deletions
diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h index e7b0ec4..c454b16 100644 --- a/include/fbgemm/FbgemmI8DepthwiseAvx2.h +++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h @@ -11,19 +11,26 @@ namespace fbgemm { -// KERNEL_PROD is the product of all kernels. -// For example, KERNEL_PROD = 9 for 3x3, and 27 for 3x3x3. -template <int KERNEL_PROD> class FBGEMM_API PackedDepthWiseConvMatrix { public: - // smat in GRS layout - PackedDepthWiseConvMatrix(int K, const std::int8_t* smat); + /** + * @params K the number of channels (same as the number of groups because + * depth-wise convolution has one input/output channel per group) + * @params kernel_prod the product of all kernels. For example, kernel_prod = + * 9 for 3x3 conv, and 27 for 3x3x3 conv. + * @param smat the source unpacked weight in GRS layout + */ + PackedDepthWiseConvMatrix(int K, int kernel_prod, const std::int8_t* smat); virtual ~PackedDepthWiseConvMatrix(); const std::int8_t* PackedMat() const { return pmat_; } + int GetKernelProduct() const { + return kernel_prod_; + } + /** * @brief Unpacks pmat_ into unpack_data. * Used for recovering the weight matrix into the original format @@ -36,24 +43,26 @@ class FBGEMM_API PackedDepthWiseConvMatrix { int addr(int r, int c); private: - int K_; - std::int8_t* pmat_; -}; // Packed3x3ConvMatrix - -using Packed3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3>; -using Packed3x3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3 * 3>; -using Packed1ConvMatrix = PackedDepthWiseConvMatrix<1>; -using Packed2ConvMatrix = PackedDepthWiseConvMatrix<2>; -using Packed3ConvMatrix = PackedDepthWiseConvMatrix<3>; -using Packed4ConvMatrix = PackedDepthWiseConvMatrix<4>; -using Packed5ConvMatrix = PackedDepthWiseConvMatrix<5>; -using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>; -using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>; + const int K_; /**< the number of channels */ + const int kernel_prod_; /** the product of all kernel dims */ + std::int8_t* pmat_; /** packed weight */ +}; // PackedDepthWiseConvMatrix -/** - * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8 - * @params A The input image in NHWK layout - * @params Bp The pre-packed filter +class FBGEMM_API Packed3x3ConvMatrix : public PackedDepthWiseConvMatrix { + public: + Packed3x3ConvMatrix(int K, const std::int8_t* smat) + : PackedDepthWiseConvMatrix(K, 3 * 3, smat) {} +}; + +class FBGEMM_API Packed3x3x3ConvMatrix : public PackedDepthWiseConvMatrix { + public: + Packed3x3x3ConvMatrix(int K, const std::int8_t* smat) + : PackedDepthWiseConvMatrix(K, 3 * 3 * 3, smat) {} +}; + +/** To be removed. Keeping it just to make sure we don't change C2 files and + * fbgemm files in a single diff + * */ FBGEMM_API void depthwise_3x3_pad_1( int N, @@ -64,8 +73,14 @@ FBGEMM_API void depthwise_3x3_pad_1( int stride_w, std::int32_t A_zero_point, const std::uint8_t* A, - const Packed3x3ConvMatrix& Bp, - std::int32_t* C, + std::int32_t B_zero_point, + const PackedDepthWiseConvMatrix& Bp, + float C_multiplier, + std::int32_t C_zero_point, + std::uint8_t* C, + const std::int32_t* col_offsets, + const std::int32_t* bias, + bool fuse_relu = false, int thread_id = 0, int num_threads = 1); @@ -74,7 +89,10 @@ FBGEMM_API void depthwise_3x3_pad_1( * This version is fused with requantization. * * @col_offsets nullptr if col_offsets are folded into bias + * @act_times_w_scale Only used if BIAS_TYPE is float, i.e., bias is + * unquantized. */ +template <typename BIAS_TYPE = std::int32_t> FBGEMM_API void depthwise_3x3_pad_1( int N, int H, @@ -85,22 +103,48 @@ FBGEMM_API void depthwise_3x3_pad_1( std::int32_t A_zero_point, const std::uint8_t* A, std::int32_t B_zero_point, - const Packed3x3ConvMatrix& Bp, + const PackedDepthWiseConvMatrix& Bp, float C_multiplier, std::int32_t C_zero_point, std::uint8_t* C, const std::int32_t* col_offsets, - const std::int32_t* bias, + const BIAS_TYPE* bias, bool fuse_relu = false, + float act_times_w_scale = 1.0f, int thread_id = 0, int num_threads = 1); /** - * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8 - * This version is fused with requantization and uses per-channel quantization. + * Depth-wise 3x3 convolution with pad=1 and K a multiple of 8, fused with + * requantization, and using per-channel quantization. * * @col_offsets nullptr if col_offsets are folded into bias */ +template <typename BIAS_TYPE = std::int32_t> +FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1( + int N, + int H, + int W, + int K, + int stride_h, + int stride_w, + std::int32_t A_zero_point, + const std::uint8_t* A, + const std::int32_t* B_zero_point, + const PackedDepthWiseConvMatrix& Bp, + const float* C_multiplier, + std::int32_t C_zero_point, + std::uint8_t* C, + const std::int32_t* col_offsets, + const BIAS_TYPE* bias, + bool fuse_relu = false, + const float* act_times_w_scale = nullptr, + int thread_id = 0, + int num_threads = 1); + +/** To be removed. Keeping it just to make sure we don't change C2 files and + * fbgemm files in a single diff + */ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1( int N, int H, @@ -111,7 +155,7 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1( std::int32_t A_zero_point, const std::uint8_t* A, const std::int32_t* B_zero_point, - const Packed3x3ConvMatrix& Bp, + const PackedDepthWiseConvMatrix& Bp, const float* C_multiplier, std::int32_t C_zero_point, std::uint8_t* C, @@ -121,6 +165,10 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1( int thread_id = 0, int num_threads = 1); +/** To be removed. Keeping it just to make sure we don't change C2 files and + * fbgemm files in a single diff + * + */ FBGEMM_API void depthwise_3x3x3_pad_1( int N, int T, @@ -132,14 +180,20 @@ FBGEMM_API void depthwise_3x3x3_pad_1( int stride_w, std::int32_t A_zero_point, const std::uint8_t* A, - const Packed3x3x3ConvMatrix& Bp, - std::int32_t* C, + std::int32_t B_zero_point, + const PackedDepthWiseConvMatrix& Bp, + float C_multiplier, + std::int32_t C_zero_point, + std::uint8_t* C, + const std::int32_t* col_offsets, + const std::int32_t* bias, + bool fuse_relu = false, int thread_id = 0, int num_threads = 1); - /** * @col_offsets nullptr if col_offsets are folded into bias */ +template <typename BIAS_TYPE = std::int32_t> FBGEMM_API void depthwise_3x3x3_pad_1( int N, int T, @@ -152,11 +206,38 @@ FBGEMM_API void depthwise_3x3x3_pad_1( std::int32_t A_zero_point, const std::uint8_t* A, std::int32_t B_zero_point, - const Packed3x3x3ConvMatrix& Bp, + const PackedDepthWiseConvMatrix& Bp, float C_multiplier, std::int32_t C_zero_point, std::uint8_t* C, const std::int32_t* col_offsets, + const BIAS_TYPE* bias, + bool fuse_relu = false, + float act_times_w_scale = 1.0f, + int thread_id = 0, + int num_threads = 1); + +/** To be removed. Keeping it just to make sure we don't change C2 files and + * fbgemm files in a single diff + * + */ +FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1( + int N, + int T, + int H, + int W, + int K, + int stride_t, + int stride_h, + int stride_w, + std::int32_t A_zero_point, + const std::uint8_t* A, + const std::int32_t* B_zero_point, + const PackedDepthWiseConvMatrix& Bp, + const float* C_multiplier, + std::int32_t C_zero_point, + std::uint8_t* C, + const std::int32_t* col_offsets, const std::int32_t* bias, bool fuse_relu = false, int thread_id = 0, @@ -165,6 +246,7 @@ FBGEMM_API void depthwise_3x3x3_pad_1( /** * @col_offsets nullptr if col_offsets are folded into bias */ +template <typename BIAS_TYPE = std::int32_t> FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1( int N, int T, @@ -177,13 +259,14 @@ FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1( std::int32_t A_zero_point, const std::uint8_t* A, const std::int32_t* B_zero_point, - const Packed3x3x3ConvMatrix& Bp, + const PackedDepthWiseConvMatrix& Bp, const float* C_multiplier, std::int32_t C_zero_point, std::uint8_t* C, const std::int32_t* col_offsets, - const std::int32_t* bias, + const BIAS_TYPE* bias, bool fuse_relu = false, + const float* act_times_w_scale = nullptr, int thread_id = 0, int num_threads = 1); |