diff options
author | Daya Khudia <dskhudia@fb.com> | 2019-06-05 22:44:57 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-06-05 22:50:08 +0300 |
commit | 8197494f3ae280941639c72bc1342a9faa8e2ad6 (patch) | |
tree | e0f7fb9b8aad61c46b7d2a79f019255514c03985 /include | |
parent | 77868418c7963572167690ef069b06cbfe67de1f (diff) |
Unified convolution interface
Summary: We want to combine three different convolution interfaces under one top level function.
Reviewed By: protonu
Differential Revision: D15399811
fbshipit-source-id: 7390616d92783506fc156f0f6017f10b5f7f8e30
Diffstat (limited to 'include')
-rw-r--r-- | include/fbgemm/ConvUtils.h | 2 | ||||
-rw-r--r-- | include/fbgemm/Fbgemm.h | 107 | ||||
-rw-r--r-- | include/fbgemm/FbgemmI8DepthwiseAvx2.h | 175 | ||||
-rw-r--r-- | include/fbgemm/Utils.h | 5 |
4 files changed, 288 insertions, 1 deletions
diff --git a/include/fbgemm/ConvUtils.h b/include/fbgemm/ConvUtils.h index 1c8251e..11f3dcc 100644 --- a/include/fbgemm/ConvUtils.h +++ b/include/fbgemm/ConvUtils.h @@ -101,7 +101,7 @@ struct conv_param_t { std::to_string(stride[d]) + ", "; } for (int d = 0; d < SPATIAL_DIM * 2; ++d) { - out += "pad_" + dim_string[3 - (SPATIAL_DIM % 3) + d] + ":" + + out += "pad_" + dim_string[3 - SPATIAL_DIM + (d % SPATIAL_DIM)] + ":" + std::to_string(pad[d]); if (d < SPATIAL_DIM * 2 - 1) { out += ", "; diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index 720f681..721b12f 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -18,6 +18,7 @@ #include "FbgemmBuild.h" #include "FbgemmI8Spmdm.h" #include "QuantUtilsAvx2.h" +#include "FbgemmI8DepthwiseAvx2.h" #include "Types.h" #include "Utils.h" @@ -524,6 +525,62 @@ class FBGEMM_API PackWeightMatrixForGConv { }; /** + * @brief A container class to keep packed weight tensor for convolution. + * The source tensor should already be quantized. + * + * @tparam SPATIAL_DIM is equal to 2 for 2D convolutions and 3 for 3D + * convolutions. Default value is 2. + * @tparam T is the datatype for source tensor. Default value is int8. + * @tparam accT is the datatype to accumulate into. Default value is int32. + */ +template < + int SPATIAL_DIM = 2, + typename T = std::int8_t, + typename accT = std::int32_t> +class FBGEMM_API PackWeightsForConv { + public: + using This = PackWeightsForConv<SPATIAL_DIM, T, accT>; + using inpType = T; + using accType = accT; + + PackWeightsForConv() = delete; // no default constructor + + PackWeightsForConv( + const conv_param_t<SPATIAL_DIM>& conv_param, + const inpType* sdata, + const BlockingFactors* blocking_params = nullptr); + + std::shared_ptr<PackBMatrix<T, accT>> getPackedWForIm2col() { + return W_im2col_packed_; + } + + std::shared_ptr<Packed3x3ConvMatrix> getPackedWFor2DDW() { + return W_dw_2D_packed_; + } + + std::shared_ptr<Packed3x3x3ConvMatrix> getPackedWFor3DDW() { + return W_dw_3D_packed_; + } + + std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>> + getPackedWForGroupwise() { + return W_gconv_packed_; + } + + private: + // Packed weights if we use im2col based convolution implementation + std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_; + // Packed weights if we use 2D depthwise convolution implementation + std::shared_ptr<Packed3x3ConvMatrix> W_dw_2D_packed_; + // Packed weights if we use 3D depthwise convolution implementation + std::shared_ptr<Packed3x3x3ConvMatrix> W_dw_3D_packed_; + // Packed weights if we use groupwise (small channels per group) convolution + // implementation + std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>> + W_gconv_packed_; +}; + +/** * @brief Matrix packed for the first input matrix in GEMM (usually activation), * and row offsets used for requantization is computed during packing. * Im2col is fused with packing here. The source matrix is already @@ -1002,6 +1059,7 @@ template < typename nextOPType = DoNothing<outT, outT>> class FBGEMM_API ReQuantizeOutput { public: + static constexpr int RELU_FUSED = FUSE_RELU; using outType = outT; using inpType = inT; /** @@ -1056,12 +1114,18 @@ class FBGEMM_API ReQuantizeOutput { const float* getCMultiplier() const { return C_multiplier_; } + std::int32_t getAZeroPoint() const { + return Aq_zero_point_; + } std::int32_t getCZeroPoint() const { return C_zero_point_; } const std::int32_t* getBZeroPoint() const { return Bq_zero_point_; } + const std::int32_t* getRowOffsets() const { + return q_row_offsets_; + } const std::int32_t* getColOffsets() const { return q_col_offsets_; } @@ -1072,6 +1136,10 @@ class FBGEMM_API ReQuantizeOutput { return ncols_; } + void setRowOffsets(const std::int32_t* row_offsets) { + q_row_offsets_ = row_offsets; + } + private: nextOPType& nextop_; const float* C_multiplier_; @@ -1273,6 +1341,12 @@ void convDepthwiseSeparable( const processOutputType& output); /** + * @brief Is this depthwise convolution optimized? + */ +template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t> +bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p); + +/** * @brief Is this groupwise convolution supported? */ template <int SPATIAL_DIM> @@ -1351,4 +1425,37 @@ static void fbgemmGetRange( } } +/** + * @brief Performs convolution using fastest path available. + * + * @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions. + */ +template < + typename processOutputType, + int SPATIAL_DIM = 2, + typename ACC_T = std::int32_t> +FBGEMM_API int fbgemmConv( + const conv_param_t<SPATIAL_DIM>& conv_p, + const std::uint8_t* activations, + PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights, + typename processOutputType::outType* out, + std::int32_t* outBuffer, + processOutputType& outProcess, + int thread_id, + int num_threads, + const BlockingFactors* blocking_params = nullptr); + +/** + * @brief Returns which fast path to take + * + * @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions. + * + * @return optimized_conv_t::depthwise, optimized_conv_t::groupwise or + * optimized_conv_t::im2col + * + */ +template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t> +FBGEMM_API optimized_conv_t +ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p); + } // namespace fbgemm diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h new file mode 100644 index 0000000..432687c --- /dev/null +++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef I8DEPTHWISE_H +#define I8DEPTHWISE_H +#include <cstdint> +#include "fbgemm/FbgemmBuild.h" + +namespace fbgemm { + +// KERNEL_PROD is the product of all kernels. +// For example, KERNEL_PROD = 9 for 3x3, and 27 for 3x3x3. +template <int KERNEL_PROD> +class FBGEMM_API PackedDepthWiseConvMatrix { + public: + // smat in RSG layout + PackedDepthWiseConvMatrix(int K, const std::int8_t* smat); + virtual ~PackedDepthWiseConvMatrix(); + + const std::int8_t* PackedMat() const { + return pmat_; + } + + private: + int K_; + std::int8_t* pmat_; +}; // Packed3x3ConvMatrix + +using Packed3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3>; +using Packed3x3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3 * 3>; + +/** + * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8 + * @params A The input image in NHWK layout + * @params Bp The pre-packed filter + */ +FBGEMM_API void depthwise_3x3_pad_1( + int N, + int H, + int W, + int K, + int stride_h, + int stride_w, + std::int32_t A_zero_point, + const std::uint8_t* A, + const Packed3x3ConvMatrix& Bp, + std::int32_t* C, + int thread_id = 0, + int num_threads = 1); + +/** + * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8 + * This version is fused with requantization. + * + * @col_offsets nullptr if col_offsets are folded into bias + */ +FBGEMM_API void depthwise_3x3_pad_1( + int N, + int H, + int W, + int K, + int stride_h, + int stride_w, + std::int32_t A_zero_point, + const std::uint8_t* A, + std::int32_t B_zero_point, + const Packed3x3ConvMatrix& Bp, + float C_multiplier, + std::int32_t C_zero_point, + std::uint8_t* C, + const std::int32_t* col_offsets, + const std::int32_t* bias, + bool fuse_relu = false, + int thread_id = 0, + int num_threads = 1); + +/** + * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8 + * This version is fused with requantization and uses per-channel quantization. + * + * @col_offsets nullptr if col_offsets are folded into bias + */ +FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1( + int N, + int H, + int W, + int K, + int stride_h, + int stride_w, + std::int32_t A_zero_point, + const std::uint8_t* A, + const std::int32_t* B_zero_point, + const Packed3x3ConvMatrix& Bp, + const float* C_multiplier, + std::int32_t C_zero_point, + std::uint8_t* C, + const std::int32_t* col_offsets, + const std::int32_t* bias, + bool fuse_relu = false, + int thread_id = 0, + int num_threads = 1); + +FBGEMM_API void depthwise_3x3x3_pad_1( + int N, + int T, + int H, + int W, + int K, + int stride_t, + int stride_h, + int stride_w, + std::int32_t A_zero_point, + const std::uint8_t* A, + const Packed3x3x3ConvMatrix& Bp, + std::int32_t* C, + int thread_id = 0, + int num_threads = 1); + +/** + * @col_offsets nullptr if col_offsets are folded into bias + */ +FBGEMM_API void depthwise_3x3x3_pad_1( + int N, + int T, + int H, + int W, + int K, + int stride_t, + int stride_h, + int stride_w, + std::int32_t A_zero_point, + const std::uint8_t* A, + std::int32_t B_zero_point, + const Packed3x3x3ConvMatrix& Bp, + float C_multiplier, + std::int32_t C_zero_point, + std::uint8_t* C, + const std::int32_t* col_offsets, + const std::int32_t* bias, + bool fuse_relu = false, + int thread_id = 0, + int num_threads = 1); + +/** + * @col_offsets nullptr if col_offsets are folded into bias + */ +FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1( + int N, + int T, + int H, + int W, + int K, + int stride_t, + int stride_h, + int stride_w, + std::int32_t A_zero_point, + const std::uint8_t* A, + const std::int32_t* B_zero_point, + const Packed3x3x3ConvMatrix& Bp, + const float* C_multiplier, + std::int32_t C_zero_point, + std::uint8_t* C, + const std::int32_t* col_offsets, + const std::int32_t* bias, + bool fuse_relu = false, + int thread_id = 0, + int num_threads = 1); + +} // namespace fbgemm + +#endif diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h index 58232e7..ef1d4ab 100644 --- a/include/fbgemm/Utils.h +++ b/include/fbgemm/Utils.h @@ -32,6 +32,11 @@ enum class matrix_op_t { NoTranspose, Transpose }; enum class inst_set_t { anyarch, avx2, avx512 }; /** + * @brief Typed enum for optimized paths for convolutions + */ +enum class optimized_conv_t { depthwise, groupwise, im2col }; + +/** * @brief Typed enum for implementation type. * * ref is reference and opt is optimized. |