diff options
author | Young Jin Kim <youki@microsoft.com> | 2019-08-01 22:38:23 +0300 |
---|---|---|
committer | Young Jin Kim <youki@microsoft.com> | 2019-08-01 22:38:23 +0300 |
commit | eb8fede25bd048da6fd396654936703a474f0504 (patch) | |
tree | 943fd29e7e173fb1075b9886b0309765f5f4b114 | |
parent | e4ed5196cbec0d0a485578996b09912e92927e02 (diff) | |
parent | f712cb2328a2b29424bdaeecb9c0731da2cd997b (diff) |
Merge upstream master
-rw-r--r-- | README.md | 3 | ||||
-rw-r--r-- | bench/ConvUnifiedBenchmark.cc | 39 | ||||
-rw-r--r-- | include/fbgemm/Fbgemm.h | 72 | ||||
-rw-r--r-- | include/fbgemm/FbgemmI8DepthwiseAvx2.h | 20 | ||||
-rw-r--r-- | include/fbgemm/Utils.h | 2 | ||||
-rw-r--r-- | src/FbgemmConv.cc | 57 | ||||
-rw-r--r-- | src/FbgemmI8DepthwiseAvx2.cc | 46 | ||||
-rw-r--r-- | src/PackAMatrix.cc | 22 | ||||
-rw-r--r-- | src/PackAWithIm2Col.cc | 21 | ||||
-rw-r--r-- | src/PackAWithQuantRowOffset.cc | 25 | ||||
-rw-r--r-- | src/PackAWithRowOffset.cc | 25 | ||||
-rw-r--r-- | src/PackBMatrix.cc | 139 | ||||
-rw-r--r-- | src/PackMatrix.cc | 40 | ||||
-rw-r--r-- | src/PackWeightMatrixForGConv.cc | 141 | ||||
-rw-r--r-- | src/PackWeightsForConv.cc | 68 | ||||
-rw-r--r-- | src/RefImplementations.cc | 54 | ||||
-rw-r--r-- | test/GConvTest.cc | 57 | ||||
-rw-r--r-- | test/I8DepthwiseTest.cc | 69 | ||||
-rw-r--r-- | test/PackedRequantizeAcc16Test.cc | 72 | ||||
-rw-r--r-- | test/PackedRequantizeTest.cc | 70 | ||||
-rw-r--r-- | test/QuantUtilsTest.cc | 183 | ||||
-rw-r--r-- | test/UniConvPackingTest.cc | 148 | ||||
-rw-r--r-- | test/UniConvTest.cc | 325 |
23 files changed, 1322 insertions, 376 deletions
@@ -64,6 +64,9 @@ General build instructions are as follows: ``` git clone --recursive https://github.com/pytorch/FBGEMM.git cd FBGEMM +# if you are updating an existing checkout +git submodule sync +git submodule update --init --recursive mkdir build && cd build cmake .. make diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc index 6bc2cf4..b450beb 100644 --- a/bench/ConvUnifiedBenchmark.cc +++ b/bench/ConvUnifiedBenchmark.cc @@ -26,16 +26,18 @@ using namespace fbgemm; // 2D conv shapes vector<conv_param_t<2>> shapes_2d = { - // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, - // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right - // 2D convolutions - // regular - conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - // groupwise - conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), - - // DW - conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, + // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right + // 2D convolutions + // regular + conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // groupwise + conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // DW + conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // Pointwise + conv_param_t<>(1, 128, 128, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}) + }; // 3D conv shapes @@ -43,9 +45,11 @@ vector<conv_param_t<3>> shapes_3d = { // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h, stride_w}, // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right} // Regular - conv_param_t<3>(1, 64, 64, {32, 56, 56}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}), + conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}), // Depthwise - conv_param_t<3>(1, 64, 64, {32, 56, 56}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}) + conv_param_t<3>(1, 64, 64, {8, 14, 14}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}), + // Pointwise + conv_param_t<3>(1, 128, 128, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0}) }; template <int SPATIAL_DIM, typename Acc_t> @@ -110,6 +114,9 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) { aligned_vector<int8_t> Bint8( kernel_dim * conv_p.IC * (conv_p.OC / conv_p.G)); + aligned_vector<int8_t> Bint8_tr( + kernel_dim * conv_p.IC * (conv_p.OC / conv_p.G)); + int im_out_dim = accumulate( conv_p.OUT_DIM.begin(), conv_p.OUT_DIM.end(), 1, multiplies<int>()); aligned_vector<int32_t> Cint32_ref(conv_p.MB * im_out_dim * conv_p.OC); @@ -132,14 +139,14 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) { randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2); int32_t C_zero_point = 5; - aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end()); - // reference implementation + // conv_ref expects weights to be in G (R S C/G) K/G + transposeConvWeights<SPATIAL_DIM>(conv_p, Bint8.data(), Bint8_tr.data()); conv_ref( conv_p, Aint8.data(), Aint8_zero_point, - Bint8.data(), + Bint8_tr.data(), Cint32_ref.data()); // matrix dimensions after im2col @@ -162,7 +169,7 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) { KDimPerGroup, OC_per_G, OC_per_G, - Bint8.data() + g * KDimPerGroup * OC_per_G, + Bint8_tr.data() + g * KDimPerGroup * OC_per_G, Bint8_zero_point.data(), col_offsets.data() + g * OC_per_G, conv_p.OC); diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index bdcf308..7f428ed 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -489,6 +489,15 @@ class FBGEMM_API PackBMatrix final const T* smat_; std::int32_t ld_; std::int32_t row_interleave_; + + /** + * @brief Internal function performing both pack & unpack + */ + void pack_unpack_( + const block_type_t& block, + T* unpack_buf, + T* pack_buf, + bool ispack); }; /** @@ -521,6 +530,11 @@ class FBGEMM_API PackWeightMatrixForGConv { void pack(); /** + * @brief Unpacks a pmat buffer into source matrix. + */ + void unpack(T* origin_buf); + + /** * @brief Return packed data */ inpType* getBuf() { @@ -543,6 +557,22 @@ class FBGEMM_API PackWeightMatrixForGConv { const T* sdata_; T* pdata_; bool bufAllocatedHere_; + + /** + * @brief Internal function performing both pack & unpack + */ + void pack_unpack_(const T* src, T* dst, bool ispack); + + /** + * @brief Get the index of the unpacked data + */ + int unpacked_index_(int r, int s, int k, int g, int c, bool tr); + + /** + * @brief Get the index of the packed data + */ + int packed_index_(int r, int s, int k, int g, int c); + }; /** @@ -588,7 +618,40 @@ class FBGEMM_API PackWeightsForConv { return W_gconv_packed_; } + std::shared_ptr<PackBMatrix<T, accT>> getPackedWForPointwise() { + return W_pointwise_packed_; + } + + int inputChannels() { + return conv_param_.IC; + } + + int outputChannels() { + return conv_param_.OC; + } + + std::array<int, SPATIAL_DIM> kernelDims() { + return conv_param_.K; + } + + int groups() { + return conv_param_.G; + } + + /** + * @brief Returns true if the packed weights would work for the given + * convolution parameters, and false otherwise + */ + bool isPackingCompliant(const conv_param_t<SPATIAL_DIM>& conv_p); + + /** + * @brief Unpack packed matric into origin_buf (Used for the serialization to + * recover weight matrix). + */ + void unpack(T* origin_buf); + private: + const conv_param_t<SPATIAL_DIM> conv_param_; // Packed weights if we use im2col based convolution implementation std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_; // Packed weights if we use 2D depthwise convolution implementation @@ -599,6 +662,8 @@ class FBGEMM_API PackWeightsForConv { // implementation std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>> W_gconv_packed_; + // Packed weights if we use direct gemm for pointwise convolution + std::shared_ptr<PackBMatrix<T, accT>> W_pointwise_packed_; }; /** @@ -1374,6 +1439,13 @@ template <int SPATIAL_DIM> FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p); /** + * @brief Is this convolution a direct matrix-matrix multiplication, i.e., 1x1 + * (aka pointwise) with right paddings etc.? + */ +template <int SPATIAL_DIM> +FBGEMM_API bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p); + +/** * @brief Allocate __size bytes of uninitialized storage whose alignment is * specified by __align. */ diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h index 069ff77..e7b0ec4 100644 --- a/include/fbgemm/FbgemmI8DepthwiseAvx2.h +++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h @@ -16,7 +16,7 @@ namespace fbgemm { template <int KERNEL_PROD> class FBGEMM_API PackedDepthWiseConvMatrix { public: - // smat in RSG layout + // smat in GRS layout PackedDepthWiseConvMatrix(int K, const std::int8_t* smat); virtual ~PackedDepthWiseConvMatrix(); @@ -24,6 +24,17 @@ class FBGEMM_API PackedDepthWiseConvMatrix { return pmat_; } + /** + * @brief Unpacks pmat_ into unpack_data. + * Used for recovering the weight matrix into the original format + */ + void unpack(std::int8_t* unpacked_data); + + /** + * @brief returns the index into pmat_ given the row and column for smat + */ + int addr(int r, int c); + private: int K_; std::int8_t* pmat_; @@ -31,6 +42,13 @@ class FBGEMM_API PackedDepthWiseConvMatrix { using Packed3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3>; using Packed3x3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3 * 3>; +using Packed1ConvMatrix = PackedDepthWiseConvMatrix<1>; +using Packed2ConvMatrix = PackedDepthWiseConvMatrix<2>; +using Packed3ConvMatrix = PackedDepthWiseConvMatrix<3>; +using Packed4ConvMatrix = PackedDepthWiseConvMatrix<4>; +using Packed5ConvMatrix = PackedDepthWiseConvMatrix<5>; +using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>; +using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>; /** * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8 diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h index 68fe177..eac0bcd 100644 --- a/include/fbgemm/Utils.h +++ b/include/fbgemm/Utils.h @@ -44,7 +44,7 @@ enum class inst_set_t { anyarch, avx2, avx512 }; /** * @brief Typed enum for optimized paths for convolutions */ -enum class optimized_conv_t { depthwise, groupwise, im2col }; +enum class optimized_conv_t { depthwise, groupwise, pointwise, im2col }; /** * @brief Typed enum for implementation type. diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc index 5db63f6..027e6c5 100644 --- a/src/FbgemmConv.cc +++ b/src/FbgemmConv.cc @@ -6,8 +6,9 @@ */ #include <algorithm> -#include <iostream> +#include <numeric> #include <vector> +#include <functional> #include "fbgemm/Fbgemm.h" namespace fbgemm { @@ -33,12 +34,24 @@ bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) { }); } +template <int SPATIAL_DIM> +bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) { + return std::accumulate(conv_p.K.begin(), conv_p.K.end(), 0) == SPATIAL_DIM && + std::accumulate(conv_p.stride.begin(), conv_p.stride.end(), 0) == + SPATIAL_DIM && + std::accumulate(conv_p.dilation.begin(), conv_p.dilation.end(), 0) == + SPATIAL_DIM && + std::accumulate(conv_p.pad.begin(), conv_p.pad.end(), 0) == 0; +} + template <int SPATIAL_DIM, typename ACC_T> optimized_conv_t ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) { if (takeDepthWiseFastPath<SPATIAL_DIM, ACC_T>(conv_p)) { return optimized_conv_t::depthwise; } else if (fbgemmOptimizedGConv<SPATIAL_DIM>(conv_p)) { return optimized_conv_t::groupwise; + } else if (takePointWiseFastPath<SPATIAL_DIM>(conv_p)) { + return optimized_conv_t::pointwise; } else { return optimized_conv_t::im2col; } @@ -58,6 +71,13 @@ int fbgemmConv( static_assert( SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "Only 2D and 3D convolutions are supported"); + + if (!packed_weights.isPackingCompliant(conv_p)) { + throw std::logic_error( + "[FBGEMM_CONV_ERROR] Prepacked weights can't be used" + " with these convolution parameters!"); + } + switch (ConvFastPath<SPATIAL_DIM, ACC_T>(conv_p)) { case optimized_conv_t::depthwise: { // 2D and 3D depthwise fast path @@ -134,11 +154,44 @@ int fbgemmConv( num_threads); break; } + case optimized_conv_t::pointwise: { + std::vector<int32_t> row_offset_buf( + PackAWithRowOffset<uint8_t>::rowOffsetBufferSize(blocking_params)); + int image_dim = std::accumulate( + conv_p.IN_DIM.begin(), + conv_p.IN_DIM.end(), + 1, + std::multiplies<int>()); + PackAWithRowOffset<uint8_t, ACC_T> packA( + matrix_op_t::NoTranspose, + conv_p.MB * image_dim, + conv_p.IC, + activations, + conv_p.IC, + nullptr, + conv_p.G, + row_offset_buf.data(), + blocking_params); + + outProcess.setRowOffsets(row_offset_buf.data()); + fbgemmPacked( + packA, + *(packed_weights.getPackedWForPointwise()), + out, + outBuffer, + conv_p.OC, + outProcess, + thread_id, + num_threads, + blocking_params); + break; + } case optimized_conv_t::im2col: { // All other convolutions go through im2col-based implementation // std::cout << "Im2col path" << std::endl; std::vector<int32_t> row_offset_buf( - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize()); + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM> + ::rowOffsetBufferSize(blocking_params)); const std::int32_t* b_zero_point = outProcess.getBZeroPoint(); bool b_symmetric = b_zero_point[0] == 0; diff --git a/src/FbgemmI8DepthwiseAvx2.cc b/src/FbgemmI8DepthwiseAvx2.cc index f96d1d2..183a8a9 100644 --- a/src/FbgemmI8DepthwiseAvx2.cc +++ b/src/FbgemmI8DepthwiseAvx2.cc @@ -170,6 +170,45 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix( } template <int KERNEL_PROD> +int PackedDepthWiseConvMatrix<KERNEL_PROD>::addr(int r, int c) { + constexpr int KERNEL_PROD_ALIGNED = (KERNEL_PROD + 1) / 2 * 2; + if (c >= KERNEL_PROD / 4 * 4 && + (KERNEL_PROD % 4 == 1 || KERNEL_PROD % 4 == 2)) { + int kBlock = r / 32; + int reg_idx = (r % 16) / 8 + c / 4 * 4; + + int blk_idx = kBlock * KERNEL_PROD_ALIGNED + reg_idx; + + int r_ = r % 8; + int c_ = c % 4; + + int in_blk_idx = (r % 32) / 16 * 16 + 2 * r_ + c_; + return blk_idx * 32 + in_blk_idx; + + } else { + int kBlock = r / 32; + int reg_idx = (r % 16) / 4 + c / 4 * 4; + + int blk_idx = kBlock * KERNEL_PROD_ALIGNED + reg_idx; + + int r_ = r % 4; + int c_ = c % 4; + + int in_blk_idx = (r % 32) / 16 * 16 + 4 * r_ + c_; + return blk_idx * 32 + in_blk_idx; + } +} + +template <int KERNEL_PROD> +void PackedDepthWiseConvMatrix<KERNEL_PROD>::unpack(int8_t* unpacked_data) { + for (int r = 0; r < K_; ++r) { + for (int c = 0; c < KERNEL_PROD; ++c) { + unpacked_data[r * KERNEL_PROD + c] = pmat_[addr(r, c)]; + } + } +} + +template <int KERNEL_PROD> PackedDepthWiseConvMatrix<KERNEL_PROD>::~PackedDepthWiseConvMatrix() { #ifdef _MSC_VER _aligned_free(pmat_); @@ -180,6 +219,13 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::~PackedDepthWiseConvMatrix() { template class PackedDepthWiseConvMatrix<3 * 3>; template class PackedDepthWiseConvMatrix<3 * 3 * 3>; +template class PackedDepthWiseConvMatrix<1>; +template class PackedDepthWiseConvMatrix<2>; +template class PackedDepthWiseConvMatrix<3>; +template class PackedDepthWiseConvMatrix<4>; +template class PackedDepthWiseConvMatrix<5>; +template class PackedDepthWiseConvMatrix<5 * 2>; +template class PackedDepthWiseConvMatrix<11 * 1>; // c = a0 * b0 + a1 * b1 + a2 * b2 + a3 * b3 // A is in uint8_t diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc index 87adaba..143e11d 100644 --- a/src/PackAMatrix.cc +++ b/src/PackAMatrix.cc @@ -34,31 +34,29 @@ PackAMatrix<T, accT>::PackAMatrix( if (!cpuinfo_initialize()) { throw std::runtime_error("Failed to initialize cpuinfo!"); } + if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) { + assert(0 && "unknown architecure"); + } + if (params) { - if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { - BaseType::brow_ = params->MCB; - BaseType::bcol_ = params->KCB; - row_interleave_B_ = params->ROW_INTERLEAVE; - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecure"); - } + BaseType::brow_ = params->MCB; + BaseType::bcol_ = params->KCB; + row_interleave_B_ = params->ROW_INTERLEAVE; } else { if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (fbgemmHasAvx2Support()) { + } else { + // AVX2 BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE; - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecure"); } } + if (BaseType::numCols() % groups != 0) { throw std::runtime_error( "groups = " + std::to_string(groups) + diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index e55dd4e..d731654 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -49,32 +49,29 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col( if (!cpuinfo_initialize()) { throw std::runtime_error("Failed to initialize cpuinfo!"); } + if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) { + assert(0 && "unknown architecure"); + } if (params) { - if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { - BaseType::brow_ = params->MCB; - BaseType::bcol_ = params->KCB; - row_interleave_B_ = params->ROW_INTERLEAVE; - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecure"); - } + BaseType::brow_ = params->MCB; + BaseType::bcol_ = params->KCB; + row_interleave_B_ = params->ROW_INTERLEAVE; } else { if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (fbgemmHasAvx2Support()) { + } else { + // AVX2 BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE; - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecure"); } } + if (BaseType::numCols() % conv_p.G != 0) { throw std::runtime_error( "groups = " + std::to_string(conv_p.G) + diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc index 305a298..52caed4 100644 --- a/src/PackAWithQuantRowOffset.cc +++ b/src/PackAWithQuantRowOffset.cc @@ -45,32 +45,31 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset( if (!cpuinfo_initialize()) { throw std::runtime_error("Failed to initialize cpuinfo!"); } - rowOffsetAllocatedHere = false; + if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) { + assert(0 && "unknown architecure"); + } + if (params) { - if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { - BaseType::brow_ = params->MCB; - BaseType::bcol_ = params->KCB; - row_interleave_B_ = params->ROW_INTERLEAVE; - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecure"); - } + BaseType::brow_ = params->MCB; + BaseType::bcol_ = params->KCB; + row_interleave_B_ = params->ROW_INTERLEAVE; } else { if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (fbgemmHasAvx2Support()) { + } else { + // AVX2 BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE; - } else { - // TODO: Have default slower path - assert(0 && "unknown architecure"); } } + + rowOffsetAllocatedHere = false; + if (BaseType::numCols() % groups != 0) { throw std::runtime_error( "groups = " + std::to_string(groups) + diff --git a/src/PackAWithRowOffset.cc b/src/PackAWithRowOffset.cc index b791817..733bf5c 100644 --- a/src/PackAWithRowOffset.cc +++ b/src/PackAWithRowOffset.cc @@ -39,32 +39,31 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset( if (!cpuinfo_initialize()) { throw std::runtime_error("Failed to initialize cpuinfo!"); } - rowOffsetAllocatedHere = false; + if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) { + assert(0 && "unknown architecure"); + } + if (params) { - if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { - BaseType::brow_ = params->MCB; - BaseType::bcol_ = params->KCB; - row_interleave_B_ = params->ROW_INTERLEAVE; - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecure"); - } + BaseType::brow_ = params->MCB; + BaseType::bcol_ = params->KCB; + row_interleave_B_ = params->ROW_INTERLEAVE; } else { if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (fbgemmHasAvx2Support()) { + } else { + // AVX2 BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; row_interleave_B_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE; - } else { - // TODO: Have default slower path - assert(0 && "unknown architecure"); } } + + rowOffsetAllocatedHere = false; + if (BaseType::numCols() % groups != 0) { throw std::runtime_error( "groups = " + std::to_string(groups) + diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index 4d86d45..b19b5d4 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -188,31 +188,29 @@ PackBMatrix<T, accT>::PackBMatrix( if (!cpuinfo_initialize()) { throw std::runtime_error("Failed to initialize cpuinfo!"); } + if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) { + assert(0 && "unknown architecure"); + } + if (params) { - if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { - BaseType::brow_ = params->KCB; - BaseType::bcol_ = params->NCB; - row_interleave_ = params->ROW_INTERLEAVE; - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecure"); - } + BaseType::brow_ = params->KCB; + BaseType::bcol_ = params->NCB; + row_interleave_ = params->ROW_INTERLEAVE; } else { if (fbgemmHasAvx512Support()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB; row_interleave_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; - } else if (fbgemmHasAvx2Support()) { + } else { + // AVX2 BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::NCB; row_interleave_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE; - } else { - // Error - assert(0 && "unknown architecure"); } } + if (BaseType::numRows() % groups != 0) { throw std::runtime_error( "groups = " + std::to_string(groups) + @@ -292,7 +290,11 @@ PackBMatrix<T, accT>::PackBMatrix( } template <typename T, typename accT> -void PackBMatrix<T, accT>::pack(const block_type_t& block) { +void PackBMatrix<T, accT>::pack_unpack_( + const block_type_t& block, + T* unpack_buf, + T* pack_buf, + bool ispack) { assert((BaseType::blockRowSize() % row_interleave_) == 0); assert((block.row_start % BaseType::blockRowSize()) == 0); assert((block.col_start % BaseType::blockColSize()) == 0); @@ -300,7 +302,7 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) { BaseType::packedBlock(block); bool tr = (trans_ == matrix_op_t::Transpose); for (int g = 0; g < BaseType::numGroups(); ++g) { - T* out = BaseType::getBuf() + + T* pack_buf_cur = pack_buf + g * BaseType::packedBufferSize(block.row_size, block.col_size); for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) * @@ -326,10 +328,16 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) { c_blk_offset * BaseType::blockRowSize() * BaseType::blockColSize() + c_idx_offset * row_interleave_; - int out_idx = r_offset + c_offset; - T val = tr ? smat_[i + (g * block.col_size + j) * ld_] - : smat_[(g * block.row_size + i) * ld_ + j]; - out[out_idx] = val; + if (ispack) { + pack_buf_cur[r_offset + c_offset] = tr + ? unpack_buf[i + (g * block.col_size + j) * ld_] + : unpack_buf[(g * block.row_size + i) * ld_ + j]; + } else { + T* unpack_buf_cur = tr + ? &(unpack_buf[i + (g * block.col_size + j) * ld_]) + : &(unpack_buf[(g * block.row_size + i) * ld_ + j]); + *unpack_buf_cur = pack_buf_cur[r_offset + c_offset]; + } c_idx_offset++; if (c_idx_offset == BaseType::blockColSize()) { @@ -338,78 +346,45 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) { } } } - // fill the remaining with zero. - // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill. - for (int i = block.row_start + block.row_size; - i < (block.row_start + block.row_size + row_interleave_ - 1) / - row_interleave_ * row_interleave_; - ++i) { - int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) * - (BaseType::blockRowSize() * BaseType::blockColSize()) + - (i % BaseType::blockRowSize() / row_interleave_) * - BaseType::blockColSize() * row_interleave_ + - i % row_interleave_; - for (int j = block.col_start; j < block.col_start + block.col_size; j++) { - int c_offset = (j / BaseType::blockColSize()) * - BaseType::blockRowSize() * BaseType::blockColSize() + - (j % BaseType::blockColSize()) * row_interleave_; + if (ispack) { + // fill the remaining with zero. + // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill. + for (int i = block.row_start + block.row_size; + i < (block.row_start + block.row_size + row_interleave_ - 1) / + row_interleave_ * row_interleave_; + ++i) { + int r_offset = + ((i / BaseType::blockRowSize()) * BaseType::blockCols()) * + (BaseType::blockRowSize() * BaseType::blockColSize()) + + (i % BaseType::blockRowSize() / row_interleave_) * + BaseType::blockColSize() * row_interleave_ + + i % row_interleave_; + for (int j = block.col_start; j < block.col_start + block.col_size; + j++) { + int c_offset = (j / BaseType::blockColSize()) * + BaseType::blockRowSize() * BaseType::blockColSize() + + (j % BaseType::blockColSize()) * row_interleave_; - int out_idx = r_offset + c_offset; - out[out_idx] = 0; + int out_idx = r_offset + c_offset; + pack_buf_cur[out_idx] = 0; + } } } } // for each group } template <typename T, typename accT> -void PackBMatrix<T, accT>::unpack(T* origin_buf) { - bool tr = (trans_ == matrix_op_t::Transpose); - for (int g = 0; g < this->numGroups(); ++g) { - T* out = BaseType::getBuf() + - g * - BaseType::packedBufferSize( - BaseType::numPackedRows(), BaseType::numPackedCols()); - for (int i = BaseType::packedRowStart(); - i < BaseType::packedRowStart() + BaseType::numPackedRows(); - ++i) { - int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) * - (BaseType::blockRowSize() * BaseType::blockColSize()) + - (i % BaseType::blockRowSize() / row_interleave_) * - BaseType::blockColSize() * row_interleave_ + - i % row_interleave_; - - int c_start_offset = - (BaseType::packedColStart() / BaseType::blockColSize()) * - BaseType::blockRowSize() * BaseType::blockColSize() + - (BaseType::packedColStart() % BaseType::blockColSize()) * - row_interleave_; - - int c_idx_offset = 0; - int c_blk_offset = 0; - for (int j = BaseType::packedColStart(); - j < BaseType::packedColStart() + BaseType::numPackedCols(); - ++j) { - int c_offset = c_start_offset + - c_blk_offset * BaseType::blockRowSize() * BaseType::blockColSize() + - c_idx_offset * row_interleave_; - - int out_idx = r_offset + c_offset; - - T val = out[out_idx]; - if (tr) { - origin_buf[i + (g * BaseType::numPackedCols() + j) * ld_] = val; - } else { - origin_buf[(g * BaseType::numPackedRows() + i) * ld_ + j] = val; - } +void PackBMatrix<T, accT>::pack(const block_type_t& block) { + pack_unpack_(block, const_cast<T*>(smat_), BaseType::getBuf(), true); +} - c_idx_offset++; - if (c_idx_offset == BaseType::blockColSize()) { - c_idx_offset = 0; - c_blk_offset++; - } - } - } - } // for each group +template <typename T, typename accT> +void PackBMatrix<T, accT>::unpack(T* origin_buf) { + block_type_t blockB{BaseType::packedRowStart(), + BaseType::numPackedRows(), + BaseType::packedColStart(), + BaseType::numPackedCols()}; + pack_unpack_(blockB, origin_buf, BaseType::getBuf(), false); } template <typename T, typename accT> diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc index 33227fb..c7503dd 100644 --- a/src/PackMatrix.cc +++ b/src/PackMatrix.cc @@ -36,45 +36,37 @@ int PackMatrix<PT, inpType, accType>::packedBufferSize( if (!cpuinfo_initialize()) { throw std::runtime_error("Failed to initialize cpuinfo!"); } + if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) { + assert(0 && "unknown architecure"); + } + int MCB, KCB, NCB; if (params) { - if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { - MCB = params->MCB; - NCB = params->NCB; - KCB = params->KCB; - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecure"); - } + MCB = params->MCB; + NCB = params->NCB; + KCB = params->KCB; } else { if (fbgemmHasAvx512Support()) { MCB = PackingTraits<inpType, accType, inst_set_t::avx512>::MCB; NCB = PackingTraits<inpType, accType, inst_set_t::avx512>::NCB; KCB = PackingTraits<inpType, accType, inst_set_t::avx512>::KCB; - } else if (fbgemmHasAvx2Support()) { + } else { + // AVX2 MCB = PackingTraits<inpType, accType, inst_set_t::avx2>::MCB; NCB = PackingTraits<inpType, accType, inst_set_t::avx2>::NCB; KCB = PackingTraits<inpType, accType, inst_set_t::avx2>::KCB; - } else { - // TODO: Have default slower path - assert(0 && "unsupported architecure"); - return -1; } } - if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { - if (isA()) { - return MCB * KCB; - } else { - int rowBlock = KCB; - int colBlock = NCB; - return (((rows + rowBlock - 1) / rowBlock) * rowBlock) * - (((cols + colBlock - 1) / colBlock) * colBlock); - } + if (isA()) { + return MCB * KCB; } else { - // TODO: Have default slower path - assert(0 && "unsupported architecure"); + int rowBlock = KCB; + int colBlock = NCB; + return (((rows + rowBlock - 1) / rowBlock) * rowBlock) * + (((cols + colBlock - 1) / colBlock) * colBlock); } + return -1; } diff --git a/src/PackWeightMatrixForGConv.cc b/src/PackWeightMatrixForGConv.cc index 0fb0e2c..ba6adf3 100644 --- a/src/PackWeightMatrixForGConv.cc +++ b/src/PackWeightMatrixForGConv.cc @@ -36,8 +36,61 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv( } /** - * @brief Pack weight tensor in a suitable format required for the optimized - * kernel. + * @brief Get the index of the unpacked data for a given <r, s, k, g, c, tr> + * + * Non-transposed: G (R S C/G) K/G + * Transposed: G K/G (R S C/G) + * Using inline as this will be called frequently + */ +template <typename T, typename accT, int SPATIAL_DIM> +inline int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::unpacked_index_( + int r, int s, int k, int g, int c, bool tr) { + // Get the full dimensions + int R = conv_param_.K[0]; + int S = conv_param_.K[1]; + int G = conv_param_.G; + int IC_per_G = conv_param_.IC / G; + int OC_per_G = conv_param_.OC / G; + + int idx; + if (tr) { + idx = (((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c; + } else { + idx = (((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k; + } + return idx; +} + +/** + * @brief Get the index of the packed data for a given <r, s, k, g, c> + * + * The index may differ depending on IC_per_G. + * Using inline as this will be called frequently + */ +template <typename T, typename accT, int SPATIAL_DIM> +inline int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::packed_index_( + int r, int s, int k, int g, int c) { + // Get the full dimensions + int R = conv_param_.K[0]; + int S = conv_param_.K[1]; + int G = conv_param_.G; + int IC_per_G = conv_param_.IC / G; + int OC_per_G = conv_param_.OC / G; + + int idx; + // For IC_per_G == 4, we need to work on 2 groups at a time + if (IC_per_G == 4) { + idx = (((((g / 2) * R + r) * S + s) * OC_per_G + k) * 2 + (g % 2)) + * IC_per_G + c; + } else { + idx = ((((g * (IC_per_G / 4) + (c / 4)) * R + r) * S + s) * OC_per_G + k) + * 4 + (c % 4); + } + return idx; +} + +/** + * @ brief Pack or unpack matrix * * Let IC_per_G be number of input channels per group and OC_per_G be number of * output channels per group. @@ -53,15 +106,17 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv( * on 2 groups at a time and full SIMD width can be efficiently utilized even * while working on 1 group at a time. * In this case, the layout is G (C/4) R S K 4 - */ +*/ + template <typename T, typename accT, int SPATIAL_DIM> -void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() { +void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack_unpack_( + const T* src, T* dst, bool ispack) { // filters are assumed to be in G RS C/G K/G format int R = conv_param_.K[0]; int S = conv_param_.K[1]; int G = conv_param_.G; - int IC_per_G = conv_param_.IC / conv_param_.G; - int OC_per_G = conv_param_.OC / conv_param_.G; + int IC_per_G = conv_param_.IC / G; + int OC_per_G = conv_param_.OC / G; // If transpose option is set, the weight matrix is in layout G K/G (R S C/G) // instead of G (R S C/G) K/G @@ -73,25 +128,13 @@ void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() { for (int k = 0; k < OC_per_G; ++k) { for (int g = 0; g < G; ++g) { for (int c = 0; c < IC_per_G; ++c) { - inpType b = tr - ? sdata_ - [(((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c] - : sdata_ - [(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k]; - if (IC_per_G == 4) { - // For IC_per_G == 4, we need to work on 2 groups at a time - pdata_ - [(((((g / 2) * R + r) * S + s) * OC_per_G + k) * 2 + - (g % 2)) * - IC_per_G + - c] = b; + int p_idx = packed_index_(r, s, k, g, c); + int up_idx = unpacked_index_(r, s, k, g, c, tr); + // Pack: src (unpacked) -> dst (packed) + if (ispack) { + dst[p_idx] = src[up_idx]; } else { - pdata_ - [((((g * (IC_per_G / 4) + (c / 4)) * R + r) * S + s) * - OC_per_G + - k) * - 4 + - (c % 4)] = b; + dst[up_idx] = src[p_idx]; } } } @@ -99,14 +142,54 @@ void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() { } } } else { + // For pack & transposed, call transposeConvWeights() + // G K/G (R S C/G) => G (R S C/G) K/G if (tr) { - // conv_ref expects weights to be in G (R S C/G) K/G format - transposeConvWeights(conv_param_, sdata_, pdata_); + if (ispack) { + transposeConvWeights(conv_param_, src, dst); + } else { + // TODO: Wrap this as a inverseTransposeConvWeights()? + // For unpack & transposed, call transposeConvWeights() + // G (R S C/G) K/G => G K/G (R S C/G) + for (int r = 0; r < R; ++r) { + for (int s = 0; s < S; ++s) { + for (int k = 0; k < OC_per_G; ++k) { + for (int g = 0; g < G; ++g) { + for (int c = 0; c < IC_per_G; ++c) { + dst[(((g * OC_per_G + k) * R + r) * S + s) + * IC_per_G + c] = + src[(((g * R + r) * S + s) * IC_per_G + c) + * OC_per_G + k]; + } + } + } + } + } + } // end if(ispack) } else { // just copy the data for not supported cases - memcpy(pdata_, sdata_, G * R * S * OC_per_G * IC_per_G * sizeof(inpType)); - } - } + memcpy(dst, src, + G * R * S * OC_per_G * IC_per_G * sizeof(inpType)); + } //end if(tr) + } // end if(fbgemmOptimizedGConv(conv_param_) +} + +/** + * @brief Pack weight tensor in a suitable format required for the optimized + * kernel. + */ +template <typename T, typename accT, int SPATIAL_DIM> +void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() { + pack_unpack_(sdata_, pdata_, true); +} + +/** + * @brief Unpack the packed weight tensor (for the optimized kernel) + * to the original form. + */ +template <typename T, typename accT, int SPATIAL_DIM> +void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::unpack(T* origin_buf) { + pack_unpack_(const_cast<const T*>(pdata_), origin_buf, false); } template class PackWeightMatrixForGConv<int8_t, int32_t, 2>; diff --git a/src/PackWeightsForConv.cc b/src/PackWeightsForConv.cc index c811144..25b04af 100644 --- a/src/PackWeightsForConv.cc +++ b/src/PackWeightsForConv.cc @@ -4,6 +4,7 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ +#include <algorithm> #include <memory> #include "fbgemm/Fbgemm.h" @@ -13,7 +14,8 @@ template <int SPATIAL_DIM, typename T, typename accT> PackWeightsForConv<SPATIAL_DIM, T, accT>::PackWeightsForConv( const conv_param_t<SPATIAL_DIM>& conv_p, const T* sdata, - const BlockingFactors* blocking_params) { + const BlockingFactors* blocking_params) + : conv_param_(conv_p) { static_assert( SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "Only 2D and 3D convolutions are supported"); @@ -42,18 +44,36 @@ PackWeightsForConv<SPATIAL_DIM, T, accT>::PackWeightsForConv( W_dw_3D_packed_ = nullptr; W_gconv_packed_ = std::make_shared<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>( - matrix_op_t::NoTranspose, conv_p, sdata, nullptr); + matrix_op_t::Transpose, conv_p, sdata, nullptr); + break; + } + case optimized_conv_t::pointwise: { + W_im2col_packed_ = nullptr; + W_dw_2D_packed_ = nullptr; + W_dw_3D_packed_ = nullptr; + W_gconv_packed_ = nullptr; + int NDim = conv_p.OC / conv_p.G; + int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC; + W_pointwise_packed_ = std::make_shared<PackBMatrix<T, accT>>( + matrix_op_t::Transpose, + KDim, + NDim, + sdata, + KDim / conv_p.G, + nullptr, + conv_p.G, + blocking_params); break; } case optimized_conv_t::im2col: { int NDim = conv_p.OC / conv_p.G; int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC; W_im2col_packed_ = std::make_shared<PackBMatrix<T, accT>>( - matrix_op_t::NoTranspose, + matrix_op_t::Transpose, KDim, NDim, sdata, - NDim, + KDim / conv_p.G, nullptr, conv_p.G, blocking_params); @@ -65,6 +85,46 @@ PackWeightsForConv<SPATIAL_DIM, T, accT>::PackWeightsForConv( } // switch } +template <int SPATIAL_DIM, typename T, typename accT> +void PackWeightsForConv<SPATIAL_DIM, T, accT>::unpack(T* origin_buf) { + if (W_dw_2D_packed_) { + W_dw_2D_packed_->unpack(origin_buf); + } else if (W_dw_3D_packed_) { + W_dw_3D_packed_->unpack(origin_buf); + } else if (W_gconv_packed_) { + W_gconv_packed_->unpack(origin_buf); + } else if (W_im2col_packed_) { + W_im2col_packed_->unpack(origin_buf); + } else if (W_pointwise_packed_) { + W_pointwise_packed_->unpack(origin_buf); + } else { + assert(false && "At least one packed weights object should exist"); + } +} + +template <int SPATIAL_DIM, typename T, typename accT> +bool PackWeightsForConv<SPATIAL_DIM, T, accT>::isPackingCompliant( + const conv_param_t<SPATIAL_DIM>& test_conv_p) { + return conv_param_.IC == test_conv_p.IC && conv_param_.OC == test_conv_p.OC && + conv_param_.G == test_conv_p.G && + std::equal( + conv_param_.K.begin(), + conv_param_.K.end(), + test_conv_p.K.begin()) && + std::equal( + conv_param_.stride.begin(), + conv_param_.stride.end(), + test_conv_p.stride.begin()) && + std::equal( + conv_param_.pad.begin(), + conv_param_.pad.end(), + test_conv_p.pad.begin()) && + std::equal( + conv_param_.dilation.begin(), + conv_param_.dilation.end(), + test_conv_p.dilation.begin()); +} + template class PackWeightsForConv<2, int8_t, int32_t>; template class PackWeightsForConv<3, int8_t, int32_t>; diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index b4b0c2b..e3c0eac 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -181,8 +181,7 @@ void cblas_sgemm_ref( int ldb, float beta, float* Cfp32, - int ldc - ) { + int ldc) { for (int i = 0; i < m; ++i) { for (int j = 0; j < n; ++j) { float sum = 0; @@ -204,7 +203,6 @@ void cblas_sgemm_ref( } } - void row_offsets_u8acc32_ref( int M, int K, @@ -542,21 +540,49 @@ void transposeConvWeights( const conv_param_t<SPATIAL_DIM>& conv_p, const std::int8_t* src, std::int8_t* dest) { - assert(SPATIAL_DIM == 2 && "Only 2D supported currently"); - int R = conv_p.K[0]; - int S = conv_p.K[1]; int G = conv_p.G; int IC_per_G = conv_p.IC / conv_p.G; int OC_per_G = conv_p.OC / conv_p.G; - // Transforms weights from G K/G (R S C/G) to G (R S C/G) K/G format. - for (int r = 0; r < R; ++r) { - for (int s = 0; s < S; ++s) { - for (int k = 0; k < OC_per_G; ++k) { - for (int g = 0; g < G; ++g) { - for (int c = 0; c < IC_per_G; ++c) { - dest[(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k] = - src[(((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c]; + assert( + (SPATIAL_DIM == 3 || SPATIAL_DIM == 2) && + "Only 2D and 3D convolutions are supported"); + if (SPATIAL_DIM == 2) { + int R = conv_p.K[0]; + int S = conv_p.K[1]; + // Transforms weights from G K/G (R S C/G) to G (R S C/G) K/G format. + for (int r = 0; r < R; ++r) { + for (int s = 0; s < S; ++s) { + for (int k = 0; k < OC_per_G; ++k) { + for (int g = 0; g < G; ++g) { + for (int c = 0; c < IC_per_G; ++c) { + dest[(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k] = + src[(((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c]; + } + } + } + } + } + } else { + // Transforms weights from G K/G (T R S C/G) to G (T R S C/G) K/G format. + int T = conv_p.K[0]; + int R = conv_p.K[1]; + int S = conv_p.K[2]; + for (int t = 0; t < T; ++t) { + for (int r = 0; r < R; ++r) { + for (int s = 0; s < S; ++s) { + for (int k = 0; k < OC_per_G; ++k) { + for (int g = 0; g < G; ++g) { + for (int c = 0; c < IC_per_G; ++c) { + dest + [((((g * T + t) * R + r) * S + s) * IC_per_G + c) * + OC_per_G + + k] = + src[((((g * OC_per_G + k) * T + t) * R + r) * S + s) * + IC_per_G + + c]; + } + } } } } diff --git a/test/GConvTest.cc b/test/GConvTest.cc index 84f0d52..0074535 100644 --- a/test/GConvTest.cc +++ b/test/GConvTest.cc @@ -43,6 +43,8 @@ class fbgemmGConvAcc32WithQuantGranularityTest QuantizationGranularity, bool, bool>> {}; +class fbgemmGConvPackTest + : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t>> {}; }; // namespace INSTANTIATE_TEST_CASE_P( @@ -61,6 +63,13 @@ INSTANTIATE_TEST_CASE_P( ::testing::ValuesIn(qGranularityVals), ::testing::Bool(), // A symmetric ::testing::Bool())); // B symmetric + +INSTANTIATE_TEST_CASE_P( + InstantiationName, + fbgemmGConvPackTest, + ::testing::Combine( + ::testing::Values(matrix_op_t::NoTranspose), + ::testing::ValuesIn(transposeVals))); /** * @brief Shapes for unit test. */ @@ -413,3 +422,51 @@ TEST_P(fbgemmGConvAcc32Test, NoRequantizeTest) { static_cast<int32_t>(0)); } // for each shape } + +/** + * @brief Unit test for packing and unpacking the weight tensor + */ +TEST_P(fbgemmGConvPackTest, PackUnpackTest) { + vector<conv_param_t<>> shapes(GetShapes_()); + matrix_op_t atrans, btrans; + tie(atrans, btrans) = GetParam(); + + for (auto conv_p : shapes) { + int R = conv_p.K[0]; + int S = conv_p.K[1]; + int IC_per_G = conv_p.IC / conv_p.G; + int OC_per_G = conv_p.OC / conv_p.G; + + // Weights -- test the packing/unpacking of only the weights + // when btrans == Transpose, the weight matrix is in layout G K/G (R S C/G) + // instead of G (R S C/G) K/G + int weight_len = R * S * conv_p.G * IC_per_G * OC_per_G; + aligned_vector<int8_t> Bint8(weight_len, 0); + + // Random fill the weights + randFill<int8_t>(Bint8, -4, 4); + + // Instantiate the object + PackWeightMatrixForGConv<int8_t> packedWeights( + btrans, conv_p, Bint8.data(), nullptr); + + // Setup a buffer to get pack -> unpacked results + aligned_vector<int8_t> unpack_buf(weight_len, 0); + + // START Actual pack-unpack operations + // Perform packing first. This should populate pdata_ of packedWeights + packedWeights.pack(); + + // Next perform unpacking + packedWeights.unpack(unpack_buf.data()); + // END actual pack-unpack operations + + // Sanity check + for (int i = 0; i < weight_len; ++i) { + EXPECT_EQ(Bint8.data()[i], unpack_buf.data()[i]) + << "Pack/Unpack results differ at index " << i + << ", Reference: " << static_cast<int> (Bint8.data()[i]) + << ", Pack-Unpacked: " << static_cast<int> (unpack_buf.data()[i]); + } + } // for each shape +} diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc index 11bd625..0604879 100644 --- a/test/I8DepthwiseTest.cc +++ b/test/I8DepthwiseTest.cc @@ -69,8 +69,16 @@ static vector<vector<int>> shapes = { }; namespace { -class FBGemmDepthWiseTest - : public testing::TestWithParam<tuple<bool, bool>> {}; + +class FBGemmDepthWiseTest : public testing::TestWithParam<tuple<bool, bool>> {}; + +// Two parameters are K (or Groups) and kernel_prod, i.e., +// (output_channels)(kernel_prod) +// output_channels == Groups. +// For example, kernel_prod for 3x3 convolution is 9 +class FBGemmDepthWisePackUnpackTest + : public testing::TestWithParam<tuple<int, int>> {}; + } // namespace INSTANTIATE_TEST_CASE_P( @@ -78,6 +86,13 @@ INSTANTIATE_TEST_CASE_P( FBGemmDepthWiseTest, ::testing::Combine(::testing::Bool(), ::testing::Bool())); +INSTANTIATE_TEST_CASE_P( + InstantiationName, + FBGemmDepthWisePackUnpackTest, + ::testing::Combine( + ::testing::ValuesIn({8, 16, 24, 32, 40, 64, 72}), + ::testing::ValuesIn({1, 2, 3, 4, 5, 9, 10, 11, 27}))); + TEST_P(FBGemmDepthWiseTest, Test3x3) { bool a_symmetric, b_symmetric; tie(a_symmetric, b_symmetric) = GetParam(); @@ -297,8 +312,8 @@ TEST_P(FBGemmDepthWiseTest, Test3x3x3) { for (int k = 0; k < K; ++k) { int32_t expected = C_uint8_ref [(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k]; - int32_t actual = C_uint8 - [(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k]; + int32_t actual = + C_uint8[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k]; EXPECT_EQ(expected, actual) << "Depthwise 3x3 results differ at (" << n << ", " << t << ", " << h << ", " << w << ", " << k << ")."; @@ -561,4 +576,50 @@ TEST(FBGemmDepthWiseTest, Test3x3x3PerChannelQuantization) { } // for each shape } // Test3x3PerChannelQuantization +TEST_P(FBGemmDepthWisePackUnpackTest, TestPackUnpack) { + int K, kernel_prod; + tie(K, kernel_prod) = GetParam(); + + ASSERT_EQ(K % 8, 0) + << "output channels (== groups) should be a multiple of 8"; + aligned_vector<int8_t> B(K * kernel_prod); + randFill<int8_t>(B, -16, 16); + + aligned_vector<int8_t> BUnpacked(K * kernel_prod); + + if (kernel_prod == 1) { + Packed1ConvMatrix BPacked(K, B.data()); + BPacked.unpack(BUnpacked.data()); + } else if (kernel_prod == 2) { + Packed2ConvMatrix BPacked(K, B.data()); + BPacked.unpack(BUnpacked.data()); + } else if (kernel_prod == 3) { + Packed3ConvMatrix BPacked(K, B.data()); + BPacked.unpack(BUnpacked.data()); + } else if (kernel_prod == 4) { + Packed4ConvMatrix BPacked(K, B.data()); + BPacked.unpack(BUnpacked.data()); + } else if (kernel_prod == 5) { + Packed5ConvMatrix BPacked(K, B.data()); + BPacked.unpack(BUnpacked.data()); + } else if (kernel_prod == 9) { + Packed3x3ConvMatrix BPacked(K, B.data()); + BPacked.unpack(BUnpacked.data()); + } else if (kernel_prod == 10) { + Packed10ConvMatrix BPacked(K, B.data()); + BPacked.unpack(BUnpacked.data()); + } else if (kernel_prod == 11) { + Packed11ConvMatrix BPacked(K, B.data()); + BPacked.unpack(BUnpacked.data()); + } else if (kernel_prod == 27) { + Packed3x3x3ConvMatrix BPacked(K, B.data()); + BPacked.unpack(BUnpacked.data()); + } else { + ASSERT_TRUE(false); + } + + ASSERT_EQ(B, BUnpacked) + << "Original and unpacked data elements are not the same"; +} // TestPackUnpack + } // namespace fbgemm diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc index 20f860e..23af3eb 100644 --- a/test/PackedRequantizeAcc16Test.cc +++ b/test/PackedRequantizeAcc16Test.cc @@ -27,7 +27,7 @@ using namespace std; using namespace fbgemm; vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose, - matrix_op_t::Transpose}; + matrix_op_t::Transpose}; vector<QuantizationGranularity> qGranularityVals{ QuantizationGranularity::TENSOR, @@ -40,6 +40,8 @@ class fbgemmu8s8acc16WithQuantGranularityTest tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {}; class fbgemmu8s8acc16Test : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {}; +class fbgemmPackUnpackAcc16Test + : public testing::TestWithParam<tuple<matrix_op_t, bool>> {}; }; // namespace INSTANTIATE_TEST_CASE_P( @@ -59,6 +61,11 @@ INSTANTIATE_TEST_CASE_P( ::testing::ValuesIn(transposeVals), ::testing::Bool())); +INSTANTIATE_TEST_CASE_P( + InstantiationName, + fbgemmPackUnpackAcc16Test, + ::testing::Combine(::testing::ValuesIn(transposeVals), ::testing::Bool())); + /** * @brief Shapes for unit test. */ @@ -810,3 +817,66 @@ TEST_P(fbgemmu8s8acc16Test, NoRequantizeTest) { } // for each groups } // for each shape } + +/** + * @brief Unit test for packing and unpacking the weight tensor. + */ +TEST_P(fbgemmPackUnpackAcc16Test, TestPackUnpack) { + vector<vector<int>> shapes(GetShapes_()); + matrix_op_t btrans; + bool test_ld; + tie(btrans, test_ld) = GetParam(); + + for (auto shape : shapes) { + for (int groups : {1, 3, 4}) { + int n = shape[1]; + int k = shape[2]; + + if (k % groups != 0) { + continue; + } + int k_per_group = k / groups; + + // kxn matrix + aligned_vector<int8_t> Bint8(k * n); + randFill<int8_t>(Bint8, -128, 127); + + // To test lda != k , we just reduce k by half and use the original k + // as lda. + int n_adjusted = n; + if (test_ld) { + if (btrans == matrix_op_t::NoTranspose) { + n_adjusted = std::max(n / 2, 1); + } + } + + // Note that packing for weight is performed during the constructor + // stage. + PackBMatrix<int8_t, int16_t> packedWeights( + btrans, + k, + n_adjusted, + Bint8.data(), + (btrans == matrix_op_t::Transpose) ? k_per_group : n, + nullptr, + groups); + + // Setup a buffer to get pack -> unpacked results + aligned_vector<int8_t> unpack_buf(k * n, 0); + + // Perform unpacking + packedWeights.unpack(unpack_buf.data()); + + // Sanity check + for (int i = 0; i < k; i++) { + for (int j = 0; j < n_adjusted; j++) { + EXPECT_EQ(Bint8.data()[i * n + j], unpack_buf.data()[i * n + j]) + << "Pack/Unpack results differ at index (" << i << ", " << j + << ", Reference: " << static_cast<int>(Bint8.data()[i * n + j]) + << ", Pack-Unpacked: " + << static_cast<int>(unpack_buf.data()[i * n + j]); + } + } + } + } +} diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc index fd827b0..11ef6ff 100644 --- a/test/PackedRequantizeTest.cc +++ b/test/PackedRequantizeTest.cc @@ -39,6 +39,8 @@ class fbgemmu8s8acc32WithQuantGranularityTest tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {}; class fbgemmu8s8acc32Test : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {}; +class fbgemmPackUnpackAcc32Test + : public testing::TestWithParam<tuple<matrix_op_t, bool>> {}; }; // namespace INSTANTIATE_TEST_CASE_P( @@ -58,6 +60,11 @@ INSTANTIATE_TEST_CASE_P( ::testing::ValuesIn(transposeVals), ::testing::Bool())); +INSTANTIATE_TEST_CASE_P( + InstantiationName, + fbgemmPackUnpackAcc32Test, + ::testing::Combine(::testing::ValuesIn(transposeVals), ::testing::Bool())); + /** * @brief Shapes for unit test. */ @@ -749,3 +756,66 @@ TEST_P(fbgemmu8s8acc32Test, TestSymmetricQuantizedInputOutput) { } // for each groups } // for each shape } + +/** + * @brief Unit test for packing and unpacking the weight tensor. + */ +TEST_P(fbgemmPackUnpackAcc32Test, TestPackUnpack) { + vector<vector<int>> shapes(GetShapes_()); + matrix_op_t btrans; + bool test_ld; + tie(btrans, test_ld) = GetParam(); + + for (auto shape : shapes) { + for (int groups : {1, 3, 4}) { + int n = shape[1]; + int k = shape[2]; + + if (k % groups != 0) { + continue; + } + int k_per_group = k / groups; + + // kxn matrix + aligned_vector<int8_t> Bint8(k * n); + randFill<int8_t>(Bint8, -128, 127); + + // To test lda != k , we just reduce k by half and use the original k + // as lda. + int n_adjusted = n; + if (test_ld) { + if (btrans == matrix_op_t::NoTranspose) { + n_adjusted = std::max(n / 2, 1); + } + } + + // Note that packing for weight is performed during the constructor + // stage. + PackBMatrix<int8_t> packedWeights( + btrans, + k, + n_adjusted, + Bint8.data(), + (btrans == matrix_op_t::Transpose) ? k_per_group : n, + nullptr, + groups); + + // Setup a buffer to get pack -> unpacked results + aligned_vector<int8_t> unpack_buf(k * n, 0); + + // Perform unpacking + packedWeights.unpack(unpack_buf.data()); + + // Sanity check + for (int i = 0; i < k; i++) { + for (int j = 0; j < n_adjusted; j++) { + EXPECT_EQ(Bint8.data()[i * n + j], unpack_buf.data()[i * n + j]) + << "Pack/Unpack results differ at index (" << i << ", " << j + << ", Reference: " << static_cast<int>(Bint8.data()[i * n + j]) + << ", Pack-Unpacked: " + << static_cast<int>(unpack_buf.data()[i * n + j]); + } + } + } + } +} diff --git a/test/QuantUtilsTest.cc b/test/QuantUtilsTest.cc new file mode 100644 index 0000000..ddb1f91 --- /dev/null +++ b/test/QuantUtilsTest.cc @@ -0,0 +1,183 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include <algorithm> +#include <limits> +#include <random> + +#include <gtest/gtest.h> + +#include "fbgemm/QuantUtils.h" +#include "fbgemm/Utils.h" + +using namespace std; +using namespace fbgemm; + +// tuple represents K, C, X, G, layout_t +// layout_t can be KCX or KXC +class QuantizeGroupwiseTest + : public testing::TestWithParam<tuple<int, int, int, int, layout_t>> {}; + +INSTANTIATE_TEST_CASE_P( + InstantiationName, + QuantizeGroupwiseTest, + ::testing::Combine( + ::testing::ValuesIn({4, 12, 64}), // K + ::testing::ValuesIn({12, 16, 32}), // C + ::testing::ValuesIn({1, 10, 15, 30}), // X + ::testing::ValuesIn({1, 4}), // G + ::testing::ValuesIn({layout_t::KCX, layout_t::KXC}))); + +template <typename T, layout_t LT> +void ref_impl( + const vector<float>& src, + int K, + int C, + int X, + int G, + const vector<float>& scales, + const vector<int>& zero_points, + vector<T>& dst) { + int C_per_G = C / G; + for (int i = 0; i < K; ++i) { + for (int g = 0; g < G; ++g) { + for (int c = 0; c < C / G; ++c) { + for (int x = 0; x < X; ++x) { + float num; + if (LT == layout_t::KCX) { + num = src[(i * C + g * C_per_G + c) * X + x]; + } else { + num = src[(i * X + x) * C + g * C_per_G + c]; + } + int res = nearbyint(zero_points[g] + num / scales[g]); + T final_res = min<T>( + max<T>(res, numeric_limits<T>::min()), numeric_limits<T>::max()); + if (LT == layout_t::KCX) { + dst[(i * C + g * C_per_G + c) * X + x] = final_res; + } else { + dst[(i * X + x) * C + g * C_per_G + c] = final_res; + } + } + } + } + } +} + +template <typename T, layout_t LT> +void runTests( + const vector<float>& src, + int K, + int C, + int X, + int G, + const vector<float>& scales, + const vector<int>& zero_points, + vector<T>& dst, + vector<T>& dst_ref) { + QuantizeGroupwise<T, LT>( + src.data(), K, C, X, G, scales.data(), zero_points.data(), dst.data()); + + ref_impl<T, LT>(src, K, C, X, G, scales, zero_points, dst_ref); +} + +/** + * There can be off-by-one error in quantized values due to how the mid-point + * cases are rounded-off in vectorized vs scalar codes and due to adding of + * zero_point before rounding vs after rounding. We ignore such differences + * while comparing results. + */ +template <typename T> +::testing::AssertionResult isNear( + const vector<T>& res, + const vector<T>& res_ref) { + bool match = true; + if (res.size() == res_ref.size()) { + for (int i = 0; i < res.size(); ++i) { + if (!(res[i] == res_ref[i] || res[i] == res_ref[i] + 1 || + res[i] == res_ref[i] - 1)) { + match = false; + break; + } + } + } + if (match) + return ::testing::AssertionSuccess(); + else + return ::testing::AssertionFailure() << " Quantized results do not match"; +} + +/** + * Test for QuantizeGroupwise + */ +TEST_P(QuantizeGroupwiseTest, quantizeTest) { + int K, C, X, G; + layout_t layout; + tie(K, C, X, G, layout) = GetParam(); + + random_device rd; + mt19937 gen(rd()); + + uniform_real_distribution<float> disFP(0.1, 1.1); + + vector<float> inp(K * C * X); + generate(inp.begin(), inp.end(), [&, disFP]() mutable { return disFP(gen); }); + + vector<float> scales(G); + generate(scales.begin(), scales.end(), [&, disFP]() mutable { + return disFP(gen); + }); + + uniform_int_distribution<> disUInt8(0, 8); + vector<int> zero_points_uint8(G); + generate( + zero_points_uint8.begin(), + zero_points_uint8.end(), + [&, disUInt8]() mutable { return disUInt8(gen); }); + + uniform_int_distribution<> disInt8(-64, 63); + vector<int> zero_points_int8(G); + generate( + zero_points_int8.begin(), zero_points_int8.end(), [&, disInt8]() mutable { + return disInt8(gen); + }); + + uniform_int_distribution<> disInt32(-512, 512); + vector<int> zero_points_int32(G); + generate( + zero_points_int32.begin(), + zero_points_int32.end(), + [&, disInt32]() mutable { return disInt32(gen); }); + + vector<uint8_t> dstuint8(K * C * X); + vector<uint8_t> dstuint8_ref(K * C * X); + + vector<int8_t> dstint8(K * C * X); + vector<int8_t> dstint8_ref(K * C * X); + + vector<int32_t> dstint32(K * C * X); + vector<int32_t> dstint32_ref(K * C * X); + + if (layout == layout_t::KCX) { + runTests<uint8_t, layout_t::KCX>( + inp, K, C, X, G, scales, zero_points_uint8, dstuint8, dstuint8_ref); + runTests<int8_t, layout_t::KCX>( + inp, K, C, X, G, scales, zero_points_int8, dstint8, dstint8_ref); + runTests<int32_t, layout_t::KCX>( + inp, K, C, X, G, scales, zero_points_int32, dstint32, dstint32_ref); + } else { + runTests<uint8_t, layout_t::KXC>( + inp, K, C, X, G, scales, zero_points_uint8, dstuint8, dstuint8_ref); + runTests<int8_t, layout_t::KXC>( + inp, K, C, X, G, scales, zero_points_int8, dstint8, dstint8_ref); + runTests<int32_t, layout_t::KXC>( + inp, K, C, X, G, scales, zero_points_int32, dstint32, dstint32_ref); + } + + EXPECT_TRUE(isNear(dstuint8, dstuint8_ref)); + EXPECT_TRUE(isNear(dstint8, dstint8_ref)); + EXPECT_TRUE(isNear(dstint32, dstint32_ref)); +} diff --git a/test/UniConvPackingTest.cc b/test/UniConvPackingTest.cc deleted file mode 100644 index 77552af..0000000 --- a/test/UniConvPackingTest.cc +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -#include <algorithm> -#include <random> -#include <iostream> - - -#include <gtest/gtest.h> - -#include "QuantizationHelpers.h" -#include "TestUtils.h" -#include "bench/BenchUtils.h" -#include "fbgemm/Fbgemm.h" -#include "src/RefImplementations.h" - -using namespace std; -using namespace fbgemm; - -namespace { - -// tuple represents MB, IC, OC, IT, IH, IW, KH/KW, stride, pad -class convPackingTest - : public testing::TestWithParam< - tuple<int, int, int, int, int, int, int, int, int, int>> {}; - -}; // namespace - -INSTANTIATE_TEST_CASE_P( - InstantiationName, - convPackingTest, - ::testing::Combine( - ::testing::ValuesIn({1, 2}), // MB - ::testing::ValuesIn({16, 32}), // IC - ::testing::ValuesIn({16, 32}), // OC - ::testing::ValuesIn({17}), // IT - ::testing::ValuesIn({10, 30, 55}), // IH - ::testing::ValuesIn({10, 30, 55}), // IW - ::testing::ValuesIn({1, 4, 16}), // G - ::testing::ValuesIn({3, 7}), // kernel - ::testing::ValuesIn({1, 2}), // stride - ::testing::ValuesIn({1, 2}))); // pad - -/** - * Test for conv packing - */ -TEST_P(convPackingTest, packingTest) { - int MB, IC, OC, IT, IH, IW, G, kernel, stride, pad; - tie(MB, IC, OC, IT, IH, IW, G, kernel, stride, pad) = GetParam(); - - conv_param_t<2> conv_p_2d( - MB, - IC, - OC, - {IH, IW}, - G, - {kernel, kernel}, - {stride, stride}, - {pad, pad, pad, pad}); - - int kernel_dim_2d = kernel * kernel; - aligned_vector<int8_t> Bint8_2d( - kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G)); - PackWeightsForConv<2> packedB_2D(conv_p_2d, Bint8_2d.data()); - - switch (ConvFastPath<2, int32_t>(conv_p_2d)) { - case optimized_conv_t::depthwise: { - ASSERT_NE(packedB_2D.getPackedWFor2DDW(), nullptr) - << "2D depthwise packed matrix is null"; - ASSERT_EQ(packedB_2D.getPackedWForIm2col(), nullptr) - << "im2col packed matrix should be null"; - ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr) - << "3D depthwise packed matrix should be null"; - ASSERT_EQ(packedB_2D.getPackedWForGroupwise(), nullptr) - << "groupwise packed matrix should be null"; - break; - } - case optimized_conv_t::groupwise: { - ASSERT_EQ(packedB_2D.getPackedWForIm2col(), nullptr) - << "im2col packed matrix should be null"; - ASSERT_EQ(packedB_2D.getPackedWFor2DDW(), nullptr) - << "2D depthwise packed matrix is null"; - ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr) - << "3D depthwise packed matrix should be null"; - ASSERT_NE(packedB_2D.getPackedWForGroupwise(), nullptr) - << "Groupwise packed matrix is null"; - break; - } - case optimized_conv_t::im2col: { - ASSERT_EQ(packedB_2D.getPackedWFor2DDW(), nullptr) - << "2D depthwise packed matrix is null"; - ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr) - << "3D depthwise packed matrix should be null"; - ASSERT_EQ(packedB_2D.getPackedWForGroupwise(), nullptr) - << "groupwise packed matrix should be null"; - ASSERT_NE(packedB_2D.getPackedWForIm2col(), nullptr) - << "im2col packed matrix is null"; - break; - } - } - - conv_param_t<3> conv_p_3d( - MB, - IC, - OC, - {IT, IH, IW}, - G, - {kernel, kernel, kernel}, - {stride, stride, stride}, - {pad, pad, pad, pad, pad, pad}); - - int kernel_dim_3d = kernel * kernel * kernel; - aligned_vector<int8_t> Bint8_3d( - kernel_dim_3d * conv_p_3d.IC * (conv_p_3d.OC / conv_p_3d.G)); - PackWeightsForConv<3> packedB_3D(conv_p_3d, Bint8_3d.data()); - - switch (ConvFastPath<3, int32_t>(conv_p_3d)) { - case optimized_conv_t::depthwise: { - ASSERT_EQ(packedB_3D.getPackedWFor2DDW(), nullptr) - << "2D depthwise packed matrix is null"; - ASSERT_EQ(packedB_3D.getPackedWForIm2col(), nullptr) - << "im2col packed matrix should be null"; - ASSERT_NE(packedB_3D.getPackedWFor3DDW(), nullptr) - << "3D depthwise packed matrix should be null"; - ASSERT_EQ(packedB_3D.getPackedWForGroupwise(), nullptr) - << "groupwise packed matrix should be null"; - break; - } - case optimized_conv_t::groupwise: { - ASSERT_TRUE(false) << "groupwise are not supported for 3D"; - break; - } - case optimized_conv_t::im2col: { - ASSERT_EQ(packedB_3D.getPackedWFor2DDW(), nullptr) - << "2D depthwise packed matrix is null"; - ASSERT_EQ(packedB_3D.getPackedWFor3DDW(), nullptr) - << "3D depthwise packed matrix should be null"; - ASSERT_EQ(packedB_3D.getPackedWForGroupwise(), nullptr) - << "groupwise packed matrix should be null"; - ASSERT_NE(packedB_3D.getPackedWForIm2col(), nullptr) - << "im2col packed matrix is null"; - break; - } - } -} diff --git a/test/UniConvTest.cc b/test/UniConvTest.cc new file mode 100644 index 0000000..91bf578 --- /dev/null +++ b/test/UniConvTest.cc @@ -0,0 +1,325 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include <algorithm> +#include <random> +#include <iostream> +#include <stdexcept> + + +#include <gtest/gtest.h> + +#include "QuantizationHelpers.h" +#include "TestUtils.h" +#include "bench/BenchUtils.h" +#include "fbgemm/Fbgemm.h" +#include "src/RefImplementations.h" + +using namespace std; +using namespace fbgemm; + +namespace { + +// tuple represents MB, IC, OC, IT, IH, IW, KH/KW, stride, pad +class uniConvTest + : public testing::TestWithParam< + tuple<int, int, int, int, int, int, int, int, int, int>> {}; + +}; // namespace + +INSTANTIATE_TEST_CASE_P( + InstantiationName, + uniConvTest, + ::testing::Combine( + ::testing::ValuesIn({1, 2}), // MB + ::testing::ValuesIn({16, 32}), // IC + ::testing::ValuesIn({16, 32}), // OC + ::testing::ValuesIn({17}), // IT + ::testing::ValuesIn({10, 30, 55}), // IH + ::testing::ValuesIn({10, 30, 55}), // IW + ::testing::ValuesIn({1, 4, 16}), // G + ::testing::ValuesIn({1, 3, 7}), // kernel + ::testing::ValuesIn({1, 2}), // stride + ::testing::ValuesIn({0, 1, 2}))); // pad + +/** + * Test for conv packing + */ +TEST_P(uniConvTest, packingTest) { + int MB, IC, OC, IT, IH, IW, G, kernel, stride, pad; + tie(MB, IC, OC, IT, IH, IW, G, kernel, stride, pad) = GetParam(); + + conv_param_t<2> conv_p_2d( + MB, + IC, + OC, + {IH, IW}, + G, + {kernel, kernel}, + {stride, stride}, + {pad, pad, pad, pad}); + + int kernel_dim_2d = kernel * kernel; + aligned_vector<int8_t> Bint8_2d( + kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G)); + PackWeightsForConv<2> packedB_2D(conv_p_2d, Bint8_2d.data()); + + switch (ConvFastPath<2, int32_t>(conv_p_2d)) { + case optimized_conv_t::depthwise: { + ASSERT_EQ(packedB_2D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix should be null"; + ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr) + << "3D depthwise packed matrix should be null"; + ASSERT_EQ(packedB_2D.getPackedWForGroupwise(), nullptr) + << "groupwise packed matrix should be null"; + ASSERT_EQ(packedB_2D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix should be null"; + ASSERT_NE(packedB_2D.getPackedWFor2DDW(), nullptr) + << "2D depthwise packed matrix is null"; + break; + } + case optimized_conv_t::groupwise: { + ASSERT_EQ(packedB_2D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix should be null"; + ASSERT_EQ(packedB_2D.getPackedWFor2DDW(), nullptr) + << "2D depthwise packed matrix is null"; + ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr) + << "3D depthwise packed matrix should be null"; + ASSERT_EQ(packedB_2D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix should be null"; + ASSERT_NE(packedB_2D.getPackedWForGroupwise(), nullptr) + << "Groupwise packed matrix is null"; + break; + } + case optimized_conv_t::pointwise: { + ASSERT_EQ(packedB_2D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix should be null"; + ASSERT_EQ(packedB_2D.getPackedWFor2DDW(), nullptr) + << "2D depthwise packed matrix is null"; + ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr) + << "3D depthwise packed matrix should be null"; + ASSERT_EQ(packedB_2D.getPackedWForGroupwise(), nullptr) + << "Groupwise packed matrix should be null"; + ASSERT_NE(packedB_2D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix is null"; + break; + } + case optimized_conv_t::im2col: { + ASSERT_EQ(packedB_2D.getPackedWFor2DDW(), nullptr) + << "2D depthwise packed matrix is null"; + ASSERT_EQ(packedB_2D.getPackedWFor3DDW(), nullptr) + << "3D depthwise packed matrix should be null"; + ASSERT_EQ(packedB_2D.getPackedWForGroupwise(), nullptr) + << "groupwise packed matrix should be null"; + ASSERT_EQ(packedB_2D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix should be null"; + ASSERT_NE(packedB_2D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix is null"; + break; + } + } + + conv_param_t<3> conv_p_3d( + MB, + IC, + OC, + {IT, IH, IW}, + G, + {kernel, kernel, kernel}, + {stride, stride, stride}, + {pad, pad, pad, pad, pad, pad}); + + int kernel_dim_3d = kernel * kernel * kernel; + aligned_vector<int8_t> Bint8_3d( + kernel_dim_3d * conv_p_3d.IC * (conv_p_3d.OC / conv_p_3d.G)); + PackWeightsForConv<3> packedB_3D(conv_p_3d, Bint8_3d.data()); + + switch (ConvFastPath<3, int32_t>(conv_p_3d)) { + case optimized_conv_t::depthwise: { + ASSERT_EQ(packedB_3D.getPackedWFor2DDW(), nullptr) + << "2D depthwise packed matrix is null"; + ASSERT_EQ(packedB_3D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix should be null"; + ASSERT_EQ(packedB_3D.getPackedWForGroupwise(), nullptr) + << "groupwise packed matrix should be null"; + ASSERT_EQ(packedB_3D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix should be null"; + ASSERT_NE(packedB_3D.getPackedWFor3DDW(), nullptr) + << "3D depthwise packed matrix should be null"; + break; + } + case optimized_conv_t::groupwise: { + ASSERT_TRUE(false) << "groupwise are not supported for 3D"; + break; + } + case optimized_conv_t::pointwise: { + ASSERT_EQ(packedB_3D.getPackedWFor2DDW(), nullptr) + << "2D depthwise packed matrix is null"; + ASSERT_EQ(packedB_3D.getPackedWFor3DDW(), nullptr) + << "3D depthwise packed matrix should be null"; + ASSERT_EQ(packedB_3D.getPackedWForGroupwise(), nullptr) + << "groupwise packed matrix should be null"; + ASSERT_EQ(packedB_3D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix should be null"; + ASSERT_NE(packedB_3D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix is null"; + break; + } + case optimized_conv_t::im2col: { + ASSERT_EQ(packedB_3D.getPackedWFor2DDW(), nullptr) + << "2D depthwise packed matrix is null"; + ASSERT_EQ(packedB_3D.getPackedWFor3DDW(), nullptr) + << "3D depthwise packed matrix should be null"; + ASSERT_EQ(packedB_3D.getPackedWForGroupwise(), nullptr) + << "groupwise packed matrix should be null"; + ASSERT_EQ(packedB_3D.getPackedWForPointwise(), nullptr) + << "pointwise packed matrix should be null"; + ASSERT_NE(packedB_3D.getPackedWForIm2col(), nullptr) + << "im2col packed matrix is null"; + break; + } + } +} + +/** + * Test for packing/unpacking + */ +TEST_P(uniConvTest, packUnpackTest) { + int MB, IC, OC, IT, IH, IW, G, kernel, stride, pad; + tie(MB, IC, OC, IT, IH, IW, G, kernel, stride, pad) = GetParam(); + + conv_param_t<2> conv_p_2d( + MB, + IC, + OC, + {IH, IW}, + G, + {kernel, kernel}, + {stride, stride}, + {pad, pad, pad, pad}); + + int kernel_dim_2d = kernel * kernel; + + aligned_vector<int8_t> Bint8_2d( + kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G)); + aligned_vector<int8_t> Bint8_2d_unpacked( + kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G)); + + PackWeightsForConv<2> packedB_2D(conv_p_2d, Bint8_2d.data()); + + packedB_2D.unpack(Bint8_2d_unpacked.data()); + + ASSERT_EQ(Bint8_2d, Bint8_2d_unpacked) + << "Original and unpacked data elements are not the same [2D]"; + + conv_param_t<3> conv_p_3d( + MB, + IC, + OC, + {IT, IH, IW}, + G, + {kernel, kernel, kernel}, + {stride, stride, stride}, + {pad, pad, pad, pad, pad, pad}); + + int kernel_dim_3d = kernel * kernel * kernel; + + aligned_vector<int8_t> Bint8_3d( + kernel_dim_3d * conv_p_3d.IC * (conv_p_3d.OC / conv_p_3d.G)); + + aligned_vector<int8_t> Bint8_3d_unpacked( + kernel_dim_3d * conv_p_3d.IC * (conv_p_3d.OC / conv_p_3d.G)); + + PackWeightsForConv<3> packedB_3D(conv_p_3d, Bint8_3d.data()); + + packedB_3D.unpack(Bint8_3d_unpacked.data()); + + ASSERT_EQ(Bint8_3d, Bint8_3d_unpacked) + << "Original and unpacked data elements are not the same [3D]"; +} + +TEST(uniConvTest, cornerCases) { + int stride = 1; + conv_param_t<2> conv_p_2d( + 1, // mini-batch + 16, // input channels + 32, // output channels + {28, 28}, // input height/width + 4, // groups + {3, 3}, // kernel height/width + {stride, stride}, // strides + {1, 1, 1, 1}); // padding + + int kernel_dim_2d = conv_p_2d.K[0] * conv_p_2d.K[1]; + + aligned_vector<uint8_t> Aint8( + conv_p_2d.MB * conv_p_2d.IN_DIM[0] * conv_p_2d.IN_DIM[1] * conv_p_2d.IC); + aligned_vector<int8_t> Bint8_2d( + kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G)); + aligned_vector<int32_t> Cint32_fb( + conv_p_2d.MB * conv_p_2d.OUT_DIM[0] * conv_p_2d.OUT_DIM[1] * + conv_p_2d.OC); + aligned_vector<uint8_t> Cint8_fb(Cint32_fb.size(), 0); + + // A matrix (input activations) + randFill<uint8_t>(Aint8, 0, 5); + int32_t Aint8_zero_point = 4; + + // B matrix (weights) + randFill<int8_t>(Bint8_2d, -4, 4); + aligned_vector<int32_t> Bint8_zero_point(1); + randFill(Bint8_zero_point, -3, -1); + + aligned_vector<float> C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2); + int32_t C_zero_point = 5; + + PackWeightsForConv<2> packedB_2D(conv_p_2d, Bint8_2d.data()); + + vector<int32_t> col_offsets(conv_p_2d.OC); + + DoNothing<> doNothingObj{}; + ReQuantizeOutput<false, QuantizationGranularity::TENSOR> outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_point, + Aint8_zero_point, + Bint8_zero_point.data(), + nullptr, // row offsets + col_offsets.data(), + nullptr, // bias + conv_p_2d.OC, + conv_p_2d.G); + + try { + conv_p_2d.stride[0] = 2; + fbgemmConv( + conv_p_2d, + Aint8.data(), + packedB_2D, + Cint8_fb.data(), + Cint32_fb.data(), + outputProcObj, + 0, + 1); + } catch (std::logic_error const& err) { + std::string s(err.what()); + EXPECT_TRUE(s.rfind("[FBGEMM_CONV_ERROR]", 0) == 0); + } + + // reset + conv_p_2d.stride[0] = stride; + // this should run fine + fbgemmConv( + conv_p_2d, + Aint8.data(), + packedB_2D, + Cint8_fb.data(), + Cint32_fb.data(), + outputProcObj, + 0, + 1); +} |