diff options
author | Jongsoo Park <jongsoo@fb.com> | 2019-02-23 23:22:19 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-02-23 23:29:18 +0300 |
commit | 7b99ce43df15ab1fb5c6be383e3bc2f651cde44c (patch) | |
tree | 61f53d072af92701d175100c6c81951c87a8feb3 | |
parent | 5720ee73bf2665a566e596eb8a0999629a91b213 (diff) |
specialization for first conv (#80)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/80
Specialize PackAWithIm2Col for common shapes of strided convolution.
TODO: will also add specialization for resnext101
I see PackAWithIm2Col can be also a good target for JIT
Reviewed By: protonu
Differential Revision: D14197118
fbshipit-source-id: 77201ce17d0e4e2e33a80b4c99b757c378a61018
-rw-r--r-- | src/PackAWithIm2Col.cc | 155 | ||||
-rw-r--r-- | test/Im2ColFusedRequantizeTest.cc | 2 |
2 files changed, 157 insertions, 0 deletions
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index b9e310f..f75092e 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -83,6 +83,103 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col( } } +template < + int SPATIAL_DIM, + int IC, + int IN_DIM_H, + int IN_DIM_W, + int K_H, + int K_W, + int STRIDE_H, + int STRIDE_W, + int PAD_W, + int PAD_H, + int OUT_DIM_H, + int OUT_DIM_W, + int BCOL, + int COL_SIZE, + int COL_P_SIZE> +void pack_a_with_im2col_opt( + const block_type_t& block, + const uint8_t* sdata, + uint8_t* out, + int32_t a_zero_pt, + int32_t* row_offset_buf, + bool row_offset_acc) { + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + int n = i / (OUT_DIM_H * OUT_DIM_W); + int hw = i % (OUT_DIM_H * OUT_DIM_W); + int w = hw % OUT_DIM_W; + int h = hw / OUT_DIM_W; + + // j refers to column index within block + int j = 0; + // r and s iterate over K_H and K_W, respectively + for (int r = 0; r < K_H; ++r) { + int h_in = -PAD_H + h * STRIDE_H + r; + if (h_in < 0 || h_in >= IN_DIM_H) { + // Short-circuit if h_in is in padding. + std::memset( + out + (i - block.row_start) * BCOL + j, + a_zero_pt, + sizeof(uint8_t) * K_W * IC); + j += K_W * IC; + continue; + } + + int s = 0; + // left_pad_len : the number of spatial pixels we need to pad at the + // beginning + int left_pad_len = PAD_W - w * STRIDE_W; + if (left_pad_len > 0) { + std::memset( + out + (i - block.row_start) * BCOL + j, + a_zero_pt, + sizeof(uint8_t) * left_pad_len * IC); + s += left_pad_len; + } + + // mid_len : the number of spatial pixels that we handle normally + // (no padding) + int mid_len = std::min(IN_DIM_W + PAD_W - w * STRIDE_W, K_W) - s; + std::memcpy( + out + (i - block.row_start) * BCOL + j + s * IC, + sdata + + ((n * IN_DIM_H + h_in) * IN_DIM_W + -PAD_W + w * STRIDE_W + + s) * + IC, + sizeof(uint8_t) * mid_len * IC); + s += mid_len; + + // right_pad_len : the number of spatial pixels we need to pad at the end + int right_pad_len = K_W - s; + if (right_pad_len > 0) { + std::memset( + out + (i - block.row_start) * BCOL + j + s * IC, + a_zero_pt, + sizeof(uint8_t) * right_pad_len * IC); + } + j += K_W * IC; + } // r loop + + // zero fill + // Please see the comment in PackAMatrix.cc for zero vs zero_pt fill. + if (COL_P_SIZE - COL_SIZE > 0) { + std::memset( + &out[(i - block.row_start) * BCOL + COL_SIZE], + 0, + sizeof(uint8_t) * COL_P_SIZE - COL_SIZE); + } + + if (row_offset_buf) { + int32_t row_sum = + row_offset_acc ? row_offset_buf[i - block.row_start] : 0; + row_sum += reduceAvx2(out + (i - block.row_start) * BCOL, COL_SIZE); + row_offset_buf[i - block.row_start] = row_sum; + } + } +} + template <typename T, typename accT, int SPATIAL_DIM> void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) { block_type_t block_p = {block.row_start, @@ -153,6 +250,64 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) { } int ic_per_group = conv_p_.IC / conv_p_.G; + + if (SPATIAL_DIM == 2 && conv_p_.IC == 3 && conv_p_.IN_DIM[0] == 224 && + conv_p_.IN_DIM[1] == 224 && conv_p_.G == 1 && conv_p_.K[0] == 7 && + conv_p_.K[1] == 7 && conv_p_.stride[0] == 2 && conv_p_.stride[1] == 2 && + conv_p_.pad[0] == 3 && conv_p_.pad[1] == 3 && block.col_size == 147 && + block_p.col_size == 148 && block.col_start == 0 && + std::is_same<T, uint8_t>::value) { + if (BaseType::blockColSize() == 256) { + pack_a_with_im2col_opt< + SPATIAL_DIM, + 3, + 224, + 224, + 7, + 7, + 2, + 2, + 3, + 3, + 112, + 112, + 256, + 147, + 148>( + block, + reinterpret_cast<const uint8_t*>(sdata_), + reinterpret_cast<uint8_t*>(out), + a_zero_pt_, + row_offset_buf, + row_offset_acc); + return; + } else if (BaseType::blockColSize() == 512) { + pack_a_with_im2col_opt< + SPATIAL_DIM, + 3, + 224, + 224, + 7, + 7, + 2, + 2, + 3, + 3, + 112, + 112, + 512, + 147, + 148>( + block, + reinterpret_cast<const uint8_t*>(sdata_), + reinterpret_cast<uint8_t*>(out), + a_zero_pt_, + row_offset_buf, + row_offset_acc); + return; + } + } + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { if (SPATIAL_DIM == 2) { // static if int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]); diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc index 1dfc756..1ec2a06 100644 --- a/test/Im2ColFusedRequantizeTest.cc +++ b/test/Im2ColFusedRequantizeTest.cc @@ -51,6 +51,8 @@ static vector<conv_param_t<>> shapes = { conv_param_t<>(2, 32, 16, {16, 14}, 4, {3, 3}, {1, 1}, {0, 0, 0, 0}), conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}), + // first layer of resnet50 + conv_param_t<>(1, 3, 64, {224, 224}, 1, {7, 7}, {2, 2}, {3, 3, 3, 3}), }; template <typename ACC_T, QuantizationGranularity Q_GRAN> |