Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJongsoo Park <jongsoo@fb.com>2019-02-23 23:22:19 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-02-23 23:29:18 +0300
commit7b99ce43df15ab1fb5c6be383e3bc2f651cde44c (patch)
tree61f53d072af92701d175100c6c81951c87a8feb3
parent5720ee73bf2665a566e596eb8a0999629a91b213 (diff)
specialization for first conv (#80)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/80 Specialize PackAWithIm2Col for common shapes of strided convolution. TODO: will also add specialization for resnext101 I see PackAWithIm2Col can be also a good target for JIT Reviewed By: protonu Differential Revision: D14197118 fbshipit-source-id: 77201ce17d0e4e2e33a80b4c99b757c378a61018
-rw-r--r--src/PackAWithIm2Col.cc155
-rw-r--r--test/Im2ColFusedRequantizeTest.cc2
2 files changed, 157 insertions, 0 deletions
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index b9e310f..f75092e 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -83,6 +83,103 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
}
}
+template <
+ int SPATIAL_DIM,
+ int IC,
+ int IN_DIM_H,
+ int IN_DIM_W,
+ int K_H,
+ int K_W,
+ int STRIDE_H,
+ int STRIDE_W,
+ int PAD_W,
+ int PAD_H,
+ int OUT_DIM_H,
+ int OUT_DIM_W,
+ int BCOL,
+ int COL_SIZE,
+ int COL_P_SIZE>
+void pack_a_with_im2col_opt(
+ const block_type_t& block,
+ const uint8_t* sdata,
+ uint8_t* out,
+ int32_t a_zero_pt,
+ int32_t* row_offset_buf,
+ bool row_offset_acc) {
+ for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ int n = i / (OUT_DIM_H * OUT_DIM_W);
+ int hw = i % (OUT_DIM_H * OUT_DIM_W);
+ int w = hw % OUT_DIM_W;
+ int h = hw / OUT_DIM_W;
+
+ // j refers to column index within block
+ int j = 0;
+ // r and s iterate over K_H and K_W, respectively
+ for (int r = 0; r < K_H; ++r) {
+ int h_in = -PAD_H + h * STRIDE_H + r;
+ if (h_in < 0 || h_in >= IN_DIM_H) {
+ // Short-circuit if h_in is in padding.
+ std::memset(
+ out + (i - block.row_start) * BCOL + j,
+ a_zero_pt,
+ sizeof(uint8_t) * K_W * IC);
+ j += K_W * IC;
+ continue;
+ }
+
+ int s = 0;
+ // left_pad_len : the number of spatial pixels we need to pad at the
+ // beginning
+ int left_pad_len = PAD_W - w * STRIDE_W;
+ if (left_pad_len > 0) {
+ std::memset(
+ out + (i - block.row_start) * BCOL + j,
+ a_zero_pt,
+ sizeof(uint8_t) * left_pad_len * IC);
+ s += left_pad_len;
+ }
+
+ // mid_len : the number of spatial pixels that we handle normally
+ // (no padding)
+ int mid_len = std::min(IN_DIM_W + PAD_W - w * STRIDE_W, K_W) - s;
+ std::memcpy(
+ out + (i - block.row_start) * BCOL + j + s * IC,
+ sdata +
+ ((n * IN_DIM_H + h_in) * IN_DIM_W + -PAD_W + w * STRIDE_W +
+ s) *
+ IC,
+ sizeof(uint8_t) * mid_len * IC);
+ s += mid_len;
+
+ // right_pad_len : the number of spatial pixels we need to pad at the end
+ int right_pad_len = K_W - s;
+ if (right_pad_len > 0) {
+ std::memset(
+ out + (i - block.row_start) * BCOL + j + s * IC,
+ a_zero_pt,
+ sizeof(uint8_t) * right_pad_len * IC);
+ }
+ j += K_W * IC;
+ } // r loop
+
+ // zero fill
+ // Please see the comment in PackAMatrix.cc for zero vs zero_pt fill.
+ if (COL_P_SIZE - COL_SIZE > 0) {
+ std::memset(
+ &out[(i - block.row_start) * BCOL + COL_SIZE],
+ 0,
+ sizeof(uint8_t) * COL_P_SIZE - COL_SIZE);
+ }
+
+ if (row_offset_buf) {
+ int32_t row_sum =
+ row_offset_acc ? row_offset_buf[i - block.row_start] : 0;
+ row_sum += reduceAvx2(out + (i - block.row_start) * BCOL, COL_SIZE);
+ row_offset_buf[i - block.row_start] = row_sum;
+ }
+ }
+}
+
template <typename T, typename accT, int SPATIAL_DIM>
void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
block_type_t block_p = {block.row_start,
@@ -153,6 +250,64 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
}
int ic_per_group = conv_p_.IC / conv_p_.G;
+
+ if (SPATIAL_DIM == 2 && conv_p_.IC == 3 && conv_p_.IN_DIM[0] == 224 &&
+ conv_p_.IN_DIM[1] == 224 && conv_p_.G == 1 && conv_p_.K[0] == 7 &&
+ conv_p_.K[1] == 7 && conv_p_.stride[0] == 2 && conv_p_.stride[1] == 2 &&
+ conv_p_.pad[0] == 3 && conv_p_.pad[1] == 3 && block.col_size == 147 &&
+ block_p.col_size == 148 && block.col_start == 0 &&
+ std::is_same<T, uint8_t>::value) {
+ if (BaseType::blockColSize() == 256) {
+ pack_a_with_im2col_opt<
+ SPATIAL_DIM,
+ 3,
+ 224,
+ 224,
+ 7,
+ 7,
+ 2,
+ 2,
+ 3,
+ 3,
+ 112,
+ 112,
+ 256,
+ 147,
+ 148>(
+ block,
+ reinterpret_cast<const uint8_t*>(sdata_),
+ reinterpret_cast<uint8_t*>(out),
+ a_zero_pt_,
+ row_offset_buf,
+ row_offset_acc);
+ return;
+ } else if (BaseType::blockColSize() == 512) {
+ pack_a_with_im2col_opt<
+ SPATIAL_DIM,
+ 3,
+ 224,
+ 224,
+ 7,
+ 7,
+ 2,
+ 2,
+ 3,
+ 3,
+ 112,
+ 112,
+ 512,
+ 147,
+ 148>(
+ block,
+ reinterpret_cast<const uint8_t*>(sdata_),
+ reinterpret_cast<uint8_t*>(out),
+ a_zero_pt_,
+ row_offset_buf,
+ row_offset_acc);
+ return;
+ }
+ }
+
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
if (SPATIAL_DIM == 2) { // static if
int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]);
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index 1dfc756..1ec2a06 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -51,6 +51,8 @@ static vector<conv_param_t<>> shapes = {
conv_param_t<>(2, 32, 16, {16, 14}, 4, {3, 3}, {1, 1}, {0, 0, 0, 0}),
conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}),
+ // first layer of resnet50
+ conv_param_t<>(1, 3, 64, {224, 224}, 1, {7, 7}, {2, 2}, {3, 3, 3, 3}),
};
template <typename ACC_T, QuantizationGranularity Q_GRAN>