specialization for first conv (#80)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/80 Specialize PackAWithIm2Col for common shapes of strided convolution. TODO: will also add specialization for resnext101 I see PackAWithIm2Col can be also a good target for JIT Reviewed By: protonu Differential Revision: D14197118 fbshipit-source-id: 77201ce17d0e4e2e33a80b4c99b757c378a61018
author: Jongsoo Park <jongsoo@fb.com> 2019-02-23 23:22:19 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-02-23 23:29:18 +0300
commit: 7b99ce43df15ab1fb5c6be383e3bc2f651cde44c (patch)
tree: 61f53d072af92701d175100c6c81951c87a8feb3
parent: 5720ee73bf2665a566e596eb8a0999629a91b213 (diff)
2 files changed, 157 insertions, 0 deletions
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index b9e310f..f75092e 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -83,6 +83,103 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
   }
 }
 
+template <
+    int SPATIAL_DIM,
+    int IC,
+    int IN_DIM_H,
+    int IN_DIM_W,
+    int K_H,
+    int K_W,
+    int STRIDE_H,
+    int STRIDE_W,
+    int PAD_W,
+    int PAD_H,
+    int OUT_DIM_H,
+    int OUT_DIM_W,
+    int BCOL,
+    int COL_SIZE,
+    int COL_P_SIZE>
+void pack_a_with_im2col_opt(
+    const block_type_t& block,
+    const uint8_t* sdata,
+    uint8_t* out,
+    int32_t a_zero_pt,
+    int32_t* row_offset_buf,
+    bool row_offset_acc) {
+  for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+    int n = i / (OUT_DIM_H * OUT_DIM_W);
+    int hw = i % (OUT_DIM_H * OUT_DIM_W);
+    int w = hw % OUT_DIM_W;
+    int h = hw / OUT_DIM_W;
+
+    // j refers to column index within block
+    int j = 0;
+    // r and s iterate over K_H and K_W, respectively
+    for (int r = 0; r < K_H; ++r) {
+      int h_in = -PAD_H + h * STRIDE_H + r;
+      if (h_in < 0 || h_in >= IN_DIM_H) {
+        // Short-circuit if h_in is in padding.
+        std::memset(
+            out + (i - block.row_start) * BCOL + j,
+            a_zero_pt,
+            sizeof(uint8_t) * K_W * IC);
+        j += K_W * IC;
+        continue;
+      }
+
+      int s = 0;
+      // left_pad_len : the number of spatial pixels we need to pad at the
+      // beginning
+      int left_pad_len = PAD_W - w * STRIDE_W;
+      if (left_pad_len > 0) {
+        std::memset(
+            out + (i - block.row_start) * BCOL + j,
+            a_zero_pt,
+            sizeof(uint8_t) * left_pad_len * IC);
+        s += left_pad_len;
+      }
+
+      // mid_len : the number of spatial pixels that we handle normally
+      // (no padding)
+      int mid_len = std::min(IN_DIM_W + PAD_W - w * STRIDE_W, K_W) - s;
+      std::memcpy(
+          out + (i - block.row_start) * BCOL + j + s * IC,
+          sdata +
+              ((n * IN_DIM_H + h_in) * IN_DIM_W + -PAD_W + w * STRIDE_W +
+               s) *
+                  IC,
+          sizeof(uint8_t) * mid_len * IC);
+      s += mid_len;
+
+      // right_pad_len : the number of spatial pixels we need to pad at the end
+      int right_pad_len = K_W - s;
+      if (right_pad_len > 0) {
+        std::memset(
+            out + (i - block.row_start) * BCOL + j + s * IC,
+            a_zero_pt,
+            sizeof(uint8_t) * right_pad_len * IC);
+      }
+      j += K_W * IC;
+    } // r loop
+
+    // zero fill
+    // Please see the comment in PackAMatrix.cc for zero vs zero_pt fill.
+    if (COL_P_SIZE - COL_SIZE > 0) {
+      std::memset(
+          &out[(i - block.row_start) * BCOL + COL_SIZE],
+          0,
+          sizeof(uint8_t) * COL_P_SIZE - COL_SIZE);
+    }
+
+    if (row_offset_buf) {
+      int32_t row_sum =
+          row_offset_acc ? row_offset_buf[i - block.row_start] : 0;
+      row_sum += reduceAvx2(out + (i - block.row_start) * BCOL, COL_SIZE);
+      row_offset_buf[i - block.row_start] = row_sum;
+    }
+  }
+}
+
 template <typename T, typename accT, int SPATIAL_DIM>
 void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
   block_type_t block_p = {block.row_start,
@@ -153,6 +250,64 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
   }
 
   int ic_per_group = conv_p_.IC / conv_p_.G;
+
+  if (SPATIAL_DIM == 2 && conv_p_.IC == 3 && conv_p_.IN_DIM[0] == 224 &&
+      conv_p_.IN_DIM[1] == 224 && conv_p_.G == 1 && conv_p_.K[0] == 7 &&
+      conv_p_.K[1] == 7 && conv_p_.stride[0] == 2 && conv_p_.stride[1] == 2 &&
+      conv_p_.pad[0] == 3 && conv_p_.pad[1] == 3 && block.col_size == 147 &&
+      block_p.col_size == 148 && block.col_start == 0 &&
+      std::is_same<T, uint8_t>::value) {
+    if (BaseType::blockColSize() == 256) {
+      pack_a_with_im2col_opt<
+          SPATIAL_DIM,
+          3,
+          224,
+          224,
+          7,
+          7,
+          2,
+          2,
+          3,
+          3,
+          112,
+          112,
+          256,
+          147,
+          148>(
+          block,
+          reinterpret_cast<const uint8_t*>(sdata_),
+          reinterpret_cast<uint8_t*>(out),
+          a_zero_pt_,
+          row_offset_buf,
+          row_offset_acc);
+      return;
+    } else if (BaseType::blockColSize() == 512) {
+      pack_a_with_im2col_opt<
+          SPATIAL_DIM,
+          3,
+          224,
+          224,
+          7,
+          7,
+          2,
+          2,
+          3,
+          3,
+          112,
+          112,
+          512,
+          147,
+          148>(
+          block,
+          reinterpret_cast<const uint8_t*>(sdata_),
+          reinterpret_cast<uint8_t*>(out),
+          a_zero_pt_,
+          row_offset_buf,
+          row_offset_acc);
+      return;
+    }
+  }
+
   for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
     if (SPATIAL_DIM == 2) { // static if
       int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]);
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index 1dfc756..1ec2a06 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -51,6 +51,8 @@ static vector<conv_param_t<>> shapes = {
     conv_param_t<>(2, 32, 16, {16, 14}, 4, {3, 3}, {1, 1}, {0, 0, 0, 0}),
     conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
     conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}),
+    // first layer of resnet50
+    conv_param_t<>(1, 3, 64, {224, 224}, 1, {7, 7}, {2, 2}, {3, 3, 3, 3}),
 };
 
 template <typename ACC_T, QuantizationGranularity Q_GRAN>
author	Jongsoo Park <jongsoo@fb.com>	2019-02-23 23:22:19 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-02-23 23:29:18 +0300
commit	7b99ce43df15ab1fb5c6be383e3bc2f651cde44c (patch)
tree	61f53d072af92701d175100c6c81951c87a8feb3
parent	5720ee73bf2665a566e596eb8a0999629a91b213 (diff)