Manually syncing with internal copy

author: dskhudia <dskhudia@fb.com> 2018-11-03 21:05:52 +0300
committer: dskhudia <dskhudia@fb.com> 2018-11-03 21:05:52 +0300
commit: 505eb847185c9255526813dd39edadcd4e61d8e0 (patch)
tree: 480e3b4f3125d8b0f39047464ae7a7b233585f49 /src
parent: e85b5a12254fa47ca6b56236489253a68fd32104 (diff)
2 files changed, 73 insertions, 29 deletions
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index 7012289..a007685 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -10,6 +10,8 @@
 #include <iostream>
 #include "fbgemm/Fbgemm.h"
 
+#include <algorithm>
+
 namespace fbgemm2 {
 
 template <typename T, typename accT>
@@ -50,6 +52,7 @@ PackAWithIm2Col<T, accT>::PackAWithIm2Col(
         aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)));
   }
   if (row_offset) {
+    rowOffsetAllocatedHere = false;
     row_offset_ = row_offset;
   } else {
     rowOffsetAllocatedHere = true;
@@ -65,39 +68,66 @@ void PackAWithIm2Col<T, accT>::pack(const block_type_t& block) {
                           block.col_start,
                           (block.col_size + row_interleave_B_ - 1) /
                               row_interleave_B_ * row_interleave_B_};
-
   BaseType::packedBlock(block_p);
   T* out = BaseType::getBuf();
+
   for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
     int n = i / (conv_p_.OH * conv_p_.OW);
     int hw = i % (conv_p_.OH * conv_p_.OW);
     int w = hw % conv_p_.OW;
     int h = hw / conv_p_.OW;
-    for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
-      int c = j % conv_p_.IC;
+    for (int j = block.col_start;
+         j < block.col_start + block.col_size + conv_p_.IC - 1;
+         j += conv_p_.IC) {
+      int j_blk_id = j / conv_p_.IC;
+      // max( j_blk_id * IC, START)  -> min( END, (j_blk_id + 1) * IC )
+      int j_blk_start = std::max(j_blk_id * conv_p_.IC, block.col_start);
+      int j_blk_end = std::min(
+          (j_blk_id + 1) * conv_p_.IC, block.col_start + block.col_size);
+      if (j_blk_start >= j_blk_end) {
+        break;
+      }
+
       int rs = j / conv_p_.IC;
       int s = rs % conv_p_.KW;
       int r = rs / conv_p_.KW;
 
       int w_in = -conv_p_.pad_w + w * conv_p_.stride_w + s;
       int h_in = -conv_p_.pad_h + h * conv_p_.stride_h + r;
-      // Please note that padding for convolution should be filled with zero_pt
+
       if (h_in < 0 || h_in >= conv_p_.IH || w_in < 0 || w_in >= conv_p_.IW) {
-        out[(i - block.row_start) * BaseType::blockColSize() +
-            (j - block.col_start)] = BaseType::zeroPoint();
+        // Please note that padding for convolution should be filled with
+        // zero_pt
+        std::memset(
+            &out
+                [(i - block.row_start) * BaseType::blockColSize() +
+                 (j_blk_start - block.col_start)],
+            BaseType::zeroPoint(),
+            sizeof(T) * (j_blk_end - j_blk_start));
       } else {
-        out[(i - block.row_start) * BaseType::blockColSize() +
-            (j - block.col_start)] = sdata_
-            [((n * conv_p_.IH + h_in) * conv_p_.IW + w_in) * conv_p_.IC + c];
+        std::memcpy(
+            &out
+                [(i - block.row_start) * BaseType::blockColSize() +
+                 j_blk_start - block.col_start],
+            &sdata_
+                [((n * conv_p_.IH + h_in) * conv_p_.IW + w_in) * conv_p_.IC +
+                 (j_blk_start % conv_p_.IC)],
+            sizeof(T) * (j_blk_end - j_blk_start));
       }
     }
     // zero fill
     // Please see the comment in PackAMatrix.cc for zero vs zero_pt fill.
-    for (int j = block.col_start + block.col_size;
-         j < block_p.col_start + block_p.col_size;
-         ++j) {
-      out[(i - block.row_start) * BaseType::blockColSize() +
-          (j - block.col_start)] = 0;
+    if ((block_p.col_start + block_p.col_size) -
+            (block.col_start + block.col_size) >
+        0) {
+      std::memset(
+          &out
+              [(i - block.row_start) * BaseType::blockColSize() +
+               (block.col_size)],
+          0,
+          sizeof(T) *
+              ((block_p.col_start + block_p.col_size) -
+               (block.col_start + block.col_size)));
     }
   }
 }
@@ -111,7 +141,7 @@ void PackAWithIm2Col<T, accT>::printPackedMatrix(std::string name) {
   T* out = BaseType::getBuf();
   for (auto r = 0; r < BaseType::numPackedRows(); ++r) {
     for (auto c = 0; c < BaseType::numPackedCols(); ++c) {
-      T val = out[ r * BaseType::blockColSize() + c ];
+      T val = out[r * BaseType::blockColSize() + c];
       if (std::is_integral<T>::value) {
         // cast to int64 because cout doesn't print int8_t type directly
         std::cout << std::setw(5) << static_cast<int64_t>(val) << " ";
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index 9aedc88..10e581f 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -8,6 +8,7 @@
 
 #include <cassert>
 #include <cmath>
+#include <cstring>
 
 using namespace std;
 
@@ -226,6 +227,12 @@ int32_t clip_16bit(int32_t x) {
   }
 }
 
+/* Imitate the Im2Col<float, CPUContext, StorageOrder::NHWC> function
+ * from caffe2/utils/math_cpu.cc
+ * NHWC StorageOrder/Layout
+ * A:  NHWC: NH_0W_0 x C_0
+ * Ao: NHWC: NH_1W_1 x RSC_0
+ */
 void im2col_ref(
     const conv_param_t& conv_p,
     const std::uint8_t* A,
@@ -238,20 +245,27 @@ void im2col_ref(
           int h_in = -conv_p.pad_h + h * conv_p.stride_h + r;
           for (int s = 0; s < conv_p.KW; ++s) {
             int w_in = -conv_p.pad_w + w * conv_p.stride_w + s;
-            for (int c = 0; c < conv_p.IC; ++c) {
-              // Ai: NHWC: NH_0W_0 x C_0
-              std::uint8_t val =
-                  h_in < 0 || h_in >= conv_p.IH || w_in < 0 || w_in >= conv_p.IW
-                  ? A_zero_point
-                  : A[((n * conv_p.IH + h_in) * conv_p.IW + w_in) * conv_p.IC +
-                      c];
-              // Ao: NHWC: NH_1W_1 x RSC_0
-              Ao[((((n * conv_p.OH + h) * conv_p.OW + w) * conv_p.KH + r) *
-                      conv_p.KW +
-                  s) *
-                     conv_p.IC +
-                 c] = val;
-            } // for each c
+            if (h_in < 0 || h_in >= conv_p.IH || w_in < 0 ||
+                w_in >= conv_p.IW) {
+              std::memset(
+                  &Ao[((((n * conv_p.OH + h) * conv_p.OW + w) * conv_p.KH + r) *
+                           conv_p.KW +
+                       s) *
+                          conv_p.IC +
+                      0],
+                  A_zero_point,
+                  sizeof(uint8_t) * conv_p.IC);
+            } else {
+              std::memcpy(
+                  &Ao[((((n * conv_p.OH + h) * conv_p.OW + w) * conv_p.KH + r) *
+                           conv_p.KW +
+                       s) *
+                          conv_p.IC +
+                      0],
+                  &A[((n * conv_p.IH + h_in) * conv_p.IW + w_in) * conv_p.IC +
+                     0],
+                  sizeof(uint8_t) * conv_p.IC);
+            }
           } // for each s
         } // for each r
       } // for each w
author	dskhudia <dskhudia@fb.com>	2018-11-03 21:05:52 +0300
committer	dskhudia <dskhudia@fb.com>	2018-11-03 21:05:52 +0300
commit	505eb847185c9255526813dd39edadcd4e61d8e0 (patch)
tree	480e3b4f3125d8b0f39047464ae7a7b233585f49 /src
parent	e85b5a12254fa47ca6b56236489253a68fd32104 (diff)