Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authordskhudia <dskhudia@fb.com>2018-11-03 21:05:52 +0300
committerdskhudia <dskhudia@fb.com>2018-11-03 21:05:52 +0300
commit505eb847185c9255526813dd39edadcd4e61d8e0 (patch)
tree480e3b4f3125d8b0f39047464ae7a7b233585f49 /src
parente85b5a12254fa47ca6b56236489253a68fd32104 (diff)
Manually syncing with internal copy
Diffstat (limited to 'src')
-rw-r--r--src/PackAWithIm2Col.cc60
-rw-r--r--src/RefImplementations.cc42
2 files changed, 73 insertions, 29 deletions
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index 7012289..a007685 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -10,6 +10,8 @@
#include <iostream>
#include "fbgemm/Fbgemm.h"
+#include <algorithm>
+
namespace fbgemm2 {
template <typename T, typename accT>
@@ -50,6 +52,7 @@ PackAWithIm2Col<T, accT>::PackAWithIm2Col(
aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)));
}
if (row_offset) {
+ rowOffsetAllocatedHere = false;
row_offset_ = row_offset;
} else {
rowOffsetAllocatedHere = true;
@@ -65,39 +68,66 @@ void PackAWithIm2Col<T, accT>::pack(const block_type_t& block) {
block.col_start,
(block.col_size + row_interleave_B_ - 1) /
row_interleave_B_ * row_interleave_B_};
-
BaseType::packedBlock(block_p);
T* out = BaseType::getBuf();
+
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
int n = i / (conv_p_.OH * conv_p_.OW);
int hw = i % (conv_p_.OH * conv_p_.OW);
int w = hw % conv_p_.OW;
int h = hw / conv_p_.OW;
- for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
- int c = j % conv_p_.IC;
+ for (int j = block.col_start;
+ j < block.col_start + block.col_size + conv_p_.IC - 1;
+ j += conv_p_.IC) {
+ int j_blk_id = j / conv_p_.IC;
+ // max( j_blk_id * IC, START) -> min( END, (j_blk_id + 1) * IC )
+ int j_blk_start = std::max(j_blk_id * conv_p_.IC, block.col_start);
+ int j_blk_end = std::min(
+ (j_blk_id + 1) * conv_p_.IC, block.col_start + block.col_size);
+ if (j_blk_start >= j_blk_end) {
+ break;
+ }
+
int rs = j / conv_p_.IC;
int s = rs % conv_p_.KW;
int r = rs / conv_p_.KW;
int w_in = -conv_p_.pad_w + w * conv_p_.stride_w + s;
int h_in = -conv_p_.pad_h + h * conv_p_.stride_h + r;
- // Please note that padding for convolution should be filled with zero_pt
+
if (h_in < 0 || h_in >= conv_p_.IH || w_in < 0 || w_in >= conv_p_.IW) {
- out[(i - block.row_start) * BaseType::blockColSize() +
- (j - block.col_start)] = BaseType::zeroPoint();
+ // Please note that padding for convolution should be filled with
+ // zero_pt
+ std::memset(
+ &out
+ [(i - block.row_start) * BaseType::blockColSize() +
+ (j_blk_start - block.col_start)],
+ BaseType::zeroPoint(),
+ sizeof(T) * (j_blk_end - j_blk_start));
} else {
- out[(i - block.row_start) * BaseType::blockColSize() +
- (j - block.col_start)] = sdata_
- [((n * conv_p_.IH + h_in) * conv_p_.IW + w_in) * conv_p_.IC + c];
+ std::memcpy(
+ &out
+ [(i - block.row_start) * BaseType::blockColSize() +
+ j_blk_start - block.col_start],
+ &sdata_
+ [((n * conv_p_.IH + h_in) * conv_p_.IW + w_in) * conv_p_.IC +
+ (j_blk_start % conv_p_.IC)],
+ sizeof(T) * (j_blk_end - j_blk_start));
}
}
// zero fill
// Please see the comment in PackAMatrix.cc for zero vs zero_pt fill.
- for (int j = block.col_start + block.col_size;
- j < block_p.col_start + block_p.col_size;
- ++j) {
- out[(i - block.row_start) * BaseType::blockColSize() +
- (j - block.col_start)] = 0;
+ if ((block_p.col_start + block_p.col_size) -
+ (block.col_start + block.col_size) >
+ 0) {
+ std::memset(
+ &out
+ [(i - block.row_start) * BaseType::blockColSize() +
+ (block.col_size)],
+ 0,
+ sizeof(T) *
+ ((block_p.col_start + block_p.col_size) -
+ (block.col_start + block.col_size)));
}
}
}
@@ -111,7 +141,7 @@ void PackAWithIm2Col<T, accT>::printPackedMatrix(std::string name) {
T* out = BaseType::getBuf();
for (auto r = 0; r < BaseType::numPackedRows(); ++r) {
for (auto c = 0; c < BaseType::numPackedCols(); ++c) {
- T val = out[ r * BaseType::blockColSize() + c ];
+ T val = out[r * BaseType::blockColSize() + c];
if (std::is_integral<T>::value) {
// cast to int64 because cout doesn't print int8_t type directly
std::cout << std::setw(5) << static_cast<int64_t>(val) << " ";
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index 9aedc88..10e581f 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -8,6 +8,7 @@
#include <cassert>
#include <cmath>
+#include <cstring>
using namespace std;
@@ -226,6 +227,12 @@ int32_t clip_16bit(int32_t x) {
}
}
+/* Imitate the Im2Col<float, CPUContext, StorageOrder::NHWC> function
+ * from caffe2/utils/math_cpu.cc
+ * NHWC StorageOrder/Layout
+ * A: NHWC: NH_0W_0 x C_0
+ * Ao: NHWC: NH_1W_1 x RSC_0
+ */
void im2col_ref(
const conv_param_t& conv_p,
const std::uint8_t* A,
@@ -238,20 +245,27 @@ void im2col_ref(
int h_in = -conv_p.pad_h + h * conv_p.stride_h + r;
for (int s = 0; s < conv_p.KW; ++s) {
int w_in = -conv_p.pad_w + w * conv_p.stride_w + s;
- for (int c = 0; c < conv_p.IC; ++c) {
- // Ai: NHWC: NH_0W_0 x C_0
- std::uint8_t val =
- h_in < 0 || h_in >= conv_p.IH || w_in < 0 || w_in >= conv_p.IW
- ? A_zero_point
- : A[((n * conv_p.IH + h_in) * conv_p.IW + w_in) * conv_p.IC +
- c];
- // Ao: NHWC: NH_1W_1 x RSC_0
- Ao[((((n * conv_p.OH + h) * conv_p.OW + w) * conv_p.KH + r) *
- conv_p.KW +
- s) *
- conv_p.IC +
- c] = val;
- } // for each c
+ if (h_in < 0 || h_in >= conv_p.IH || w_in < 0 ||
+ w_in >= conv_p.IW) {
+ std::memset(
+ &Ao[((((n * conv_p.OH + h) * conv_p.OW + w) * conv_p.KH + r) *
+ conv_p.KW +
+ s) *
+ conv_p.IC +
+ 0],
+ A_zero_point,
+ sizeof(uint8_t) * conv_p.IC);
+ } else {
+ std::memcpy(
+ &Ao[((((n * conv_p.OH + h) * conv_p.OW + w) * conv_p.KH + r) *
+ conv_p.KW +
+ s) *
+ conv_p.IC +
+ 0],
+ &A[((n * conv_p.IH + h_in) * conv_p.IW + w_in) * conv_p.IC +
+ 0],
+ sizeof(uint8_t) * conv_p.IC);
+ }
} // for each s
} // for each r
} // for each w