diff options
author | dskhudia <dskhudia@fb.com> | 2018-11-03 21:05:52 +0300 |
---|---|---|
committer | dskhudia <dskhudia@fb.com> | 2018-11-03 21:05:52 +0300 |
commit | 505eb847185c9255526813dd39edadcd4e61d8e0 (patch) | |
tree | 480e3b4f3125d8b0f39047464ae7a7b233585f49 /src | |
parent | e85b5a12254fa47ca6b56236489253a68fd32104 (diff) |
Manually syncing with internal copy
Diffstat (limited to 'src')
-rw-r--r-- | src/PackAWithIm2Col.cc | 60 | ||||
-rw-r--r-- | src/RefImplementations.cc | 42 |
2 files changed, 73 insertions, 29 deletions
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index 7012289..a007685 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -10,6 +10,8 @@ #include <iostream> #include "fbgemm/Fbgemm.h" +#include <algorithm> + namespace fbgemm2 { template <typename T, typename accT> @@ -50,6 +52,7 @@ PackAWithIm2Col<T, accT>::PackAWithIm2Col( aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T))); } if (row_offset) { + rowOffsetAllocatedHere = false; row_offset_ = row_offset; } else { rowOffsetAllocatedHere = true; @@ -65,39 +68,66 @@ void PackAWithIm2Col<T, accT>::pack(const block_type_t& block) { block.col_start, (block.col_size + row_interleave_B_ - 1) / row_interleave_B_ * row_interleave_B_}; - BaseType::packedBlock(block_p); T* out = BaseType::getBuf(); + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { int n = i / (conv_p_.OH * conv_p_.OW); int hw = i % (conv_p_.OH * conv_p_.OW); int w = hw % conv_p_.OW; int h = hw / conv_p_.OW; - for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { - int c = j % conv_p_.IC; + for (int j = block.col_start; + j < block.col_start + block.col_size + conv_p_.IC - 1; + j += conv_p_.IC) { + int j_blk_id = j / conv_p_.IC; + // max( j_blk_id * IC, START) -> min( END, (j_blk_id + 1) * IC ) + int j_blk_start = std::max(j_blk_id * conv_p_.IC, block.col_start); + int j_blk_end = std::min( + (j_blk_id + 1) * conv_p_.IC, block.col_start + block.col_size); + if (j_blk_start >= j_blk_end) { + break; + } + int rs = j / conv_p_.IC; int s = rs % conv_p_.KW; int r = rs / conv_p_.KW; int w_in = -conv_p_.pad_w + w * conv_p_.stride_w + s; int h_in = -conv_p_.pad_h + h * conv_p_.stride_h + r; - // Please note that padding for convolution should be filled with zero_pt + if (h_in < 0 || h_in >= conv_p_.IH || w_in < 0 || w_in >= conv_p_.IW) { - out[(i - block.row_start) * BaseType::blockColSize() + - (j - block.col_start)] = BaseType::zeroPoint(); + // Please note that padding for convolution should be filled with + // zero_pt + std::memset( + &out + [(i - block.row_start) * BaseType::blockColSize() + + (j_blk_start - block.col_start)], + BaseType::zeroPoint(), + sizeof(T) * (j_blk_end - j_blk_start)); } else { - out[(i - block.row_start) * BaseType::blockColSize() + - (j - block.col_start)] = sdata_ - [((n * conv_p_.IH + h_in) * conv_p_.IW + w_in) * conv_p_.IC + c]; + std::memcpy( + &out + [(i - block.row_start) * BaseType::blockColSize() + + j_blk_start - block.col_start], + &sdata_ + [((n * conv_p_.IH + h_in) * conv_p_.IW + w_in) * conv_p_.IC + + (j_blk_start % conv_p_.IC)], + sizeof(T) * (j_blk_end - j_blk_start)); } } // zero fill // Please see the comment in PackAMatrix.cc for zero vs zero_pt fill. - for (int j = block.col_start + block.col_size; - j < block_p.col_start + block_p.col_size; - ++j) { - out[(i - block.row_start) * BaseType::blockColSize() + - (j - block.col_start)] = 0; + if ((block_p.col_start + block_p.col_size) - + (block.col_start + block.col_size) > + 0) { + std::memset( + &out + [(i - block.row_start) * BaseType::blockColSize() + + (block.col_size)], + 0, + sizeof(T) * + ((block_p.col_start + block_p.col_size) - + (block.col_start + block.col_size))); } } } @@ -111,7 +141,7 @@ void PackAWithIm2Col<T, accT>::printPackedMatrix(std::string name) { T* out = BaseType::getBuf(); for (auto r = 0; r < BaseType::numPackedRows(); ++r) { for (auto c = 0; c < BaseType::numPackedCols(); ++c) { - T val = out[ r * BaseType::blockColSize() + c ]; + T val = out[r * BaseType::blockColSize() + c]; if (std::is_integral<T>::value) { // cast to int64 because cout doesn't print int8_t type directly std::cout << std::setw(5) << static_cast<int64_t>(val) << " "; diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index 9aedc88..10e581f 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -8,6 +8,7 @@ #include <cassert> #include <cmath> +#include <cstring> using namespace std; @@ -226,6 +227,12 @@ int32_t clip_16bit(int32_t x) { } } +/* Imitate the Im2Col<float, CPUContext, StorageOrder::NHWC> function + * from caffe2/utils/math_cpu.cc + * NHWC StorageOrder/Layout + * A: NHWC: NH_0W_0 x C_0 + * Ao: NHWC: NH_1W_1 x RSC_0 + */ void im2col_ref( const conv_param_t& conv_p, const std::uint8_t* A, @@ -238,20 +245,27 @@ void im2col_ref( int h_in = -conv_p.pad_h + h * conv_p.stride_h + r; for (int s = 0; s < conv_p.KW; ++s) { int w_in = -conv_p.pad_w + w * conv_p.stride_w + s; - for (int c = 0; c < conv_p.IC; ++c) { - // Ai: NHWC: NH_0W_0 x C_0 - std::uint8_t val = - h_in < 0 || h_in >= conv_p.IH || w_in < 0 || w_in >= conv_p.IW - ? A_zero_point - : A[((n * conv_p.IH + h_in) * conv_p.IW + w_in) * conv_p.IC + - c]; - // Ao: NHWC: NH_1W_1 x RSC_0 - Ao[((((n * conv_p.OH + h) * conv_p.OW + w) * conv_p.KH + r) * - conv_p.KW + - s) * - conv_p.IC + - c] = val; - } // for each c + if (h_in < 0 || h_in >= conv_p.IH || w_in < 0 || + w_in >= conv_p.IW) { + std::memset( + &Ao[((((n * conv_p.OH + h) * conv_p.OW + w) * conv_p.KH + r) * + conv_p.KW + + s) * + conv_p.IC + + 0], + A_zero_point, + sizeof(uint8_t) * conv_p.IC); + } else { + std::memcpy( + &Ao[((((n * conv_p.OH + h) * conv_p.OW + w) * conv_p.KH + r) * + conv_p.KW + + s) * + conv_p.IC + + 0], + &A[((n * conv_p.IH + h_in) * conv_p.IW + w_in) * conv_p.IC + + 0], + sizeof(uint8_t) * conv_p.IC); + } } // for each s } // for each r } // for each w |