Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJianyu Huang <jianyuhuang@fb.com>2018-11-08 22:09:04 +0300
committerJianyu Huang <jianyuhuang@fb.com>2018-11-08 22:09:04 +0300
commit428a0b6cede232eb5c4e9c3bbd8e9d74d8e34500 (patch)
tree8597c3316e00e00fa5d0fc8939c9ae5c518f767a /src
parentd90e2e1659f9f991319d05bfc58640aeafa733aa (diff)
Sync with internal copy: Asymmetric padding; fbgemm2 -> fbgemm
Diffstat (limited to 'src')
-rw-r--r--src/ExecuteKernel.cc2
-rw-r--r--src/ExecuteKernelGeneric.h4
-rw-r--r--src/ExecuteKernelU8S8.cc28
-rw-r--r--src/ExecuteKernelU8S8.h4
-rw-r--r--src/Fbgemm.cc46
-rw-r--r--src/FbgemmFP16.cc2
-rw-r--r--src/FbgemmFP16UKernels.cc4
-rw-r--r--src/FbgemmFP16UKernels.h4
-rw-r--r--src/FbgemmI8Depthwise.cc5
-rw-r--r--src/FbgemmI8Depthwise.h4
-rw-r--r--src/FbgemmI8Spmdm.cc4
-rw-r--r--src/GenerateKernel.h4
-rw-r--r--src/GenerateKernelU8S8S32ACC16.cc4
-rw-r--r--src/GenerateKernelU8S8S32ACC16_avx512.cc4
-rw-r--r--src/GenerateKernelU8S8S32ACC32.cc4
-rw-r--r--src/GenerateKernelU8S8S32ACC32_avx512.cc4
-rw-r--r--src/PackAMatrix.cc4
-rw-r--r--src/PackAWithIm2Col.cc211
-rw-r--r--src/PackBMatrix.cc18
-rw-r--r--src/PackMatrix.cc4
-rw-r--r--src/PackWithQuantRowOffset.cc4
-rw-r--r--src/PackWithRowOffset.cc4
-rw-r--r--src/RefImplementations.cc6
-rw-r--r--src/RefImplementations.h4
-rw-r--r--src/Utils.cc4
-rw-r--r--src/Utils_avx512.cc4
-rw-r--r--src/codegen_fp16fp32.cc2
27 files changed, 268 insertions, 124 deletions
diff --git a/src/ExecuteKernel.cc b/src/ExecuteKernel.cc
index 0e3d122..3bc7e36 100644
--- a/src/ExecuteKernel.cc
+++ b/src/ExecuteKernel.cc
@@ -9,4 +9,4 @@
#include "fbgemm/Fbgemm.h"
#include "fbgemm/Utils.h"
-namespace fbgemm2 {} // namespace fbgemm2
+namespace fbgemm {} // namespace fbgemm
diff --git a/src/ExecuteKernelGeneric.h b/src/ExecuteKernelGeneric.h
index e83e943..4649912 100644
--- a/src/ExecuteKernelGeneric.h
+++ b/src/ExecuteKernelGeneric.h
@@ -9,7 +9,7 @@
#include "fbgemm/Fbgemm.h"
#include "GenerateKernel.h"
-namespace fbgemm2 {
+namespace fbgemm {
/**
* @brief Execute Engine for the macro-kernel and output processing.
@@ -61,4 +61,4 @@ class ExecuteKernel : public CodeGenBase<
///< the C tile in the macro-kernel.
};
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index e091a87..b3f8c15 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -14,7 +14,7 @@ double kernel_time = 0.0;
double postprocessing_time = 0.0;
#endif
-namespace fbgemm2 {
+namespace fbgemm {
template <typename packingAMatrix, typename cT, typename processOutputType>
ExecuteKernel<
@@ -328,6 +328,18 @@ template class ExecuteKernel<
memCopy<>>;
template class ExecuteKernel<
+ PackAWithIm2Col<uint8_t, int16_t>,
+ PackBMatrix<int8_t, int16_t>,
+ uint8_t,
+ ReQuantizeOutput<false>>;
+
+template class ExecuteKernel<
+ PackAWithIm2Col<uint8_t, int16_t, 3>,
+ PackBMatrix<int8_t, int16_t>,
+ uint8_t,
+ ReQuantizeOutput<false>>;
+
+template class ExecuteKernel<
PackAWithRowOffset<uint8_t, int32_t>,
PackBMatrix<int8_t, int32_t>,
int32_t,
@@ -346,6 +358,18 @@ template class ExecuteKernel<
memCopy<>>;
template class ExecuteKernel<
+ PackAWithIm2Col<uint8_t, int32_t>,
+ PackBMatrix<int8_t, int32_t>,
+ uint8_t,
+ ReQuantizeOutput<false>>;
+
+template class ExecuteKernel<
+ PackAWithIm2Col<uint8_t, int32_t, 3>,
+ PackBMatrix<int8_t, int32_t>,
+ uint8_t,
+ ReQuantizeOutput<false>>;
+
+template class ExecuteKernel<
PackAWithQuantRowOffset<uint8_t, int32_t>,
PackBMatrix<int8_t, int32_t>,
int32_t,
@@ -363,4 +387,4 @@ template class ExecuteKernel<
int32_t,
DoNothing<int32_t, int32_t>>;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/ExecuteKernelU8S8.h b/src/ExecuteKernelU8S8.h
index 0bd7fc5..dfa6577 100644
--- a/src/ExecuteKernelU8S8.h
+++ b/src/ExecuteKernelU8S8.h
@@ -7,7 +7,7 @@
#pragma once
#include "ExecuteKernel.h"
-namespace fbgemm2 {
+namespace fbgemm {
/**
* @brief Execute Engine of uint 8 and int8 matrix
@@ -70,4 +70,4 @@ class ExecuteKernel<
int nbSize_; ///< block size in the n dimension.
};
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index 9195a05..f8f0d34 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -15,9 +15,9 @@ double computing_time = 0.0;
double run_time = 0.0;
#endif
-using namespace fbgemm2;
+using namespace fbgemm;
-namespace fbgemm2 {
+namespace fbgemm {
template <
typename packingAMatrix,
@@ -246,6 +246,26 @@ template void fbgemmPacked(
int num_threads);
template void fbgemmPacked(
+ PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeOutput<false>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithIm2Col<uint8_t, int32_t, 3>, uint8_t, int32_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeOutput<false>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
packA,
PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
@@ -361,6 +381,26 @@ template void fbgemmPacked(
int num_threads);
template void fbgemmPacked(
+ PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeOutput<false>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithIm2Col<uint8_t, int16_t, 3>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeOutput<false>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
int32_t* C,
@@ -380,4 +420,4 @@ template void fbgemmPacked(
int thread_id,
int num_threads);
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/FbgemmFP16.cc b/src/FbgemmFP16.cc
index eff173f..f54feb8 100644
--- a/src/FbgemmFP16.cc
+++ b/src/FbgemmFP16.cc
@@ -14,7 +14,7 @@
using namespace std;
-namespace fbgemm2 {
+namespace fbgemm {
/// class that performs packing of matrix in
/// row-major or col-major format into
diff --git a/src/FbgemmFP16UKernels.cc b/src/FbgemmFP16UKernels.cc
index ec1b297..cc1273e 100644
--- a/src/FbgemmFP16UKernels.cc
+++ b/src/FbgemmFP16UKernels.cc
@@ -6,7 +6,7 @@
*/
#include "FbgemmFP16UKernels.h"
-namespace fbgemm2 {
+namespace fbgemm {
void __attribute__ ((noinline)) gemmkernel_1x1_AVX2_fA0fB0fC0(GemmParams *gp)
{
@@ -2200,4 +2200,4 @@ asm volatile
);
}
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/FbgemmFP16UKernels.h b/src/FbgemmFP16UKernels.h
index bf7f247..88a136b 100644
--- a/src/FbgemmFP16UKernels.h
+++ b/src/FbgemmFP16UKernels.h
@@ -11,7 +11,7 @@
#include <vector>
#include "fbgemm/Types.h"
-namespace fbgemm2 {
+namespace fbgemm {
using fp16 = float16;
using fp32 = float;
@@ -35,6 +35,6 @@ void __attribute__ ((noinline)) gemmkernel_14x1_AVX2_fA0fB0fC0(GemmParams *gp);
typedef void (* funcptr_fp16) (GemmParams *gp);
;
-} // namespace fbgemm2
+} // namespace fbgemm
#endif
diff --git a/src/FbgemmI8Depthwise.cc b/src/FbgemmI8Depthwise.cc
index 551e98e..ed64859 100644
--- a/src/FbgemmI8Depthwise.cc
+++ b/src/FbgemmI8Depthwise.cc
@@ -18,7 +18,8 @@
using namespace std;
-namespace fbgemm2 {
+namespace fbgemm
+{
static array<array<int, 8>, 8> masks = {{
{ 0, 0, 0, 0, 0, 0, 0, 0, },
@@ -2767,4 +2768,4 @@ void depthwise_3x3_per_channel_quantization_pad_1(
}
}
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/FbgemmI8Depthwise.h b/src/FbgemmI8Depthwise.h
index bc62c84..a60cb58 100644
--- a/src/FbgemmI8Depthwise.h
+++ b/src/FbgemmI8Depthwise.h
@@ -8,7 +8,7 @@
#include <cstdint>
-namespace fbgemm2
+namespace fbgemm
{
// KERNEL_PROD is the product of all kernels.
@@ -102,4 +102,4 @@ void depthwise_3x3x3_pad_1(
const std::int32_t* col_offsets, const std::int32_t* bias,
bool fuse_relu = false, int thread_id = 0, int num_threads = 1);
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc
index 723a467..12e1cb2 100644
--- a/src/FbgemmI8Spmdm.cc
+++ b/src/FbgemmI8Spmdm.cc
@@ -25,7 +25,7 @@ double spmdm_run_time = 0.0;
using namespace std;
-namespace fbgemm2 {
+namespace fbgemm {
CompressedSparseColumn::CompressedSparseColumn(int num_of_rows, int num_of_cols)
: num_rows_(num_of_rows),
@@ -505,4 +505,4 @@ void CompressedSparseColumn::SpMDM(
#endif
}
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/GenerateKernel.h b/src/GenerateKernel.h
index 30160d1..5a75c33 100644
--- a/src/GenerateKernel.h
+++ b/src/GenerateKernel.h
@@ -11,7 +11,7 @@
#include <tuple>
#include "fbgemm/Fbgemm.h"
-namespace fbgemm2 {
+namespace fbgemm {
namespace x86 = asmjit::x86;
@@ -151,4 +151,4 @@ thread_local std::map<
typename CodeGenBase<TA, TB, TC, accT>::jit_micro_kernel_fp>
CodeGenBase<TA, TB, TC, accT>::codeCache_;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc
index 451592a..b9ab727 100644
--- a/src/GenerateKernelU8S8S32ACC16.cc
+++ b/src/GenerateKernelU8S8S32ACC16.cc
@@ -7,7 +7,7 @@
#include <iostream>
#include "GenerateKernel.h"
-namespace fbgemm2 {
+namespace fbgemm {
namespace x86 = asmjit::x86;
@@ -295,4 +295,4 @@ CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::getOrCreate<inst_set_t::avx2>(
return fn;
}
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/GenerateKernelU8S8S32ACC16_avx512.cc b/src/GenerateKernelU8S8S32ACC16_avx512.cc
index cab43ed..eeeaea0 100644
--- a/src/GenerateKernelU8S8S32ACC16_avx512.cc
+++ b/src/GenerateKernelU8S8S32ACC16_avx512.cc
@@ -7,7 +7,7 @@
#include <iostream>
#include "GenerateKernel.h"
-namespace fbgemm2 {
+namespace fbgemm {
namespace x86 = asmjit::x86;
@@ -298,4 +298,4 @@ CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::getOrCreate<inst_set_t::avx512>(
return fn;
}
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/GenerateKernelU8S8S32ACC32.cc b/src/GenerateKernelU8S8S32ACC32.cc
index 9529f5d..31c9996 100644
--- a/src/GenerateKernelU8S8S32ACC32.cc
+++ b/src/GenerateKernelU8S8S32ACC32.cc
@@ -7,7 +7,7 @@
#include <iostream>
#include "GenerateKernel.h"
-namespace fbgemm2 {
+namespace fbgemm {
namespace x86 = asmjit::x86;
@@ -312,4 +312,4 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx2>(
return fn;
}
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/GenerateKernelU8S8S32ACC32_avx512.cc b/src/GenerateKernelU8S8S32ACC32_avx512.cc
index 251a8b8..0621bb0 100644
--- a/src/GenerateKernelU8S8S32ACC32_avx512.cc
+++ b/src/GenerateKernelU8S8S32ACC32_avx512.cc
@@ -7,7 +7,7 @@
#include <iostream>
#include "GenerateKernel.h"
-namespace fbgemm2 {
+namespace fbgemm {
namespace x86 = asmjit::x86;
@@ -314,4 +314,4 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx512>(
return fn;
}
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc
index 8f260ba..cd991ca 100644
--- a/src/PackAMatrix.cc
+++ b/src/PackAMatrix.cc
@@ -10,7 +10,7 @@
#include <iostream>
#include "fbgemm/Fbgemm.h"
-namespace fbgemm2 {
+namespace fbgemm {
template <typename T, typename accT>
PackAMatrix<T, accT>::PackAMatrix(
@@ -162,4 +162,4 @@ void PackAMatrix<T, accT>::printPackedMatrix(std::string name) {
template class PackAMatrix<uint8_t, int32_t>;
template class PackAMatrix<uint8_t, int16_t>;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index 8dde696..71efced 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -13,7 +13,7 @@
#include "fbgemm/Fbgemm.h"
-namespace fbgemm2 {
+namespace fbgemm {
template <typename T, typename accT, int SPATIAL_DIM>
PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
@@ -82,9 +82,122 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
row_interleave_B_ * row_interleave_B_};
BaseType::packedBlock(block_p);
T* out = BaseType::getBuf();
+ // accumulate into row offset?
+ bool row_offset_acc = (block.col_start != 0);
+ int32_t* row_offset_buf = getRowOffsetBuffer();
- if (SPATIAL_DIM == 3) { // static if
+ bool point_wise = true;
+ for (int d = 0; d < SPATIAL_DIM; ++d) {
+ if (conv_p_.K[d] != 1 || conv_p_.pad[d] != 0 || conv_p_.stride[d] != 1 ||
+ conv_p_.dilation[d] != 1) {
+ point_wise = false;
+ break;
+ }
+ }
+ for (int d = SPATIAL_DIM; d < SPATIAL_DIM * 2; ++d) {
+ if (conv_p_.pad[d] != 0) {
+ point_wise = false;
+ break;
+ }
+ }
+
+ if (point_wise) {
+ int32_t ld = this->numCols();
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ int buf_idx = i - block.row_start;
+ memcpy(
+ out + buf_idx * BaseType::blockColSize(),
+ sdata_ + i * ld + block.col_start,
+ block.col_size * sizeof(T));
+ // zero fill
+ for (int j = block.col_size; j < block_p.col_size; ++j) {
+ out[buf_idx * BaseType::blockColSize() + j] = 0;
+ }
+ int32_t row_sum =
+ row_offset_acc ? row_offset_buf[i - block.row_start] : 0;
+ __m256i sum_v = _mm256_setzero_si256();
+ __m256i one_epi16_v = _mm256_set1_epi16(1);
+ __m256i one_epi8_v = _mm256_set1_epi8(1);
+ for (int j = block.col_start;
+ j < block.col_start + block.col_size / 32 * 32;
+ j += 32) {
+ __m256i src_v = _mm256_loadu_si256(
+ reinterpret_cast<__m256i const*>(sdata_ + i * ld + j));
+ sum_v = _mm256_add_epi32(
+ sum_v,
+ _mm256_madd_epi16(
+ _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v));
+ }
+ for (int j = block.col_start + block.col_size / 32 * 32;
+ j < block.col_start + block.col_size;
+ ++j) {
+ row_sum += sdata_[i * ld + j];
+ }
+ // alignas(64) std::array<int32_t, 8> temp;
+ alignas(64) std::int32_t temp[8];
+ //_mm256_store_si256(reinterpret_cast<__m256i*>(temp.data()), sum_v);
+ _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v);
+ for (int k = 0; k < 8; ++k) {
+ row_sum += temp[k];
+ }
+ row_offset_buf[i - block.row_start] = row_sum;
+ }
+
+ return;
+ }
+
+ if (SPATIAL_DIM != 2 && SPATIAL_DIM != 3) {
+ assert(false && "unsupported conv dimension");
+ }
+
+ for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ if (SPATIAL_DIM == 2) { // static if
+ int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]);
+ int hw = i % (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]);
+ int w = hw % conv_p_.OUT_DIM[1];
+ int h = hw / conv_p_.OUT_DIM[1];
+ for (int j = block.col_start;
+ j < block.col_start + block.col_size + conv_p_.IC - 1;
+ j += conv_p_.IC) {
+ int j_blk_id = j / conv_p_.IC;
+ // max( j_blk_id * IC, START) -> min( END, (j_blk_id + 1) * IC )
+ int j_blk_start = std::max(j_blk_id * conv_p_.IC, block.col_start);
+ int j_blk_end = std::min(
+ (j_blk_id + 1) * conv_p_.IC, block.col_start + block.col_size);
+ if (j_blk_start >= j_blk_end) {
+ break;
+ }
+
+ int rs = j / conv_p_.IC;
+ int s = rs % conv_p_.K[1];
+ int r = rs / conv_p_.K[1];
+
+ int h_in = -conv_p_.pad[0] + h * conv_p_.stride[0] + r;
+ int w_in = -conv_p_.pad[1] + w * conv_p_.stride[1] + s;
+
+ if (h_in < 0 || h_in >= conv_p_.IN_DIM[0] || w_in < 0 ||
+ w_in >= conv_p_.IN_DIM[1]) {
+ // Please note that padding for convolution should be filled with
+ // zero_pt
+ std::memset(
+ &out
+ [(i - block.row_start) * BaseType::blockColSize() +
+ (j_blk_start - block.col_start)],
+ BaseType::zeroPoint(),
+ sizeof(T) * (j_blk_end - j_blk_start));
+ } else {
+ std::memcpy(
+ &out
+ [(i - block.row_start) * BaseType::blockColSize() +
+ j_blk_start - block.col_start],
+ &sdata_
+ [((n * conv_p_.IN_DIM[0] + h_in) * conv_p_.IN_DIM[1] + w_in) *
+ conv_p_.IC +
+ (j_blk_start % conv_p_.IC)],
+ sizeof(T) * (j_blk_end - j_blk_start));
+ }
+ }
+ } else if (SPATIAL_DIM == 3) { // static if
int n =
i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1] * conv_p_.OUT_DIM[2]);
int thw =
@@ -139,72 +252,8 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
sizeof(T) * (j_blk_end - j_blk_start));
}
}
- // zero fill
- // Please see the comment in PackAMatrix.cc for zero vs zero_pt fill.
- if ((block_p.col_start + block_p.col_size) -
- (block.col_start + block.col_size) >
- 0) {
- std::memset(
- &out
- [(i - block.row_start) * BaseType::blockColSize() +
- (block.col_size)],
- 0,
- sizeof(T) *
- ((block_p.col_start + block_p.col_size) -
- (block.col_start + block.col_size)));
- }
}
- return;
- }
-
- assert(SPATIAL_DIM == 2 && "unsupported conv dimension");
- for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
- int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]);
- int hw = i % (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]);
- int w = hw % conv_p_.OUT_DIM[1];
- int h = hw / conv_p_.OUT_DIM[1];
- for (int j = block.col_start;
- j < block.col_start + block.col_size + conv_p_.IC - 1;
- j += conv_p_.IC) {
- int j_blk_id = j / conv_p_.IC;
- // max( j_blk_id * IC, START) -> min( END, (j_blk_id + 1) * IC )
- int j_blk_start = std::max(j_blk_id * conv_p_.IC, block.col_start);
- int j_blk_end = std::min(
- (j_blk_id + 1) * conv_p_.IC, block.col_start + block.col_size);
- if (j_blk_start >= j_blk_end) {
- break;
- }
-
- int rs = j / conv_p_.IC;
- int s = rs % conv_p_.K[1];
- int r = rs / conv_p_.K[1];
-
- int h_in = -conv_p_.pad[0] + h * conv_p_.stride[0] + r;
- int w_in = -conv_p_.pad[1] + w * conv_p_.stride[1] + s;
-
- if (h_in < 0 || h_in >= conv_p_.IN_DIM[0] || w_in < 0 ||
- w_in >= conv_p_.IN_DIM[1]) {
- // Please note that padding for convolution should be filled with
- // zero_pt
- std::memset(
- &out
- [(i - block.row_start) * BaseType::blockColSize() +
- (j_blk_start - block.col_start)],
- BaseType::zeroPoint(),
- sizeof(T) * (j_blk_end - j_blk_start));
- } else {
- std::memcpy(
- &out
- [(i - block.row_start) * BaseType::blockColSize() +
- j_blk_start - block.col_start],
- &sdata_
- [((n * conv_p_.IN_DIM[0] + h_in) * conv_p_.IN_DIM[1] + w_in) *
- conv_p_.IC +
- (j_blk_start % conv_p_.IC)],
- sizeof(T) * (j_blk_end - j_blk_start));
- }
- }
// zero fill
// Please see the comment in PackAMatrix.cc for zero vs zero_pt fill.
if ((block_p.col_start + block_p.col_size) -
@@ -219,7 +268,33 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
((block_p.col_start + block_p.col_size) -
(block.col_start + block.col_size)));
}
- }
+
+ // TODO: skip row_offset computation when B_zero_point is 0
+ int32_t row_sum =
+ row_offset_acc ? row_offset_buf[i - block.row_start] : 0;
+
+ __m256i sum_v = _mm256_setzero_si256();
+ __m256i one_epi16_v = _mm256_set1_epi16(1);
+ __m256i one_epi8_v = _mm256_set1_epi8(1);
+ for (int j = 0; j < block.col_size / 32 * 32; j += 32) {
+ __m256i src_v = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(
+ out + (i - block.row_start) * this->blockColSize() + j));
+ sum_v = _mm256_add_epi32(
+ sum_v,
+ _mm256_madd_epi16(
+ _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v));
+ }
+ for (int j = block.col_size / 32 * 32; j < block.col_size; ++j) {
+ row_sum += out[(i - block.row_start) * this->blockColSize() + j];
+ }
+ alignas(64) int32_t temp[8];
+ _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v);
+ for (int k = 0; k < 8; ++k) {
+ row_sum += temp[k];
+ }
+
+ row_offset_buf[i - block.row_start] = row_sum;
+ } // for each i
}
template <typename T, typename accT, int SPATIAL_DIM>
@@ -267,4 +342,4 @@ template class PackAWithIm2Col<uint8_t, int16_t>;
template class PackAWithIm2Col<uint8_t, int32_t, 3>;
template class PackAWithIm2Col<uint8_t, int16_t, 3>;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index 878503f..485afb1 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -10,7 +10,7 @@
#include <iostream>
#include "fbgemm/Fbgemm.h"
-namespace fbgemm2 {
+namespace fbgemm {
template <typename T, typename accT>
PackBMatrix<T, accT>::PackBMatrix(
@@ -163,13 +163,17 @@ bool PackBMatrix<T, accT>::equals(const PackBMatrix<T, accT>& that) const {
return false;
}
- return memcmp(
- BaseType::buf_,
- that.buf_,
- BaseType::blockRows() * BaseType::brow_ * BaseType::blockCols() *
- BaseType::bcol_ * sizeof(T)) == 0;
+ for (int i = 0; i < this->numRows(); ++i) {
+ for (int j = 0; j < this->numCols(); ++j) {
+ if (this->buf_[addr(i, j)] != that.buf_[that.addr(i, j)]) {
+ return false;
+ }
+ }
+ }
+
+ return true;
}
template class PackBMatrix<int8_t, int32_t>;
template class PackBMatrix<int8_t, int16_t>;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc
index 37b4e88..fd4c766 100644
--- a/src/PackMatrix.cc
+++ b/src/PackMatrix.cc
@@ -11,7 +11,7 @@
#include "fbgemm/ConvUtils.h"
#include "fbgemm/Fbgemm.h"
-namespace fbgemm2 {
+namespace fbgemm {
template <typename PT, typename inpType, typename accType>
PackMatrix<PT, inpType, accType>::PackMatrix(
@@ -91,4 +91,4 @@ template class PackMatrix<
template class PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>;
template class PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackWithQuantRowOffset.cc b/src/PackWithQuantRowOffset.cc
index 5f60faa..15cd737 100644
--- a/src/PackWithQuantRowOffset.cc
+++ b/src/PackWithQuantRowOffset.cc
@@ -13,7 +13,7 @@
#include <stdexcept>
#include "fbgemm/Fbgemm.h"
-namespace fbgemm2 {
+namespace fbgemm {
template <typename T, typename accT>
PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
@@ -255,4 +255,4 @@ int PackAWithQuantRowOffset<T, accT>::rowOffsetBufferSize() {
template class PackAWithQuantRowOffset<uint8_t, int32_t>;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/PackWithRowOffset.cc b/src/PackWithRowOffset.cc
index fa1f2b0..dec3f70 100644
--- a/src/PackWithRowOffset.cc
+++ b/src/PackWithRowOffset.cc
@@ -12,7 +12,7 @@
#include <stdexcept>
#include "fbgemm/Fbgemm.h"
-namespace fbgemm2 {
+namespace fbgemm {
template <typename T, typename accT>
PackAWithRowOffset<T, accT>::PackAWithRowOffset(
@@ -211,4 +211,4 @@ int PackAWithRowOffset<T, accT>::rowOffsetBufferSize() {
template class PackAWithRowOffset<uint8_t, int32_t>;
template class PackAWithRowOffset<uint8_t, int16_t>;
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index 4b919c1..dc41c27 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -13,7 +13,7 @@
using namespace std;
-namespace fbgemm2 {
+namespace fbgemm {
void requantize_u8acc32_ref(
int M,
@@ -195,7 +195,7 @@ void spmdm_ref(
int M,
const uint8_t* A,
int lda,
- fbgemm2::CompressedSparseColumn& B,
+ fbgemm::CompressedSparseColumn& B,
bool accumulation,
int32_t* C,
int ldc) {
@@ -746,4 +746,4 @@ void depthwise_3x3x3_pad_1_ref(
}
};
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/RefImplementations.h b/src/RefImplementations.h
index 69d060a..9e81ce1 100644
--- a/src/RefImplementations.h
+++ b/src/RefImplementations.h
@@ -12,7 +12,7 @@
#include "fbgemm/ConvUtils.h"
#include "fbgemm/FbgemmI8Spmdm.h"
-namespace fbgemm2 {
+namespace fbgemm {
/**
* @brief Reference implementation of requantization step.
@@ -283,4 +283,4 @@ void depthwise_3x3x3_pad_1_ref(
const std::int32_t* col_offsets,
const std::int32_t* bias);
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/Utils.cc b/src/Utils.cc
index 10ab469..45aafd3 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -15,7 +15,7 @@
#include <limits>
#include <stdexcept>
-namespace fbgemm2 {
+namespace fbgemm {
/**
* @brief Compare the reference and test result matrix to check the correctness.
@@ -354,4 +354,4 @@ void transpose_simd(
}
}
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/Utils_avx512.cc b/src/Utils_avx512.cc
index b6bf413..62a99ba 100644
--- a/src/Utils_avx512.cc
+++ b/src/Utils_avx512.cc
@@ -9,7 +9,7 @@
#include <immintrin.h>
-namespace fbgemm2 {
+namespace fbgemm {
inline void transpose_kernel_16x16_avx512(
const float* src,
@@ -240,4 +240,4 @@ void transpose_16x16(
transpose_8x8(M - ib, N, &src[ib * ld_src], ld_src, &dst[ib], ld_dst);
}
-} // namespace fbgemm2
+} // namespace fbgemm
diff --git a/src/codegen_fp16fp32.cc b/src/codegen_fp16fp32.cc
index 8e36c85..2b2b022 100644
--- a/src/codegen_fp16fp32.cc
+++ b/src/codegen_fp16fp32.cc
@@ -79,7 +79,7 @@ int main() {
hdrfile << "#include <tuple>\n";
hdrfile << "#include <vector>\n";
hdrfile << "#include \"fbgemm/Types.h\"\n";
- hdrfile << "using fp16 = fbgemm2::float16;\n";
+ hdrfile << "using fp16 = fbgemm::float16;\n";
hdrfile << "using fp32 = float;\n";
hdrfile << "struct GemmParams {uint64_t k; float *A; const fp16 *B;\n"
"float *beta; uint64_t accum; float *C; uint64_t ldc;\n"