Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJongsoo Park <jongsoo@fb.com>2019-02-13 02:50:07 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-02-13 02:55:31 +0300
commitc4e50e0ed5db7c9aadb6df47faa4227f5286293c (patch)
tree963e416174ca1a3d486633c9e61f72760ce025ca
parent66df1a0ccd762e525e319cb579810deade551152 (diff)
no need to subtract col offset if a_zp is 0 (#69)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/69 This diff prepares for D14013931 that folds column offsets into bias. In depthwise convolution, we allow passing column_offsets == nullptr which means column_offsets are folded into bias. We bypass adding column_offset * A_zero_point if either column_offset == nullptr or A_zero_point == 0 Reviewed By: jianyuh Differential Revision: D14017772 fbshipit-source-id: ad4a79402f43cbf78dbad68e1bff6d07c19dded0
-rw-r--r--include/fbgemm/OutputProcessing-inl.h8
-rw-r--r--src/ExecuteKernelU8S8.cc286
-rw-r--r--src/Fbgemm.cc42
-rw-r--r--src/RefImplementations.cc12
4 files changed, 189 insertions, 159 deletions
diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h
index c250942..9485b18 100644
--- a/include/fbgemm/OutputProcessing-inl.h
+++ b/include/fbgemm/OutputProcessing-inl.h
@@ -81,7 +81,9 @@ inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)];
- raw -= Aq_zero_point_ * q_col_offsets_[j];
+ if (Aq_zero_point_) {
+ raw -= Aq_zero_point_ * q_col_offsets_[j];
+ }
int Bq_zero_point_idx;
if (Q_GRAN == QuantizationGranularity::TENSOR) {
Bq_zero_point_idx = 0;
@@ -225,7 +227,9 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start];
- raw -= Aq_zero_point_ * q_col_offsets_[j];
+ if (Aq_zero_point_) {
+ raw -= Aq_zero_point_ * q_col_offsets_[j];
+ }
int Bq_zero_point_idx;
if (Q_GRAN == QuantizationGranularity::TENSOR) {
Bq_zero_point_idx = 0;
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index e7f7c70..cdceb63 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -243,116 +243,127 @@ void ExecuteKernel<
////////////////////////////////////////////////////////////////////////////////
// ReQuantizeOutput
-#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \
- template class ExecuteKernel< \
- PACK_A<uint8_t, ACC_T>, \
- PackBMatrix<int8_t, ACC_T>, \
- uint8_t, \
+#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, ACC_T>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ uint8_t, \
ReQuantizeOutput<RELU, Q_GRAN>>;
-#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \
- INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
-
-#define INSTANTIATE_RELU(PACK_A, ACC_T) \
- INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \
- INSTANTIATE_Q_GRANS(PACK_A, ACC_T, true);
-
-#define INSTANTIATE_ACC_T(PACK_A) \
- INSTANTIATE_RELU(PACK_A, int32_t); \
- INSTANTIATE_RELU(PACK_A, int16_t);
-
-INSTANTIATE_ACC_T(PackAMatrix);
-INSTANTIATE_ACC_T(PackAWithRowOffset);
-
-#undef INSTANTIATE_ACC_T
-#undef INSTANTIATE_RELU
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
-
-#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
- template class ExecuteKernel< \
- PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
- PackBMatrix<int8_t, ACC_T>, \
- uint8_t, \
+#define INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, RELU) \
+ INSTANTIATE_REQUANT_BASE( \
+ PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_REQUANT_BASE( \
+ PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_REQUANT_BASE( \
+ PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_REQUANT_RELU(PACK_A, ACC_T) \
+ INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, false); \
+ INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, true);
+
+#define INSTANTIATE_REQUANT_ACC_T(PACK_A) \
+ INSTANTIATE_REQUANT_RELU(PACK_A, int32_t); \
+ INSTANTIATE_REQUANT_RELU(PACK_A, int16_t);
+
+INSTANTIATE_REQUANT_ACC_T(PackAMatrix);
+INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset);
+
+#undef INSTANTIATE_REQUANT_ACC_T
+#undef INSTANTIATE_REQUANT_RELU
+#undef INSTANTIATE_REQUANT_Q_GRANS
+#undef INSTANTIATE_REQUANT_BASE
+
+#define INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ uint8_t, \
ReQuantizeOutput<RELU, Q_GRAN>>;
-#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
- INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE( \
+#define INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_IM2COL_REQUANT_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_IM2COL_REQUANT_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_IM2COL_REQUANT_BASE( \
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
-#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
- INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
- INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+#define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \
+ INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 2); \
+ INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 3);
-#define INSTANTIATE_RELU(ACC_T) \
- INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
- INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+#define INSTANTIATE_IM2COL_REQUANT_RELU(ACC_T) \
+ INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, false); \
+ INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, true);
-INSTANTIATE_RELU(int32_t);
-INSTANTIATE_RELU(int16_t);
+INSTANTIATE_IM2COL_REQUANT_RELU(int32_t);
+INSTANTIATE_IM2COL_REQUANT_RELU(int16_t);
-#undef INSTANTIATE_RELU
-#undef INSTANTIATE_SPATIAL_DIM
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_IM2COL_REQUANT_RELU
+#undef INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM
+#undef INSTANTIATE_IM2COL_REQUANT_Q_GRANS
+#undef INSTANTIATE_IM2COL_REQUANT_BASE
////////////////////////////////////////////////////////////////////////////////
// ReQuantizeForFloat
-#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \
- template class ExecuteKernel< \
- PACK_A<uint8_t, int32_t>, \
- PackBMatrix<int8_t, int32_t>, \
- float, \
+#define INSTANTIATE_REQUANT_FLOAT_BASE(PACK_A, RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, int32_t>, \
+ PackBMatrix<int8_t, int32_t>, \
+ float, \
ReQuantizeForFloat<RELU, Q_GRAN>>;
-#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \
- INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
+#define INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, RELU) \
+ INSTANTIATE_REQUANT_FLOAT_BASE( \
+ PACK_A, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_REQUANT_FLOAT_BASE( \
+ PACK_A, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_REQUANT_FLOAT_BASE( \
+ PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
-#define INSTANTIATE_RELU(PACK_A) \
- INSTANTIATE_Q_GRANS(PACK_A, false); \
- INSTANTIATE_Q_GRANS(PACK_A, true);
+#define INSTANTIATE_REQUANT_FLOAT_RELU(PACK_A) \
+ INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, false); \
+ INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, true);
-INSTANTIATE_RELU(PackAWithRowOffset);
-INSTANTIATE_RELU(PackAWithQuantRowOffset);
+INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithRowOffset);
+INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithQuantRowOffset);
-#undef INSTANTIATE_RELU
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_REQUANT_FLOAT_RELU
+#undef INSTANTIATE_REQUANT_FLOAT_Q_GRANS
+#undef INSTANTIATE_REQUANT_FLOAT_BASE
-#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
- template class ExecuteKernel< \
- PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
- PackBMatrix<int8_t, ACC_T>, \
- float, \
+#define INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ float, \
ReQuantizeForFloat<RELU, Q_GRAN>>;
-#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
- INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE( \
+#define INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \
ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
-#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
- INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
- INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+#define INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, RELU) \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 2); \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 3);
-#define INSTANTIATE_RELU(ACC_T) \
- INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
- INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+#define INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(ACC_T) \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, false); \
+ INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, true);
-INSTANTIATE_RELU(int32_t);
-INSTANTIATE_RELU(int16_t);
+INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(int32_t);
+INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(int16_t);
-#undef INSTANTIATE_RELU
-#undef INSTANTIATE_SPATIAL_DIM
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU
+#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM
+#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS
+#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE
template class ExecuteKernel<
PackAWithRowOffset<uint8_t, int16_t>,
@@ -362,41 +373,46 @@ template class ExecuteKernel<
////////////////////////////////////////////////////////////////////////////////
// DoSpmdmOnInpBuffer
-#define INSTANTIATE_BASE(RELU, Q_GRAN) \
- template class ExecuteKernel< \
- PackAWithRowOffset<uint8_t, int16_t>, \
- PackBMatrix<int8_t, int16_t>, \
- uint8_t, \
+#define INSTANTIATE_SPMDM_BASE(PACK_A, RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, int16_t>, \
+ PackBMatrix<int8_t, int16_t>, \
+ uint8_t, \
DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>;
-#define INSTANTIATE_Q_GRANS(RELU) \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
+#define INSTANTIATE_SPMDM_Q_GRANS(PACK_A, RELU) \
+ INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
-INSTANTIATE_Q_GRANS(false);
-INSTANTIATE_Q_GRANS(true);
+#define INSTANTIATE_SPMDM_RELU(PACK_A) \
+ INSTANTIATE_SPMDM_Q_GRANS(PACK_A, false); \
+ INSTANTIATE_SPMDM_Q_GRANS(PACK_A, true);
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
+INSTANTIATE_SPMDM_RELU(PackAMatrix);
+INSTANTIATE_SPMDM_RELU(PackAWithRowOffset);
-#define INSTANTIATE_BASE(RELU, Q_GRAN) \
- template class ExecuteKernel< \
- PackAWithIm2Col<uint8_t, int16_t>, \
- PackBMatrix<int8_t, int16_t>, \
- uint8_t, \
+#undef INSTANTIATE_SPMDM_RELU
+#undef INSTANTIATE_SPMDM_Q_GRANS
+#undef INSTANTIATE_SPMDM_BASE
+
+#define INSTANTIATE_SCONV_BASE(RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, int16_t>, \
+ PackBMatrix<int8_t, int16_t>, \
+ uint8_t, \
DoSConvOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>;
-#define INSTANTIATE_Q_GRANS(RELU) \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
+#define INSTANTIATE_SCONV_Q_GRANS(RELU) \
+ INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
-INSTANTIATE_Q_GRANS(false);
-INSTANTIATE_Q_GRANS(true);
+INSTANTIATE_SCONV_Q_GRANS(false);
+INSTANTIATE_SCONV_Q_GRANS(true);
-#undef INSTANTIATE_Q_GRANS
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_SCONV_Q_GRANS
+#undef INSTANTIATE_SCONV_BASE
template class ExecuteKernel<
PackAWithRowOffset<uint8_t, int16_t>,
@@ -406,39 +422,39 @@ template class ExecuteKernel<
////////////////////////////////////////////////////////////////////////////////
// memCopy
-#define INSTANTIATE_BASE(PACK_A, ACC_T) \
- template class ExecuteKernel< \
- PACK_A<uint8_t, ACC_T>, \
- PackBMatrix<int8_t, ACC_T>, \
- int32_t, \
+#define INSTANTIATE_MEMCPY_BASE(PACK_A, ACC_T) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, ACC_T>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ int32_t, \
memCopy<>>;
-#define INSTANTIATE_ACC_T(PACK_A) \
- INSTANTIATE_BASE(PACK_A, int32_t) \
- INSTANTIATE_BASE(PACK_A, int16_t)
+#define INSTANTIATE_MEMCPY_ACC_T(PACK_A) \
+ INSTANTIATE_MEMCPY_BASE(PACK_A, int32_t) \
+ INSTANTIATE_MEMCPY_BASE(PACK_A, int16_t)
-INSTANTIATE_ACC_T(PackAMatrix);
-INSTANTIATE_ACC_T(PackAWithRowOffset);
+INSTANTIATE_MEMCPY_ACC_T(PackAMatrix);
+INSTANTIATE_MEMCPY_ACC_T(PackAWithRowOffset);
-#undef INSTANTIATE_ACC_T
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_MEMCPY_ACC_T
+#undef INSTANTIATE_MEMCPY_BASE
-#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \
- template class ExecuteKernel< \
- PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
- PackBMatrix<int8_t, ACC_T>, \
- int32_t, \
+#define INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, SPATIAL_DIM) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ int32_t, \
memCopy<>>;
-#define INSTANTIATE_SPATIAL_DIM(ACC_T) \
- INSTANTIATE_BASE(ACC_T, 2); \
- INSTANTIATE_BASE(ACC_T, 3);
+#define INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(ACC_T) \
+ INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 2); \
+ INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 3);
-INSTANTIATE_SPATIAL_DIM(int32_t);
-INSTANTIATE_SPATIAL_DIM(int16_t);
+INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(int32_t);
+INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(int16_t);
-#undef INSTANTIATE_SPATIAL_DIM
-#undef INSTANTIATE_BASE
+#undef INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM
+#undef INSTANTIATE_MEMCPY_IM2COL_BASE
template class ExecuteKernel<
PackAWithQuantRowOffset<uint8_t, int32_t>,
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index ab0693a..f258604 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -360,31 +360,35 @@ template void fbgemmPacked(
////////////////////////////////////////////////////////////////////////////////
// DoSpmdmOnInpBuffer
-#define INSTANTIATE_BASE(RELU, Q_GRAN) \
- template void fbgemmPacked( \
- PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& \
- packA, \
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, \
- uint8_t* C, \
- int32_t* C_buffer, \
- uint32_t ldc, \
- const DoSpmdmOnInpBuffer< \
- uint8_t, \
- int32_t, \
- ReQuantizeOutput<RELU, Q_GRAN>>& outProcess, \
- int thread_id, \
+#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \
+ template void fbgemmPacked( \
+ PackMatrix<PACK_A<uint8_t, int16_t>, uint8_t, int16_t>& packA, \
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, \
+ uint8_t* C, \
+ int32_t* C_buffer, \
+ uint32_t ldc, \
+ const DoSpmdmOnInpBuffer< \
+ uint8_t, \
+ int32_t, \
+ ReQuantizeOutput<RELU, Q_GRAN>>& outProcess, \
+ int thread_id, \
int num_threads);
-#define INSTANTIATE_Q_GRANS(RELU) \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \
- INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
+#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
-INSTANTIATE_Q_GRANS(false);
-INSTANTIATE_Q_GRANS(true);
+#define INSTANTIATE_RELU(PACK_A) \
+ INSTANTIATE_Q_GRANS(PACK_A, false); \
+ INSTANTIATE_Q_GRANS(PACK_A, true);
+
+INSTANTIATE_RELU(PackAMatrix);
+INSTANTIATE_RELU(PackAWithRowOffset);
#undef INSTANTIATE_Q_GRANS
#undef INSTANTIATE_BASE
+#undef INSTANTIATE_RELU
#define INSTANTIATE_BASE(RELU, Q_GRAN) \
template void fbgemmPacked( \
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index 5c6cf1b..5f1277f 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -34,8 +34,12 @@ void requantize_u8acc32_ref(
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
int32_t raw = inp[i * ld + j];
- raw -= A_zero_point * col_offsets[j];
- raw -= B_zero_point * row_offsets[i];
+ if (A_zero_point) {
+ raw -= A_zero_point * col_offsets[j];
+ }
+ if (B_zero_point) {
+ raw -= B_zero_point * row_offsets[i];
+ }
if (bias) {
raw += bias[j];
}
@@ -69,7 +73,9 @@ void requantize_u8acc32_ref(
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
int32_t raw = inp[i * ld + j];
- raw -= A_zero_point * col_offsets[j];
+ if (A_zero_point) {
+ raw -= A_zero_point * col_offsets[j];
+ }
raw -= B_zero_point[j / ncols_per_quant_group] * row_offsets[i];
if (bias) {
raw += bias[j];