diff options
author | Jongsoo Park <jongsoo@fb.com> | 2019-02-13 02:50:07 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-02-13 02:55:31 +0300 |
commit | c4e50e0ed5db7c9aadb6df47faa4227f5286293c (patch) | |
tree | 963e416174ca1a3d486633c9e61f72760ce025ca | |
parent | 66df1a0ccd762e525e319cb579810deade551152 (diff) |
no need to subtract col offset if a_zp is 0 (#69)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/69
This diff prepares for D14013931 that folds column offsets into bias.
In depthwise convolution, we allow passing column_offsets == nullptr which means column_offsets are folded into bias. We bypass adding column_offset * A_zero_point if either column_offset == nullptr or A_zero_point == 0
Reviewed By: jianyuh
Differential Revision: D14017772
fbshipit-source-id: ad4a79402f43cbf78dbad68e1bff6d07c19dded0
-rw-r--r-- | include/fbgemm/OutputProcessing-inl.h | 8 | ||||
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 286 | ||||
-rw-r--r-- | src/Fbgemm.cc | 42 | ||||
-rw-r--r-- | src/RefImplementations.cc | 12 |
4 files changed, 189 insertions, 159 deletions
diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h index c250942..9485b18 100644 --- a/include/fbgemm/OutputProcessing-inl.h +++ b/include/fbgemm/OutputProcessing-inl.h @@ -81,7 +81,9 @@ inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f( for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)]; - raw -= Aq_zero_point_ * q_col_offsets_[j]; + if (Aq_zero_point_) { + raw -= Aq_zero_point_ * q_col_offsets_[j]; + } int Bq_zero_point_idx; if (Q_GRAN == QuantizationGranularity::TENSOR) { Bq_zero_point_idx = 0; @@ -225,7 +227,9 @@ inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f( for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start]; - raw -= Aq_zero_point_ * q_col_offsets_[j]; + if (Aq_zero_point_) { + raw -= Aq_zero_point_ * q_col_offsets_[j]; + } int Bq_zero_point_idx; if (Q_GRAN == QuantizationGranularity::TENSOR) { Bq_zero_point_idx = 0; diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index e7f7c70..cdceb63 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -243,116 +243,127 @@ void ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PACK_A<uint8_t, ACC_T>, \ - PackBMatrix<int8_t, ACC_T>, \ - uint8_t, \ +#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, ACC_T>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ ReQuantizeOutput<RELU, Q_GRAN>>; -#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); - -#define INSTANTIATE_RELU(PACK_A, ACC_T) \ - INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \ - INSTANTIATE_Q_GRANS(PACK_A, ACC_T, true); - -#define INSTANTIATE_ACC_T(PACK_A) \ - INSTANTIATE_RELU(PACK_A, int32_t); \ - INSTANTIATE_RELU(PACK_A, int16_t); - -INSTANTIATE_ACC_T(PackAMatrix); -INSTANTIATE_ACC_T(PackAWithRowOffset); - -#undef INSTANTIATE_ACC_T -#undef INSTANTIATE_RELU -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE - -#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - PackBMatrix<int8_t, ACC_T>, \ - uint8_t, \ +#define INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, RELU) \ + INSTANTIATE_REQUANT_BASE( \ + PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_REQUANT_BASE( \ + PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_REQUANT_BASE( \ + PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_REQUANT_RELU(PACK_A, ACC_T) \ + INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, false); \ + INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, true); + +#define INSTANTIATE_REQUANT_ACC_T(PACK_A) \ + INSTANTIATE_REQUANT_RELU(PACK_A, int32_t); \ + INSTANTIATE_REQUANT_RELU(PACK_A, int16_t); + +INSTANTIATE_REQUANT_ACC_T(PackAMatrix); +INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset); + +#undef INSTANTIATE_REQUANT_ACC_T +#undef INSTANTIATE_REQUANT_RELU +#undef INSTANTIATE_REQUANT_Q_GRANS +#undef INSTANTIATE_REQUANT_BASE + +#define INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ ReQuantizeOutput<RELU, Q_GRAN>>; -#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE( \ +#define INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_IM2COL_REQUANT_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_IM2COL_REQUANT_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_IM2COL_REQUANT_BASE( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); -#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ - INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ - INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); +#define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 3); -#define INSTANTIATE_RELU(ACC_T) \ - INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ - INSTANTIATE_SPATIAL_DIM(ACC_T, true); +#define INSTANTIATE_IM2COL_REQUANT_RELU(ACC_T) \ + INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, true); -INSTANTIATE_RELU(int32_t); -INSTANTIATE_RELU(int16_t); +INSTANTIATE_IM2COL_REQUANT_RELU(int32_t); +INSTANTIATE_IM2COL_REQUANT_RELU(int16_t); -#undef INSTANTIATE_RELU -#undef INSTANTIATE_SPATIAL_DIM -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE +#undef INSTANTIATE_IM2COL_REQUANT_RELU +#undef INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM +#undef INSTANTIATE_IM2COL_REQUANT_Q_GRANS +#undef INSTANTIATE_IM2COL_REQUANT_BASE //////////////////////////////////////////////////////////////////////////////// // ReQuantizeForFloat -#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PACK_A<uint8_t, int32_t>, \ - PackBMatrix<int8_t, int32_t>, \ - float, \ +#define INSTANTIATE_REQUANT_FLOAT_BASE(PACK_A, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, int32_t>, \ + PackBMatrix<int8_t, int32_t>, \ + float, \ ReQuantizeForFloat<RELU, Q_GRAN>>; -#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \ - INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, RELU) \ + INSTANTIATE_REQUANT_FLOAT_BASE( \ + PACK_A, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_REQUANT_FLOAT_BASE( \ + PACK_A, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_REQUANT_FLOAT_BASE( \ + PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); -#define INSTANTIATE_RELU(PACK_A) \ - INSTANTIATE_Q_GRANS(PACK_A, false); \ - INSTANTIATE_Q_GRANS(PACK_A, true); +#define INSTANTIATE_REQUANT_FLOAT_RELU(PACK_A) \ + INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, false); \ + INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, true); -INSTANTIATE_RELU(PackAWithRowOffset); -INSTANTIATE_RELU(PackAWithQuantRowOffset); +INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithRowOffset); +INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithQuantRowOffset); -#undef INSTANTIATE_RELU -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE +#undef INSTANTIATE_REQUANT_FLOAT_RELU +#undef INSTANTIATE_REQUANT_FLOAT_Q_GRANS +#undef INSTANTIATE_REQUANT_FLOAT_BASE -#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - PackBMatrix<int8_t, ACC_T>, \ - float, \ +#define INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \ + ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + float, \ ReQuantizeForFloat<RELU, Q_GRAN>>; -#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE( \ +#define INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); -#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ - INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ - INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); +#define INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 3); -#define INSTANTIATE_RELU(ACC_T) \ - INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ - INSTANTIATE_SPATIAL_DIM(ACC_T, true); +#define INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(ACC_T) \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, true); -INSTANTIATE_RELU(int32_t); -INSTANTIATE_RELU(int16_t); +INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(int32_t); +INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(int16_t); -#undef INSTANTIATE_RELU -#undef INSTANTIATE_SPATIAL_DIM -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE +#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU +#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM +#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS +#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE template class ExecuteKernel< PackAWithRowOffset<uint8_t, int16_t>, @@ -362,41 +373,46 @@ template class ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // DoSpmdmOnInpBuffer -#define INSTANTIATE_BASE(RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithRowOffset<uint8_t, int16_t>, \ - PackBMatrix<int8_t, int16_t>, \ - uint8_t, \ +#define INSTANTIATE_SPMDM_BASE(PACK_A, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, int16_t>, \ + PackBMatrix<int8_t, int16_t>, \ + uint8_t, \ DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>; -#define INSTANTIATE_Q_GRANS(RELU) \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_SPMDM_Q_GRANS(PACK_A, RELU) \ + INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); -INSTANTIATE_Q_GRANS(false); -INSTANTIATE_Q_GRANS(true); +#define INSTANTIATE_SPMDM_RELU(PACK_A) \ + INSTANTIATE_SPMDM_Q_GRANS(PACK_A, false); \ + INSTANTIATE_SPMDM_Q_GRANS(PACK_A, true); -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE +INSTANTIATE_SPMDM_RELU(PackAMatrix); +INSTANTIATE_SPMDM_RELU(PackAWithRowOffset); -#define INSTANTIATE_BASE(RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithIm2Col<uint8_t, int16_t>, \ - PackBMatrix<int8_t, int16_t>, \ - uint8_t, \ +#undef INSTANTIATE_SPMDM_RELU +#undef INSTANTIATE_SPMDM_Q_GRANS +#undef INSTANTIATE_SPMDM_BASE + +#define INSTANTIATE_SCONV_BASE(RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, int16_t>, \ + PackBMatrix<int8_t, int16_t>, \ + uint8_t, \ DoSConvOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>; -#define INSTANTIATE_Q_GRANS(RELU) \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_SCONV_Q_GRANS(RELU) \ + INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); -INSTANTIATE_Q_GRANS(false); -INSTANTIATE_Q_GRANS(true); +INSTANTIATE_SCONV_Q_GRANS(false); +INSTANTIATE_SCONV_Q_GRANS(true); -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE +#undef INSTANTIATE_SCONV_Q_GRANS +#undef INSTANTIATE_SCONV_BASE template class ExecuteKernel< PackAWithRowOffset<uint8_t, int16_t>, @@ -406,39 +422,39 @@ template class ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // memCopy -#define INSTANTIATE_BASE(PACK_A, ACC_T) \ - template class ExecuteKernel< \ - PACK_A<uint8_t, ACC_T>, \ - PackBMatrix<int8_t, ACC_T>, \ - int32_t, \ +#define INSTANTIATE_MEMCPY_BASE(PACK_A, ACC_T) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, ACC_T>, \ + PackBMatrix<int8_t, ACC_T>, \ + int32_t, \ memCopy<>>; -#define INSTANTIATE_ACC_T(PACK_A) \ - INSTANTIATE_BASE(PACK_A, int32_t) \ - INSTANTIATE_BASE(PACK_A, int16_t) +#define INSTANTIATE_MEMCPY_ACC_T(PACK_A) \ + INSTANTIATE_MEMCPY_BASE(PACK_A, int32_t) \ + INSTANTIATE_MEMCPY_BASE(PACK_A, int16_t) -INSTANTIATE_ACC_T(PackAMatrix); -INSTANTIATE_ACC_T(PackAWithRowOffset); +INSTANTIATE_MEMCPY_ACC_T(PackAMatrix); +INSTANTIATE_MEMCPY_ACC_T(PackAWithRowOffset); -#undef INSTANTIATE_ACC_T -#undef INSTANTIATE_BASE +#undef INSTANTIATE_MEMCPY_ACC_T +#undef INSTANTIATE_MEMCPY_BASE -#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \ - template class ExecuteKernel< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - PackBMatrix<int8_t, ACC_T>, \ - int32_t, \ +#define INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, SPATIAL_DIM) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + int32_t, \ memCopy<>>; -#define INSTANTIATE_SPATIAL_DIM(ACC_T) \ - INSTANTIATE_BASE(ACC_T, 2); \ - INSTANTIATE_BASE(ACC_T, 3); +#define INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(ACC_T) \ + INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 2); \ + INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 3); -INSTANTIATE_SPATIAL_DIM(int32_t); -INSTANTIATE_SPATIAL_DIM(int16_t); +INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(int32_t); +INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(int16_t); -#undef INSTANTIATE_SPATIAL_DIM -#undef INSTANTIATE_BASE +#undef INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM +#undef INSTANTIATE_MEMCPY_IM2COL_BASE template class ExecuteKernel< PackAWithQuantRowOffset<uint8_t, int32_t>, diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index ab0693a..f258604 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -360,31 +360,35 @@ template void fbgemmPacked( //////////////////////////////////////////////////////////////////////////////// // DoSpmdmOnInpBuffer -#define INSTANTIATE_BASE(RELU, Q_GRAN) \ - template void fbgemmPacked( \ - PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& \ - packA, \ - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, \ - uint8_t* C, \ - int32_t* C_buffer, \ - uint32_t ldc, \ - const DoSpmdmOnInpBuffer< \ - uint8_t, \ - int32_t, \ - ReQuantizeOutput<RELU, Q_GRAN>>& outProcess, \ - int thread_id, \ +#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix<PACK_A<uint8_t, int16_t>, uint8_t, int16_t>& packA, \ + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const DoSpmdmOnInpBuffer< \ + uint8_t, \ + int32_t, \ + ReQuantizeOutput<RELU, Q_GRAN>>& outProcess, \ + int thread_id, \ int num_threads); -#define INSTANTIATE_Q_GRANS(RELU) \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); -INSTANTIATE_Q_GRANS(false); -INSTANTIATE_Q_GRANS(true); +#define INSTANTIATE_RELU(PACK_A) \ + INSTANTIATE_Q_GRANS(PACK_A, false); \ + INSTANTIATE_Q_GRANS(PACK_A, true); + +INSTANTIATE_RELU(PackAMatrix); +INSTANTIATE_RELU(PackAWithRowOffset); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE +#undef INSTANTIATE_RELU #define INSTANTIATE_BASE(RELU, Q_GRAN) \ template void fbgemmPacked( \ diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index 5c6cf1b..5f1277f 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -34,8 +34,12 @@ void requantize_u8acc32_ref( for (int i = 0; i < M; ++i) { for (int j = 0; j < N; ++j) { int32_t raw = inp[i * ld + j]; - raw -= A_zero_point * col_offsets[j]; - raw -= B_zero_point * row_offsets[i]; + if (A_zero_point) { + raw -= A_zero_point * col_offsets[j]; + } + if (B_zero_point) { + raw -= B_zero_point * row_offsets[i]; + } if (bias) { raw += bias[j]; } @@ -69,7 +73,9 @@ void requantize_u8acc32_ref( for (int i = 0; i < M; ++i) { for (int j = 0; j < N; ++j) { int32_t raw = inp[i * ld + j]; - raw -= A_zero_point * col_offsets[j]; + if (A_zero_point) { + raw -= A_zero_point * col_offsets[j]; + } raw -= B_zero_point[j / ncols_per_quant_group] * row_offsets[i]; if (bias) { raw += bias[j]; |