diff options
author | Jongsoo Park <jongsoo@fb.com> | 2019-02-13 02:50:07 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-02-13 02:55:31 +0300 |
commit | c4e50e0ed5db7c9aadb6df47faa4227f5286293c (patch) | |
tree | 963e416174ca1a3d486633c9e61f72760ce025ca /src/ExecuteKernelU8S8.cc | |
parent | 66df1a0ccd762e525e319cb579810deade551152 (diff) |
no need to subtract col offset if a_zp is 0 (#69)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/69
This diff prepares for D14013931 that folds column offsets into bias.
In depthwise convolution, we allow passing column_offsets == nullptr which means column_offsets are folded into bias. We bypass adding column_offset * A_zero_point if either column_offset == nullptr or A_zero_point == 0
Reviewed By: jianyuh
Differential Revision: D14017772
fbshipit-source-id: ad4a79402f43cbf78dbad68e1bff6d07c19dded0
Diffstat (limited to 'src/ExecuteKernelU8S8.cc')
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 286 |
1 files changed, 151 insertions, 135 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index e7f7c70..cdceb63 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -243,116 +243,127 @@ void ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PACK_A<uint8_t, ACC_T>, \ - PackBMatrix<int8_t, ACC_T>, \ - uint8_t, \ +#define INSTANTIATE_REQUANT_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, ACC_T>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ ReQuantizeOutput<RELU, Q_GRAN>>; -#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); - -#define INSTANTIATE_RELU(PACK_A, ACC_T) \ - INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \ - INSTANTIATE_Q_GRANS(PACK_A, ACC_T, true); - -#define INSTANTIATE_ACC_T(PACK_A) \ - INSTANTIATE_RELU(PACK_A, int32_t); \ - INSTANTIATE_RELU(PACK_A, int16_t); - -INSTANTIATE_ACC_T(PackAMatrix); -INSTANTIATE_ACC_T(PackAWithRowOffset); - -#undef INSTANTIATE_ACC_T -#undef INSTANTIATE_RELU -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE - -#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - PackBMatrix<int8_t, ACC_T>, \ - uint8_t, \ +#define INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, RELU) \ + INSTANTIATE_REQUANT_BASE( \ + PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_REQUANT_BASE( \ + PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_REQUANT_BASE( \ + PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_REQUANT_RELU(PACK_A, ACC_T) \ + INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, false); \ + INSTANTIATE_REQUANT_Q_GRANS(PACK_A, ACC_T, true); + +#define INSTANTIATE_REQUANT_ACC_T(PACK_A) \ + INSTANTIATE_REQUANT_RELU(PACK_A, int32_t); \ + INSTANTIATE_REQUANT_RELU(PACK_A, int16_t); + +INSTANTIATE_REQUANT_ACC_T(PackAMatrix); +INSTANTIATE_REQUANT_ACC_T(PackAWithRowOffset); + +#undef INSTANTIATE_REQUANT_ACC_T +#undef INSTANTIATE_REQUANT_RELU +#undef INSTANTIATE_REQUANT_Q_GRANS +#undef INSTANTIATE_REQUANT_BASE + +#define INSTANTIATE_IM2COL_REQUANT_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ ReQuantizeOutput<RELU, Q_GRAN>>; -#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE( \ +#define INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_IM2COL_REQUANT_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_IM2COL_REQUANT_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_IM2COL_REQUANT_BASE( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); -#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ - INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ - INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); +#define INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_IM2COL_REQUANT_Q_GRANS(ACC_T, RELU, 3); -#define INSTANTIATE_RELU(ACC_T) \ - INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ - INSTANTIATE_SPATIAL_DIM(ACC_T, true); +#define INSTANTIATE_IM2COL_REQUANT_RELU(ACC_T) \ + INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM(ACC_T, true); -INSTANTIATE_RELU(int32_t); -INSTANTIATE_RELU(int16_t); +INSTANTIATE_IM2COL_REQUANT_RELU(int32_t); +INSTANTIATE_IM2COL_REQUANT_RELU(int16_t); -#undef INSTANTIATE_RELU -#undef INSTANTIATE_SPATIAL_DIM -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE +#undef INSTANTIATE_IM2COL_REQUANT_RELU +#undef INSTANTIATE_IM2COL_REQUANT_SPATIAL_DIM +#undef INSTANTIATE_IM2COL_REQUANT_Q_GRANS +#undef INSTANTIATE_IM2COL_REQUANT_BASE //////////////////////////////////////////////////////////////////////////////// // ReQuantizeForFloat -#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PACK_A<uint8_t, int32_t>, \ - PackBMatrix<int8_t, int32_t>, \ - float, \ +#define INSTANTIATE_REQUANT_FLOAT_BASE(PACK_A, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, int32_t>, \ + PackBMatrix<int8_t, int32_t>, \ + float, \ ReQuantizeForFloat<RELU, Q_GRAN>>; -#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \ - INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, RELU) \ + INSTANTIATE_REQUANT_FLOAT_BASE( \ + PACK_A, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_REQUANT_FLOAT_BASE( \ + PACK_A, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_REQUANT_FLOAT_BASE( \ + PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); -#define INSTANTIATE_RELU(PACK_A) \ - INSTANTIATE_Q_GRANS(PACK_A, false); \ - INSTANTIATE_Q_GRANS(PACK_A, true); +#define INSTANTIATE_REQUANT_FLOAT_RELU(PACK_A) \ + INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, false); \ + INSTANTIATE_REQUANT_FLOAT_Q_GRANS(PACK_A, true); -INSTANTIATE_RELU(PackAWithRowOffset); -INSTANTIATE_RELU(PackAWithQuantRowOffset); +INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithRowOffset); +INSTANTIATE_REQUANT_FLOAT_RELU(PackAWithQuantRowOffset); -#undef INSTANTIATE_RELU -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE +#undef INSTANTIATE_REQUANT_FLOAT_RELU +#undef INSTANTIATE_REQUANT_FLOAT_Q_GRANS +#undef INSTANTIATE_REQUANT_FLOAT_BASE -#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - PackBMatrix<int8_t, ACC_T>, \ - float, \ +#define INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \ + ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + float, \ ReQuantizeForFloat<RELU, Q_GRAN>>; -#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE( \ +#define INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); -#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ - INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ - INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); +#define INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS(ACC_T, RELU, 3); -#define INSTANTIATE_RELU(ACC_T) \ - INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ - INSTANTIATE_SPATIAL_DIM(ACC_T, true); +#define INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(ACC_T) \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM(ACC_T, true); -INSTANTIATE_RELU(int32_t); -INSTANTIATE_RELU(int16_t); +INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(int32_t); +INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU(int16_t); -#undef INSTANTIATE_RELU -#undef INSTANTIATE_SPATIAL_DIM -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE +#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_RELU +#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_SPATIAL_DIM +#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_Q_GRANS +#undef INSTANTIATE_REQUANT_FLOAT_IM2COL_BASE template class ExecuteKernel< PackAWithRowOffset<uint8_t, int16_t>, @@ -362,41 +373,46 @@ template class ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // DoSpmdmOnInpBuffer -#define INSTANTIATE_BASE(RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithRowOffset<uint8_t, int16_t>, \ - PackBMatrix<int8_t, int16_t>, \ - uint8_t, \ +#define INSTANTIATE_SPMDM_BASE(PACK_A, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, int16_t>, \ + PackBMatrix<int8_t, int16_t>, \ + uint8_t, \ DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>; -#define INSTANTIATE_Q_GRANS(RELU) \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_SPMDM_Q_GRANS(PACK_A, RELU) \ + INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_SPMDM_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); -INSTANTIATE_Q_GRANS(false); -INSTANTIATE_Q_GRANS(true); +#define INSTANTIATE_SPMDM_RELU(PACK_A) \ + INSTANTIATE_SPMDM_Q_GRANS(PACK_A, false); \ + INSTANTIATE_SPMDM_Q_GRANS(PACK_A, true); -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE +INSTANTIATE_SPMDM_RELU(PackAMatrix); +INSTANTIATE_SPMDM_RELU(PackAWithRowOffset); -#define INSTANTIATE_BASE(RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithIm2Col<uint8_t, int16_t>, \ - PackBMatrix<int8_t, int16_t>, \ - uint8_t, \ +#undef INSTANTIATE_SPMDM_RELU +#undef INSTANTIATE_SPMDM_Q_GRANS +#undef INSTANTIATE_SPMDM_BASE + +#define INSTANTIATE_SCONV_BASE(RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, int16_t>, \ + PackBMatrix<int8_t, int16_t>, \ + uint8_t, \ DoSConvOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>; -#define INSTANTIATE_Q_GRANS(RELU) \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_SCONV_Q_GRANS(RELU) \ + INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_SCONV_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); -INSTANTIATE_Q_GRANS(false); -INSTANTIATE_Q_GRANS(true); +INSTANTIATE_SCONV_Q_GRANS(false); +INSTANTIATE_SCONV_Q_GRANS(true); -#undef INSTANTIATE_Q_GRANS -#undef INSTANTIATE_BASE +#undef INSTANTIATE_SCONV_Q_GRANS +#undef INSTANTIATE_SCONV_BASE template class ExecuteKernel< PackAWithRowOffset<uint8_t, int16_t>, @@ -406,39 +422,39 @@ template class ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // memCopy -#define INSTANTIATE_BASE(PACK_A, ACC_T) \ - template class ExecuteKernel< \ - PACK_A<uint8_t, ACC_T>, \ - PackBMatrix<int8_t, ACC_T>, \ - int32_t, \ +#define INSTANTIATE_MEMCPY_BASE(PACK_A, ACC_T) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, ACC_T>, \ + PackBMatrix<int8_t, ACC_T>, \ + int32_t, \ memCopy<>>; -#define INSTANTIATE_ACC_T(PACK_A) \ - INSTANTIATE_BASE(PACK_A, int32_t) \ - INSTANTIATE_BASE(PACK_A, int16_t) +#define INSTANTIATE_MEMCPY_ACC_T(PACK_A) \ + INSTANTIATE_MEMCPY_BASE(PACK_A, int32_t) \ + INSTANTIATE_MEMCPY_BASE(PACK_A, int16_t) -INSTANTIATE_ACC_T(PackAMatrix); -INSTANTIATE_ACC_T(PackAWithRowOffset); +INSTANTIATE_MEMCPY_ACC_T(PackAMatrix); +INSTANTIATE_MEMCPY_ACC_T(PackAWithRowOffset); -#undef INSTANTIATE_ACC_T -#undef INSTANTIATE_BASE +#undef INSTANTIATE_MEMCPY_ACC_T +#undef INSTANTIATE_MEMCPY_BASE -#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \ - template class ExecuteKernel< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - PackBMatrix<int8_t, ACC_T>, \ - int32_t, \ +#define INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, SPATIAL_DIM) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + int32_t, \ memCopy<>>; -#define INSTANTIATE_SPATIAL_DIM(ACC_T) \ - INSTANTIATE_BASE(ACC_T, 2); \ - INSTANTIATE_BASE(ACC_T, 3); +#define INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(ACC_T) \ + INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 2); \ + INSTANTIATE_MEMCPY_IM2COL_BASE(ACC_T, 3); -INSTANTIATE_SPATIAL_DIM(int32_t); -INSTANTIATE_SPATIAL_DIM(int16_t); +INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(int32_t); +INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM(int16_t); -#undef INSTANTIATE_SPATIAL_DIM -#undef INSTANTIATE_BASE +#undef INSTANTIATE_MEMCPY_IM2COL_SPATIAL_DIM +#undef INSTANTIATE_MEMCPY_IM2COL_BASE template class ExecuteKernel< PackAWithQuantRowOffset<uint8_t, int32_t>, |