diff options
author | Jongsoo Park <jongsoo@fb.com> | 2018-12-11 09:59:28 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-12-11 10:01:30 +0300 |
commit | ebbe4f4fca119e9787f47b769591643ddfc8c4a7 (patch) | |
tree | 6c5c19671f00aa00266b9bbf17e969f9fd42ffad /src/ExecuteKernelU8S8.cc | |
parent | 895646cfe2b68e42a506c49217a635270d42bd09 (diff) |
instantiate more kernels for PackAmatrix (#47)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/47
PackAMatrix (compared to PackAWithRowOffset) can be a faster alternative when B_zero_point = 0
Reviewed By: jianyuh
Differential Revision: D13413605
fbshipit-source-id: 2cac4560e8f166d19c58c65ae25400d1b0795b19
Diffstat (limited to 'src/ExecuteKernelU8S8.cc')
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 39 |
1 files changed, 19 insertions, 20 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 152d7f1..3b6e059 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -243,25 +243,30 @@ void ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithRowOffset<uint8_t, ACC_T>, \ - PackBMatrix<int8_t, ACC_T>, \ - uint8_t, \ +#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, ACC_T>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ ReQuantizeOutput<RELU, Q_GRAN>>; -#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); -#define INSTANTIATE_RELU(ACC_T) \ - INSTANTIATE_Q_GRANS(ACC_T, false); \ - INSTANTIATE_Q_GRANS(ACC_T, true); +#define INSTANTIATE_RELU(PACK_A, ACC_T) \ + INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \ + INSTANTIATE_Q_GRANS(PACK_A, ACC_T, true); -INSTANTIATE_RELU(int32_t); -INSTANTIATE_RELU(int16_t); +#define INSTANTIATE_ACC_T(PACK_A) \ + INSTANTIATE_RELU(PACK_A, int32_t); \ + INSTANTIATE_RELU(PACK_A, int16_t); + +INSTANTIATE_ACC_T(PackAMatrix); +INSTANTIATE_ACC_T(PackAWithRowOffset); +#undef INSTANTIATE_ACC_T #undef INSTANTIATE_RELU #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE @@ -295,12 +300,6 @@ INSTANTIATE_RELU(int16_t); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE -template class ExecuteKernel< - PackAMatrix<uint8_t, int16_t>, - PackBMatrix<int8_t, int16_t>, - uint8_t, - ReQuantizeOutput<false>>; - //////////////////////////////////////////////////////////////////////////////// // ReQuantizeForFloat #define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ |