From ebbe4f4fca119e9787f47b769591643ddfc8c4a7 Mon Sep 17 00:00:00 2001 From: Jongsoo Park Date: Mon, 10 Dec 2018 22:59:28 -0800 Subject: instantiate more kernels for PackAmatrix (#47) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/47 PackAMatrix (compared to PackAWithRowOffset) can be a faster alternative when B_zero_point = 0 Reviewed By: jianyuh Differential Revision: D13413605 fbshipit-source-id: 2cac4560e8f166d19c58c65ae25400d1b0795b19 --- src/ExecuteKernelU8S8.cc | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'src/ExecuteKernelU8S8.cc') diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 152d7f1..3b6e059 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -243,25 +243,30 @@ void ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithRowOffset, \ - PackBMatrix, \ - uint8_t, \ +#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A, \ + PackBMatrix, \ + uint8_t, \ ReQuantizeOutput>; -#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); -#define INSTANTIATE_RELU(ACC_T) \ - INSTANTIATE_Q_GRANS(ACC_T, false); \ - INSTANTIATE_Q_GRANS(ACC_T, true); +#define INSTANTIATE_RELU(PACK_A, ACC_T) \ + INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \ + INSTANTIATE_Q_GRANS(PACK_A, ACC_T, true); -INSTANTIATE_RELU(int32_t); -INSTANTIATE_RELU(int16_t); +#define INSTANTIATE_ACC_T(PACK_A) \ + INSTANTIATE_RELU(PACK_A, int32_t); \ + INSTANTIATE_RELU(PACK_A, int16_t); + +INSTANTIATE_ACC_T(PackAMatrix); +INSTANTIATE_ACC_T(PackAWithRowOffset); +#undef INSTANTIATE_ACC_T #undef INSTANTIATE_RELU #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE @@ -295,12 +300,6 @@ INSTANTIATE_RELU(int16_t); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE -template class ExecuteKernel< - PackAMatrix, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; - //////////////////////////////////////////////////////////////////////////////// // ReQuantizeForFloat #define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ -- cgit v1.2.3