diff options
author | Jongsoo Park <jongsoo@fb.com> | 2018-12-11 09:59:28 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-12-11 10:01:30 +0300 |
commit | ebbe4f4fca119e9787f47b769591643ddfc8c4a7 (patch) | |
tree | 6c5c19671f00aa00266b9bbf17e969f9fd42ffad | |
parent | 895646cfe2b68e42a506c49217a635270d42bd09 (diff) |
instantiate more kernels for PackAmatrix (#47)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/47
PackAMatrix (compared to PackAWithRowOffset) can be a faster alternative when B_zero_point = 0
Reviewed By: jianyuh
Differential Revision: D13413605
fbshipit-source-id: 2cac4560e8f166d19c58c65ae25400d1b0795b19
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 39 | ||||
-rw-r--r-- | src/Fbgemm.cc | 53 |
2 files changed, 42 insertions, 50 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 152d7f1..3b6e059 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -243,25 +243,30 @@ void ExecuteKernel< //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \ - template class ExecuteKernel< \ - PackAWithRowOffset<uint8_t, ACC_T>, \ - PackBMatrix<int8_t, ACC_T>, \ - uint8_t, \ +#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, ACC_T>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ ReQuantizeOutput<RELU, Q_GRAN>>; -#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); -#define INSTANTIATE_RELU(ACC_T) \ - INSTANTIATE_Q_GRANS(ACC_T, false); \ - INSTANTIATE_Q_GRANS(ACC_T, true); +#define INSTANTIATE_RELU(PACK_A, ACC_T) \ + INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \ + INSTANTIATE_Q_GRANS(PACK_A, ACC_T, true); -INSTANTIATE_RELU(int32_t); -INSTANTIATE_RELU(int16_t); +#define INSTANTIATE_ACC_T(PACK_A) \ + INSTANTIATE_RELU(PACK_A, int32_t); \ + INSTANTIATE_RELU(PACK_A, int16_t); + +INSTANTIATE_ACC_T(PackAMatrix); +INSTANTIATE_ACC_T(PackAWithRowOffset); +#undef INSTANTIATE_ACC_T #undef INSTANTIATE_RELU #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE @@ -295,12 +300,6 @@ INSTANTIATE_RELU(int16_t); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE -template class ExecuteKernel< - PackAMatrix<uint8_t, int16_t>, - PackBMatrix<int8_t, int16_t>, - uint8_t, - ReQuantizeOutput<false>>; - //////////////////////////////////////////////////////////////////////////////// // ReQuantizeForFloat #define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index 6623fe7..b7a99c6 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -15,8 +15,6 @@ double computing_time = 0.0; double run_time = 0.0; #endif -using namespace fbgemm; - namespace fbgemm { template < @@ -200,29 +198,34 @@ bool fbgemmSupportedCPU() { //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \ - template void fbgemmPacked( \ - PackMatrix<PackAWithRowOffset<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \ - PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ - uint8_t* C, \ - int32_t* C_buffer, \ - uint32_t ldc, \ - const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ - int thread_id, \ +#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix<PACK_A<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \ + PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + int thread_id, \ int num_threads); -#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); -#define INSTANTIATE_RELU(ACC_T) \ - INSTANTIATE_Q_GRANS(ACC_T, false); \ - INSTANTIATE_Q_GRANS(ACC_T, true); +#define INSTANTIATE_RELU(PACK_A, ACC_T) \ + INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \ + INSTANTIATE_Q_GRANS(PACK_A, ACC_T, true); -INSTANTIATE_RELU(int32_t); -INSTANTIATE_RELU(int16_t); +#define INSTANTIATE_ACC_T(PACK_A) \ + INSTANTIATE_RELU(PACK_A, int32_t); \ + INSTANTIATE_RELU(PACK_A, int16_t); +INSTANTIATE_ACC_T(PackAMatrix); +INSTANTIATE_ACC_T(PackAWithRowOffset); + +#undef INSTANTIATE_ACC_T #undef INSTANTIATE_RELU #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE @@ -263,16 +266,6 @@ INSTANTIATE_RELU(int16_t); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE -template void fbgemmPacked( - PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput<false>& outProcess, - int thread_id, - int num_threads); - //////////////////////////////////////////////////////////////////////////////// // ReQuantizeForFloat #define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ |