From d4ee77f5a851879f4a778f122656158663b766b5 Mon Sep 17 00:00:00 2001 From: Jongsoo Park Date: Tue, 27 Nov 2018 10:05:28 -0800 Subject: per-group and per-channel quantization (#14340) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/14340 Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/25 Per-group and per-channel quantization in fbgemm This diff also cleans up explicit template instantiation using macro expansion This diff also changes randFill interface which was easy to make mistakes of generating integer random numbers for floating point vectors. Using this in DNNLOWP operators will be done in a separate diff. Reviewed By: dskhudia Differential Revision: D13176386 fbshipit-source-id: e46c53e31e21520bded71b8ed86e8b19e010e2dd --- src/ExecuteKernelU8S8.cc | 284 +++++++++++++++++++++++++---------------------- 1 file changed, 154 insertions(+), 130 deletions(-) (limited to 'src/ExecuteKernelU8S8.cc') diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 2e2035c..f1ec882 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -240,47 +240,60 @@ void ExecuteKernel< } // for each j block } -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; - -template class ExecuteKernel< - PackAWithQuantRowOffset, - PackBMatrix, - float, - ReQuantizeForFloat>; - -template class ExecuteKernel< - PackAWithQuantRowOffset, - PackBMatrix, - float, - ReQuantizeForFloat>; - -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - float, - ReQuantizeForFloat>; - -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - float, - ReQuantizeForFloat>; - -template class ExecuteKernel< - PackAMatrix, - PackBMatrix, - int32_t, - memCopy<>>; +//////////////////////////////////////////////////////////////////////////////// +// ReQuantizeOutput +#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithRowOffset, \ + PackBMatrix, \ + uint8_t, \ + ReQuantizeOutput>; + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_Q_GRANS(ACC_T, false); \ + INSTANTIATE_Q_GRANS(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col, \ + PackBMatrix, \ + uint8_t, \ + ReQuantizeOutput>; + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_SPATIAL_DIM(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAMatrix, @@ -288,110 +301,127 @@ template class ExecuteKernel< uint8_t, ReQuantizeOutput>; -template class ExecuteKernel< - PackAMatrix, - PackBMatrix, - int32_t, - memCopy<>>; +//////////////////////////////////////////////////////////////////////////////// +// ReQuantizeForFloat +#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A, \ + PackBMatrix, \ + float, \ + ReQuantizeForFloat>; + +#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_RELU(PACK_A) \ + INSTANTIATE_Q_GRANS(PACK_A, false); \ + INSTANTIATE_Q_GRANS(PACK_A, true); + +INSTANTIATE_RELU(PackAWithRowOffset); +INSTANTIATE_RELU(PackAWithQuantRowOffset); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col, \ + PackBMatrix, \ + float, \ + ReQuantizeForFloat>; + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_SPATIAL_DIM(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAWithRowOffset, PackBMatrix, - uint8_t, - DoSpmdmOnInpBuffer< - ReQuantizeOutput::outType, - int32_t, - ReQuantizeOutput>>; + float, + ReQuantizeForFloat>; -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - uint8_t, - DoSpmdmOnInpBuffer< - ReQuantizeOutput::outType, - int32_t, - ReQuantizeOutput>>; +//////////////////////////////////////////////////////////////////////////////// +// DoSpmdmOnInpBuffer +#define INSTANTIATE_BASE(RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithRowOffset, \ + PackBMatrix, \ + uint8_t, \ + DoSpmdmOnInpBuffer>>; -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - float, - DoSpmdmOnInpBuffer< - ReQuantizeForFloat::outType, - int32_t, - ReQuantizeForFloat>>; +#define INSTANTIATE_Q_GRANS(RELU) \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +INSTANTIATE_Q_GRANS(false); +INSTANTIATE_Q_GRANS(true); -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAWithRowOffset, PackBMatrix, - int32_t, - memCopy<>>; + float, + DoSpmdmOnInpBuffer>>; -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - int32_t, - memCopy<>>; +//////////////////////////////////////////////////////////////////////////////// +// memCopy +#define INSTANTIATE_BASE(PACK_A, ACC_T) \ + template class ExecuteKernel< \ + PACK_A, \ + PackBMatrix, \ + int32_t, \ + memCopy<>>; -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - int32_t, - memCopy<>>; +#define INSTANTIATE_ACC_T(PACK_A) \ + INSTANTIATE_BASE(PACK_A, int32_t) \ + INSTANTIATE_BASE(PACK_A, int16_t) -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +INSTANTIATE_ACC_T(PackAMatrix); +INSTANTIATE_ACC_T(PackAWithRowOffset); -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +#undef INSTANTIATE_ACC_T +#undef INSTANTIATE_BASE -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - int32_t, - memCopy<>>; - -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - int32_t, - memCopy<>>; +#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \ + template class ExecuteKernel< \ + PackAWithIm2Col, \ + PackBMatrix, \ + int32_t, \ + memCopy<>>; -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - int32_t, - memCopy<>>; +#define INSTANTIATE_SPATIAL_DIM(ACC_T) \ + INSTANTIATE_BASE(ACC_T, 2); \ + INSTANTIATE_BASE(ACC_T, 3); -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +INSTANTIATE_SPATIAL_DIM(int32_t); +INSTANTIATE_SPATIAL_DIM(int16_t); -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAWithQuantRowOffset, @@ -399,12 +429,6 @@ template class ExecuteKernel< int32_t, memCopy<>>; -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - float, - ReQuantizeForFloat>; - template class ExecuteKernel< PackAMatrix, PackBMatrix, -- cgit v1.2.3