diff options
author | Young Jin Kim <youki@microsoft.com> | 2019-12-03 22:53:14 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-12-03 22:53:14 +0300 |
commit | 84e66a976046180187724aff60a236c5378fde7c (patch) | |
tree | f2c4e39fe4d46df1b7a23602d244d21c9f9ee35b /src/Fbgemm.cc | |
parent | f0b354327aaf2330c65340725b1981040c8bec9e (diff) | |
parent | e6e9b167426c12cd048c3d7d76651492f818daec (diff) |
Merge pull request #1 from marian-nmt/youki/win-jit-debug-int8
Youki/win jit debug int8
Diffstat (limited to 'src/Fbgemm.cc')
-rw-r--r-- | src/Fbgemm.cc | 76 |
1 files changed, 51 insertions, 25 deletions
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index 2f641ee..b691b88 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -48,7 +48,8 @@ void fbgemmPacked( if (!cpuinfo_initialize()) { throw std::runtime_error("Failed to initialize cpuinfo!"); } - if ((!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) { + if ((!fbgemmHasAvx512VnniSupport() && !fbgemmHasAvx512Support() && + !fbgemmHasAvx2Support())) { assert(0 && "unknown architecure"); } @@ -62,7 +63,20 @@ void fbgemmPacked( MR = blocking_params->MR; } else { - if (fbgemmHasAvx512Support()) { + if (fbgemmHasAvx512VnniSupport()) { + MCB = PackingTraits< + typename packingAMatrix::inpType, + typename packingAMatrix::accType, + inst_set_t::avx512_vnni>::MCB; + KCB = PackingTraits< + typename packingAMatrix::inpType, + typename packingAMatrix::accType, + inst_set_t::avx512_vnni>::KCB; + MR = PackingTraits< + typename packingAMatrix::inpType, + typename packingAMatrix::accType, + inst_set_t::avx512_vnni>::MR; + } else if (fbgemmHasAvx512Support()) { MCB = PackingTraits< typename packingAMatrix::inpType, typename packingAMatrix::accType, @@ -223,22 +237,26 @@ bool fbgemmSupportedCPU() { //////////////////////////////////////////////////////////////////////////////// // ReQuantizeOutput -#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN) \ +#define INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, BIAS_TYPE) \ template void fbgemmPacked( \ PackMatrix<PACK_A<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \ PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ uint8_t* C, \ int32_t* C_buffer, \ uint32_t ldc, \ - const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ int thread_id, \ int num_threads, \ const BlockingFactors* blocking_params); -#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); +#define INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, Q_GRAN) \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, float); \ + INSTANTIATE_BASE(PACK_A, ACC_T, RELU, Q_GRAN, int32_t); + +#define INSTANTIATE_Q_GRANS(PACK_A, ACC_T, RELU) \ + INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BIAS_T(PACK_A, ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_RELU(PACK_A, ACC_T) \ INSTANTIATE_Q_GRANS(PACK_A, ACC_T, false); \ @@ -254,27 +272,34 @@ INSTANTIATE_ACC_T(PackAWithRowOffset); #undef INSTANTIATE_ACC_T #undef INSTANTIATE_RELU #undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE -#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ - template void fbgemmPacked( \ - PackMatrix< \ - PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ - uint8_t, \ - ACC_T>& packA, \ - PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ - uint8_t* C, \ - int32_t* C_buffer, \ - uint32_t ldc, \ - const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ - int thread_id, \ - int num_threads, \ +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, BIAS_TYPE) \ + template void fbgemmPacked( \ + PackMatrix< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + uint8_t, \ + ACC_T>& packA, \ + PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \ + int thread_id, \ + int num_threads, \ const BlockingFactors* blocking_params); -#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ - INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ - INSTANTIATE_BASE( \ +#define INSTANTIATE_BIAS_T(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, float); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN, int32_t); + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BIAS_T( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BIAS_T( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BIAS_T( \ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); #define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ @@ -291,6 +316,7 @@ INSTANTIATE_RELU(int16_t); #undef INSTANTIATE_RELU #undef INSTANTIATE_SPATIAL_DIM #undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BIAS_T #undef INSTANTIATE_BASE //////////////////////////////////////////////////////////////////////////////// |