diff options
author | Jianyu Huang <jianyuhuang@fb.com> | 2019-08-09 21:23:22 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-08-09 21:33:13 +0300 |
commit | 7b156071d8912dcf6711c88578c30f0f0d05d3a6 (patch) | |
tree | b95540b1acbe2e17982f8a1c48fbe5c75a016d12 /src/ExecuteKernelU8S8.cc | |
parent | 122135c29b68de5176bd56de6ced936cdc63cb36 (diff) |
Integrate VNNI into FBGEMM master branch (#114)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/114
Adding the VNNI support in FBGEMM.
Previously, we have the issue on CMake version. Currently PyTorch and FBGEMM OSS test has the CMake 3.5 test, while ASMJIT requires CMake to be 3.8+. This caused the build failure for some platforms. Now the CMake version issue is resolved by a PR to ASMJIT to downgrade the CMake requirement: https://github.com/asmjit/asmjit/pull/252.
Reviewed By: dskhudia
Differential Revision: D16720839
fbshipit-source-id: e5e5f2d26f924df8d9fb955f4a3758561fa73288
Diffstat (limited to 'src/ExecuteKernelU8S8.cc')
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 47 |
1 files changed, 41 insertions, 6 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index f7292fd..0a4ff55 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -49,7 +49,8 @@ ExecuteKernel< throw std::runtime_error("Failed to initialize cpuinfo!"); } if (params) { - if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { + if (fbgemmHasAvx512VnniSupport() || fbgemmHasAvx512Support() || + fbgemmHasAvx2Support()) { mbSize_ = params->MCB; nbSize_ = params->NCB; nrMinSize_ = params->NR_MIN; @@ -59,7 +60,20 @@ ExecuteKernel< assert(0 && "unsupported architecure"); } } else { - if (fbgemmHasAvx512Support()) { + if (fbgemmHasAvx512VnniSupport()) { + mbSize_ = PackingTraits< + int8_t, + typename packingAMatrix::accType, + inst_set_t::avx512_vnni>::MCB; + nbSize_ = PackingTraits< + int8_t, + typename packingAMatrix::accType, + inst_set_t::avx512_vnni>::NCB; + nrMinSize_ = PackingTraits< + int8_t, + typename packingAMatrix::accType, + inst_set_t::avx512_vnni>::NR_MIN; + } else if (fbgemmHasAvx512Support()) { mbSize_ = PackingTraits< int8_t, typename packingAMatrix::accType, @@ -118,7 +132,25 @@ void ExecuteKernel< typename BaseType::jit_micro_kernel_fp fn; - if (fbgemmHasAvx512Support()) { + if (fbgemmHasAvx512VnniSupport()) { + if (std::is_same<typename packingAMatrix::accType, std::int16_t>::value) { + // For AVX512VNNI, we redirect int16_t to int32_t accumulation. + CodeGenBase<uint8_t, int8_t, int32_t, int32_t> codeObj; + fn = codeObj.getOrCreate<inst_set_t::avx512_vnni>( + accum, + packed_rows_A, + packedB_.blockColSize(), + packedA_.numPackedCols(), + nbSize_); + } else { + fn = BaseType::template getOrCreate<inst_set_t::avx512_vnni>( + accum, + packed_rows_A, + packedB_.blockColSize(), + packedA_.numPackedCols(), + nbSize_); + } + } else if (fbgemmHasAvx512Support()) { fn = BaseType::template getOrCreate<inst_set_t::avx512>( accum, packed_rows_A, @@ -148,7 +180,10 @@ void ExecuteKernel< if (jb == bColBlocks - 1) { int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_; if (nc != nbSize_) { - if (fbgemmHasAvx512Support()) { + if (fbgemmHasAvx512VnniSupport()) { + fn = BaseType::template getOrCreate<inst_set_t::avx512_vnni>( + accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); + } else if (fbgemmHasAvx512Support()) { fn = BaseType::template getOrCreate<inst_set_t::avx512>( accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); } else if (fbgemmHasAvx2Support()) { @@ -213,7 +248,7 @@ void ExecuteKernel< int32_t nSize = C_buffer_start == C_tile_ ? jb * nbSize_ : packedB_.numCols(); if (nSize) { - if (fbgemmHasAvx512Support()) { + if (fbgemmHasAvx512VnniSupport() || fbgemmHasAvx512Support()) { // TODO: avx512 path // Currently use avx2 code outputProcess_.template f<inst_set_t::avx2>( @@ -238,7 +273,7 @@ void ExecuteKernel< if (C_buffer_start == C_tile_) { // When C_tile_ scratchpad was used to avoid accessing memory past // C_buffer_ . - if (fbgemmHasAvx512Support()) { + if (fbgemmHasAvx512VnniSupport() || fbgemmHasAvx512Support()) { // TODO: avx512 path // Currently use avx2 code outputProcess_.template f<inst_set_t::avx2>( |