Integrate VNNI into FBGEMM master branch (#114)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/114 Adding the VNNI support in FBGEMM. Previously, we have the issue on CMake version. Currently PyTorch and FBGEMM OSS test has the CMake 3.5 test, while ASMJIT requires CMake to be 3.8+. This caused the build failure for some platforms. Now the CMake version issue is resolved by a PR to ASMJIT to downgrade the CMake requirement: https://github.com/asmjit/asmjit/pull/252. Reviewed By: dskhudia Differential Revision: D16720839 fbshipit-source-id: e5e5f2d26f924df8d9fb955f4a3758561fa73288
author: Jianyu Huang <jianyuhuang@fb.com> 2019-08-09 21:23:22 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-08-09 21:33:13 +0300
commit: 7b156071d8912dcf6711c88578c30f0f0d05d3a6 (patch)
tree: b95540b1acbe2e17982f8a1c48fbe5c75a016d12 /src/ExecuteKernelU8S8.cc
parent: 122135c29b68de5176bd56de6ced936cdc63cb36 (diff)
1 files changed, 41 insertions, 6 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index f7292fd..0a4ff55 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -49,7 +49,8 @@ ExecuteKernel<
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
   if (params) {
-    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
+    if (fbgemmHasAvx512VnniSupport() || fbgemmHasAvx512Support() ||
+        fbgemmHasAvx2Support()) {
       mbSize_ = params->MCB;
       nbSize_ = params->NCB;
       nrMinSize_ = params->NR_MIN;
@@ -59,7 +60,20 @@ ExecuteKernel<
       assert(0 && "unsupported architecure");
     }
   } else {
-    if (fbgemmHasAvx512Support()) {
+    if (fbgemmHasAvx512VnniSupport()) {
+      mbSize_ = PackingTraits<
+          int8_t,
+          typename packingAMatrix::accType,
+          inst_set_t::avx512_vnni>::MCB;
+      nbSize_ = PackingTraits<
+          int8_t,
+          typename packingAMatrix::accType,
+          inst_set_t::avx512_vnni>::NCB;
+      nrMinSize_ = PackingTraits<
+          int8_t,
+          typename packingAMatrix::accType,
+          inst_set_t::avx512_vnni>::NR_MIN;
+    } else if (fbgemmHasAvx512Support()) {
       mbSize_ = PackingTraits<
           int8_t,
           typename packingAMatrix::accType,
@@ -118,7 +132,25 @@ void ExecuteKernel<
 
   typename BaseType::jit_micro_kernel_fp fn;
 
-  if (fbgemmHasAvx512Support()) {
+  if (fbgemmHasAvx512VnniSupport()) {
+    if (std::is_same<typename packingAMatrix::accType, std::int16_t>::value) {
+      // For AVX512VNNI, we redirect int16_t to int32_t accumulation.
+      CodeGenBase<uint8_t, int8_t, int32_t, int32_t> codeObj;
+      fn = codeObj.getOrCreate<inst_set_t::avx512_vnni>(
+          accum,
+          packed_rows_A,
+          packedB_.blockColSize(),
+          packedA_.numPackedCols(),
+          nbSize_);
+    } else {
+      fn = BaseType::template getOrCreate<inst_set_t::avx512_vnni>(
+          accum,
+          packed_rows_A,
+          packedB_.blockColSize(),
+          packedA_.numPackedCols(),
+          nbSize_);
+    }
+  } else if (fbgemmHasAvx512Support()) {
     fn = BaseType::template getOrCreate<inst_set_t::avx512>(
         accum,
         packed_rows_A,
@@ -148,7 +180,10 @@ void ExecuteKernel<
     if (jb == bColBlocks - 1) {
       int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_;
       if (nc != nbSize_) {
-        if (fbgemmHasAvx512Support()) {
+        if (fbgemmHasAvx512VnniSupport()) {
+          fn = BaseType::template getOrCreate<inst_set_t::avx512_vnni>(
+              accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
+        } else if (fbgemmHasAvx512Support()) {
           fn = BaseType::template getOrCreate<inst_set_t::avx512>(
               accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
         } else if (fbgemmHasAvx2Support()) {
@@ -213,7 +248,7 @@ void ExecuteKernel<
       int32_t nSize =
           C_buffer_start == C_tile_ ? jb * nbSize_ : packedB_.numCols();
       if (nSize) {
-        if (fbgemmHasAvx512Support()) {
+        if (fbgemmHasAvx512VnniSupport() || fbgemmHasAvx512Support()) {
           // TODO: avx512 path
           // Currently use avx2 code
           outputProcess_.template f<inst_set_t::avx2>(
@@ -238,7 +273,7 @@ void ExecuteKernel<
       if (C_buffer_start == C_tile_) {
         // When C_tile_ scratchpad was used to avoid accessing memory past
         // C_buffer_ .
-        if (fbgemmHasAvx512Support()) {
+        if (fbgemmHasAvx512VnniSupport() || fbgemmHasAvx512Support()) {
           // TODO: avx512 path
           // Currently use avx2 code
           outputProcess_.template f<inst_set_t::avx2>(
author	Jianyu Huang <jianyuhuang@fb.com>	2019-08-09 21:23:22 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-08-09 21:33:13 +0300
commit	7b156071d8912dcf6711c88578c30f0f0d05d3a6 (patch)
tree	b95540b1acbe2e17982f8a1c48fbe5c75a016d12 /src/ExecuteKernelU8S8.cc
parent	122135c29b68de5176bd56de6ced936cdc63cb36 (diff)