Integrate VNNI into FBGEMM master branch (#113)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/113 Adding the VNNI support in FBGEMM. Reviewed By: dskhudia Differential Revision: D16276574 fbshipit-source-id: 832ccdb27339489ebc138f3b2678e53d107c1b79
author: Jianyu Huang <jianyuhuang@fb.com> 2019-08-06 19:35:42 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-08-06 19:50:51 +0300
commit: d8b3323668fdd15dc70e9cb43ab16e96f4846eeb (patch)
tree: d48a6818c14575d92e68bf1ffb621d646a6c893e /src/ExecuteKernelU8S8.cc
parent: 0d5d057ca941ebb511bdc6178fc26c23e6c4a953 (diff)
1 files changed, 41 insertions, 6 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index f7292fd..0a4ff55 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -49,7 +49,8 @@ ExecuteKernel<
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
   if (params) {
-    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
+    if (fbgemmHasAvx512VnniSupport() || fbgemmHasAvx512Support() ||
+        fbgemmHasAvx2Support()) {
       mbSize_ = params->MCB;
       nbSize_ = params->NCB;
       nrMinSize_ = params->NR_MIN;
@@ -59,7 +60,20 @@ ExecuteKernel<
       assert(0 && "unsupported architecure");
     }
   } else {
-    if (fbgemmHasAvx512Support()) {
+    if (fbgemmHasAvx512VnniSupport()) {
+      mbSize_ = PackingTraits<
+          int8_t,
+          typename packingAMatrix::accType,
+          inst_set_t::avx512_vnni>::MCB;
+      nbSize_ = PackingTraits<
+          int8_t,
+          typename packingAMatrix::accType,
+          inst_set_t::avx512_vnni>::NCB;
+      nrMinSize_ = PackingTraits<
+          int8_t,
+          typename packingAMatrix::accType,
+          inst_set_t::avx512_vnni>::NR_MIN;
+    } else if (fbgemmHasAvx512Support()) {
       mbSize_ = PackingTraits<
           int8_t,
           typename packingAMatrix::accType,
@@ -118,7 +132,25 @@ void ExecuteKernel<
 
   typename BaseType::jit_micro_kernel_fp fn;
 
-  if (fbgemmHasAvx512Support()) {
+  if (fbgemmHasAvx512VnniSupport()) {
+    if (std::is_same<typename packingAMatrix::accType, std::int16_t>::value) {
+      // For AVX512VNNI, we redirect int16_t to int32_t accumulation.
+      CodeGenBase<uint8_t, int8_t, int32_t, int32_t> codeObj;
+      fn = codeObj.getOrCreate<inst_set_t::avx512_vnni>(
+          accum,
+          packed_rows_A,
+          packedB_.blockColSize(),
+          packedA_.numPackedCols(),
+          nbSize_);
+    } else {
+      fn = BaseType::template getOrCreate<inst_set_t::avx512_vnni>(
+          accum,
+          packed_rows_A,
+          packedB_.blockColSize(),
+          packedA_.numPackedCols(),
+          nbSize_);
+    }
+  } else if (fbgemmHasAvx512Support()) {
     fn = BaseType::template getOrCreate<inst_set_t::avx512>(
         accum,
         packed_rows_A,
@@ -148,7 +180,10 @@ void ExecuteKernel<
     if (jb == bColBlocks - 1) {
       int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_;
       if (nc != nbSize_) {
-        if (fbgemmHasAvx512Support()) {
+        if (fbgemmHasAvx512VnniSupport()) {
+          fn = BaseType::template getOrCreate<inst_set_t::avx512_vnni>(
+              accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
+        } else if (fbgemmHasAvx512Support()) {
           fn = BaseType::template getOrCreate<inst_set_t::avx512>(
               accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
         } else if (fbgemmHasAvx2Support()) {
@@ -213,7 +248,7 @@ void ExecuteKernel<
       int32_t nSize =
           C_buffer_start == C_tile_ ? jb * nbSize_ : packedB_.numCols();
       if (nSize) {
-        if (fbgemmHasAvx512Support()) {
+        if (fbgemmHasAvx512VnniSupport() || fbgemmHasAvx512Support()) {
           // TODO: avx512 path
           // Currently use avx2 code
           outputProcess_.template f<inst_set_t::avx2>(
@@ -238,7 +273,7 @@ void ExecuteKernel<
       if (C_buffer_start == C_tile_) {
         // When C_tile_ scratchpad was used to avoid accessing memory past
         // C_buffer_ .
-        if (fbgemmHasAvx512Support()) {
+        if (fbgemmHasAvx512VnniSupport() || fbgemmHasAvx512Support()) {
           // TODO: avx512 path
           // Currently use avx2 code
           outputProcess_.template f<inst_set_t::avx2>(
author	Jianyu Huang <jianyuhuang@fb.com>	2019-08-06 19:35:42 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-08-06 19:50:51 +0300
commit	d8b3323668fdd15dc70e9cb43ab16e96f4846eeb (patch)
tree	d48a6818c14575d92e68bf1ffb621d646a6c893e /src/ExecuteKernelU8S8.cc
parent	0d5d057ca941ebb511bdc6178fc26c23e6c4a953 (diff)