From 5ec2641b8a4bb74827f9c46357c69c4af1f6cfe0 Mon Sep 17 00:00:00 2001
From: Jianyu Huang <jianyuhuang@fb.com>
Date: Wed, 13 Feb 2019 15:46:39 -0800
Subject: JIT kernel should only handle a small portion of NCB for the last
 block: multiple of NR

Summary:
Before this Diff:
we pass into the JIT kernel with nc = NCB ( packedB_.blockColSize() ) instead of nc = leftover size (packedB_.lastBcol() ) for the last block of B (diffusion/FBS/browse/master/fbcode/deeplearning/fbgemm/src/ExecuteKernelU8S8.cc;1adfe7977ef7ea2a1aee0ed785bd3fed5b7c4a20$102), which cause the additional computation when n is small.

After this Diff:
we pass into the JIT kernel with a small portion of NCB (still multiple of NR) for the last block of B.

The main performance gain is for Acc16, because NCB = 4 * NR for Acc16 and NCB = NR for Acc32 in our current settings (AVX2 and AVX512).

Reviewed By: jspark1105

Differential Revision: D14063628

fbshipit-source-id: 5829d06553daf617e2fefa7d26cb2d761af402c1
---
 src/ExecuteKernelU8S8.cc | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'src/ExecuteKernelU8S8.cc')

diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index cdceb63..64bac90 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -53,6 +53,10 @@ ExecuteKernel<
         int8_t,
         typename packingAMatrix::accType,
         inst_set_t::avx512>::NCB;
+    nrSize_ = PackingTraits<
+        int8_t,
+        typename packingAMatrix::accType,
+        inst_set_t::avx512>::NR;
   } else if (cpuinfo_has_x86_avx2()) {
     mbSize_ = PackingTraits<
         int8_t,
@@ -62,6 +66,10 @@ ExecuteKernel<
         int8_t,
         typename packingAMatrix::accType,
         inst_set_t::avx2>::NCB;
+    nrSize_ = PackingTraits<
+        int8_t,
+        typename packingAMatrix::accType,
+        inst_set_t::avx2>::NR;
   } else {
     assert(0 && "unsupported architecure");
   }
@@ -125,6 +133,27 @@ void ExecuteKernel<
 #endif
 
   for (int jb = 0; jb < bColBlocks; ++jb) {
+    if (jb == bColBlocks - 1) {
+      int nc = ((packedB_.lastBcol() - 1) / nrSize_ + 1) * nrSize_;
+      if (nc != nbSize_) {
+        if (cpuinfo_initialize()) {
+          if (cpuinfo_has_x86_avx512f()) {
+            fn = BaseType::template getOrCreate<inst_set_t::avx512>(
+                accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
+          } else if (cpuinfo_has_x86_avx2()) {
+            fn = BaseType::template getOrCreate<inst_set_t::avx2>(
+                accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
+          } else {
+            // TODO: Have default slower path
+            assert(0 && "unsupported architecture");
+            return;
+          }
+        } else {
+          throw std::runtime_error("Failed to initialize cpuinfo!");
+        }
+      }
+    }
+
     bBuf = packedB_.getBuf(jb, kBlock);
     // prefetch addr of the next packed block of B matrix
     bBuf_pf = packedB_.getBuf(jb == bColBlocks - 1 ? jb : jb + 1, kBlock);
-- 
cgit v1.2.3