From 5ec2641b8a4bb74827f9c46357c69c4af1f6cfe0 Mon Sep 17 00:00:00 2001 From: Jianyu Huang Date: Wed, 13 Feb 2019 15:46:39 -0800 Subject: JIT kernel should only handle a small portion of NCB for the last block: multiple of NR Summary: Before this Diff: we pass into the JIT kernel with nc = NCB ( packedB_.blockColSize() ) instead of nc = leftover size (packedB_.lastBcol() ) for the last block of B (diffusion/FBS/browse/master/fbcode/deeplearning/fbgemm/src/ExecuteKernelU8S8.cc;1adfe7977ef7ea2a1aee0ed785bd3fed5b7c4a20$102), which cause the additional computation when n is small. After this Diff: we pass into the JIT kernel with a small portion of NCB (still multiple of NR) for the last block of B. The main performance gain is for Acc16, because NCB = 4 * NR for Acc16 and NCB = NR for Acc32 in our current settings (AVX2 and AVX512). Reviewed By: jspark1105 Differential Revision: D14063628 fbshipit-source-id: 5829d06553daf617e2fefa7d26cb2d761af402c1 --- src/ExecuteKernelU8S8.cc | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'src/ExecuteKernelU8S8.cc') diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index cdceb63..64bac90 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -53,6 +53,10 @@ ExecuteKernel< int8_t, typename packingAMatrix::accType, inst_set_t::avx512>::NCB; + nrSize_ = PackingTraits< + int8_t, + typename packingAMatrix::accType, + inst_set_t::avx512>::NR; } else if (cpuinfo_has_x86_avx2()) { mbSize_ = PackingTraits< int8_t, @@ -62,6 +66,10 @@ ExecuteKernel< int8_t, typename packingAMatrix::accType, inst_set_t::avx2>::NCB; + nrSize_ = PackingTraits< + int8_t, + typename packingAMatrix::accType, + inst_set_t::avx2>::NR; } else { assert(0 && "unsupported architecure"); } @@ -125,6 +133,27 @@ void ExecuteKernel< #endif for (int jb = 0; jb < bColBlocks; ++jb) { + if (jb == bColBlocks - 1) { + int nc = ((packedB_.lastBcol() - 1) / nrSize_ + 1) * nrSize_; + if (nc != nbSize_) { + if (cpuinfo_initialize()) { + if (cpuinfo_has_x86_avx512f()) { + fn = BaseType::template getOrCreate( + accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); + } else if (cpuinfo_has_x86_avx2()) { + fn = BaseType::template getOrCreate( + accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_); + } else { + // TODO: Have default slower path + assert(0 && "unsupported architecture"); + return; + } + } else { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } + } + } + bBuf = packedB_.getBuf(jb, kBlock); // prefetch addr of the next packed block of B matrix bBuf_pf = packedB_.getBuf(jb == bColBlocks - 1 ? jb : jb + 1, kBlock); -- cgit v1.2.3