Further optimize acc32 kernel and cache blocking dimension for B matrix is now free to be autotuned (#89)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/89 We have one more loop (over NR tiles in NCB block) in the generated assembly kernel. This change also frees NCB as an independent dimension that can be auto-tuned. ~~TODO: Similar changes for acc16 kernel. ~~ D14516232 Reviewed By: jspark1105 Differential Revision: D14507536 fbshipit-source-id: 6843fffdd0bcf9bb7cd0231163fbefd6e52d5bf7
author: Daya S Khudia <dskhudia@fb.com> 2019-03-21 20:03:36 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-03-21 20:07:54 +0300
commit: fe1c3d91772703a9c4f00fa04a9acaeeeffcf83c (patch)
tree: ce3fc31c3aaf57288b6b954a8606170400a9e6b6 /include/fbgemm/PackingTraits-inl.h
parent: 0d8f88c12dce0f9167a5ee93aa2a851aa99ff864 (diff)
1 files changed, 10 insertions, 9 deletions
diff --git a/include/fbgemm/PackingTraits-inl.h b/include/fbgemm/PackingTraits-inl.h
index 598ca28..c32c127 100644
--- a/include/fbgemm/PackingTraits-inl.h
+++ b/include/fbgemm/PackingTraits-inl.h
@@ -152,23 +152,24 @@ struct PackingTraits<
     std::int32_t,
     inst_set_t::avx512,
     typename std::enable_if<is_8bit<T>::value>::type> {
-  static constexpr int MR{28}; ///< Register block for M dimension.
+  static constexpr int MR{14}; ///< Register block for M dimension.
   static constexpr int NR{
-      16}; ///< Register block for N dimension.
-           ///< NR = VLEN/8/ROW_INTERLEAVE = 512 / 8 / 4 = 16.
-           ///< Total registers used for N dimension: NCB/NR.
-           ///< Here we use 28 x 1 zmm register blocking for
-           ///< the registers used for accumulation C.
+      32}; ///< Register block for N dimension.
+           ///< Must be a multiple of 16 because 16*ROW_INTERLEAVE int8 elements
+           ///< completely fill a 512-bit wide vector. Total registers used for
+           ///< N dimension: NR*ROW_INTERLEAVE*8/VLEN. We use MR x
+           ///< NR*ROW_INTERLEAVE*8/VLEN zmm registers
+           ///< for C accumulations.
 
   static constexpr int ROW_INTERLEAVE{
       4}; ///< 4 rows are interleaved to use vpmaddubsw instruction for packing
           ///< B matrix.
 
   static constexpr int MCB{
-      140}; ///< Cache block for M dimension (multiple of MR).
+      56}; ///< Cache block for M dimension (multiple of MR).
   static constexpr int NCB{
-      16}; ///< Cache block for N dimension (multiple of NR).
-  static constexpr int KCB{512}; ///< Cache block for K dimension.
+      32}; ///< Cache block for N dimension (multiple of NR).
+  static constexpr int KCB{256}; ///< Cache block for K dimension.
 };
 
 /**
author	Daya S Khudia <dskhudia@fb.com>	2019-03-21 20:03:36 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-03-21 20:07:54 +0300
commit	fe1c3d91772703a9c4f00fa04a9acaeeeffcf83c (patch)
tree	ce3fc31c3aaf57288b6b954a8606170400a9e6b6 /include/fbgemm/PackingTraits-inl.h
parent	0d8f88c12dce0f9167a5ee93aa2a851aa99ff864 (diff)