Add documentations for the cache/register blocking parameters (#81)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/81 Add the documentations for choosing the current blocking parameters. Reviewed By: dskhudia Differential Revision: D14256809 fbshipit-source-id: e9a355e4611d6cb22791f2585313edc0d1b30ad2
author: Jianyu Huang <jianyuhuang@fb.com> 2019-03-01 12:19:01 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-03-01 12:21:48 +0300
commit: 2eb84b8912f8340d8ffc54a3ef7653291f64f6f8 (patch)
tree: 32f70f2b07c5d3dfdf79cd578868de3ea2478a52
parent: 426d7be717a3d2f5cef5346ef10d81bb636e625a (diff)
1 files changed, 79 insertions, 25 deletions
diff --git a/include/fbgemm/PackingTraits-inl.h b/include/fbgemm/PackingTraits-inl.h
index 35cd50a..598ca28 100644
--- a/include/fbgemm/PackingTraits-inl.h
+++ b/include/fbgemm/PackingTraits-inl.h
@@ -6,6 +6,42 @@
  */
 #pragma once
 
+/*
+ * This file configures the important cache blocking parameters and registers
+ * blocking parameters for the matrix multiplication loops inside FBGEMM.
+ *
+ * ROW_INTERLEAVE: the number of interleaved rows to use vpmaddubsw instructions
+ * for packing B matrix. For 32-bit accumulation, ROW_INTERLEAVE = 4; For 16-bit
+ * accumulation, ROW_INTERLEAVE = 2.
+ *
+ * VLEN: the vector length of one SIMD register. For avx2, VLEN = 256; For
+ * avx512, VLEN = 512.
+ *
+ * NR: the register blocking parameters for N dimension. NR columns of
+ * interleaved rows of int8 (singed or unsigned) should fit into one SIMD
+ * register. Basically, NR = VLEN / 8 / ROW_INTERLEAVE (8 is the bit length for
+ * int8 (signed or unsigned).
+ *
+ * MR: the register blocking parameters for M dimension. MR is the total number
+ * of SIMD registers used for M dimension of registers used for accumulation C.
+ * This indicates the number of vpbroadcastw instructions for A.
+ *
+ * NCB: the cache blocking parameters for N dimension. NCB needs to be a
+ * multiple of NR. The total register on N dimension of registers used for
+ * accumulation C should be NCB/NR.
+ *
+ * (MR) * (NCB/NR): the number of registers used for accumulation C. (MR) *
+ * (NCB/NR) should be less than the total register number (avx2 has 16 ymm
+ * registers; avx512 has 32 zmm registers). (MR) * (NCB/NR) should be as large
+ * as possible to increase the register utilization.
+ *
+ * KCB: the cache blocking parameters for K dimension.
+ *
+ * MCB: the cache blocking parameters for M dimension. MCB needs to be a
+ * multiple of MR.
+ *
+ */
+
 /**
  * @brief Packing parameter specialization for accumulation into 32-bit
  * integers.
@@ -19,17 +55,22 @@ struct PackingTraits<
     std::int32_t,
     inst_set_t::avx2,
     typename std::enable_if<is_8bit<T>::value>::type> {
-  static constexpr int MR{12}; ///< Register block for M dimension
-  static constexpr int NR{8}; ///< Register block for N dimension
+  static constexpr int MR{12}; ///< Register block for M dimension.
+  static constexpr int NR{8}; ///< Register block for N dimension.
+                              ///< NR = VLEN/8/ROW_INTERLEAVE = 256 / 8 / 4 = 8.
+                              ///< Total registers used for N dimension: NCB/NR.
+                              ///< Here we use 12 x 1 ymm register blocking for
+                              ///< the registers used for accumulation C.
 
   static constexpr int ROW_INTERLEAVE{
       4}; ///< 4 rows are interleaved to use vpmaddubsw instruction for packing
           ///< B matrix.
 
   static constexpr int MCB{
-      120}; ///< cache block for M dimension (multiple of MR)
-  static constexpr int NCB{8}; ///< cache block for N dimension (multiple of NR)
-  static constexpr int KCB{512}; ///< cache block for K dimension
+      120}; ///< Cache block for M dimension (multiple of MR).
+  static constexpr int NCB{
+      8}; ///< Cache block for N dimension (multiple of NR).
+  static constexpr int KCB{512}; ///< Cache block for K dimension.
 };
 
 /**
@@ -45,19 +86,23 @@ struct PackingTraits<
     std::int16_t,
     inst_set_t::avx2,
     typename std::enable_if<is_8bit<T>::value>::type> {
-  static constexpr int MR{3}; ///< Register block for M dimension
-  static constexpr int NR{16}; ///< Register block for N dimension; Total
-                               ///< register used for N dimension: NCB/NR
+  static constexpr int MR{3}; ///< Register block for M dimension.
+  static constexpr int NR{
+      16}; ///< Register block for N dimension;
+           ///< NR = VLEN/8/ROW_INTERLEAVE = 256 / 8 / 2 = 16.
+           ///< Total registers used for N dimension: NCB/NR.
+           ///< Here we use 3 x 4 ymm register blocking for the
+           ///< registers used for accumulation C.
 
   static constexpr int ROW_INTERLEAVE{
       2}; ///< 2 rows are interleaved to use vpmaddubsw instruction for packing
           ///< B matrix.
 
   static constexpr int MCB{
-      60}; ///< cache block for M dimension (multiple of MR)
+      60}; ///< Cache block for M dimension (multiple of MR).
   static constexpr int NCB{
-      64}; ///< cache block for N dimension (multiple of NR)
-  static constexpr int KCB{256}; ///< cache block for K dimension
+      64}; ///< Cache block for N dimension (multiple of NR).
+  static constexpr int KCB{256}; ///< Cache block for K dimension.
 };
 
 /**
@@ -75,10 +120,10 @@ struct PackingTraits<float, float, inst_set_t::avx2> {
   static constexpr int ROW_INTERLEAVE{1}; ///< No Row interleave.
 
   static constexpr int MCB{
-      24}; ///< cache block for M dimension (multiple of MR)
+      24}; ///< Cache block for M dimension (multiple of MR)
   static constexpr int NCB{
-      64}; ///< cache block for N dimension (multiple of NR)
-  static constexpr int KCB{256}; ///< cache block for K dimension
+      64}; ///< Cache block for N dimension (multiple of NR)
+  static constexpr int KCB{256}; ///< Cache block for K dimension
 };
 
 /**
@@ -107,18 +152,23 @@ struct PackingTraits<
     std::int32_t,
     inst_set_t::avx512,
     typename std::enable_if<is_8bit<T>::value>::type> {
-  static constexpr int MR{28}; ///< Register block for M dimension
-  static constexpr int NR{16}; ///< Register block for N dimension
+  static constexpr int MR{28}; ///< Register block for M dimension.
+  static constexpr int NR{
+      16}; ///< Register block for N dimension.
+           ///< NR = VLEN/8/ROW_INTERLEAVE = 512 / 8 / 4 = 16.
+           ///< Total registers used for N dimension: NCB/NR.
+           ///< Here we use 28 x 1 zmm register blocking for
+           ///< the registers used for accumulation C.
 
   static constexpr int ROW_INTERLEAVE{
       4}; ///< 4 rows are interleaved to use vpmaddubsw instruction for packing
-          ///< B matrix
+          ///< B matrix.
 
   static constexpr int MCB{
-      140}; ///< cache block for M dimension (multiple of MR)
+      140}; ///< Cache block for M dimension (multiple of MR).
   static constexpr int NCB{
-      16}; ///< cache block for N dimension (multiple of NR)
-  static constexpr int KCB{512}; ///< cache block for K dimension
+      16}; ///< Cache block for N dimension (multiple of NR).
+  static constexpr int KCB{512}; ///< Cache block for K dimension.
 };
 
 /**
@@ -135,16 +185,20 @@ struct PackingTraits<
     inst_set_t::avx512,
     typename std::enable_if<is_8bit<T>::value>::type> {
   static constexpr int MR{6}; ///< Register block for M dimension
-  static constexpr int NR{32}; ///< Register block for N dimension; Total
-                               ///< register used for N dimension: NCB/NR
+  static constexpr int NR{
+      32}; ///< Register block for N dimension;
+           ///< NR = VLEN/8/ROW_INTERLEAVE = 512 / 8 / 2 = 32.
+           ///< Total registers used for N dimension: NCB/NR.
+           ///< Here we use 6 x 4 zmm register blocking for
+           ///< the registers used for accumulation C.
 
   static constexpr int ROW_INTERLEAVE{
       2}; ///< 2 rows are interleaved to use vpmaddubsw instruction for packing
           ///< B matrix.
 
   static constexpr int MCB{
-      60}; ///< cache block for M dimension (multiple of MR)
+      60}; ///< Cache block for M dimension (multiple of MR).
   static constexpr int NCB{
-      128}; ///< cache block for N dimension (multiple of NR)
-  static constexpr int KCB{256}; ///< cache block for K dimension
+      128}; ///< Cache block for N dimension (multiple of NR).
+  static constexpr int KCB{256}; ///< Cache block for K dimension.
 };
author	Jianyu Huang <jianyuhuang@fb.com>	2019-03-01 12:19:01 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-03-01 12:21:48 +0300
commit	2eb84b8912f8340d8ffc54a3ef7653291f64f6f8 (patch)
tree	32f70f2b07c5d3dfdf79cd578868de3ea2478a52
parent	426d7be717a3d2f5cef5346ef10d81bb636e625a (diff)