Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJianyu Huang <jianyuhuang@fb.com>2019-03-01 12:19:01 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-03-01 12:21:48 +0300
commit2eb84b8912f8340d8ffc54a3ef7653291f64f6f8 (patch)
tree32f70f2b07c5d3dfdf79cd578868de3ea2478a52
parent426d7be717a3d2f5cef5346ef10d81bb636e625a (diff)
Add documentations for the cache/register blocking parameters (#81)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/81 Add the documentations for choosing the current blocking parameters. Reviewed By: dskhudia Differential Revision: D14256809 fbshipit-source-id: e9a355e4611d6cb22791f2585313edc0d1b30ad2
-rw-r--r--include/fbgemm/PackingTraits-inl.h104
1 files changed, 79 insertions, 25 deletions
diff --git a/include/fbgemm/PackingTraits-inl.h b/include/fbgemm/PackingTraits-inl.h
index 35cd50a..598ca28 100644
--- a/include/fbgemm/PackingTraits-inl.h
+++ b/include/fbgemm/PackingTraits-inl.h
@@ -6,6 +6,42 @@
*/
#pragma once
+/*
+ * This file configures the important cache blocking parameters and registers
+ * blocking parameters for the matrix multiplication loops inside FBGEMM.
+ *
+ * ROW_INTERLEAVE: the number of interleaved rows to use vpmaddubsw instructions
+ * for packing B matrix. For 32-bit accumulation, ROW_INTERLEAVE = 4; For 16-bit
+ * accumulation, ROW_INTERLEAVE = 2.
+ *
+ * VLEN: the vector length of one SIMD register. For avx2, VLEN = 256; For
+ * avx512, VLEN = 512.
+ *
+ * NR: the register blocking parameters for N dimension. NR columns of
+ * interleaved rows of int8 (singed or unsigned) should fit into one SIMD
+ * register. Basically, NR = VLEN / 8 / ROW_INTERLEAVE (8 is the bit length for
+ * int8 (signed or unsigned).
+ *
+ * MR: the register blocking parameters for M dimension. MR is the total number
+ * of SIMD registers used for M dimension of registers used for accumulation C.
+ * This indicates the number of vpbroadcastw instructions for A.
+ *
+ * NCB: the cache blocking parameters for N dimension. NCB needs to be a
+ * multiple of NR. The total register on N dimension of registers used for
+ * accumulation C should be NCB/NR.
+ *
+ * (MR) * (NCB/NR): the number of registers used for accumulation C. (MR) *
+ * (NCB/NR) should be less than the total register number (avx2 has 16 ymm
+ * registers; avx512 has 32 zmm registers). (MR) * (NCB/NR) should be as large
+ * as possible to increase the register utilization.
+ *
+ * KCB: the cache blocking parameters for K dimension.
+ *
+ * MCB: the cache blocking parameters for M dimension. MCB needs to be a
+ * multiple of MR.
+ *
+ */
+
/**
* @brief Packing parameter specialization for accumulation into 32-bit
* integers.
@@ -19,17 +55,22 @@ struct PackingTraits<
std::int32_t,
inst_set_t::avx2,
typename std::enable_if<is_8bit<T>::value>::type> {
- static constexpr int MR{12}; ///< Register block for M dimension
- static constexpr int NR{8}; ///< Register block for N dimension
+ static constexpr int MR{12}; ///< Register block for M dimension.
+ static constexpr int NR{8}; ///< Register block for N dimension.
+ ///< NR = VLEN/8/ROW_INTERLEAVE = 256 / 8 / 4 = 8.
+ ///< Total registers used for N dimension: NCB/NR.
+ ///< Here we use 12 x 1 ymm register blocking for
+ ///< the registers used for accumulation C.
static constexpr int ROW_INTERLEAVE{
4}; ///< 4 rows are interleaved to use vpmaddubsw instruction for packing
///< B matrix.
static constexpr int MCB{
- 120}; ///< cache block for M dimension (multiple of MR)
- static constexpr int NCB{8}; ///< cache block for N dimension (multiple of NR)
- static constexpr int KCB{512}; ///< cache block for K dimension
+ 120}; ///< Cache block for M dimension (multiple of MR).
+ static constexpr int NCB{
+ 8}; ///< Cache block for N dimension (multiple of NR).
+ static constexpr int KCB{512}; ///< Cache block for K dimension.
};
/**
@@ -45,19 +86,23 @@ struct PackingTraits<
std::int16_t,
inst_set_t::avx2,
typename std::enable_if<is_8bit<T>::value>::type> {
- static constexpr int MR{3}; ///< Register block for M dimension
- static constexpr int NR{16}; ///< Register block for N dimension; Total
- ///< register used for N dimension: NCB/NR
+ static constexpr int MR{3}; ///< Register block for M dimension.
+ static constexpr int NR{
+ 16}; ///< Register block for N dimension;
+ ///< NR = VLEN/8/ROW_INTERLEAVE = 256 / 8 / 2 = 16.
+ ///< Total registers used for N dimension: NCB/NR.
+ ///< Here we use 3 x 4 ymm register blocking for the
+ ///< registers used for accumulation C.
static constexpr int ROW_INTERLEAVE{
2}; ///< 2 rows are interleaved to use vpmaddubsw instruction for packing
///< B matrix.
static constexpr int MCB{
- 60}; ///< cache block for M dimension (multiple of MR)
+ 60}; ///< Cache block for M dimension (multiple of MR).
static constexpr int NCB{
- 64}; ///< cache block for N dimension (multiple of NR)
- static constexpr int KCB{256}; ///< cache block for K dimension
+ 64}; ///< Cache block for N dimension (multiple of NR).
+ static constexpr int KCB{256}; ///< Cache block for K dimension.
};
/**
@@ -75,10 +120,10 @@ struct PackingTraits<float, float, inst_set_t::avx2> {
static constexpr int ROW_INTERLEAVE{1}; ///< No Row interleave.
static constexpr int MCB{
- 24}; ///< cache block for M dimension (multiple of MR)
+ 24}; ///< Cache block for M dimension (multiple of MR)
static constexpr int NCB{
- 64}; ///< cache block for N dimension (multiple of NR)
- static constexpr int KCB{256}; ///< cache block for K dimension
+ 64}; ///< Cache block for N dimension (multiple of NR)
+ static constexpr int KCB{256}; ///< Cache block for K dimension
};
/**
@@ -107,18 +152,23 @@ struct PackingTraits<
std::int32_t,
inst_set_t::avx512,
typename std::enable_if<is_8bit<T>::value>::type> {
- static constexpr int MR{28}; ///< Register block for M dimension
- static constexpr int NR{16}; ///< Register block for N dimension
+ static constexpr int MR{28}; ///< Register block for M dimension.
+ static constexpr int NR{
+ 16}; ///< Register block for N dimension.
+ ///< NR = VLEN/8/ROW_INTERLEAVE = 512 / 8 / 4 = 16.
+ ///< Total registers used for N dimension: NCB/NR.
+ ///< Here we use 28 x 1 zmm register blocking for
+ ///< the registers used for accumulation C.
static constexpr int ROW_INTERLEAVE{
4}; ///< 4 rows are interleaved to use vpmaddubsw instruction for packing
- ///< B matrix
+ ///< B matrix.
static constexpr int MCB{
- 140}; ///< cache block for M dimension (multiple of MR)
+ 140}; ///< Cache block for M dimension (multiple of MR).
static constexpr int NCB{
- 16}; ///< cache block for N dimension (multiple of NR)
- static constexpr int KCB{512}; ///< cache block for K dimension
+ 16}; ///< Cache block for N dimension (multiple of NR).
+ static constexpr int KCB{512}; ///< Cache block for K dimension.
};
/**
@@ -135,16 +185,20 @@ struct PackingTraits<
inst_set_t::avx512,
typename std::enable_if<is_8bit<T>::value>::type> {
static constexpr int MR{6}; ///< Register block for M dimension
- static constexpr int NR{32}; ///< Register block for N dimension; Total
- ///< register used for N dimension: NCB/NR
+ static constexpr int NR{
+ 32}; ///< Register block for N dimension;
+ ///< NR = VLEN/8/ROW_INTERLEAVE = 512 / 8 / 2 = 32.
+ ///< Total registers used for N dimension: NCB/NR.
+ ///< Here we use 6 x 4 zmm register blocking for
+ ///< the registers used for accumulation C.
static constexpr int ROW_INTERLEAVE{
2}; ///< 2 rows are interleaved to use vpmaddubsw instruction for packing
///< B matrix.
static constexpr int MCB{
- 60}; ///< cache block for M dimension (multiple of MR)
+ 60}; ///< Cache block for M dimension (multiple of MR).
static constexpr int NCB{
- 128}; ///< cache block for N dimension (multiple of NR)
- static constexpr int KCB{256}; ///< cache block for K dimension
+ 128}; ///< Cache block for N dimension (multiple of NR).
+ static constexpr int KCB{256}; ///< Cache block for K dimension.
};