Pass blocking param pointer into packedBufferSize() in PackBMatrix.cc

Summary: Pass blocking params in to compute correct buffer size for each group. Fix the bug for this CONV shape: `conv_param_t<2>(1, 32, 16, {12, 14}, 4, {3, 3}, {1, 1}, {0, 0, 0, 0})` Corresponding M, N, K = 120, 4, 288 with these params: BlockingFactors params; params.MCB = 48; params.NCB = 16; params.KCB = 256; params.MR = 1; params.NR = 16; params.ROW_INTERLEAVE = 4; params.NR_MIN = 16; Reviewed By: jianyuh Differential Revision: D16571367 fbshipit-source-id: 27c9b003d37c4d3d13767227e8343d44668823d6
author: Mike Tsai <miketsai@fb.com> 2019-08-02 02:03:29 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-08-02 02:20:54 +0300
commit: 0d5d057ca941ebb511bdc6178fc26c23e6c4a953 (patch)
tree: 1ceaddaf942edb9debcafad7491b750fc3a5f066
parent: f712cb2328a2b29424bdaeecb9c0731da2cd997b (diff)
4 files changed, 120 insertions, 82 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index bdec036..68963fa 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -441,14 +441,17 @@ class FBGEMM_API PackBMatrix final
   std::int32_t addr(std::int32_t i, std::int32_t j) const;
 
   /**
-   * @brief Packs a block of source matrix into pmat buffer.
+   * @brief Packs a block of source matrix into pmat buffer. The blocking
+   *        parameters are needed to compute the buffer size of each group.
+   *        It will use default blocking parameters if params is not provided.
    */
-  void pack(const block_type_t& block);
+  void pack(const block_type_t& block, const BlockingFactors* params = nullptr);
 
   /**
    * @brief Print the packed block.
    */
-  void printPackedMatrix(std::string name);
+  void printPackedMatrix(std::string name,
+                         const BlockingFactors* params = nullptr);
 
   /**
    * @return true if meta information like matrix shape is the same.
@@ -463,7 +466,7 @@ class FBGEMM_API PackBMatrix final
    * @brief Unpack pmat buffer to the origin_buf (Used for the serialization to
    * recover weight matrix).
    */
-  void unpack(T* origin_buf);
+  void unpack(T* origin_buf, const BlockingFactors* params = nullptr);
 
   ~PackBMatrix() {}
 
@@ -480,7 +483,8 @@ class FBGEMM_API PackBMatrix final
       const block_type_t& block,
       T* unpack_buf,
       T* pack_buf,
-      bool ispack);
+      bool ispack,
+      const BlockingFactors* params = nullptr);
 };
 
 /**
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index a2805b1..0990edb 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -228,7 +228,7 @@ PackBMatrix<T, accT>::PackBMatrix(
         BaseType::numGroups() * BaseType::blockRows() * BaseType::brow_ *
             BaseType::blockCols() * BaseType::bcol_ * sizeof(T));
   }
-  pack(block);
+  pack(block, params);
 }
 
 template <typename T, typename accT>
@@ -236,7 +236,8 @@ void PackBMatrix<T, accT>::pack_unpack_(
     const block_type_t& block,
     T* unpack_buf,
     T* pack_buf,
-    bool ispack) {
+    bool ispack,
+    const BlockingFactors* params) {
   assert((BaseType::blockRowSize() % row_interleave_) == 0);
   assert((block.row_start % BaseType::blockRowSize()) == 0);
   assert((block.col_start % BaseType::blockColSize()) == 0);
@@ -245,7 +246,7 @@ void PackBMatrix<T, accT>::pack_unpack_(
   bool tr = (trans_ == matrix_op_t::Transpose);
   for (int g = 0; g < BaseType::numGroups(); ++g) {
     T* pack_buf_cur = pack_buf +
-        g * BaseType::packedBufferSize(block.row_size, block.col_size);
+        g * BaseType::packedBufferSize(block.row_size, block.col_size, params);
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
       int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
               (BaseType::blockRowSize() * BaseType::blockColSize()) +
@@ -316,17 +317,19 @@ void PackBMatrix<T, accT>::pack_unpack_(
 }
 
 template <typename T, typename accT>
-void PackBMatrix<T, accT>::pack(const block_type_t& block) {
-  pack_unpack_(block, const_cast<T*>(smat_), BaseType::getBuf(), true);
+void PackBMatrix<T, accT>::pack(const block_type_t& block,
+                                const BlockingFactors* params) {
+  pack_unpack_(block, const_cast<T*>(smat_), BaseType::getBuf(), true, params);
 }
 
 template <typename T, typename accT>
-void PackBMatrix<T, accT>::unpack(T* origin_buf) {
+void PackBMatrix<T, accT>::unpack(T* origin_buf,
+                                  const BlockingFactors* params) {
   block_type_t blockB{BaseType::packedRowStart(),
                       BaseType::numPackedRows(),
                       BaseType::packedColStart(),
                       BaseType::numPackedCols()};
-  pack_unpack_(blockB, origin_buf, BaseType::getBuf(), false);
+  pack_unpack_(blockB, origin_buf, BaseType::getBuf(), false, params);
 }
 
 template <typename T, typename accT>
@@ -349,7 +352,8 @@ int32_t PackBMatrix<T, accT>::addr(int32_t r, int32_t c) const {
 }
 
 template <typename T, typename accT>
-void PackBMatrix<T, accT>::printPackedMatrix(std::string name) {
+void PackBMatrix<T, accT>::printPackedMatrix(std::string name,
+                                             const BlockingFactors* params) {
   std::cout << name << ":"
             << "[" << BaseType::numPackedRows() << ", "
             << BaseType::numPackedCols() << "]" << std::endl;
@@ -361,7 +365,7 @@ void PackBMatrix<T, accT>::printPackedMatrix(std::string name) {
     T* out = BaseType::getBuf() +
         g *
             BaseType::packedBufferSize(
-                BaseType::numPackedRows(), BaseType::numPackedCols());
+                BaseType::numPackedRows(), BaseType::numPackedCols(), params);
     std::cout << "group: " << g << std::endl;
     for (auto nr = 0; nr < BaseType::blockRows(); ++nr) {
       auto rows = (nr == BaseType::blockRows() - 1) ? BaseType::lastBrow()
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
index 40254cb..93e7566 100644
--- a/test/PackedRequantizeAcc16Test.cc
+++ b/test/PackedRequantizeAcc16Test.cc
@@ -93,6 +93,8 @@ static vector<vector<int>> GetShapes_() {
       {102, 512, 258},
 
       {1024, 512, 258},
+
+      {120, 4, 288},
   };
   return shapes;
 }
@@ -826,54 +828,67 @@ TEST_P(fbgemmPackUnpackAcc16Test, TestPackUnpack) {
   bool test_ld;
   tie(btrans, test_ld) = GetParam();
 
+  BlockingFactors params;
+  params.MCB = 48;
+  params.NCB = 16;
+  params.KCB = 256;
+  params.MR = 1;
+  params.NR = 16;
+  params.ROW_INTERLEAVE = 4;
+  params.NR_MIN = 16;
+  vector<BlockingFactors*> vec_params_ptr = {&params, nullptr};
+
   for (auto shape : shapes) {
     for (int groups : {1, 3, 4}) {
-      int n = shape[1];
-      int k = shape[2];
+      for (auto params_ptr : vec_params_ptr) {
+        int n = shape[1];
+        int k = shape[2];
 
-      if (k % groups != 0) {
-        continue;
-      }
-      int k_per_group = k / groups;
+        if (k % groups != 0) {
+          continue;
+        }
+        int k_per_group = k / groups;
 
-      // kxn matrix
-      aligned_vector<int8_t> Bint8(k * n);
-      randFill<int8_t>(Bint8, -128, 127);
+        // kxn matrix
+        aligned_vector<int8_t> Bint8(k * n);
+        randFill<int8_t>(Bint8, -128, 127);
 
-      // To test lda != k , we just reduce k by half and use the original k
-      // as lda.
-      int n_adjusted = n;
-      if (test_ld) {
-        if (btrans == matrix_op_t::NoTranspose) {
-          n_adjusted = std::max(n / 2, 1);
+        // To test lda != k , we just reduce k by half and use the original k
+        // as lda.
+        int n_adjusted = n;
+        if (test_ld) {
+          if (btrans == matrix_op_t::NoTranspose) {
+            n_adjusted = std::max(n / 2, 1);
+          }
         }
-      }
 
-      // Note that packing for weight is performed during the constructor
-      // stage.
-      PackBMatrix<int8_t, int16_t> packedWeights(
-          btrans,
-          k,
-          n_adjusted,
-          Bint8.data(),
-          (btrans == matrix_op_t::Transpose) ? k_per_group : n,
-          nullptr,
-          groups);
+        // Note that packing for weight is performed during the constructor
+        // stage.
+        PackBMatrix<int8_t, int16_t> packedWeights(
+            btrans,
+            k,
+            n_adjusted,
+            Bint8.data(),
+            (btrans == matrix_op_t::Transpose) ? k_per_group : n,
+            nullptr,
+            groups,
+            params_ptr);
 
-      // Setup a buffer to get pack -> unpacked results
-      aligned_vector<int8_t> unpack_buf(k * n, 0);
+        // Setup a buffer to get pack -> unpacked results
+        aligned_vector<int8_t> unpack_buf(k * n, 0);
 
-      // Perform unpacking
-      packedWeights.unpack(unpack_buf.data());
+        // Perform unpacking
+        packedWeights.unpack(unpack_buf.data(), params_ptr);
 
-      // Sanity check
-      for (int i = 0; i < k; i++) {
-        for (int j = 0; j < n_adjusted; j++) {
-          EXPECT_EQ(Bint8.data()[i * n + j], unpack_buf.data()[i * n + j])
+        // Sanity check
+        for (int i = 0; i < k; i++) {
+          for (int j = 0; j < n_adjusted; j++) {
+            EXPECT_EQ(Bint8.data()[i * n + j], unpack_buf.data()[i * n + j])
               << "Pack/Unpack results differ at index (" << i << ", " << j
               << ", Reference: " << static_cast<int>(Bint8.data()[i * n + j])
               << ", Pack-Unpacked: "
               << static_cast<int>(unpack_buf.data()[i * n + j]);
+          }
         }
       }
     }
diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
index 11ef6ff..5338243 100644
--- a/test/PackedRequantizeTest.cc
+++ b/test/PackedRequantizeTest.cc
@@ -93,6 +93,8 @@ static vector<vector<int>> GetShapes_() {
       {102, 512, 258},
 
       {1024, 512, 258},
+
+      {120, 4, 288},
   };
   return shapes;
 }
@@ -766,54 +768,67 @@ TEST_P(fbgemmPackUnpackAcc32Test, TestPackUnpack) {
   bool test_ld;
   tie(btrans, test_ld) = GetParam();
 
+  BlockingFactors params;
+  params.MCB = 48;
+  params.NCB = 16;
+  params.KCB = 256;
+  params.MR = 1;
+  params.NR = 16;
+  params.ROW_INTERLEAVE = 4;
+  params.NR_MIN = 16;
+  vector<BlockingFactors*> vec_params_ptr = {&params, nullptr};
+
   for (auto shape : shapes) {
     for (int groups : {1, 3, 4}) {
-      int n = shape[1];
-      int k = shape[2];
+      for (auto params_ptr : vec_params_ptr) {
+        int n = shape[1];
+        int k = shape[2];
 
-      if (k % groups != 0) {
-        continue;
-      }
-      int k_per_group = k / groups;
+        if (k % groups != 0) {
+          continue;
+        }
+        int k_per_group = k / groups;
 
-      // kxn matrix
-      aligned_vector<int8_t> Bint8(k * n);
-      randFill<int8_t>(Bint8, -128, 127);
+        // kxn matrix
+        aligned_vector<int8_t> Bint8(k * n);
+        randFill<int8_t>(Bint8, -128, 127);
 
-      // To test lda != k , we just reduce k by half and use the original k
-      // as lda.
-      int n_adjusted = n;
-      if (test_ld) {
-        if (btrans == matrix_op_t::NoTranspose) {
-          n_adjusted = std::max(n / 2, 1);
+        // To test lda != k , we just reduce k by half and use the original k
+        // as lda.
+        int n_adjusted = n;
+        if (test_ld) {
+          if (btrans == matrix_op_t::NoTranspose) {
+            n_adjusted = std::max(n / 2, 1);
+          }
         }
-      }
 
-      // Note that packing for weight is performed during the constructor
-      // stage.
-      PackBMatrix<int8_t> packedWeights(
-          btrans,
-          k,
-          n_adjusted,
-          Bint8.data(),
-          (btrans == matrix_op_t::Transpose) ? k_per_group : n,
-          nullptr,
-          groups);
+        // Note that packing for weight is performed during the constructor
+        // stage.
+        PackBMatrix<int8_t> packedWeights(
+            btrans,
+            k,
+            n_adjusted,
+            Bint8.data(),
+            (btrans == matrix_op_t::Transpose) ? k_per_group : n,
+            nullptr,
+            groups,
+            params_ptr);
 
-      // Setup a buffer to get pack -> unpacked results
-      aligned_vector<int8_t> unpack_buf(k * n, 0);
+        // Setup a buffer to get pack -> unpacked results
+        aligned_vector<int8_t> unpack_buf(k * n, 0);
 
-      // Perform unpacking
-      packedWeights.unpack(unpack_buf.data());
+        // Perform unpacking
+        packedWeights.unpack(unpack_buf.data(), params_ptr);
 
-      // Sanity check
-      for (int i = 0; i < k; i++) {
-        for (int j = 0; j < n_adjusted; j++) {
-          EXPECT_EQ(Bint8.data()[i * n + j], unpack_buf.data()[i * n + j])
+        // Sanity check
+        for (int i = 0; i < k; i++) {
+          for (int j = 0; j < n_adjusted; j++) {
+            EXPECT_EQ(Bint8.data()[i * n + j], unpack_buf.data()[i * n + j])
               << "Pack/Unpack results differ at index (" << i << ", " << j
               << ", Reference: " << static_cast<int>(Bint8.data()[i * n + j])
               << ", Pack-Unpacked: "
               << static_cast<int>(unpack_buf.data()[i * n + j]);
+          }
         }
       }
     }
author	Mike Tsai <miketsai@fb.com>	2019-08-02 02:03:29 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-08-02 02:20:54 +0300
commit	0d5d057ca941ebb511bdc6178fc26c23e6c4a953 (patch)
tree	1ceaddaf942edb9debcafad7491b750fc3a5f066
parent	f712cb2328a2b29424bdaeecb9c0731da2cd997b (diff)