Optimize PackB routine by removing addr function

Summary: James Reed had a use case where the B matrix must be packed online and proposed the Diff here: https://github.com/pytorch/FBGEMM/issues/79 We previously had a Hackmonth task: T35337506. The benchmark routine is here: D13828191. Before this Diff: P60866503 M, N, K, Packing A (ms), Packing B (ms), Kernel (ms), Postprocessing (ms), Total (ms), GOPs 1, 4096, 1024, 0.0, 11.7, 0.5, 0.0, 0.6, FBGEMM_i8_acc32, 13.0 For this case, 11.7 ms is spent on PackBMatrix, while only 0.5 ms is spent on the kernels. After this Diff: P60975064 M, N, K, Packing A (ms), Packing B (ms), Kernel (ms), Postprocessing (ms), Total (ms), GOPs 1, 4096, 1024, 0.0, 2.3, 0.6, 0.0, 0.7, FBGEMM_i8_acc32, 10.9 For this case, only 2.3 ms is spent on PackBMatrix, while only 0.6 ms is spent on the kernels. Note that you should only care about the time for "Packing B (ms)". The final reported GOPs doesn't take PackB into account because for the traditional inference PackB can always be prepacked (one time overhead). Reviewed By: jamesr66a Differential Revision: D14159749 fbshipit-source-id: e61c68380fc3db729c300bc5b65ac9cec99adc8c
author: Jianyu Huang <jianyuhuang@fb.com> 2019-02-22 09:02:28 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-02-22 09:06:49 +0300
commit: 5720ee73bf2665a566e596eb8a0999629a91b213 (patch)
tree: d5ef0e9bdbe0494b80238741150f3c1d5163b9db /src/PackBMatrix.cc
parent: 9ebc998c2bf0da81ab7afa1478df395d79429cf8 (diff)
1 files changed, 43 insertions, 2 deletions
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index ee9be84..8d76e29 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -61,6 +61,8 @@ PackBMatrix<T, accT>::PackBMatrix(
 template <typename T, typename accT>
 void PackBMatrix<T, accT>::pack(const block_type_t& block) {
   assert((BaseType::blockRowSize() % row_interleave_) == 0);
+  assert((block.row_start % BaseType::blockRowSize()) == 0);
+  assert((block.col_start % BaseType::blockColSize()) == 0);
 
   BaseType::packedBlock(block);
   bool tr = (trans_ == matrix_op_t::Transpose);
@@ -68,10 +70,39 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
     T* out = BaseType::getBuf() +
         g * this->packedBufferSize(block.row_size, block.col_size);
     for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+      int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
+              (BaseType::blockRowSize() * BaseType::blockColSize()) +
+          (i % BaseType::blockRowSize() / row_interleave_) *
+              BaseType::blockColSize() * row_interleave_ +
+          i % row_interleave_;
+
+      int c_start_offset = (block.col_start / BaseType::blockColSize()) *
+              BaseType::blockRowSize() * BaseType::blockColSize() +
+          (block.col_start % BaseType::blockColSize()) * row_interleave_;
+
+      int c_idx_offset = 0;
+      int c_blk_offset = 0;
       for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
+        // int c_offset = (j / BaseType::blockColSize()) *
+        //         BaseType::blockRowSize() * BaseType::blockColSize() +
+        //     (j % BaseType::blockColSize()) * row_interleave_;
+        // 1. Loop invariant hoisting (move block offset calculation out of
+        // inner loop); 2. Strength reduction (change modulus in inner loop to
+        // an increment + rollover).
+        int c_offset = c_start_offset +
+            c_blk_offset * BaseType::blockRowSize() * BaseType::blockColSize() +
+            c_idx_offset * row_interleave_;
+
+        int out_idx = r_offset + c_offset;
         T val = tr ? smat_[i + (g * block.col_size + j) * ld_]
                    : smat_[(g * block.row_size + i) * ld_ + j];
-        out[addr(i, j)] = tconv(val, out[addr(i, j)]);
+        out[out_idx] = val;
+
+        c_idx_offset++;
+        if (c_idx_offset == BaseType::blockColSize()) {
+          c_idx_offset = 0;
+          c_blk_offset++;
+        }
       }
     }
     // fill the remaining with zero.
@@ -80,8 +111,18 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
          i < (block.row_start + block.row_size + row_interleave_ - 1) /
              row_interleave_ * row_interleave_;
          ++i) {
+      int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
+              (BaseType::blockRowSize() * BaseType::blockColSize()) +
+          (i % BaseType::blockRowSize() / row_interleave_) *
+              BaseType::blockColSize() * row_interleave_ +
+          i % row_interleave_;
       for (int j = block.col_start; j < block.col_start + block.col_size; j++) {
-        out[addr(i, j)] = tconv(0, out[addr(i, j)]);
+        int c_offset = (j / BaseType::blockColSize()) *
+                BaseType::blockRowSize() * BaseType::blockColSize() +
+            (j % BaseType::blockColSize()) * row_interleave_;
+
+        int out_idx = r_offset + c_offset;
+        out[out_idx] = 0;
       }
     }
   } // for each group
author	Jianyu Huang <jianyuhuang@fb.com>	2019-02-22 09:02:28 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-02-22 09:06:49 +0300
commit	5720ee73bf2665a566e596eb8a0999629a91b213 (patch)
tree	d5ef0e9bdbe0494b80238741150f3c1d5163b9db /src/PackBMatrix.cc
parent	9ebc998c2bf0da81ab7afa1478df395d79429cf8 (diff)