diff options
author | Jianyu Huang <jianyuhuang@fb.com> | 2019-02-22 09:02:28 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-02-22 09:06:49 +0300 |
commit | 5720ee73bf2665a566e596eb8a0999629a91b213 (patch) | |
tree | d5ef0e9bdbe0494b80238741150f3c1d5163b9db /src/PackBMatrix.cc | |
parent | 9ebc998c2bf0da81ab7afa1478df395d79429cf8 (diff) |
Optimize PackB routine by removing addr function
Summary:
James Reed had a use case where the B matrix must be packed online and proposed the Diff here:
https://github.com/pytorch/FBGEMM/issues/79
We previously had a Hackmonth task: T35337506.
The benchmark routine is here: D13828191.
Before this Diff:
P60866503
M, N, K, Packing A (ms), Packing B (ms), Kernel (ms), Postprocessing (ms), Total (ms), GOPs
1, 4096, 1024, 0.0, 11.7, 0.5, 0.0, 0.6, FBGEMM_i8_acc32, 13.0
For this case, 11.7 ms is spent on PackBMatrix, while only 0.5 ms is spent on the kernels.
After this Diff:
P60975064
M, N, K, Packing A (ms), Packing B (ms), Kernel (ms), Postprocessing (ms), Total (ms), GOPs
1, 4096, 1024, 0.0, 2.3, 0.6, 0.0, 0.7, FBGEMM_i8_acc32, 10.9
For this case, only 2.3 ms is spent on PackBMatrix, while only 0.6 ms is spent on the kernels.
Note that you should only care about the time for "Packing B (ms)". The final reported GOPs doesn't take PackB into account because for the traditional inference PackB can always be prepacked (one time overhead).
Reviewed By: jamesr66a
Differential Revision: D14159749
fbshipit-source-id: e61c68380fc3db729c300bc5b65ac9cec99adc8c
Diffstat (limited to 'src/PackBMatrix.cc')
-rw-r--r-- | src/PackBMatrix.cc | 45 |
1 files changed, 43 insertions, 2 deletions
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index ee9be84..8d76e29 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -61,6 +61,8 @@ PackBMatrix<T, accT>::PackBMatrix( template <typename T, typename accT> void PackBMatrix<T, accT>::pack(const block_type_t& block) { assert((BaseType::blockRowSize() % row_interleave_) == 0); + assert((block.row_start % BaseType::blockRowSize()) == 0); + assert((block.col_start % BaseType::blockColSize()) == 0); BaseType::packedBlock(block); bool tr = (trans_ == matrix_op_t::Transpose); @@ -68,10 +70,39 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) { T* out = BaseType::getBuf() + g * this->packedBufferSize(block.row_size, block.col_size); for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) * + (BaseType::blockRowSize() * BaseType::blockColSize()) + + (i % BaseType::blockRowSize() / row_interleave_) * + BaseType::blockColSize() * row_interleave_ + + i % row_interleave_; + + int c_start_offset = (block.col_start / BaseType::blockColSize()) * + BaseType::blockRowSize() * BaseType::blockColSize() + + (block.col_start % BaseType::blockColSize()) * row_interleave_; + + int c_idx_offset = 0; + int c_blk_offset = 0; for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { + // int c_offset = (j / BaseType::blockColSize()) * + // BaseType::blockRowSize() * BaseType::blockColSize() + + // (j % BaseType::blockColSize()) * row_interleave_; + // 1. Loop invariant hoisting (move block offset calculation out of + // inner loop); 2. Strength reduction (change modulus in inner loop to + // an increment + rollover). + int c_offset = c_start_offset + + c_blk_offset * BaseType::blockRowSize() * BaseType::blockColSize() + + c_idx_offset * row_interleave_; + + int out_idx = r_offset + c_offset; T val = tr ? smat_[i + (g * block.col_size + j) * ld_] : smat_[(g * block.row_size + i) * ld_ + j]; - out[addr(i, j)] = tconv(val, out[addr(i, j)]); + out[out_idx] = val; + + c_idx_offset++; + if (c_idx_offset == BaseType::blockColSize()) { + c_idx_offset = 0; + c_blk_offset++; + } } } // fill the remaining with zero. @@ -80,8 +111,18 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) { i < (block.row_start + block.row_size + row_interleave_ - 1) / row_interleave_ * row_interleave_; ++i) { + int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) * + (BaseType::blockRowSize() * BaseType::blockColSize()) + + (i % BaseType::blockRowSize() / row_interleave_) * + BaseType::blockColSize() * row_interleave_ + + i % row_interleave_; for (int j = block.col_start; j < block.col_start + block.col_size; j++) { - out[addr(i, j)] = tconv(0, out[addr(i, j)]); + int c_offset = (j / BaseType::blockColSize()) * + BaseType::blockRowSize() * BaseType::blockColSize() + + (j % BaseType::blockColSize()) * row_interleave_; + + int out_idx = r_offset + c_offset; + out[out_idx] = 0; } } } // for each group |