Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJianyu Huang <jianyuhuang@fb.com>2019-02-22 09:02:28 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-02-22 09:06:49 +0300
commit5720ee73bf2665a566e596eb8a0999629a91b213 (patch)
treed5ef0e9bdbe0494b80238741150f3c1d5163b9db /src/PackBMatrix.cc
parent9ebc998c2bf0da81ab7afa1478df395d79429cf8 (diff)
Optimize PackB routine by removing addr function
Summary: James Reed had a use case where the B matrix must be packed online and proposed the Diff here: https://github.com/pytorch/FBGEMM/issues/79 We previously had a Hackmonth task: T35337506. The benchmark routine is here: D13828191. Before this Diff: P60866503 M, N, K, Packing A (ms), Packing B (ms), Kernel (ms), Postprocessing (ms), Total (ms), GOPs 1, 4096, 1024, 0.0, 11.7, 0.5, 0.0, 0.6, FBGEMM_i8_acc32, 13.0 For this case, 11.7 ms is spent on PackBMatrix, while only 0.5 ms is spent on the kernels. After this Diff: P60975064 M, N, K, Packing A (ms), Packing B (ms), Kernel (ms), Postprocessing (ms), Total (ms), GOPs 1, 4096, 1024, 0.0, 2.3, 0.6, 0.0, 0.7, FBGEMM_i8_acc32, 10.9 For this case, only 2.3 ms is spent on PackBMatrix, while only 0.6 ms is spent on the kernels. Note that you should only care about the time for "Packing B (ms)". The final reported GOPs doesn't take PackB into account because for the traditional inference PackB can always be prepacked (one time overhead). Reviewed By: jamesr66a Differential Revision: D14159749 fbshipit-source-id: e61c68380fc3db729c300bc5b65ac9cec99adc8c
Diffstat (limited to 'src/PackBMatrix.cc')
-rw-r--r--src/PackBMatrix.cc45
1 files changed, 43 insertions, 2 deletions
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index ee9be84..8d76e29 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -61,6 +61,8 @@ PackBMatrix<T, accT>::PackBMatrix(
template <typename T, typename accT>
void PackBMatrix<T, accT>::pack(const block_type_t& block) {
assert((BaseType::blockRowSize() % row_interleave_) == 0);
+ assert((block.row_start % BaseType::blockRowSize()) == 0);
+ assert((block.col_start % BaseType::blockColSize()) == 0);
BaseType::packedBlock(block);
bool tr = (trans_ == matrix_op_t::Transpose);
@@ -68,10 +70,39 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
T* out = BaseType::getBuf() +
g * this->packedBufferSize(block.row_size, block.col_size);
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
+ (BaseType::blockRowSize() * BaseType::blockColSize()) +
+ (i % BaseType::blockRowSize() / row_interleave_) *
+ BaseType::blockColSize() * row_interleave_ +
+ i % row_interleave_;
+
+ int c_start_offset = (block.col_start / BaseType::blockColSize()) *
+ BaseType::blockRowSize() * BaseType::blockColSize() +
+ (block.col_start % BaseType::blockColSize()) * row_interleave_;
+
+ int c_idx_offset = 0;
+ int c_blk_offset = 0;
for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
+ // int c_offset = (j / BaseType::blockColSize()) *
+ // BaseType::blockRowSize() * BaseType::blockColSize() +
+ // (j % BaseType::blockColSize()) * row_interleave_;
+ // 1. Loop invariant hoisting (move block offset calculation out of
+ // inner loop); 2. Strength reduction (change modulus in inner loop to
+ // an increment + rollover).
+ int c_offset = c_start_offset +
+ c_blk_offset * BaseType::blockRowSize() * BaseType::blockColSize() +
+ c_idx_offset * row_interleave_;
+
+ int out_idx = r_offset + c_offset;
T val = tr ? smat_[i + (g * block.col_size + j) * ld_]
: smat_[(g * block.row_size + i) * ld_ + j];
- out[addr(i, j)] = tconv(val, out[addr(i, j)]);
+ out[out_idx] = val;
+
+ c_idx_offset++;
+ if (c_idx_offset == BaseType::blockColSize()) {
+ c_idx_offset = 0;
+ c_blk_offset++;
+ }
}
}
// fill the remaining with zero.
@@ -80,8 +111,18 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
i < (block.row_start + block.row_size + row_interleave_ - 1) /
row_interleave_ * row_interleave_;
++i) {
+ int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
+ (BaseType::blockRowSize() * BaseType::blockColSize()) +
+ (i % BaseType::blockRowSize() / row_interleave_) *
+ BaseType::blockColSize() * row_interleave_ +
+ i % row_interleave_;
for (int j = block.col_start; j < block.col_start + block.col_size; j++) {
- out[addr(i, j)] = tconv(0, out[addr(i, j)]);
+ int c_offset = (j / BaseType::blockColSize()) *
+ BaseType::blockRowSize() * BaseType::blockColSize() +
+ (j % BaseType::blockColSize()) * row_interleave_;
+
+ int out_idx = r_offset + c_offset;
+ out[out_idx] = 0;
}
}
} // for each group