diff options
author | Yinghai Lu <yinghai@fb.com> | 2019-08-09 02:12:20 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-08-09 02:17:04 +0300 |
commit | 122135c29b68de5176bd56de6ced936cdc63cb36 (patch) | |
tree | de97891c2a33b3141ec72a5d648b20d32a13a67b | |
parent | cf34b9a26b609109b18d6498f0608faddb7a911b (diff) |
Add unpack to PackedGemmMatrixFP16 (#112)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/112
We need to unpack the layout to support non-CPU arch.
Reviewed By: jianyuh
Differential Revision: D16584449
fbshipit-source-id: 309acaf8f2406e39d6975c0e9fef3e849a6d3950
-rw-r--r-- | include/fbgemm/FbgemmFP16.h | 32 | ||||
-rw-r--r-- | test/FP16Test.cc | 116 |
2 files changed, 135 insertions, 13 deletions
diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h index 2b4c42b..e026a42 100644 --- a/include/fbgemm/FbgemmFP16.h +++ b/include/fbgemm/FbgemmFP16.h @@ -104,6 +104,14 @@ class PackedGemmMatrixFP16 { } } + void setPacked(bool p) { + packed_ = p; + } + + bool packed() const { + return packed_; + } + void initializeMemory() { // allocate and initialize packed memory const int padding = 1024; // required by sw pipelined kernels @@ -120,6 +128,16 @@ class PackedGemmMatrixFP16 { free(pmat_); } + void unpackFromSrc(const matrix_op_t trans, float16* src_mat) { + bool tr = (trans == matrix_op_t::Transpose); + for (int i = 0; i < numRows(); i++) { + for (int j = 0; j < numCols(); j++) { + pmat_[tr ? i + numRows() * j : i * numCols() + j] = src_mat[addr(i, j)]; + } + } + packed_ = false; + } + // protected: // blocked row-major format address arithmetic uint64_t addr(const int r_, const int c_) const { @@ -155,6 +173,19 @@ class PackedGemmMatrixFP16 { pmat_[addr(i, j)]); } } + packed_ = true; + } + + // This function takes in an unpacked float16 matrix of the same size and + // packs it. There is no floating type conversion. + void packFromSrc(const matrix_op_t trans, const float16* smat) { + bool tr = (trans == matrix_op_t::Transpose); + for (int i = 0; i < numRows(); ++i) { + for (int j = 0; j < numCols(); ++j) { + pmat_[addr(i, j)] = smat[tr ? i + numRows() * j : i * numCols() + j]; + } + } + packed_ = true; } const float16& operator()(const int r, const int c) const { @@ -202,6 +233,7 @@ class PackedGemmMatrixFP16 { uint64_t size_; int kernel_ncol_blocks_; float16* pmat_; + bool packed_{false}; friend void cblas_gemm_compute( const matrix_op_t transa, diff --git a/test/FP16Test.cc b/test/FP16Test.cc index eb49086..3267655 100644 --- a/test/FP16Test.cc +++ b/test/FP16Test.cc @@ -27,7 +27,26 @@ using namespace fbgemm; namespace { // The template parameter is transpose of A and B class FBGemmFP16Test - : public testing::TestWithParam<pair<matrix_op_t, matrix_op_t>> {}; + : public testing::TestWithParam<pair<matrix_op_t, matrix_op_t>> { + protected: + vector<vector<int>> GenShapes() const { + vector<vector<int>> shapes; + random_device r; + default_random_engine generator(r()); + uniform_int_distribution<int> dm(1, 256); + uniform_int_distribution<int> dnk(1, 1024); + for (int i = 0; i < 10; i++) { + int m = dm(generator); + int n = dnk(generator); + int k = dnk(generator); + shapes.push_back({m, n, k}); + if (m > 10) { + shapes.push_back({(m / 10) * 10, n, k}); + } + } + return shapes; + } +}; }; // namespace INSTANTIATE_TEST_CASE_P( @@ -44,21 +63,75 @@ INSTANTIATE_TEST_CASE_P( matrix_op_t::Transpose, matrix_op_t::Transpose)*/)); TEST_P(FBGemmFP16Test, Test) { - vector<vector<int>> shapes; - random_device r; - default_random_engine generator(r()); - uniform_int_distribution<int> dm(1, 256); - uniform_int_distribution<int> dnk(1, 1024); - for (int i = 0; i < 10; i++) { - int m = dm(generator); - int n = dnk(generator); - int k = dnk(generator); - shapes.push_back({m, n, k}); - if (m > 10) { - shapes.push_back({(m / 10) * 10, n, k}); + auto shapes = GenShapes(); + float alpha = 1.f, beta = 0.f; + matrix_op_t atrans, btrans; + tie(atrans, btrans) = GetParam(); + + for (auto s : shapes) { + int m = s[0]; + int n = s[1]; + int k = s[2]; + + cerr << "m = " << m << " n = " << n << " k = " << k; + if (atrans == matrix_op_t::Transpose) { + cerr << " A_transposed"; + } + if (btrans == matrix_op_t::Transpose) { + cerr << " B_transposed"; + } + cerr << endl; + + // initialize with small numbers + aligned_vector<int> Aint(m * k); + aligned_vector<int> Bint(k * n); + randFill(Aint, 0, 4); + randFill(Bint, 0, 4); + aligned_vector<float> A(Aint.begin(), Aint.end()); + aligned_vector<float> B(Bint.begin(), Bint.end()); + + aligned_vector<float> C(m * n, NAN); + + aligned_vector<float> A_ref(A), B_ref(B), C_ref(C); + + if (atrans == matrix_op_t::Transpose) { + transpose_matrix(A_ref.data(), k, m); + } + if (btrans == matrix_op_t::Transpose) { + transpose_matrix(B_ref.data(), n, k); + } + + // Gold via reference sgemm + matmul_fp_ref(m, n, k, k, n, n, A_ref.data(), B_ref.data(), C_ref.data()); + + // fbgemm fp16 + PackedGemmMatrixFP16 Bp(btrans, k, n, alpha, B.data()); +#ifdef _OPENMP +#pragma omp parallel +#endif + { + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); + + cblas_gemm_compute( + atrans, m, A.data(), Bp, beta, C.data(), tid, num_threads); + } + + // correctness check + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + float expected = C_ref[i * n + j]; + float actual = C[i * n + j]; + EXPECT_EQ(expected, actual) + << "GEMM results differ at (" << i << ", " << j << "). ref " + << expected << " FBGemm " << actual; + } } } +} +TEST_P(FBGemmFP16Test, Unpack) { + auto shapes = GenShapes(); float alpha = 1.f, beta = 0.f; matrix_op_t atrans, btrans; tie(atrans, btrans) = GetParam(); @@ -101,6 +174,23 @@ TEST_P(FBGemmFP16Test, Test) { // fbgemm fp16 PackedGemmMatrixFP16 Bp(btrans, k, n, alpha, B.data()); + EXPECT_TRUE(Bp.packed()); + + // Test unpack + aligned_vector<float16> tmp(Bp.matSize()); + memcpy(tmp.data(), Bp.pmat(), Bp.matSize() * sizeof(float16)); + Bp.unpackFromSrc(btrans, tmp.data()); + EXPECT_FALSE(Bp.packed()); + memcpy(tmp.data(), Bp.pmat(), Bp.matSize() * sizeof(float16)); + for (int i = 0; i < k; ++i) { + for (int j = 0; j < n; ++j) { + EXPECT_EQ(B[i * n + j], cpu_half2float(tmp[i * n + j])); + } + } + + // Pack it back + Bp.packFromSrc(btrans, tmp.data()); + EXPECT_TRUE(Bp.packed()); #ifdef _OPENMP #pragma omp parallel |