Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYinghai Lu <yinghai@fb.com>2019-08-09 02:12:20 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-08-09 02:17:04 +0300
commit122135c29b68de5176bd56de6ced936cdc63cb36 (patch)
treede97891c2a33b3141ec72a5d648b20d32a13a67b
parentcf34b9a26b609109b18d6498f0608faddb7a911b (diff)
Add unpack to PackedGemmMatrixFP16 (#112)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/112 We need to unpack the layout to support non-CPU arch. Reviewed By: jianyuh Differential Revision: D16584449 fbshipit-source-id: 309acaf8f2406e39d6975c0e9fef3e849a6d3950
-rw-r--r--include/fbgemm/FbgemmFP16.h32
-rw-r--r--test/FP16Test.cc116
2 files changed, 135 insertions, 13 deletions
diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h
index 2b4c42b..e026a42 100644
--- a/include/fbgemm/FbgemmFP16.h
+++ b/include/fbgemm/FbgemmFP16.h
@@ -104,6 +104,14 @@ class PackedGemmMatrixFP16 {
}
}
+ void setPacked(bool p) {
+ packed_ = p;
+ }
+
+ bool packed() const {
+ return packed_;
+ }
+
void initializeMemory() {
// allocate and initialize packed memory
const int padding = 1024; // required by sw pipelined kernels
@@ -120,6 +128,16 @@ class PackedGemmMatrixFP16 {
free(pmat_);
}
+ void unpackFromSrc(const matrix_op_t trans, float16* src_mat) {
+ bool tr = (trans == matrix_op_t::Transpose);
+ for (int i = 0; i < numRows(); i++) {
+ for (int j = 0; j < numCols(); j++) {
+ pmat_[tr ? i + numRows() * j : i * numCols() + j] = src_mat[addr(i, j)];
+ }
+ }
+ packed_ = false;
+ }
+
// protected:
// blocked row-major format address arithmetic
uint64_t addr(const int r_, const int c_) const {
@@ -155,6 +173,19 @@ class PackedGemmMatrixFP16 {
pmat_[addr(i, j)]);
}
}
+ packed_ = true;
+ }
+
+ // This function takes in an unpacked float16 matrix of the same size and
+ // packs it. There is no floating type conversion.
+ void packFromSrc(const matrix_op_t trans, const float16* smat) {
+ bool tr = (trans == matrix_op_t::Transpose);
+ for (int i = 0; i < numRows(); ++i) {
+ for (int j = 0; j < numCols(); ++j) {
+ pmat_[addr(i, j)] = smat[tr ? i + numRows() * j : i * numCols() + j];
+ }
+ }
+ packed_ = true;
}
const float16& operator()(const int r, const int c) const {
@@ -202,6 +233,7 @@ class PackedGemmMatrixFP16 {
uint64_t size_;
int kernel_ncol_blocks_;
float16* pmat_;
+ bool packed_{false};
friend void cblas_gemm_compute(
const matrix_op_t transa,
diff --git a/test/FP16Test.cc b/test/FP16Test.cc
index eb49086..3267655 100644
--- a/test/FP16Test.cc
+++ b/test/FP16Test.cc
@@ -27,7 +27,26 @@ using namespace fbgemm;
namespace {
// The template parameter is transpose of A and B
class FBGemmFP16Test
- : public testing::TestWithParam<pair<matrix_op_t, matrix_op_t>> {};
+ : public testing::TestWithParam<pair<matrix_op_t, matrix_op_t>> {
+ protected:
+ vector<vector<int>> GenShapes() const {
+ vector<vector<int>> shapes;
+ random_device r;
+ default_random_engine generator(r());
+ uniform_int_distribution<int> dm(1, 256);
+ uniform_int_distribution<int> dnk(1, 1024);
+ for (int i = 0; i < 10; i++) {
+ int m = dm(generator);
+ int n = dnk(generator);
+ int k = dnk(generator);
+ shapes.push_back({m, n, k});
+ if (m > 10) {
+ shapes.push_back({(m / 10) * 10, n, k});
+ }
+ }
+ return shapes;
+ }
+};
}; // namespace
INSTANTIATE_TEST_CASE_P(
@@ -44,21 +63,75 @@ INSTANTIATE_TEST_CASE_P(
matrix_op_t::Transpose, matrix_op_t::Transpose)*/));
TEST_P(FBGemmFP16Test, Test) {
- vector<vector<int>> shapes;
- random_device r;
- default_random_engine generator(r());
- uniform_int_distribution<int> dm(1, 256);
- uniform_int_distribution<int> dnk(1, 1024);
- for (int i = 0; i < 10; i++) {
- int m = dm(generator);
- int n = dnk(generator);
- int k = dnk(generator);
- shapes.push_back({m, n, k});
- if (m > 10) {
- shapes.push_back({(m / 10) * 10, n, k});
+ auto shapes = GenShapes();
+ float alpha = 1.f, beta = 0.f;
+ matrix_op_t atrans, btrans;
+ tie(atrans, btrans) = GetParam();
+
+ for (auto s : shapes) {
+ int m = s[0];
+ int n = s[1];
+ int k = s[2];
+
+ cerr << "m = " << m << " n = " << n << " k = " << k;
+ if (atrans == matrix_op_t::Transpose) {
+ cerr << " A_transposed";
+ }
+ if (btrans == matrix_op_t::Transpose) {
+ cerr << " B_transposed";
+ }
+ cerr << endl;
+
+ // initialize with small numbers
+ aligned_vector<int> Aint(m * k);
+ aligned_vector<int> Bint(k * n);
+ randFill(Aint, 0, 4);
+ randFill(Bint, 0, 4);
+ aligned_vector<float> A(Aint.begin(), Aint.end());
+ aligned_vector<float> B(Bint.begin(), Bint.end());
+
+ aligned_vector<float> C(m * n, NAN);
+
+ aligned_vector<float> A_ref(A), B_ref(B), C_ref(C);
+
+ if (atrans == matrix_op_t::Transpose) {
+ transpose_matrix(A_ref.data(), k, m);
+ }
+ if (btrans == matrix_op_t::Transpose) {
+ transpose_matrix(B_ref.data(), n, k);
+ }
+
+ // Gold via reference sgemm
+ matmul_fp_ref(m, n, k, k, n, n, A_ref.data(), B_ref.data(), C_ref.data());
+
+ // fbgemm fp16
+ PackedGemmMatrixFP16 Bp(btrans, k, n, alpha, B.data());
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+ {
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
+
+ cblas_gemm_compute(
+ atrans, m, A.data(), Bp, beta, C.data(), tid, num_threads);
+ }
+
+ // correctness check
+ for (int i = 0; i < m; ++i) {
+ for (int j = 0; j < n; ++j) {
+ float expected = C_ref[i * n + j];
+ float actual = C[i * n + j];
+ EXPECT_EQ(expected, actual)
+ << "GEMM results differ at (" << i << ", " << j << "). ref "
+ << expected << " FBGemm " << actual;
+ }
}
}
+}
+TEST_P(FBGemmFP16Test, Unpack) {
+ auto shapes = GenShapes();
float alpha = 1.f, beta = 0.f;
matrix_op_t atrans, btrans;
tie(atrans, btrans) = GetParam();
@@ -101,6 +174,23 @@ TEST_P(FBGemmFP16Test, Test) {
// fbgemm fp16
PackedGemmMatrixFP16 Bp(btrans, k, n, alpha, B.data());
+ EXPECT_TRUE(Bp.packed());
+
+ // Test unpack
+ aligned_vector<float16> tmp(Bp.matSize());
+ memcpy(tmp.data(), Bp.pmat(), Bp.matSize() * sizeof(float16));
+ Bp.unpackFromSrc(btrans, tmp.data());
+ EXPECT_FALSE(Bp.packed());
+ memcpy(tmp.data(), Bp.pmat(), Bp.matSize() * sizeof(float16));
+ for (int i = 0; i < k; ++i) {
+ for (int j = 0; j < n; ++j) {
+ EXPECT_EQ(B[i * n + j], cpu_half2float(tmp[i * n + j]));
+ }
+ }
+
+ // Pack it back
+ Bp.packFromSrc(btrans, tmp.data());
+ EXPECT_TRUE(Bp.packed());
#ifdef _OPENMP
#pragma omp parallel