Add unpack to PackedGemmMatrixFP16 (#112)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/112 We need to unpack the layout to support non-CPU arch. Reviewed By: jianyuh Differential Revision: D16584449 fbshipit-source-id: 309acaf8f2406e39d6975c0e9fef3e849a6d3950
author: Yinghai Lu <yinghai@fb.com> 2019-08-09 02:12:20 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-08-09 02:17:04 +0300
commit: 122135c29b68de5176bd56de6ced936cdc63cb36 (patch)
tree: de97891c2a33b3141ec72a5d648b20d32a13a67b
parent: cf34b9a26b609109b18d6498f0608faddb7a911b (diff)
2 files changed, 135 insertions, 13 deletions
diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h
index 2b4c42b..e026a42 100644
--- a/include/fbgemm/FbgemmFP16.h
+++ b/include/fbgemm/FbgemmFP16.h
@@ -104,6 +104,14 @@ class PackedGemmMatrixFP16 {
     }
   }
 
+  void setPacked(bool p) {
+    packed_ = p;
+  }
+
+  bool packed() const {
+    return packed_;
+  }
+
   void initializeMemory() {
     // allocate and initialize packed memory
     const int padding = 1024; // required by sw pipelined kernels
@@ -120,6 +128,16 @@ class PackedGemmMatrixFP16 {
     free(pmat_);
   }
 
+  void unpackFromSrc(const matrix_op_t trans, float16* src_mat) {
+    bool tr = (trans == matrix_op_t::Transpose);
+    for (int i = 0; i < numRows(); i++) {
+      for (int j = 0; j < numCols(); j++) {
+        pmat_[tr ? i + numRows() * j : i * numCols() + j] = src_mat[addr(i, j)];
+      }
+    }
+    packed_ = false;
+  }
+
   // protected:
   // blocked row-major format address arithmetic
   uint64_t addr(const int r_, const int c_) const {
@@ -155,6 +173,19 @@ class PackedGemmMatrixFP16 {
             pmat_[addr(i, j)]);
       }
     }
+    packed_ = true;
+  }
+
+  // This function takes in an unpacked float16 matrix of the same size and
+  // packs it. There is no floating type conversion.
+  void packFromSrc(const matrix_op_t trans, const float16* smat) {
+    bool tr = (trans == matrix_op_t::Transpose);
+    for (int i = 0; i < numRows(); ++i) {
+      for (int j = 0; j < numCols(); ++j) {
+        pmat_[addr(i, j)] = smat[tr ? i + numRows() * j : i * numCols() + j];
+      }
+    }
+    packed_ = true;
   }
 
   const float16& operator()(const int r, const int c) const {
@@ -202,6 +233,7 @@ class PackedGemmMatrixFP16 {
   uint64_t size_;
   int kernel_ncol_blocks_;
   float16* pmat_;
+  bool packed_{false};
 
   friend void cblas_gemm_compute(
       const matrix_op_t transa,
diff --git a/test/FP16Test.cc b/test/FP16Test.cc
index eb49086..3267655 100644
--- a/test/FP16Test.cc
+++ b/test/FP16Test.cc
@@ -27,7 +27,26 @@ using namespace fbgemm;
 namespace {
 // The template parameter is transpose of A and B
 class FBGemmFP16Test
-    : public testing::TestWithParam<pair<matrix_op_t, matrix_op_t>> {};
+    : public testing::TestWithParam<pair<matrix_op_t, matrix_op_t>> {
+ protected:
+  vector<vector<int>> GenShapes() const {
+    vector<vector<int>> shapes;
+    random_device r;
+    default_random_engine generator(r());
+    uniform_int_distribution<int> dm(1, 256);
+    uniform_int_distribution<int> dnk(1, 1024);
+    for (int i = 0; i < 10; i++) {
+      int m = dm(generator);
+      int n = dnk(generator);
+      int k = dnk(generator);
+      shapes.push_back({m, n, k});
+      if (m > 10) {
+        shapes.push_back({(m / 10) * 10, n, k});
+      }
+    }
+    return shapes;
+  }
+};
 }; // namespace
 
 INSTANTIATE_TEST_CASE_P(
@@ -44,21 +63,75 @@ INSTANTIATE_TEST_CASE_P(
           matrix_op_t::Transpose, matrix_op_t::Transpose)*/));
 
 TEST_P(FBGemmFP16Test, Test) {
-  vector<vector<int>> shapes;
-  random_device r;
-  default_random_engine generator(r());
-  uniform_int_distribution<int> dm(1, 256);
-  uniform_int_distribution<int> dnk(1, 1024);
-  for (int i = 0; i < 10; i++) {
-    int m = dm(generator);
-    int n = dnk(generator);
-    int k = dnk(generator);
-    shapes.push_back({m, n, k});
-    if (m > 10) {
-      shapes.push_back({(m / 10) * 10, n, k});
+  auto shapes = GenShapes();
+  float alpha = 1.f, beta = 0.f;
+  matrix_op_t atrans, btrans;
+  tie(atrans, btrans) = GetParam();
+
+  for (auto s : shapes) {
+    int m = s[0];
+    int n = s[1];
+    int k = s[2];
+
+    cerr << "m = " << m << " n = " << n << " k = " << k;
+    if (atrans == matrix_op_t::Transpose) {
+      cerr << " A_transposed";
+    }
+    if (btrans == matrix_op_t::Transpose) {
+      cerr << " B_transposed";
+    }
+    cerr << endl;
+
+    // initialize with small numbers
+    aligned_vector<int> Aint(m * k);
+    aligned_vector<int> Bint(k * n);
+    randFill(Aint, 0, 4);
+    randFill(Bint, 0, 4);
+    aligned_vector<float> A(Aint.begin(), Aint.end());
+    aligned_vector<float> B(Bint.begin(), Bint.end());
+
+    aligned_vector<float> C(m * n, NAN);
+
+    aligned_vector<float> A_ref(A), B_ref(B), C_ref(C);
+
+    if (atrans == matrix_op_t::Transpose) {
+      transpose_matrix(A_ref.data(), k, m);
+    }
+    if (btrans == matrix_op_t::Transpose) {
+      transpose_matrix(B_ref.data(), n, k);
+    }
+
+    // Gold via reference sgemm
+    matmul_fp_ref(m, n, k, k, n, n, A_ref.data(), B_ref.data(), C_ref.data());
+
+    // fbgemm fp16
+    PackedGemmMatrixFP16 Bp(btrans, k, n, alpha, B.data());
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+    {
+      int num_threads = fbgemm_get_num_threads();
+      int tid = fbgemm_get_thread_num();
+
+      cblas_gemm_compute(
+          atrans, m, A.data(), Bp, beta, C.data(), tid, num_threads);
+    }
+
+    // correctness check
+    for (int i = 0; i < m; ++i) {
+      for (int j = 0; j < n; ++j) {
+        float expected = C_ref[i * n + j];
+        float actual = C[i * n + j];
+        EXPECT_EQ(expected, actual)
+            << "GEMM results differ at (" << i << ", " << j << "). ref "
+            << expected << " FBGemm " << actual;
+      }
     }
   }
+}
 
+TEST_P(FBGemmFP16Test, Unpack) {
+  auto shapes = GenShapes();
   float alpha = 1.f, beta = 0.f;
   matrix_op_t atrans, btrans;
   tie(atrans, btrans) = GetParam();
@@ -101,6 +174,23 @@ TEST_P(FBGemmFP16Test, Test) {
 
     // fbgemm fp16
     PackedGemmMatrixFP16 Bp(btrans, k, n, alpha, B.data());
+    EXPECT_TRUE(Bp.packed());
+
+    // Test unpack
+    aligned_vector<float16> tmp(Bp.matSize());
+    memcpy(tmp.data(), Bp.pmat(), Bp.matSize() * sizeof(float16));
+    Bp.unpackFromSrc(btrans, tmp.data());
+    EXPECT_FALSE(Bp.packed());
+    memcpy(tmp.data(), Bp.pmat(), Bp.matSize() * sizeof(float16));
+    for (int i = 0; i < k; ++i) {
+      for (int j = 0; j < n; ++j) {
+        EXPECT_EQ(B[i * n + j], cpu_half2float(tmp[i * n + j]));
+      }
+    }
+
+    // Pack it back
+    Bp.packFromSrc(btrans, tmp.data());
+    EXPECT_TRUE(Bp.packed());
 
 #ifdef _OPENMP
 #pragma omp parallel
author	Yinghai Lu <yinghai@fb.com>	2019-08-09 02:12:20 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-08-09 02:17:04 +0300
commit	122135c29b68de5176bd56de6ced936cdc63cb36 (patch)
tree	de97891c2a33b3141ec72a5d648b20d32a13a67b
parent	cf34b9a26b609109b18d6498f0608faddb7a911b (diff)