Implement ::unpack() for PackWeightMatrixForGConv

Summary: Implement ::unpack() for PackWeightMatrixForGConv. Unpack index calculation is the inverse of ::pack(). Reviewed By: dskhudia Differential Revision: D16085552 fbshipit-source-id: b8866365dc425fee2cb985b3e48c627198ebc29a
author: Jaewon Lee <jaewon@fb.com> 2019-07-06 00:58:49 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-07-06 01:03:34 +0300
commit: 64a2c73a425e4113839f2c2b596ea28d632d20f6 (patch)
tree: 228cbd60a528309c4eb6884b03cb78ca8a73b7ba
parent: b0cf97df8e2f368d8e0c1d2e9e1cacbd7638f79d (diff)
3 files changed, 190 insertions, 29 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index 721b12f..87f0907 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -504,6 +504,11 @@ class FBGEMM_API PackWeightMatrixForGConv {
   void pack();
 
   /**
+   * @brief Unpacks a pmat buffer into source matrix.
+   */
+  void unpack(T* origin_buf);
+
+  /**
    * @brief Return packed data
    */
   inpType* getBuf() {
@@ -522,6 +527,22 @@ class FBGEMM_API PackWeightMatrixForGConv {
   const T* sdata_;
   T* pdata_;
   bool bufAllocatedHere_;
+
+  /**
+   * @brief Internal function performing both pack & unpack
+   */
+  void pack_unpack_(const T* src, T* dst, bool ispack);
+
+  /**
+   * @brief Get the index of the unpacked data
+   */
+  int unpacked_index_(int r, int s, int k, int g, int c, bool tr);
+
+  /**
+   * @brief Get the index of the packed data
+   */
+  int packed_index_(int r, int s, int k, int g, int c);
+
 };
 
 /**
diff --git a/src/PackWeightMatrixForGConv.cc b/src/PackWeightMatrixForGConv.cc
index 0fb0e2c..ba6adf3 100644
--- a/src/PackWeightMatrixForGConv.cc
+++ b/src/PackWeightMatrixForGConv.cc
@@ -36,8 +36,61 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv(
 }
 
 /**
- * @brief Pack weight tensor in a suitable format required for the optimized
- * kernel.
+ * @brief Get the index of the unpacked data for a given <r, s, k, g, c, tr>
+ *
+ * Non-transposed: G (R S C/G) K/G
+ * Transposed: G K/G (R S C/G)
+ * Using inline as this will be called frequently
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+inline int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::unpacked_index_(
+    int r, int s, int k, int g, int c, bool tr) {
+  // Get the full dimensions
+  int R = conv_param_.K[0];
+  int S = conv_param_.K[1];
+  int G = conv_param_.G;
+  int IC_per_G = conv_param_.IC / G;
+  int OC_per_G = conv_param_.OC / G;
+
+  int idx;
+  if (tr) {
+    idx = (((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c;
+  } else {
+    idx = (((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k;
+  }
+  return idx;
+}
+
+/**
+ * @brief Get the index of the packed data for a given <r, s, k, g, c>
+ *
+ * The index may differ depending on IC_per_G.
+ * Using inline as this will be called frequently
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+inline int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::packed_index_(
+    int r, int s, int k, int g, int c) {
+  // Get the full dimensions
+  int R = conv_param_.K[0];
+  int S = conv_param_.K[1];
+  int G = conv_param_.G;
+  int IC_per_G = conv_param_.IC / G;
+  int OC_per_G = conv_param_.OC / G;
+
+  int idx;
+  // For IC_per_G == 4, we need to work on 2 groups at a time
+  if (IC_per_G == 4) {
+    idx = (((((g / 2) * R + r) * S + s) * OC_per_G + k) * 2 + (g % 2))
+      * IC_per_G + c;
+  } else {
+    idx = ((((g * (IC_per_G / 4) + (c / 4)) * R + r) * S + s) * OC_per_G + k)
+      * 4 + (c % 4);
+  }
+  return idx;
+}
+
+/**
+ * @ brief Pack or unpack matrix
  *
  * Let IC_per_G be number of input channels per group and OC_per_G be number of
  * output channels per group.
@@ -53,15 +106,17 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv(
  * on 2 groups at a time and full SIMD width can be efficiently utilized even
  * while working on 1 group at a time.
  * In this case, the layout is G (C/4) R S K 4
- */
+*/
+
 template <typename T, typename accT, int SPATIAL_DIM>
-void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
+void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack_unpack_(
+    const T* src, T* dst, bool ispack) {
   // filters are assumed to be in G RS C/G K/G format
   int R = conv_param_.K[0];
   int S = conv_param_.K[1];
   int G = conv_param_.G;
-  int IC_per_G = conv_param_.IC / conv_param_.G;
-  int OC_per_G = conv_param_.OC / conv_param_.G;
+  int IC_per_G = conv_param_.IC / G;
+  int OC_per_G = conv_param_.OC / G;
 
   // If transpose option is set, the weight matrix is in layout G K/G (R S C/G)
   // instead of G (R S C/G) K/G
@@ -73,25 +128,13 @@ void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
         for (int k = 0; k < OC_per_G; ++k) {
           for (int g = 0; g < G; ++g) {
             for (int c = 0; c < IC_per_G; ++c) {
-              inpType b = tr
-                  ? sdata_
-                        [(((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c]
-                  : sdata_
-                        [(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k];
-              if (IC_per_G == 4) {
-                // For IC_per_G == 4, we need to work on 2 groups at a time
-                pdata_
-                    [(((((g / 2) * R + r) * S + s) * OC_per_G + k) * 2 +
-                      (g % 2)) *
-                         IC_per_G +
-                     c] = b;
+              int p_idx = packed_index_(r, s, k, g, c);
+              int up_idx = unpacked_index_(r, s, k, g, c, tr);
+              // Pack: src (unpacked) -> dst (packed)
+              if (ispack) {
+                dst[p_idx] = src[up_idx];
               } else {
-                pdata_
-                    [((((g * (IC_per_G / 4) + (c / 4)) * R + r) * S + s) *
-                          OC_per_G +
-                      k) *
-                         4 +
-                     (c % 4)] = b;
+                dst[up_idx] = src[p_idx];
               }
             }
           }
@@ -99,14 +142,54 @@ void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
       }
     }
   } else {
+    // For pack & transposed, call transposeConvWeights()
+    // G K/G (R S C/G) => G (R S C/G) K/G
     if (tr) {
-      // conv_ref expects weights to be in G (R S C/G) K/G format
-      transposeConvWeights(conv_param_, sdata_, pdata_);
+      if (ispack) {
+        transposeConvWeights(conv_param_, src, dst);
+      } else {
+      // TODO: Wrap this as a inverseTransposeConvWeights()?
+      // For unpack & transposed, call transposeConvWeights()
+      // G (R S C/G) K/G => G K/G (R S C/G)
+        for (int r = 0; r < R; ++r) {
+          for (int s = 0; s < S; ++s) {
+            for (int k = 0; k < OC_per_G; ++k) {
+              for (int g = 0; g < G; ++g) {
+                for (int c = 0; c < IC_per_G; ++c) {
+                  dst[(((g * OC_per_G + k) * R + r) * S + s)
+                    * IC_per_G + c] =
+                    src[(((g * R + r) * S + s) * IC_per_G + c)
+                    * OC_per_G + k];
+                }
+              }
+            }
+          }
+        }
+      }  // end if(ispack)
     } else {
       // just copy the data for not supported cases
-      memcpy(pdata_, sdata_, G * R * S * OC_per_G * IC_per_G * sizeof(inpType));
-    }
-  }
+      memcpy(dst, src,
+          G * R * S * OC_per_G * IC_per_G * sizeof(inpType));
+    } //end if(tr)
+  } // end if(fbgemmOptimizedGConv(conv_param_)
+}
+
+/**
+ * @brief Pack weight tensor in a suitable format required for the optimized
+ * kernel.
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
+  pack_unpack_(sdata_, pdata_, true);
+}
+
+/**
+ * @brief Unpack the packed weight tensor (for the optimized kernel)
+ * to the original form.
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::unpack(T* origin_buf) {
+  pack_unpack_(const_cast<const T*>(pdata_), origin_buf, false);
 }
 
 template class PackWeightMatrixForGConv<int8_t, int32_t, 2>;
diff --git a/test/GConvTest.cc b/test/GConvTest.cc
index 84f0d52..0074535 100644
--- a/test/GConvTest.cc
+++ b/test/GConvTest.cc
@@ -43,6 +43,8 @@ class fbgemmGConvAcc32WithQuantGranularityTest
           QuantizationGranularity,
           bool,
           bool>> {};
+class fbgemmGConvPackTest
+    : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t>> {};
 }; // namespace
 
 INSTANTIATE_TEST_CASE_P(
@@ -61,6 +63,13 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::ValuesIn(qGranularityVals),
         ::testing::Bool(), // A symmetric
         ::testing::Bool())); // B symmetric
+
+INSTANTIATE_TEST_CASE_P(
+    InstantiationName,
+    fbgemmGConvPackTest,
+    ::testing::Combine(
+        ::testing::Values(matrix_op_t::NoTranspose),
+        ::testing::ValuesIn(transposeVals)));
 /**
  * @brief Shapes for unit test.
  */
@@ -413,3 +422,51 @@ TEST_P(fbgemmGConvAcc32Test, NoRequantizeTest) {
         static_cast<int32_t>(0));
   } // for each shape
 }
+
+/**
+ * @brief Unit test for packing and unpacking the weight tensor
+ */
+TEST_P(fbgemmGConvPackTest, PackUnpackTest) {
+  vector<conv_param_t<>> shapes(GetShapes_());
+  matrix_op_t atrans, btrans;
+  tie(atrans, btrans) = GetParam();
+
+  for (auto conv_p : shapes) {
+    int R = conv_p.K[0];
+    int S = conv_p.K[1];
+    int IC_per_G = conv_p.IC / conv_p.G;
+    int OC_per_G = conv_p.OC / conv_p.G;
+
+    // Weights -- test the packing/unpacking of only the weights
+    // when btrans == Transpose, the weight matrix is in layout G K/G (R S C/G)
+    // instead of G (R S C/G) K/G
+    int weight_len = R * S * conv_p.G * IC_per_G * OC_per_G;
+    aligned_vector<int8_t> Bint8(weight_len, 0);
+
+    // Random fill the weights
+    randFill<int8_t>(Bint8, -4, 4);
+
+    // Instantiate the object
+    PackWeightMatrixForGConv<int8_t> packedWeights(
+        btrans, conv_p, Bint8.data(), nullptr);
+
+    // Setup a buffer to get pack -> unpacked results
+    aligned_vector<int8_t> unpack_buf(weight_len, 0);
+
+    // START Actual pack-unpack operations
+    // Perform packing first. This should populate pdata_ of packedWeights
+    packedWeights.pack();
+
+    // Next perform unpacking
+    packedWeights.unpack(unpack_buf.data());
+    // END actual pack-unpack operations
+
+    // Sanity check
+    for (int i = 0; i < weight_len; ++i) {
+      EXPECT_EQ(Bint8.data()[i], unpack_buf.data()[i])
+        << "Pack/Unpack results differ at index " << i
+        << ", Reference: " << static_cast<int> (Bint8.data()[i])
+        << ", Pack-Unpacked: " << static_cast<int> (unpack_buf.data()[i]);
+    }
+  } // for each shape
+}
author	Jaewon Lee <jaewon@fb.com>	2019-07-06 00:58:49 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-07-06 01:03:34 +0300
commit	64a2c73a425e4113839f2c2b596ea28d632d20f6 (patch)
tree	228cbd60a528309c4eb6884b03cb78ca8a73b7ba
parent	b0cf97df8e2f368d8e0c1d2e9e1cacbd7638f79d (diff)