1 files changed, 112 insertions, 29 deletions
diff --git a/src/PackWeightMatrixForGConv.cc b/src/PackWeightMatrixForGConv.cc
index 0fb0e2c..ba6adf3 100644
--- a/src/PackWeightMatrixForGConv.cc
+++ b/src/PackWeightMatrixForGConv.cc
@@ -36,8 +36,61 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv(
 }
 
 /**
- * @brief Pack weight tensor in a suitable format required for the optimized
- * kernel.
+ * @brief Get the index of the unpacked data for a given <r, s, k, g, c, tr>
+ *
+ * Non-transposed: G (R S C/G) K/G
+ * Transposed: G K/G (R S C/G)
+ * Using inline as this will be called frequently
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+inline int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::unpacked_index_(
+    int r, int s, int k, int g, int c, bool tr) {
+  // Get the full dimensions
+  int R = conv_param_.K[0];
+  int S = conv_param_.K[1];
+  int G = conv_param_.G;
+  int IC_per_G = conv_param_.IC / G;
+  int OC_per_G = conv_param_.OC / G;
+
+  int idx;
+  if (tr) {
+    idx = (((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c;
+  } else {
+    idx = (((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k;
+  }
+  return idx;
+}
+
+/**
+ * @brief Get the index of the packed data for a given <r, s, k, g, c>
+ *
+ * The index may differ depending on IC_per_G.
+ * Using inline as this will be called frequently
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+inline int PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::packed_index_(
+    int r, int s, int k, int g, int c) {
+  // Get the full dimensions
+  int R = conv_param_.K[0];
+  int S = conv_param_.K[1];
+  int G = conv_param_.G;
+  int IC_per_G = conv_param_.IC / G;
+  int OC_per_G = conv_param_.OC / G;
+
+  int idx;
+  // For IC_per_G == 4, we need to work on 2 groups at a time
+  if (IC_per_G == 4) {
+    idx = (((((g / 2) * R + r) * S + s) * OC_per_G + k) * 2 + (g % 2))
+      * IC_per_G + c;
+  } else {
+    idx = ((((g * (IC_per_G / 4) + (c / 4)) * R + r) * S + s) * OC_per_G + k)
+      * 4 + (c % 4);
+  }
+  return idx;
+}
+
+/**
+ * @ brief Pack or unpack matrix
  *
  * Let IC_per_G be number of input channels per group and OC_per_G be number of
  * output channels per group.
@@ -53,15 +106,17 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv(
  * on 2 groups at a time and full SIMD width can be efficiently utilized even
  * while working on 1 group at a time.
  * In this case, the layout is G (C/4) R S K 4
- */
+*/
+
 template <typename T, typename accT, int SPATIAL_DIM>
-void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
+void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack_unpack_(
+    const T* src, T* dst, bool ispack) {
   // filters are assumed to be in G RS C/G K/G format
   int R = conv_param_.K[0];
   int S = conv_param_.K[1];
   int G = conv_param_.G;
-  int IC_per_G = conv_param_.IC / conv_param_.G;
-  int OC_per_G = conv_param_.OC / conv_param_.G;
+  int IC_per_G = conv_param_.IC / G;
+  int OC_per_G = conv_param_.OC / G;
 
   // If transpose option is set, the weight matrix is in layout G K/G (R S C/G)
   // instead of G (R S C/G) K/G
@@ -73,25 +128,13 @@ void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
         for (int k = 0; k < OC_per_G; ++k) {
           for (int g = 0; g < G; ++g) {
             for (int c = 0; c < IC_per_G; ++c) {
-              inpType b = tr
-                  ? sdata_
-                        [(((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c]
-                  : sdata_
-                        [(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k];
-              if (IC_per_G == 4) {
-                // For IC_per_G == 4, we need to work on 2 groups at a time
-                pdata_
-                    [(((((g / 2) * R + r) * S + s) * OC_per_G + k) * 2 +
-                      (g % 2)) *
-                         IC_per_G +
-                     c] = b;
+              int p_idx = packed_index_(r, s, k, g, c);
+              int up_idx = unpacked_index_(r, s, k, g, c, tr);
+              // Pack: src (unpacked) -> dst (packed)
+              if (ispack) {
+                dst[p_idx] = src[up_idx];
               } else {
-                pdata_
-                    [((((g * (IC_per_G / 4) + (c / 4)) * R + r) * S + s) *
-                          OC_per_G +
-                      k) *
-                         4 +
-                     (c % 4)] = b;
+                dst[up_idx] = src[p_idx];
               }
             }
           }
@@ -99,14 +142,54 @@ void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
       }
     }
   } else {
+    // For pack & transposed, call transposeConvWeights()
+    // G K/G (R S C/G) => G (R S C/G) K/G
     if (tr) {
-      // conv_ref expects weights to be in G (R S C/G) K/G format
-      transposeConvWeights(conv_param_, sdata_, pdata_);
+      if (ispack) {
+        transposeConvWeights(conv_param_, src, dst);
+      } else {
+      // TODO: Wrap this as a inverseTransposeConvWeights()?
+      // For unpack & transposed, call transposeConvWeights()
+      // G (R S C/G) K/G => G K/G (R S C/G)
+        for (int r = 0; r < R; ++r) {
+          for (int s = 0; s < S; ++s) {
+            for (int k = 0; k < OC_per_G; ++k) {
+              for (int g = 0; g < G; ++g) {
+                for (int c = 0; c < IC_per_G; ++c) {
+                  dst[(((g * OC_per_G + k) * R + r) * S + s)
+                    * IC_per_G + c] =
+                    src[(((g * R + r) * S + s) * IC_per_G + c)
+                    * OC_per_G + k];
+                }
+              }
+            }
+          }
+        }
+      }  // end if(ispack)
     } else {
       // just copy the data for not supported cases
-      memcpy(pdata_, sdata_, G * R * S * OC_per_G * IC_per_G * sizeof(inpType));
-    }
-  }
+      memcpy(dst, src,
+          G * R * S * OC_per_G * IC_per_G * sizeof(inpType));
+    } //end if(tr)
+  } // end if(fbgemmOptimizedGConv(conv_param_)
+}
+
+/**
+ * @brief Pack weight tensor in a suitable format required for the optimized
+ * kernel.
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
+  pack_unpack_(sdata_, pdata_, true);
+}
+
+/**
+ * @brief Unpack the packed weight tensor (for the optimized kernel)
+ * to the original form.
+ */
+template <typename T, typename accT, int SPATIAL_DIM>
+void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::unpack(T* origin_buf) {
+  pack_unpack_(const_cast<const T*>(pdata_), origin_buf, false);
 }
 
 template class PackWeightMatrixForGConv<int8_t, int32_t, 2>;