make G slowest moving dim of packed weight of gconv (#62)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/62 This more naturally aligns with access pattern Reviewed By: dskhudia Differential Revision: D13833672 fbshipit-source-id: 2b383b5aaf2f60201c8d433f86b64c9cf32960a2
author: Jongsoo Park <jongsoo@fb.com> 2019-02-01 22:50:44 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-02-01 22:53:50 +0300
commit: d90dbbacf0307681fef2c7d5f86685c219f433c0 (patch)
tree: d6e16041eb915352a5860dffb432dde1173c6126 /src
parent: 3373890bdeb69129ae1f37d3e764bcbea462806d (diff)
2 files changed, 9 insertions, 7 deletions
diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc
index ce93b87..7906c04 100644
--- a/src/GroupwiseConvAcc32Avx2.cc
+++ b/src/GroupwiseConvAcc32Avx2.cc
@@ -185,7 +185,7 @@ void GenConvKernel<int32_t>::genForLoadingWeights<inst_set_t::avx2>(
           WRegs_avx2_[r * S_ + s],
           x86::dword_ptr(
               wghts_R_,
-              (r * S_ + s) * G_ * K_per_G_ * C_per_G_ * sizeof(int8_t)));
+              (r * S_ + s) * 2 * K_per_G_ * C_per_G_ * sizeof(int8_t)));
     }
   }
 }
@@ -1387,7 +1387,8 @@ void fbgemmGroupwiseConvBase_(
 
           fpConv(
               actStartGroup,
-              packed_weights.getBuf() + g * K_per_G * C_per_G,
+              packed_weights.getBuf() +
+                  g * conv_param.K[0] * conv_param.K[1] * K_per_G * C_per_G,
               currOutBuf,
               a_zero_point,
               H,
@@ -1574,7 +1575,8 @@ void fbgemmGroupwiseConv(
 
         fpConv(
             actStartGroup,
-            packed_weights.getBuf() + g * K_per_G * C_per_G,
+            packed_weights.getBuf() +
+                g * conv_param.K[0] * conv_param.K[1] * K_per_G * C_per_G,
             currOutBuf,
             a_zero_point,
             H,
diff --git a/src/PackWeightMatrixForGConv.cc b/src/PackWeightMatrixForGConv.cc
index e6c9b7d..e38fba9 100644
--- a/src/PackWeightMatrixForGConv.cc
+++ b/src/PackWeightMatrixForGConv.cc
@@ -42,10 +42,10 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv(
  * Let IC_per_G be number of input channels per group and OC_per_G be number of
  * output channels per group.
  *
- * For IC_per_G  == 4 && OC_per_G == 4 optimized
+ * For IC_per_G == 4 && OC_per_G == 4 optimized
  * kernel works on 2 groups at a time hence input channels for g and g+1 group
- * are laid out sequentially for each output channel, i.e., the layout is R S
- * (G/2) K (2C)
+ * are laid out sequentially for each output channel, i.e., the layout is (G/2)
+ * R S K (2C) and K (2C) is in each 32B vector.
  * We work on two groups at a time to fully utilize the avx2 SIMD width of
  * 256-bits.
  *
@@ -78,7 +78,7 @@ void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() {
                   : sdata_
                         [(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k];
               pdata_
-                  [((((r * S + s) * (G / 2) + (g / 2)) * OC_per_G + k) * 2 +
+                  [(((((g / 2) * R + r) * S + s) * OC_per_G + k) * 2 +
                     (g % 2)) *
                        IC_per_G +
                    c] = b;
author	Jongsoo Park <jongsoo@fb.com>	2019-02-01 22:50:44 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-02-01 22:53:50 +0300
commit	d90dbbacf0307681fef2c7d5f86685c219f433c0 (patch)
tree	d6e16041eb915352a5860dffb432dde1173c6126 /src
parent	3373890bdeb69129ae1f37d3e764bcbea462806d (diff)