diff options
author | Jongsoo Park <jongsoo@fb.com> | 2019-02-01 22:50:44 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-02-01 22:53:50 +0300 |
commit | d90dbbacf0307681fef2c7d5f86685c219f433c0 (patch) | |
tree | d6e16041eb915352a5860dffb432dde1173c6126 /src | |
parent | 3373890bdeb69129ae1f37d3e764bcbea462806d (diff) |
make G slowest moving dim of packed weight of gconv (#62)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/62
This more naturally aligns with access pattern
Reviewed By: dskhudia
Differential Revision: D13833672
fbshipit-source-id: 2b383b5aaf2f60201c8d433f86b64c9cf32960a2
Diffstat (limited to 'src')
-rw-r--r-- | src/GroupwiseConvAcc32Avx2.cc | 8 | ||||
-rw-r--r-- | src/PackWeightMatrixForGConv.cc | 8 |
2 files changed, 9 insertions, 7 deletions
diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc index ce93b87..7906c04 100644 --- a/src/GroupwiseConvAcc32Avx2.cc +++ b/src/GroupwiseConvAcc32Avx2.cc @@ -185,7 +185,7 @@ void GenConvKernel<int32_t>::genForLoadingWeights<inst_set_t::avx2>( WRegs_avx2_[r * S_ + s], x86::dword_ptr( wghts_R_, - (r * S_ + s) * G_ * K_per_G_ * C_per_G_ * sizeof(int8_t))); + (r * S_ + s) * 2 * K_per_G_ * C_per_G_ * sizeof(int8_t))); } } } @@ -1387,7 +1387,8 @@ void fbgemmGroupwiseConvBase_( fpConv( actStartGroup, - packed_weights.getBuf() + g * K_per_G * C_per_G, + packed_weights.getBuf() + + g * conv_param.K[0] * conv_param.K[1] * K_per_G * C_per_G, currOutBuf, a_zero_point, H, @@ -1574,7 +1575,8 @@ void fbgemmGroupwiseConv( fpConv( actStartGroup, - packed_weights.getBuf() + g * K_per_G * C_per_G, + packed_weights.getBuf() + + g * conv_param.K[0] * conv_param.K[1] * K_per_G * C_per_G, currOutBuf, a_zero_point, H, diff --git a/src/PackWeightMatrixForGConv.cc b/src/PackWeightMatrixForGConv.cc index e6c9b7d..e38fba9 100644 --- a/src/PackWeightMatrixForGConv.cc +++ b/src/PackWeightMatrixForGConv.cc @@ -42,10 +42,10 @@ PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::PackWeightMatrixForGConv( * Let IC_per_G be number of input channels per group and OC_per_G be number of * output channels per group. * - * For IC_per_G == 4 && OC_per_G == 4 optimized + * For IC_per_G == 4 && OC_per_G == 4 optimized * kernel works on 2 groups at a time hence input channels for g and g+1 group - * are laid out sequentially for each output channel, i.e., the layout is R S - * (G/2) K (2C) + * are laid out sequentially for each output channel, i.e., the layout is (G/2) + * R S K (2C) and K (2C) is in each 32B vector. * We work on two groups at a time to fully utilize the avx2 SIMD width of * 256-bits. * @@ -78,7 +78,7 @@ void PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>::pack() { : sdata_ [(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k]; pdata_ - [((((r * S + s) * (G / 2) + (g / 2)) * OC_per_G + k) * 2 + + [(((((g / 2) * R + r) * S + s) * OC_per_G + k) * 2 + (g % 2)) * IC_per_G + c] = b; |