diff options
author | Jianyu Huang <jianyuhuang@fb.com> | 2018-11-20 10:31:33 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-11-20 10:33:34 +0300 |
commit | 12c8362ef738879c971ea4d411f419279107057e (patch) | |
tree | a6f2415d66e166d0f26b827a045e074a327d52a2 /src/Fbgemm.cc | |
parent | 282afa288cf45f75cfc38010d07ce77c081729c9 (diff) |
Optimize parallelization performance (#15)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/15
Better load balance the workload among different threads.
Reviewed By: jspark1105
Differential Revision: D13108873
fbshipit-source-id: ae75971b5ff2cc7cf19907eb95cf2df071f7bbe3
Diffstat (limited to 'src/Fbgemm.cc')
-rw-r--r-- | src/Fbgemm.cc | 29 |
1 files changed, 17 insertions, 12 deletions
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index 0ce13f4..99d0a52 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -46,6 +46,7 @@ void fbgemmPacked( "Accumulation type of both matrices should be the same"); int MCB, KCB; + int MR; // Run time CPU detection if (cpuinfo_initialize()) { @@ -58,6 +59,10 @@ void fbgemmPacked( typename packingAMatrix::inpType, typename packingAMatrix::accType, inst_set_t::avx512>::KCB; + MR = PackingTraits< + typename packingAMatrix::inpType, + typename packingAMatrix::accType, + inst_set_t::avx512>::MR; } else if (cpuinfo_has_x86_avx2()) { MCB = PackingTraits< typename packingAMatrix::inpType, @@ -67,6 +72,10 @@ void fbgemmPacked( typename packingAMatrix::inpType, typename packingAMatrix::accType, inst_set_t::avx2>::KCB; + MR = PackingTraits< + typename packingAMatrix::inpType, + typename packingAMatrix::accType, + inst_set_t::avx2>::MR; } else { // TODO: Have default slower path assert(0 && "unsupported architecture"); @@ -88,11 +97,9 @@ void fbgemmPacked( int MDim = packA.numRows(); int KDimPerGroup = packB.numRows() / packB.numGroups(); - int mBlocks = (MDim + MCB - 1) / MCB; int kBlocks = (KDimPerGroup + KCB - 1) / KCB; // remainders - int _mc = MDim % MCB; int _kc = KDimPerGroup % KCB; int kc, mc; @@ -108,9 +115,8 @@ void fbgemmPacked( #endif for (int g = 0; g < packA.numGroups(); ++g) { - int i_per_thread = (mBlocks + num_threads - 1) / num_threads; - int i_begin = std::min(thread_id * i_per_thread, mBlocks); - int i_end = std::min(i_begin + i_per_thread, mBlocks); + int i_begin, i_end; + fbgemmGetRange(num_threads, thread_id, MDim, MR, i_begin, i_end); ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType> exeKernelObj( @@ -123,13 +129,12 @@ void fbgemmPacked( outProcess, thread_id, num_threads); - for (int i = i_begin; i < i_end; ++i) { - mc = (i != mBlocks - 1 || _mc == 0) ? MCB : _mc; - for (int k = 0; k < kBlocks; ++k) { - kc = (k != kBlocks - 1 || _kc == 0) ? KCB : _kc; + for (int i = i_begin; i < i_end; i += MCB) { // i is the element index + mc = std::min(i_end - i, MCB); + for (int kb = 0; kb < kBlocks; ++kb) { // kb is the block index + kc = (kb != kBlocks - 1 || _kc == 0) ? KCB : _kc; // pack A matrix - blockA = {i * MCB, mc, g * KDimPerGroup + k * KCB, kc}; - + blockA = {i, mc, g * KDimPerGroup + kb * KCB, kc}; packA.pack(blockA); #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN @@ -141,7 +146,7 @@ void fbgemmPacked( t_start = std::chrono::high_resolution_clock::now(); #endif - exeKernelObj.execute(g * kBlocks + k); + exeKernelObj.execute(g * kBlocks + kb); #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN t_end = std::chrono::high_resolution_clock::now(); |