Optimize parallelization performance (#15)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/15 Better load balance the workload among different threads. Reviewed By: jspark1105 Differential Revision: D13108873 fbshipit-source-id: ae75971b5ff2cc7cf19907eb95cf2df071f7bbe3
author: Jianyu Huang <jianyuhuang@fb.com> 2018-11-20 10:31:33 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2018-11-20 10:33:34 +0300
commit: 12c8362ef738879c971ea4d411f419279107057e (patch)
tree: a6f2415d66e166d0f26b827a045e074a327d52a2 /src/Fbgemm.cc
parent: 282afa288cf45f75cfc38010d07ce77c081729c9 (diff)
1 files changed, 17 insertions, 12 deletions
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index 0ce13f4..99d0a52 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -46,6 +46,7 @@ void fbgemmPacked(
       "Accumulation type of both matrices should be the same");
 
   int MCB, KCB;
+  int MR;
 
   // Run time CPU detection
   if (cpuinfo_initialize()) {
@@ -58,6 +59,10 @@ void fbgemmPacked(
           typename packingAMatrix::inpType,
           typename packingAMatrix::accType,
           inst_set_t::avx512>::KCB;
+      MR = PackingTraits<
+          typename packingAMatrix::inpType,
+          typename packingAMatrix::accType,
+          inst_set_t::avx512>::MR;
     } else if (cpuinfo_has_x86_avx2()) {
       MCB = PackingTraits<
           typename packingAMatrix::inpType,
@@ -67,6 +72,10 @@ void fbgemmPacked(
           typename packingAMatrix::inpType,
           typename packingAMatrix::accType,
           inst_set_t::avx2>::KCB;
+      MR = PackingTraits<
+          typename packingAMatrix::inpType,
+          typename packingAMatrix::accType,
+          inst_set_t::avx2>::MR;
     } else {
       // TODO: Have default slower path
       assert(0 && "unsupported architecture");
@@ -88,11 +97,9 @@ void fbgemmPacked(
   int MDim = packA.numRows();
   int KDimPerGroup = packB.numRows() / packB.numGroups();
 
-  int mBlocks = (MDim + MCB - 1) / MCB;
   int kBlocks = (KDimPerGroup + KCB - 1) / KCB;
 
   // remainders
-  int _mc = MDim % MCB;
   int _kc = KDimPerGroup % KCB;
 
   int kc, mc;
@@ -108,9 +115,8 @@ void fbgemmPacked(
 #endif
 
   for (int g = 0; g < packA.numGroups(); ++g) {
-    int i_per_thread = (mBlocks + num_threads - 1) / num_threads;
-    int i_begin = std::min(thread_id * i_per_thread, mBlocks);
-    int i_end = std::min(i_begin + i_per_thread, mBlocks);
+    int i_begin, i_end;
+    fbgemmGetRange(num_threads, thread_id, MDim, MR, i_begin, i_end);
 
     ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType>
         exeKernelObj(
@@ -123,13 +129,12 @@ void fbgemmPacked(
             outProcess,
             thread_id,
             num_threads);
-    for (int i = i_begin; i < i_end; ++i) {
-      mc = (i != mBlocks - 1 || _mc == 0) ? MCB : _mc;
-      for (int k = 0; k < kBlocks; ++k) {
-        kc = (k != kBlocks - 1 || _kc == 0) ? KCB : _kc;
+    for (int i = i_begin; i < i_end; i += MCB) { // i is the element index
+      mc = std::min(i_end - i, MCB);
+      for (int kb = 0; kb < kBlocks; ++kb) { // kb is the block index
+        kc = (kb != kBlocks - 1 || _kc == 0) ? KCB : _kc;
         // pack A matrix
-        blockA = {i * MCB, mc, g * KDimPerGroup + k * KCB, kc};
-
+        blockA = {i, mc, g * KDimPerGroup + kb * KCB, kc};
         packA.pack(blockA);
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
@@ -141,7 +146,7 @@ void fbgemmPacked(
         t_start = std::chrono::high_resolution_clock::now();
 #endif
 
-        exeKernelObj.execute(g * kBlocks + k);
+        exeKernelObj.execute(g * kBlocks + kb);
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
         t_end = std::chrono::high_resolution_clock::now();
author	Jianyu Huang <jianyuhuang@fb.com>	2018-11-20 10:31:33 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2018-11-20 10:33:34 +0300
commit	12c8362ef738879c971ea4d411f419279107057e (patch)
tree	a6f2415d66e166d0f26b827a045e074a327d52a2 /src/Fbgemm.cc
parent	282afa288cf45f75cfc38010d07ce77c081729c9 (diff)