Parallelize the benchmark

Summary: Add omp parallel to parallelize the benchmark Reviewed By: jspark1105 Differential Revision: D13106978 fbshipit-source-id: cdc8ce3db86d38745487ac0cafa5bd656f182604
author: Jianyu Huang <jianyuhuang@fb.com> 2018-11-20 10:31:33 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2018-11-20 10:33:34 +0300
commit: 282afa288cf45f75cfc38010d07ce77c081729c9 (patch)
tree: 4b2c4c1164159e4d77ca11d9aef08c3365b5a367 /bench
parent: 7346431f9a368573257587dea8b5c74c21c609b9 (diff)
2 files changed, 206 insertions, 151 deletions
diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
index 367aa32..de15cce 100644
--- a/bench/PackedRequantizeAcc16Benchmark.cc
+++ b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -37,18 +37,34 @@ enum class BenchmarkType {
 
 void performance_test() {
   vector<vector<int>> shapes = {
-      // m, n, k
-      {64, 68, 17},      {60, 128, 64},     {25088, 256, 64},
-      {25088, 64, 64},   {25088, 64, 576},  {25088, 64, 256},
-
-      {6272, 512, 256},  {6272, 128, 256},  {6272, 128, 1152},
-      {6272, 512, 128},  {6272, 128, 512},
-
-      {1568, 1024, 512}, {1568, 256, 512},  {1568, 256, 2304},
-      {1568, 1024, 256}, {1568, 256, 1024},
-
-      {392, 2048, 1024}, {392, 512, 1024},  {392, 512, 4608},
-      {392, 2048, 512},  {392, 512, 2048},
+    // NOTE: clang-format wants to use a different formatting but the current
+    // formatting should be easier to read.
+    // m, n, k
+    {64, 68, 17},
+    {60, 128, 64},
+
+    {25088, 256, 64},
+    {25088, 64, 64},
+    {25088, 64, 576},
+    {25088, 64, 256},
+
+    {6272, 512, 256},
+    {6272, 128, 256},
+    {6272, 128, 1152},
+    {6272, 512, 128},
+    {6272, 128, 512},
+
+    {1568, 1024, 512},
+    {1568, 256, 512},
+    {1568, 256, 2304},
+    {1568, 1024, 256},
+    {1568, 256, 1024},
+
+    {392, 2048, 1024},
+    {392, 512, 1024},
+    {392, 512, 4608},
+    {392, 2048, 512},
+    {392, 512, 2048},
   };
   bool flush = true;
   std::vector<char> llc;
@@ -227,56 +243,9 @@ void performance_test() {
           col_offsets.data(),
           nullptr); // bias
 
-      vector<int32_t> row_offset_buf;
-      row_offset_buf.resize(
-          PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
-
-      PackAMatrix<uint8_t, int16_t> packA(
-          matrix_op_t::NoTranspose,
-          m,
-          k,
-          Aint8.data(),
-          k,
-          nullptr,
-          1,
-          Aint8_zero_point);
-      PackAWithRowOffset<uint8_t, int16_t> packAWithRowOffset(
-          matrix_op_t::NoTranspose,
-          m,
-          k,
-          Aint8.data(),
-          k,
-          nullptr,
-          1,
-          Aint8_zero_point,
-          row_offset_buf.data());
-
       PackBMatrix<int8_t, int16_t> packedB(
           matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
 
-      // no-op output process objects
-      DoNothing<int32_t, int32_t> doNothing32BitObj;
-      memCopy<> memcopyObj(doNothing32BitObj);
-
-      // spmdm -> requantization -> nothing
-      // construct an output processing pipeline in reverse order
-      // i.e. last output operation first
-      // Last operation should always be DoNothing with
-      // correct input and output type.
-      DoNothing<> doNothingObj{};
-      // Requantization back to int8
-      ReQuantizeOutput<false> reqObj(
-          doNothingObj,
-          C_multiplier,
-          C_zero_pt,
-          Aint8_zero_point,
-          Bint8_zero_point,
-          bench_type == BenchmarkType::REQUANTIZATION
-              ? nullptr
-              : packAWithRowOffset.getRowOffsetBuffer(),
-          col_offsets.data(),
-          nullptr);
-
       CompressedSparseColumn B_csc(k, n);
 
       float density = 0.001f;
@@ -310,16 +279,6 @@ void performance_test() {
       }
       B_csc.ColPtr()[n] = total_nnz;
 
-      // the top most (first) operation in the output processing
-      // pipeline is spmdm
-      // outType = final output type after fullly processing through pipeline
-      // inType = initial input type at the first call to the whole pipeline
-      DoSpmdmOnInpBuffer<
-          ReQuantizeOutput<false>::outType,
-          int32_t,
-          ReQuantizeOutput<false>>
-          spmdmObj(reqObj, Aint8.data(), k, B_csc);
-
       // printMatrix(matrix_op_t::NoTranspose,
       // Cint32_mkl.data(), m, n, n, "C mkl");
       ttot = 0;
@@ -334,52 +293,125 @@ void performance_test() {
 #endif
         llc_flush(llc);
         begin = chrono::high_resolution_clock::now();
-        switch (bench_type) {
-          case BenchmarkType::BARE_BONE:
-            fbgemmPacked(
-                packA,
-                packedB,
-                Cint32_fb.data(),
-                Cint32_fb.data(),
-                n,
-                memcopyObj,
-                0,
-                1);
-            break;
-          case BenchmarkType::REQUANTIZATION:
-            fbgemmPacked(
-                packA,
-                packedB,
-                Cint8_fb.data(),
-                Cint32_fb.data(),
-                n,
-                reqObj,
-                0,
-                1);
-            break;
-          case BenchmarkType::ROW_OFFSET_AND_REQUANTIZATION:
-            fbgemmPacked(
-                packAWithRowOffset,
-                packedB,
-                Cint8_fb.data(),
-                Cint32_fb.data(),
-                n,
-                reqObj,
-                0,
-                1);
-            break;
-          case BenchmarkType::EVERYTHING:
-            fbgemmPacked(
-                packAWithRowOffset,
-                packedB,
-                Cint8_fb.data(),
-                Cint32_fb.data(),
-                n,
-                spmdmObj,
-                0,
-                1);
-            break;
-        };
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+        {
+          vector<int32_t> row_offset_buf;
+          row_offset_buf.resize(
+              PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
+
+          PackAMatrix<uint8_t, int16_t> packA(
+              matrix_op_t::NoTranspose,
+              m,
+              k,
+              Aint8.data(),
+              k,
+              nullptr,
+              1,
+              Aint8_zero_point);
+          PackAWithRowOffset<uint8_t, int16_t> packAWithRowOffset(
+              matrix_op_t::NoTranspose,
+              m,
+              k,
+              Aint8.data(),
+              k,
+              nullptr,
+              1,
+              Aint8_zero_point,
+              row_offset_buf.data());
+
+          // no-op output process objects
+          DoNothing<int32_t, int32_t> doNothing32BitObj;
+          memCopy<> memcopyObj(doNothing32BitObj);
+
+          // spmdm -> requantization -> nothing
+          // construct an output processing pipeline in reverse order
+          // i.e. last output operation first
+          // Last operation should always be DoNothing with
+          // correct input and output type.
+          DoNothing<> doNothingObj{};
+          // Requantization back to int8
+          ReQuantizeOutput<false> reqObj(
+              doNothingObj,
+              C_multiplier,
+              C_zero_pt,
+              Aint8_zero_point,
+              Bint8_zero_point,
+              bench_type == BenchmarkType::REQUANTIZATION
+                  ? nullptr
+                  : packAWithRowOffset.getRowOffsetBuffer(),
+              col_offsets.data(),
+              nullptr);
+
+          // the top most (first) operation in the output processing
+          // pipeline is spmdm
+          // outType = final output type after fullly processing through
+          // pipeline; inType = initial input type at the first call to the
+          // whole pipeline
+          DoSpmdmOnInpBuffer<
+              ReQuantizeOutput<false>::outType,
+              int32_t,
+              ReQuantizeOutput<false>>
+              spmdmObj(reqObj, Aint8.data(), k, B_csc);
+
+#ifdef _OPENMP
+          int num_threads = omp_get_num_threads();
+          int tid = omp_get_thread_num();
+#else
+          int num_threads = 1;
+          int tid = 0;
+#endif
+          // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
+          switch (bench_type) {
+            case BenchmarkType::BARE_BONE:
+              fbgemmPacked(
+                  packA,
+                  packedB,
+                  Cint32_fb.data(),
+                  Cint32_fb.data(),
+                  n,
+                  memcopyObj,
+                  tid,
+                  num_threads);
+              break;
+            case BenchmarkType::REQUANTIZATION:
+              fbgemmPacked(
+                  packA,
+                  packedB,
+                  Cint8_fb.data(),
+                  Cint32_fb.data(),
+                  n,
+                  reqObj,
+                  tid,
+                  num_threads);
+              break;
+            case BenchmarkType::ROW_OFFSET_AND_REQUANTIZATION:
+              fbgemmPacked(
+                  packAWithRowOffset,
+                  packedB,
+                  Cint8_fb.data(),
+                  Cint32_fb.data(),
+                  n,
+                  reqObj,
+                  tid,
+                  num_threads);
+              break;
+            case BenchmarkType::EVERYTHING:
+              fbgemmPacked(
+                  packAWithRowOffset,
+                  packedB,
+                  Cint8_fb.data(),
+                  Cint32_fb.data(),
+                  n,
+                  spmdmObj,
+                  tid,
+                  num_threads);
+              break;
+          };
+        }
+
         end = chrono::high_resolution_clock::now();
 
         if (i >= NWARMUP) {
@@ -432,7 +464,10 @@ void performance_test() {
 
 int main() {
 #ifdef _OPENMP
-  omp_set_num_threads(1);
+  const char* val = getenv("OMP_NUM_THREADS");
+  if (val == nullptr || !*val) {
+    omp_set_num_threads(1);
+  }
 #endif
   performance_test();
   return 0;
diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc
index 84d7c24..0e98234 100644
--- a/bench/PackedRequantizeAcc32Benchmark.cc
+++ b/bench/PackedRequantizeAcc32Benchmark.cc
@@ -31,6 +31,7 @@ void performance_test() {
   vector<vector<int>> shapes = {
     // NOTE: clang-format wants to use a different formatting but the current
     // formatting should be easier to read.
+    // m, n, k
     {156800, 4, 36},
     {156800, 8, 36},
     {156800, 16, 36},
@@ -214,20 +215,6 @@ void performance_test() {
     // printMatrix(matrix_op_t::NoTranspose, col_offsets.data(), 1, n, n, "col
     // offsets before");
 
-    vector<int32_t> row_offset_buf;
-    row_offset_buf.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
-
-    PackAWithRowOffset<uint8_t> packAN(
-        matrix_op_t::NoTranspose,
-        m,
-        k,
-        Aint8.data(),
-        k,
-        nullptr,
-        1,
-        Aint8_zero_point,
-        row_offset_buf.data());
-
     PackBMatrix<int8_t> packedBN(
         matrix_op_t::NoTranspose,
         k,
@@ -238,17 +225,6 @@ void performance_test() {
         1,
         Bint8_zero_point);
 
-    DoNothing<> doNothingObj{};
-    ReQuantizeOutput<false> outputProcObj(
-        doNothingObj,
-        C_multiplier,
-        C_zero_pt,
-        Aint8_zero_point,
-        Bint8_zero_point,
-        packAN.getRowOffsetBuffer(),
-        col_offsets.data(),
-        nullptr);
-
     ttot = 0.0;
     runType = "FBGEMM_i8_acc32";
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
@@ -268,15 +244,56 @@ void performance_test() {
 #endif
       llc_flush(llc);
       start = chrono::high_resolution_clock::now();
-      fbgemmPacked(
-          packAN,
-          packedBN,
-          Cint8_fb.data(),
-          Cint32_buffer.data(),
-          n,
-          outputProcObj,
-          0,
-          1);
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+      {
+        vector<int32_t> row_offset_buf;
+        row_offset_buf.resize(
+            PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
+
+        PackAWithRowOffset<uint8_t> packAN(
+            matrix_op_t::NoTranspose,
+            m,
+            k,
+            Aint8.data(),
+            k,
+            nullptr,
+            1,
+            Aint8_zero_point,
+            row_offset_buf.data());
+
+        DoNothing<> doNothingObj{};
+        ReQuantizeOutput<false> outputProcObj(
+            doNothingObj,
+            C_multiplier,
+            C_zero_pt,
+            Aint8_zero_point,
+            Bint8_zero_point,
+            packAN.getRowOffsetBuffer(),
+            col_offsets.data(),
+            nullptr);
+
+#ifdef _OPENMP
+        int num_threads = omp_get_num_threads();
+        int tid = omp_get_thread_num();
+#else
+        int num_threads = 1;
+        int tid = 0;
+#endif
+        // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
+        fbgemmPacked(
+            packAN,
+            packedBN,
+            Cint8_fb.data(),
+            Cint32_buffer.data(),
+            n,
+            outputProcObj,
+            tid,
+            num_threads);
+      }
+
       end = chrono::high_resolution_clock::now();
 
       if (i >= NWARMUP) {
@@ -324,7 +341,10 @@ void performance_test() {
 
 int main(int /* unused */, char** /* unused */) {
 #ifdef _OPENMP
-  omp_set_num_threads(1);
+  const char* val = getenv("OMP_NUM_THREADS");
+  if (val == nullptr || !*val) {
+    omp_set_num_threads(1);
+  }
 #endif
   performance_test();
   return 0;
author	Jianyu Huang <jianyuhuang@fb.com>	2018-11-20 10:31:33 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2018-11-20 10:33:34 +0300
commit	282afa288cf45f75cfc38010d07ce77c081729c9 (patch)
tree	4b2c4c1164159e4d77ca11d9aef08c3365b5a367 /bench
parent	7346431f9a368573257587dea8b5c74c21c609b9 (diff)