optimize requantize for float out processing (#85)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/85 Optimizing performance of output processing when output is dequantized right away. Reviewed By: protonu Differential Revision: D14433141 fbshipit-source-id: f99a8d82000c43e554461acf036462a4e8f7e300
author: Jongsoo Park <jongsoo@fb.com> 2019-03-13 06:14:32 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-03-13 06:17:49 +0300
commit: 6011ce3b0c1fccee549e85b37e475c7a734ad742 (patch)
tree: 7089177b6c7da36c2582da1cf9b42eca9dfb2ea7 /bench/PackedFloatInOutBenchmark.cc
parent: 50b43162fd1742122d01f2704945c78f13e0d73e (diff)
1 files changed, 42 insertions, 0 deletions
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
index 1397125..66ca67e 100644
--- a/bench/PackedFloatInOutBenchmark.cc
+++ b/bench/PackedFloatInOutBenchmark.cc
@@ -76,8 +76,21 @@ void performance_test() {
   constexpr int NWARMUP = 4;
   constexpr int NITER = 10;
 
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+  cout << "WARNING: the timer may be inaccurate when used by multiple threads."
+       << endl;
+  cout << "M, "
+       << "N, "
+       << "K, "
+       << "Packing (ms), "
+       << "Kernel (ms), "
+       << "Postprocessing (ms), "
+       << "Total (ms), "
+       << "GOPs" << endl;
+#else
   cout << setw(7) << "M, " << setw(7) << "N, " << setw(7) << "K, " << setw(18)
        << "Type, " << setw(5) << "GOPS" << endl;
+#endif
 
   chrono::time_point<chrono::high_resolution_clock> start, end;
   for (auto shape : shapes) {
@@ -203,7 +216,23 @@ void performance_test() {
 
     ttot = 0;
     type = "FBGEMM_i8_acc32";
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+    double total_packing_time = 0.0;
+    double total_computing_time = 0.0;
+    double total_kernel_time = 0.0;
+    double total_postprocessing_time = 0.0;
+    double total_run_time = 0.0;
+#endif
+
     for (auto i = 0; i < NWARMUP + NITER; ++i) {
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+      packing_time = 0.0;
+      computing_time = 0.0;
+      kernel_time = 0.0;
+      postprocessing_time = 0.0;
+      run_time = 0.0;
+#endif
+
       llc_flush(llc);
       start = chrono::high_resolution_clock::now();
       fbgemmPacked(
@@ -220,6 +249,13 @@ void performance_test() {
       if (i >= NWARMUP) {
         auto dur = chrono::duration_cast<chrono::nanoseconds>(end - start);
         ttot += dur.count();
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+        total_packing_time += packing_time;
+        total_computing_time += computing_time;
+        total_kernel_time += kernel_time;
+        total_postprocessing_time += postprocessing_time;
+        total_run_time += run_time;
+#endif
       }
     }
     ((volatile char*)(llc.data()));
@@ -237,6 +273,12 @@ void performance_test() {
     // row_offsets.size(), 5);
     // printMatrix(matrix_op_t::NoTranspose, Cfp32_fb.data(),
     // m, n, n, "C fb fp32");
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+    cout << total_packing_time / (double)NITER / 1e6 << ", "
+         << total_kernel_time / (double)NITER / 1e6 << ", "
+         << total_postprocessing_time / (double)NITER / 1e6 << ", "
+         << total_run_time / (double)NITER / 1e6 << ", ";
+#endif
     cout << setw(5) << m << ", " << setw(5) << n << ", " << setw(5) << k << ", "
          << setw(16) << type << ", " << setw(5) << fixed << setw(5)
          << setprecision(1) << nops / ttot << endl;
author	Jongsoo Park <jongsoo@fb.com>	2019-03-13 06:14:32 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-03-13 06:17:49 +0300
commit	6011ce3b0c1fccee549e85b37e475c7a734ad742 (patch)
tree	7089177b6c7da36c2582da1cf9b42eca9dfb2ea7 /bench/PackedFloatInOutBenchmark.cc
parent	50b43162fd1742122d01f2704945c78f13e0d73e (diff)