optimize remainder loops of requantization and rowoffset (#54)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/54 Optimizations I implemented in ver 2 doesn't seem to help (will remove this). Looks like also using JIT for row_offset is a right long term solution. AVX512 has new instructions that could help row_offset and requantization computation. Added benchmarks for row_offset and requantization computation to make measuring their performance easier. Reviewed By: dskhudia Differential Revision: D13561062 fbshipit-source-id: f11678395c4f9e62a64874e1a0b1f8833fda779f
author: Jongsoo Park <jongsoo@fb.com> 2019-01-03 01:31:52 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-01-03 01:34:53 +0300
commit: 172048ae4c0ae44de941e2f42a4a87ac15690858 (patch)
tree: 94236ea13612833f63b326d311f4737ec92f4a60 /bench
parent: 4321c4d96a76d4735f0869ae1adb6fa2197cf377 (diff)
2 files changed, 199 insertions, 0 deletions
diff --git a/bench/RequantizeBenchmark.cc b/bench/RequantizeBenchmark.cc
new file mode 100644
index 0000000..d1430a3
--- /dev/null
+++ b/bench/RequantizeBenchmark.cc
@@ -0,0 +1,145 @@
+#include <chrono>
+#include <initializer_list>
+#include <iomanip>
+#include <iostream>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "BenchUtils.h"
+#include "fbgemm/Fbgemm.h"
+
+using namespace std;
+using namespace fbgemm;
+
+enum class BenchmarkType {
+  BARE_BONE,
+  BIAS,
+  A_ASYMMETRIC,
+  B_ASYMMETRIC,
+  PER_CHANNEL,
+};
+
+void performance_test() {
+  constexpr int NWARMUP = 4;
+  constexpr int NITER = 256;
+
+  cout << setw(4) << "len"
+       << ", " << setw(10) << "Type"
+       << ", B_elements_per_sec" << endl;
+
+  for (int len : {1,  2,  3,  4,  5,  7,  8,   9,   15,  16,  17,
+                  31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256}) {
+    aligned_vector<float> C_multiplier(len);
+    randFill<float>(C_multiplier, -8, 8);
+
+    aligned_vector<int32_t> Bint8_zero_point(len), row_offset_buf(len),
+        col_offsets(len), bias_vector(len), input(len);
+    randFill<int32_t>(Bint8_zero_point, -8, 8);
+    randFill<int32_t>(row_offset_buf, -8, 8);
+    randFill<int32_t>(col_offsets, -8, 8);
+    randFill<int32_t>(bias_vector, -8, 8);
+    randFill<int32_t>(input, -8, 8);
+
+    int32_t C_zero_point = -3;
+
+    block_type_t block{0, 1, 0, len};
+
+    aligned_vector<uint8_t> output(len);
+
+    chrono::time_point<chrono::high_resolution_clock> begin, end;
+
+    for (BenchmarkType bench_type : {BenchmarkType::BARE_BONE,
+                                     BenchmarkType::BIAS,
+                                     BenchmarkType::A_ASYMMETRIC,
+                                     BenchmarkType::B_ASYMMETRIC,
+                                     BenchmarkType::PER_CHANNEL}) {
+      int32_t Aint8_zero_point =
+          bench_type < BenchmarkType::A_ASYMMETRIC ? 0 : -3;
+      if (bench_type < BenchmarkType::B_ASYMMETRIC) {
+        Bint8_zero_point[0] = 0;
+      }
+      const int32_t* bias =
+          bench_type == BenchmarkType::BARE_BONE ? nullptr : bias_vector.data();
+
+      DoNothing<> doNothingObj{};
+      if (bench_type == BenchmarkType::PER_CHANNEL) {
+        ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> reqObj(
+            doNothingObj,
+            C_multiplier.data(),
+            C_zero_point,
+            Aint8_zero_point,
+            Bint8_zero_point.data(),
+            row_offset_buf.data(),
+            col_offsets.data(),
+            bias,
+            len);
+
+        for (int i = 0; i < NWARMUP + NITER; ++i) {
+          if (i == NWARMUP) {
+            begin = chrono::high_resolution_clock::now();
+          }
+          reqObj.f<inst_set_t::avx2>(
+              output.data(), input.data(), block, len, len);
+        }
+        end = chrono::high_resolution_clock::now();
+      } else {
+        ReQuantizeOutput<false> reqObj(
+            doNothingObj,
+            C_multiplier.data(),
+            C_zero_point,
+            Aint8_zero_point,
+            Bint8_zero_point.data(),
+            row_offset_buf.data(),
+            col_offsets.data(),
+            bias,
+            len);
+
+        for (int i = 0; i < NWARMUP + NITER; ++i) {
+          if (i == NWARMUP) {
+            begin = chrono::high_resolution_clock::now();
+          }
+          reqObj.f<inst_set_t::avx2>(
+              output.data(), input.data(), block, len, len);
+        }
+        end = chrono::high_resolution_clock::now();
+      }
+
+      auto duration = chrono::duration_cast<chrono::nanoseconds>(end - begin);
+
+      cout << setw(4) << len << ", ";
+      switch (bench_type) {
+        case BenchmarkType::BARE_BONE:
+          cout << setw(10) << "bare_bone";
+          break;
+        case BenchmarkType::BIAS:
+          cout << setw(10) << "bias";
+          break;
+        case BenchmarkType::A_ASYMMETRIC:
+          cout << setw(10) << "a_asymmetric";
+          break;
+        case BenchmarkType::B_ASYMMETRIC:
+          cout << setw(10) << "b_asymmetric";
+          break;
+        case BenchmarkType::PER_CHANNEL:
+          cout << setw(10) << "per_channel";
+          break;
+      }
+      cout << ", " << setw(10) << setprecision(3)
+           << static_cast<double>(len) * NITER / duration.count() << endl;
+    } // for each bench_type
+  } // for each length
+} // performance_test
+
+int main() {
+#ifdef _OPENMP
+  // Use 1 thread unless OMP_NUM_THREADS is explicit set.
+  const char* val = getenv("OMP_NUM_THREADS");
+  if (val == nullptr || !*val) {
+    omp_set_num_threads(1);
+  }
+#endif
+  performance_test();
+  return 0;
+}
diff --git a/bench/RowOffsetBenchmark.cc b/bench/RowOffsetBenchmark.cc
new file mode 100644
index 0000000..f30911f
--- /dev/null
+++ b/bench/RowOffsetBenchmark.cc
@@ -0,0 +1,54 @@
+#include <chrono>
+#include <initializer_list>
+#include <iomanip>
+#include <iostream>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "BenchUtils.h"
+#include "src/OptimizedKernelsAvx2.h"
+#include "fbgemm/Fbgemm.h"
+
+using namespace std;
+using namespace fbgemm;
+
+void performance_test() {
+  constexpr int NWARMUP = 4;
+  constexpr int NITER = 256;
+
+  cout << setw(4) << "len"
+       << ", B_elements_per_sec" << endl;
+
+  for (int len : {1,  2,  3,  4,  5,  7,  8,   9,   15,  16,  17,
+                  31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256}) {
+    aligned_vector<uint8_t> a(len);
+
+    chrono::time_point<chrono::high_resolution_clock> begin, end;
+    for (int i = 0; i < NWARMUP + NITER; ++i) {
+      if (i == NWARMUP) {
+        begin = chrono::high_resolution_clock::now();
+      }
+      reduceAvx2(a.data(), len);
+    }
+    end = chrono::high_resolution_clock::now();
+
+    auto duration = chrono::duration_cast<chrono::nanoseconds>(end - begin);
+
+    cout << setw(4) << len << ", " << setw(10) << setprecision(3)
+         << static_cast<double>(len) * NITER / duration.count() << endl;
+  } // for each length
+} // performance_test
+
+int main() {
+#ifdef _OPENMP
+  // Use 1 thread unless OMP_NUM_THREADS is explicit set.
+  const char* val = getenv("OMP_NUM_THREADS");
+  if (val == nullptr || !*val) {
+    omp_set_num_threads(1);
+  }
+#endif
+  performance_test();
+  return 0;
+}
author	Jongsoo Park <jongsoo@fb.com>	2019-01-03 01:31:52 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-01-03 01:34:53 +0300
commit	172048ae4c0ae44de941e2f42a4a87ac15690858 (patch)
tree	94236ea13612833f63b326d311f4737ec92f4a60 /bench
parent	4321c4d96a76d4735f0869ae1adb6fa2197cf377 (diff)