Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/bench
diff options
context:
space:
mode:
authorJongsoo Park <jongsoo@fb.com>2019-01-03 01:31:52 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-01-03 01:34:53 +0300
commit172048ae4c0ae44de941e2f42a4a87ac15690858 (patch)
tree94236ea13612833f63b326d311f4737ec92f4a60 /bench
parent4321c4d96a76d4735f0869ae1adb6fa2197cf377 (diff)
optimize remainder loops of requantization and rowoffset (#54)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/54 Optimizations I implemented in ver 2 doesn't seem to help (will remove this). Looks like also using JIT for row_offset is a right long term solution. AVX512 has new instructions that could help row_offset and requantization computation. Added benchmarks for row_offset and requantization computation to make measuring their performance easier. Reviewed By: dskhudia Differential Revision: D13561062 fbshipit-source-id: f11678395c4f9e62a64874e1a0b1f8833fda779f
Diffstat (limited to 'bench')
-rw-r--r--bench/RequantizeBenchmark.cc145
-rw-r--r--bench/RowOffsetBenchmark.cc54
2 files changed, 199 insertions, 0 deletions
diff --git a/bench/RequantizeBenchmark.cc b/bench/RequantizeBenchmark.cc
new file mode 100644
index 0000000..d1430a3
--- /dev/null
+++ b/bench/RequantizeBenchmark.cc
@@ -0,0 +1,145 @@
+#include <chrono>
+#include <initializer_list>
+#include <iomanip>
+#include <iostream>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "BenchUtils.h"
+#include "fbgemm/Fbgemm.h"
+
+using namespace std;
+using namespace fbgemm;
+
+enum class BenchmarkType {
+ BARE_BONE,
+ BIAS,
+ A_ASYMMETRIC,
+ B_ASYMMETRIC,
+ PER_CHANNEL,
+};
+
+void performance_test() {
+ constexpr int NWARMUP = 4;
+ constexpr int NITER = 256;
+
+ cout << setw(4) << "len"
+ << ", " << setw(10) << "Type"
+ << ", B_elements_per_sec" << endl;
+
+ for (int len : {1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17,
+ 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256}) {
+ aligned_vector<float> C_multiplier(len);
+ randFill<float>(C_multiplier, -8, 8);
+
+ aligned_vector<int32_t> Bint8_zero_point(len), row_offset_buf(len),
+ col_offsets(len), bias_vector(len), input(len);
+ randFill<int32_t>(Bint8_zero_point, -8, 8);
+ randFill<int32_t>(row_offset_buf, -8, 8);
+ randFill<int32_t>(col_offsets, -8, 8);
+ randFill<int32_t>(bias_vector, -8, 8);
+ randFill<int32_t>(input, -8, 8);
+
+ int32_t C_zero_point = -3;
+
+ block_type_t block{0, 1, 0, len};
+
+ aligned_vector<uint8_t> output(len);
+
+ chrono::time_point<chrono::high_resolution_clock> begin, end;
+
+ for (BenchmarkType bench_type : {BenchmarkType::BARE_BONE,
+ BenchmarkType::BIAS,
+ BenchmarkType::A_ASYMMETRIC,
+ BenchmarkType::B_ASYMMETRIC,
+ BenchmarkType::PER_CHANNEL}) {
+ int32_t Aint8_zero_point =
+ bench_type < BenchmarkType::A_ASYMMETRIC ? 0 : -3;
+ if (bench_type < BenchmarkType::B_ASYMMETRIC) {
+ Bint8_zero_point[0] = 0;
+ }
+ const int32_t* bias =
+ bench_type == BenchmarkType::BARE_BONE ? nullptr : bias_vector.data();
+
+ DoNothing<> doNothingObj{};
+ if (bench_type == BenchmarkType::PER_CHANNEL) {
+ ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> reqObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_point,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ row_offset_buf.data(),
+ col_offsets.data(),
+ bias,
+ len);
+
+ for (int i = 0; i < NWARMUP + NITER; ++i) {
+ if (i == NWARMUP) {
+ begin = chrono::high_resolution_clock::now();
+ }
+ reqObj.f<inst_set_t::avx2>(
+ output.data(), input.data(), block, len, len);
+ }
+ end = chrono::high_resolution_clock::now();
+ } else {
+ ReQuantizeOutput<false> reqObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_point,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ row_offset_buf.data(),
+ col_offsets.data(),
+ bias,
+ len);
+
+ for (int i = 0; i < NWARMUP + NITER; ++i) {
+ if (i == NWARMUP) {
+ begin = chrono::high_resolution_clock::now();
+ }
+ reqObj.f<inst_set_t::avx2>(
+ output.data(), input.data(), block, len, len);
+ }
+ end = chrono::high_resolution_clock::now();
+ }
+
+ auto duration = chrono::duration_cast<chrono::nanoseconds>(end - begin);
+
+ cout << setw(4) << len << ", ";
+ switch (bench_type) {
+ case BenchmarkType::BARE_BONE:
+ cout << setw(10) << "bare_bone";
+ break;
+ case BenchmarkType::BIAS:
+ cout << setw(10) << "bias";
+ break;
+ case BenchmarkType::A_ASYMMETRIC:
+ cout << setw(10) << "a_asymmetric";
+ break;
+ case BenchmarkType::B_ASYMMETRIC:
+ cout << setw(10) << "b_asymmetric";
+ break;
+ case BenchmarkType::PER_CHANNEL:
+ cout << setw(10) << "per_channel";
+ break;
+ }
+ cout << ", " << setw(10) << setprecision(3)
+ << static_cast<double>(len) * NITER / duration.count() << endl;
+ } // for each bench_type
+ } // for each length
+} // performance_test
+
+int main() {
+#ifdef _OPENMP
+ // Use 1 thread unless OMP_NUM_THREADS is explicit set.
+ const char* val = getenv("OMP_NUM_THREADS");
+ if (val == nullptr || !*val) {
+ omp_set_num_threads(1);
+ }
+#endif
+ performance_test();
+ return 0;
+}
diff --git a/bench/RowOffsetBenchmark.cc b/bench/RowOffsetBenchmark.cc
new file mode 100644
index 0000000..f30911f
--- /dev/null
+++ b/bench/RowOffsetBenchmark.cc
@@ -0,0 +1,54 @@
+#include <chrono>
+#include <initializer_list>
+#include <iomanip>
+#include <iostream>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "BenchUtils.h"
+#include "src/OptimizedKernelsAvx2.h"
+#include "fbgemm/Fbgemm.h"
+
+using namespace std;
+using namespace fbgemm;
+
+void performance_test() {
+ constexpr int NWARMUP = 4;
+ constexpr int NITER = 256;
+
+ cout << setw(4) << "len"
+ << ", B_elements_per_sec" << endl;
+
+ for (int len : {1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17,
+ 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256}) {
+ aligned_vector<uint8_t> a(len);
+
+ chrono::time_point<chrono::high_resolution_clock> begin, end;
+ for (int i = 0; i < NWARMUP + NITER; ++i) {
+ if (i == NWARMUP) {
+ begin = chrono::high_resolution_clock::now();
+ }
+ reduceAvx2(a.data(), len);
+ }
+ end = chrono::high_resolution_clock::now();
+
+ auto duration = chrono::duration_cast<chrono::nanoseconds>(end - begin);
+
+ cout << setw(4) << len << ", " << setw(10) << setprecision(3)
+ << static_cast<double>(len) * NITER / duration.count() << endl;
+ } // for each length
+} // performance_test
+
+int main() {
+#ifdef _OPENMP
+ // Use 1 thread unless OMP_NUM_THREADS is explicit set.
+ const char* val = getenv("OMP_NUM_THREADS");
+ if (val == nullptr || !*val) {
+ omp_set_num_threads(1);
+ }
+#endif
+ performance_test();
+ return 0;
+}