diff options
author | Jongsoo Park <jongsoo@fb.com> | 2019-01-03 01:31:52 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-01-03 01:34:53 +0300 |
commit | 172048ae4c0ae44de941e2f42a4a87ac15690858 (patch) | |
tree | 94236ea13612833f63b326d311f4737ec92f4a60 /bench | |
parent | 4321c4d96a76d4735f0869ae1adb6fa2197cf377 (diff) |
optimize remainder loops of requantization and rowoffset (#54)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/54
Optimizations I implemented in ver 2 doesn't seem to help (will remove this). Looks like also using JIT for row_offset is a right long term solution. AVX512 has new instructions that could help row_offset and requantization computation.
Added benchmarks for row_offset and requantization computation to make measuring their performance easier.
Reviewed By: dskhudia
Differential Revision: D13561062
fbshipit-source-id: f11678395c4f9e62a64874e1a0b1f8833fda779f
Diffstat (limited to 'bench')
-rw-r--r-- | bench/RequantizeBenchmark.cc | 145 | ||||
-rw-r--r-- | bench/RowOffsetBenchmark.cc | 54 |
2 files changed, 199 insertions, 0 deletions
diff --git a/bench/RequantizeBenchmark.cc b/bench/RequantizeBenchmark.cc new file mode 100644 index 0000000..d1430a3 --- /dev/null +++ b/bench/RequantizeBenchmark.cc @@ -0,0 +1,145 @@ +#include <chrono> +#include <initializer_list> +#include <iomanip> +#include <iostream> + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include "BenchUtils.h" +#include "fbgemm/Fbgemm.h" + +using namespace std; +using namespace fbgemm; + +enum class BenchmarkType { + BARE_BONE, + BIAS, + A_ASYMMETRIC, + B_ASYMMETRIC, + PER_CHANNEL, +}; + +void performance_test() { + constexpr int NWARMUP = 4; + constexpr int NITER = 256; + + cout << setw(4) << "len" + << ", " << setw(10) << "Type" + << ", B_elements_per_sec" << endl; + + for (int len : {1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, + 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256}) { + aligned_vector<float> C_multiplier(len); + randFill<float>(C_multiplier, -8, 8); + + aligned_vector<int32_t> Bint8_zero_point(len), row_offset_buf(len), + col_offsets(len), bias_vector(len), input(len); + randFill<int32_t>(Bint8_zero_point, -8, 8); + randFill<int32_t>(row_offset_buf, -8, 8); + randFill<int32_t>(col_offsets, -8, 8); + randFill<int32_t>(bias_vector, -8, 8); + randFill<int32_t>(input, -8, 8); + + int32_t C_zero_point = -3; + + block_type_t block{0, 1, 0, len}; + + aligned_vector<uint8_t> output(len); + + chrono::time_point<chrono::high_resolution_clock> begin, end; + + for (BenchmarkType bench_type : {BenchmarkType::BARE_BONE, + BenchmarkType::BIAS, + BenchmarkType::A_ASYMMETRIC, + BenchmarkType::B_ASYMMETRIC, + BenchmarkType::PER_CHANNEL}) { + int32_t Aint8_zero_point = + bench_type < BenchmarkType::A_ASYMMETRIC ? 0 : -3; + if (bench_type < BenchmarkType::B_ASYMMETRIC) { + Bint8_zero_point[0] = 0; + } + const int32_t* bias = + bench_type == BenchmarkType::BARE_BONE ? nullptr : bias_vector.data(); + + DoNothing<> doNothingObj{}; + if (bench_type == BenchmarkType::PER_CHANNEL) { + ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_point, + Aint8_zero_point, + Bint8_zero_point.data(), + row_offset_buf.data(), + col_offsets.data(), + bias, + len); + + for (int i = 0; i < NWARMUP + NITER; ++i) { + if (i == NWARMUP) { + begin = chrono::high_resolution_clock::now(); + } + reqObj.f<inst_set_t::avx2>( + output.data(), input.data(), block, len, len); + } + end = chrono::high_resolution_clock::now(); + } else { + ReQuantizeOutput<false> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_point, + Aint8_zero_point, + Bint8_zero_point.data(), + row_offset_buf.data(), + col_offsets.data(), + bias, + len); + + for (int i = 0; i < NWARMUP + NITER; ++i) { + if (i == NWARMUP) { + begin = chrono::high_resolution_clock::now(); + } + reqObj.f<inst_set_t::avx2>( + output.data(), input.data(), block, len, len); + } + end = chrono::high_resolution_clock::now(); + } + + auto duration = chrono::duration_cast<chrono::nanoseconds>(end - begin); + + cout << setw(4) << len << ", "; + switch (bench_type) { + case BenchmarkType::BARE_BONE: + cout << setw(10) << "bare_bone"; + break; + case BenchmarkType::BIAS: + cout << setw(10) << "bias"; + break; + case BenchmarkType::A_ASYMMETRIC: + cout << setw(10) << "a_asymmetric"; + break; + case BenchmarkType::B_ASYMMETRIC: + cout << setw(10) << "b_asymmetric"; + break; + case BenchmarkType::PER_CHANNEL: + cout << setw(10) << "per_channel"; + break; + } + cout << ", " << setw(10) << setprecision(3) + << static_cast<double>(len) * NITER / duration.count() << endl; + } // for each bench_type + } // for each length +} // performance_test + +int main() { +#ifdef _OPENMP + // Use 1 thread unless OMP_NUM_THREADS is explicit set. + const char* val = getenv("OMP_NUM_THREADS"); + if (val == nullptr || !*val) { + omp_set_num_threads(1); + } +#endif + performance_test(); + return 0; +} diff --git a/bench/RowOffsetBenchmark.cc b/bench/RowOffsetBenchmark.cc new file mode 100644 index 0000000..f30911f --- /dev/null +++ b/bench/RowOffsetBenchmark.cc @@ -0,0 +1,54 @@ +#include <chrono> +#include <initializer_list> +#include <iomanip> +#include <iostream> + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include "BenchUtils.h" +#include "src/OptimizedKernelsAvx2.h" +#include "fbgemm/Fbgemm.h" + +using namespace std; +using namespace fbgemm; + +void performance_test() { + constexpr int NWARMUP = 4; + constexpr int NITER = 256; + + cout << setw(4) << "len" + << ", B_elements_per_sec" << endl; + + for (int len : {1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, + 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, 256}) { + aligned_vector<uint8_t> a(len); + + chrono::time_point<chrono::high_resolution_clock> begin, end; + for (int i = 0; i < NWARMUP + NITER; ++i) { + if (i == NWARMUP) { + begin = chrono::high_resolution_clock::now(); + } + reduceAvx2(a.data(), len); + } + end = chrono::high_resolution_clock::now(); + + auto duration = chrono::duration_cast<chrono::nanoseconds>(end - begin); + + cout << setw(4) << len << ", " << setw(10) << setprecision(3) + << static_cast<double>(len) * NITER / duration.count() << endl; + } // for each length +} // performance_test + +int main() { +#ifdef _OPENMP + // Use 1 thread unless OMP_NUM_THREADS is explicit set. + const char* val = getenv("OMP_NUM_THREADS"); + if (val == nullptr || !*val) { + omp_set_num_threads(1); + } +#endif + performance_test(); + return 0; +} |