Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/bench
diff options
context:
space:
mode:
authorJianyu Huang <jianyuhuang@fb.com>2018-11-20 10:31:33 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2018-11-20 10:33:34 +0300
commit282afa288cf45f75cfc38010d07ce77c081729c9 (patch)
tree4b2c4c1164159e4d77ca11d9aef08c3365b5a367 /bench
parent7346431f9a368573257587dea8b5c74c21c609b9 (diff)
Parallelize the benchmark
Summary: Add omp parallel to parallelize the benchmark Reviewed By: jspark1105 Differential Revision: D13106978 fbshipit-source-id: cdc8ce3db86d38745487ac0cafa5bd656f182604
Diffstat (limited to 'bench')
-rw-r--r--bench/PackedRequantizeAcc16Benchmark.cc267
-rw-r--r--bench/PackedRequantizeAcc32Benchmark.cc90
2 files changed, 206 insertions, 151 deletions
diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
index 367aa32..de15cce 100644
--- a/bench/PackedRequantizeAcc16Benchmark.cc
+++ b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -37,18 +37,34 @@ enum class BenchmarkType {
void performance_test() {
vector<vector<int>> shapes = {
- // m, n, k
- {64, 68, 17}, {60, 128, 64}, {25088, 256, 64},
- {25088, 64, 64}, {25088, 64, 576}, {25088, 64, 256},
-
- {6272, 512, 256}, {6272, 128, 256}, {6272, 128, 1152},
- {6272, 512, 128}, {6272, 128, 512},
-
- {1568, 1024, 512}, {1568, 256, 512}, {1568, 256, 2304},
- {1568, 1024, 256}, {1568, 256, 1024},
-
- {392, 2048, 1024}, {392, 512, 1024}, {392, 512, 4608},
- {392, 2048, 512}, {392, 512, 2048},
+ // NOTE: clang-format wants to use a different formatting but the current
+ // formatting should be easier to read.
+ // m, n, k
+ {64, 68, 17},
+ {60, 128, 64},
+
+ {25088, 256, 64},
+ {25088, 64, 64},
+ {25088, 64, 576},
+ {25088, 64, 256},
+
+ {6272, 512, 256},
+ {6272, 128, 256},
+ {6272, 128, 1152},
+ {6272, 512, 128},
+ {6272, 128, 512},
+
+ {1568, 1024, 512},
+ {1568, 256, 512},
+ {1568, 256, 2304},
+ {1568, 1024, 256},
+ {1568, 256, 1024},
+
+ {392, 2048, 1024},
+ {392, 512, 1024},
+ {392, 512, 4608},
+ {392, 2048, 512},
+ {392, 512, 2048},
};
bool flush = true;
std::vector<char> llc;
@@ -227,56 +243,9 @@ void performance_test() {
col_offsets.data(),
nullptr); // bias
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
- PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
-
- PackAMatrix<uint8_t, int16_t> packA(
- matrix_op_t::NoTranspose,
- m,
- k,
- Aint8.data(),
- k,
- nullptr,
- 1,
- Aint8_zero_point);
- PackAWithRowOffset<uint8_t, int16_t> packAWithRowOffset(
- matrix_op_t::NoTranspose,
- m,
- k,
- Aint8.data(),
- k,
- nullptr,
- 1,
- Aint8_zero_point,
- row_offset_buf.data());
-
PackBMatrix<int8_t, int16_t> packedB(
matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
- // no-op output process objects
- DoNothing<int32_t, int32_t> doNothing32BitObj;
- memCopy<> memcopyObj(doNothing32BitObj);
-
- // spmdm -> requantization -> nothing
- // construct an output processing pipeline in reverse order
- // i.e. last output operation first
- // Last operation should always be DoNothing with
- // correct input and output type.
- DoNothing<> doNothingObj{};
- // Requantization back to int8
- ReQuantizeOutput<false> reqObj(
- doNothingObj,
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- bench_type == BenchmarkType::REQUANTIZATION
- ? nullptr
- : packAWithRowOffset.getRowOffsetBuffer(),
- col_offsets.data(),
- nullptr);
-
CompressedSparseColumn B_csc(k, n);
float density = 0.001f;
@@ -310,16 +279,6 @@ void performance_test() {
}
B_csc.ColPtr()[n] = total_nnz;
- // the top most (first) operation in the output processing
- // pipeline is spmdm
- // outType = final output type after fullly processing through pipeline
- // inType = initial input type at the first call to the whole pipeline
- DoSpmdmOnInpBuffer<
- ReQuantizeOutput<false>::outType,
- int32_t,
- ReQuantizeOutput<false>>
- spmdmObj(reqObj, Aint8.data(), k, B_csc);
-
// printMatrix(matrix_op_t::NoTranspose,
// Cint32_mkl.data(), m, n, n, "C mkl");
ttot = 0;
@@ -334,52 +293,125 @@ void performance_test() {
#endif
llc_flush(llc);
begin = chrono::high_resolution_clock::now();
- switch (bench_type) {
- case BenchmarkType::BARE_BONE:
- fbgemmPacked(
- packA,
- packedB,
- Cint32_fb.data(),
- Cint32_fb.data(),
- n,
- memcopyObj,
- 0,
- 1);
- break;
- case BenchmarkType::REQUANTIZATION:
- fbgemmPacked(
- packA,
- packedB,
- Cint8_fb.data(),
- Cint32_fb.data(),
- n,
- reqObj,
- 0,
- 1);
- break;
- case BenchmarkType::ROW_OFFSET_AND_REQUANTIZATION:
- fbgemmPacked(
- packAWithRowOffset,
- packedB,
- Cint8_fb.data(),
- Cint32_fb.data(),
- n,
- reqObj,
- 0,
- 1);
- break;
- case BenchmarkType::EVERYTHING:
- fbgemmPacked(
- packAWithRowOffset,
- packedB,
- Cint8_fb.data(),
- Cint32_fb.data(),
- n,
- spmdmObj,
- 0,
- 1);
- break;
- };
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+ {
+ vector<int32_t> row_offset_buf;
+ row_offset_buf.resize(
+ PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
+
+ PackAMatrix<uint8_t, int16_t> packA(
+ matrix_op_t::NoTranspose,
+ m,
+ k,
+ Aint8.data(),
+ k,
+ nullptr,
+ 1,
+ Aint8_zero_point);
+ PackAWithRowOffset<uint8_t, int16_t> packAWithRowOffset(
+ matrix_op_t::NoTranspose,
+ m,
+ k,
+ Aint8.data(),
+ k,
+ nullptr,
+ 1,
+ Aint8_zero_point,
+ row_offset_buf.data());
+
+ // no-op output process objects
+ DoNothing<int32_t, int32_t> doNothing32BitObj;
+ memCopy<> memcopyObj(doNothing32BitObj);
+
+ // spmdm -> requantization -> nothing
+ // construct an output processing pipeline in reverse order
+ // i.e. last output operation first
+ // Last operation should always be DoNothing with
+ // correct input and output type.
+ DoNothing<> doNothingObj{};
+ // Requantization back to int8
+ ReQuantizeOutput<false> reqObj(
+ doNothingObj,
+ C_multiplier,
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point,
+ bench_type == BenchmarkType::REQUANTIZATION
+ ? nullptr
+ : packAWithRowOffset.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr);
+
+ // the top most (first) operation in the output processing
+ // pipeline is spmdm
+ // outType = final output type after fullly processing through
+ // pipeline; inType = initial input type at the first call to the
+ // whole pipeline
+ DoSpmdmOnInpBuffer<
+ ReQuantizeOutput<false>::outType,
+ int32_t,
+ ReQuantizeOutput<false>>
+ spmdmObj(reqObj, Aint8.data(), k, B_csc);
+
+#ifdef _OPENMP
+ int num_threads = omp_get_num_threads();
+ int tid = omp_get_thread_num();
+#else
+ int num_threads = 1;
+ int tid = 0;
+#endif
+ // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
+ switch (bench_type) {
+ case BenchmarkType::BARE_BONE:
+ fbgemmPacked(
+ packA,
+ packedB,
+ Cint32_fb.data(),
+ Cint32_fb.data(),
+ n,
+ memcopyObj,
+ tid,
+ num_threads);
+ break;
+ case BenchmarkType::REQUANTIZATION:
+ fbgemmPacked(
+ packA,
+ packedB,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ n,
+ reqObj,
+ tid,
+ num_threads);
+ break;
+ case BenchmarkType::ROW_OFFSET_AND_REQUANTIZATION:
+ fbgemmPacked(
+ packAWithRowOffset,
+ packedB,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ n,
+ reqObj,
+ tid,
+ num_threads);
+ break;
+ case BenchmarkType::EVERYTHING:
+ fbgemmPacked(
+ packAWithRowOffset,
+ packedB,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ n,
+ spmdmObj,
+ tid,
+ num_threads);
+ break;
+ };
+ }
+
end = chrono::high_resolution_clock::now();
if (i >= NWARMUP) {
@@ -432,7 +464,10 @@ void performance_test() {
int main() {
#ifdef _OPENMP
- omp_set_num_threads(1);
+ const char* val = getenv("OMP_NUM_THREADS");
+ if (val == nullptr || !*val) {
+ omp_set_num_threads(1);
+ }
#endif
performance_test();
return 0;
diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc
index 84d7c24..0e98234 100644
--- a/bench/PackedRequantizeAcc32Benchmark.cc
+++ b/bench/PackedRequantizeAcc32Benchmark.cc
@@ -31,6 +31,7 @@ void performance_test() {
vector<vector<int>> shapes = {
// NOTE: clang-format wants to use a different formatting but the current
// formatting should be easier to read.
+ // m, n, k
{156800, 4, 36},
{156800, 8, 36},
{156800, 16, 36},
@@ -214,20 +215,6 @@ void performance_test() {
// printMatrix(matrix_op_t::NoTranspose, col_offsets.data(), 1, n, n, "col
// offsets before");
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
-
- PackAWithRowOffset<uint8_t> packAN(
- matrix_op_t::NoTranspose,
- m,
- k,
- Aint8.data(),
- k,
- nullptr,
- 1,
- Aint8_zero_point,
- row_offset_buf.data());
-
PackBMatrix<int8_t> packedBN(
matrix_op_t::NoTranspose,
k,
@@ -238,17 +225,6 @@ void performance_test() {
1,
Bint8_zero_point);
- DoNothing<> doNothingObj{};
- ReQuantizeOutput<false> outputProcObj(
- doNothingObj,
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- packAN.getRowOffsetBuffer(),
- col_offsets.data(),
- nullptr);
-
ttot = 0.0;
runType = "FBGEMM_i8_acc32";
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
@@ -268,15 +244,56 @@ void performance_test() {
#endif
llc_flush(llc);
start = chrono::high_resolution_clock::now();
- fbgemmPacked(
- packAN,
- packedBN,
- Cint8_fb.data(),
- Cint32_buffer.data(),
- n,
- outputProcObj,
- 0,
- 1);
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+ {
+ vector<int32_t> row_offset_buf;
+ row_offset_buf.resize(
+ PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
+
+ PackAWithRowOffset<uint8_t> packAN(
+ matrix_op_t::NoTranspose,
+ m,
+ k,
+ Aint8.data(),
+ k,
+ nullptr,
+ 1,
+ Aint8_zero_point,
+ row_offset_buf.data());
+
+ DoNothing<> doNothingObj{};
+ ReQuantizeOutput<false> outputProcObj(
+ doNothingObj,
+ C_multiplier,
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point,
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr);
+
+#ifdef _OPENMP
+ int num_threads = omp_get_num_threads();
+ int tid = omp_get_thread_num();
+#else
+ int num_threads = 1;
+ int tid = 0;
+#endif
+ // printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint8_fb.data(),
+ Cint32_buffer.data(),
+ n,
+ outputProcObj,
+ tid,
+ num_threads);
+ }
+
end = chrono::high_resolution_clock::now();
if (i >= NWARMUP) {
@@ -324,7 +341,10 @@ void performance_test() {
int main(int /* unused */, char** /* unused */) {
#ifdef _OPENMP
- omp_set_num_threads(1);
+ const char* val = getenv("OMP_NUM_THREADS");
+ if (val == nullptr || !*val) {
+ omp_set_num_threads(1);
+ }
#endif
performance_test();
return 0;