Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordskhudia <dskhudia@fb.com>2018-11-03 21:05:52 +0300
committerdskhudia <dskhudia@fb.com>2018-11-03 21:05:52 +0300
commit505eb847185c9255526813dd39edadcd4e61d8e0 (patch)
tree480e3b4f3125d8b0f39047464ae7a7b233585f49 /bench/Im2ColFusedRequantizeAcc16Benchmark.cc
parente85b5a12254fa47ca6b56236489253a68fd32104 (diff)
Manually syncing with internal copy
Diffstat (limited to 'bench/Im2ColFusedRequantizeAcc16Benchmark.cc')
-rw-r--r--bench/Im2ColFusedRequantizeAcc16Benchmark.cc230
1 files changed, 183 insertions, 47 deletions
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
index 62010ec..c24f6fa 100644
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -7,6 +7,7 @@
#include <algorithm>
#include <chrono>
#include <cmath>
+#include <iomanip>
#include <iostream>
#include <random>
#include <vector>
@@ -29,44 +30,44 @@ void performance_test() {
conv_param_t(1, 32, 32, 14, 14, 1, 3, 3, 1, 1, 1, 1),
conv_param_t(2, 32, 32, 14, 14, 1, 3, 3, 1, 1, 0, 0),
conv_param_t(2, 32, 32, 14, 14, 1, 3, 3, 1, 1, 1, 1),
- conv_param_t( 1, 272, 272, 47, 125, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 272, 272, 64, 125, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 272, 272, 66, 125, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 272, 272, 67, 100, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 272, 272, 75, 75, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 272, 272, 75, 76, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 272, 272, 75, 100, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 272, 272, 94, 75, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 272, 272, 109, 75, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 544, 544, 24, 63, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 544, 544, 33, 63, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 544, 544, 34, 50, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 544, 544, 36, 63, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 544, 544, 38, 38, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 544, 544, 38, 40, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 544, 544, 47, 38, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 1088, 1088, 7, 7, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 51, 1088, 1088, 7, 7, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 100, 1088, 1088, 7, 7, 1, 3, 3, 1, 1, 1, 1 ),
- conv_param_t( 1, 248, 248, 93, 250, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 248, 248, 128, 250, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 248, 248, 133, 200, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 248, 248, 150, 150, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 248, 248, 150, 151, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 248, 248, 150, 158, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 248, 248, 188, 150, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 248, 248, 225, 150, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 272, 272, 47, 125, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 272, 272, 64, 125, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 272, 272, 66, 125, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 272, 272, 67, 100, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 272, 272, 75, 75, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 272, 272, 75, 76, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 272, 272, 94, 75, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 544, 544, 14, 14, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 51, 544, 544, 14, 14, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 100, 544, 544, 14, 14, 1, 3, 3, 2, 2, 1, 1 ),
- conv_param_t( 1, 8, 8, 4, 4, 1, 3, 3, 1, 1, 1, 1 ),
+ conv_param_t(1, 272, 272, 47, 125, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 272, 272, 64, 125, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 272, 272, 66, 125, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 272, 272, 67, 100, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 272, 272, 75, 75, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 272, 272, 75, 76, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 272, 272, 75, 100, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 272, 272, 94, 75, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 272, 272, 109, 75, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 544, 544, 24, 63, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 544, 544, 33, 63, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 544, 544, 34, 50, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 544, 544, 36, 63, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 544, 544, 38, 38, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 544, 544, 38, 40, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 544, 544, 47, 38, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 1088, 1088, 7, 7, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(51, 1088, 1088, 7, 7, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(100, 1088, 1088, 7, 7, 1, 3, 3, 1, 1, 1, 1),
+ conv_param_t(1, 248, 248, 93, 250, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 248, 248, 128, 250, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 248, 248, 133, 200, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 248, 248, 150, 150, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 248, 248, 150, 151, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 248, 248, 150, 158, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 248, 248, 188, 150, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 248, 248, 225, 150, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 272, 272, 47, 125, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 272, 272, 64, 125, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 272, 272, 66, 125, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 272, 272, 67, 100, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 272, 272, 75, 75, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 272, 272, 75, 76, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 272, 272, 94, 75, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 544, 544, 14, 14, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(51, 544, 544, 14, 14, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(100, 544, 544, 14, 14, 1, 3, 3, 2, 2, 1, 1),
+ conv_param_t(1, 8, 8, 4, 4, 1, 3, 3, 1, 1, 1, 1),
};
bool flush = true;
@@ -79,6 +80,49 @@ void performance_test() {
constexpr int NWARMUP = 4;
constexpr int NITER = 10;
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ cout << "WARNING: the timer may be inaccurate when used by multiple threads."
+ << endl;
+ cout << "MB, "
+ << "IC, "
+ << "OC, "
+ << "IH, "
+ << "IW, "
+ << "KH, "
+ << "KW, "
+ << "stride_h, "
+ << "stride_w, "
+ << "pad_h, "
+ << "pad_w, "
+ << "Type, "
+ << "M, "
+ << "N, "
+ << "K, "
+ << "Im2Col (ms), "
+ << "Packing (ms), "
+ << "Kernel (ms), "
+ << "Postprocessing (ms), "
+ << "fbgemmPacked (ms), "
+ << "Total (ms), "
+ << "GOPS" << endl;
+#else
+ cout << setw(8) << "MB, "
+ << "IC, "
+ << "OC, "
+ << "IH, "
+ << "IW, "
+ << "KH, "
+ << "KW, "
+ << "stride_h, "
+ << "stride_w, "
+ << "pad_h, "
+ << "pad_w, "
+ << "Type, "
+ << "M, "
+ << "N, "
+ << "K, " << setw(5) << "GOPS" << endl;
+#endif
+
chrono::time_point<chrono::high_resolution_clock> begin, end;
for (auto conv_p : shapes) {
aligned_vector<float> Afp32(
@@ -96,7 +140,7 @@ void performance_test() {
conv_p.KH * conv_p.KW * conv_p.IC * conv_p.OC, 0);
aligned_vector<int32_t> Cint32_ref(
- conv_p.MB * conv_p.OH * conv_p.OW * conv_p.OC, 0.0f);
+ conv_p.MB * conv_p.OH * conv_p.OW * conv_p.OC, 0);
aligned_vector<int32_t> Cint32_fb(
conv_p.MB * conv_p.OH * conv_p.OW * conv_p.OC, 0);
@@ -104,8 +148,6 @@ void performance_test() {
aligned_vector<int32_t> Cint32_fb2(
conv_p.MB * conv_p.OH * conv_p.OW * conv_p.OC, 0);
- cout << conv_p.toString() << endl;
-
// A matrix (input activations)
randFill(Afp32, 0, 5);
int32_t Aint8_zero_point = 4;
@@ -137,7 +179,9 @@ void performance_test() {
// "B unpacked");
// packedB.printPackedMatrix("B Packed");
- double ttot = 0;
+ double nops = 2.0 * static_cast<double>(NITER) * MDim * NDim * KDim;
+ double ttot = 0.0;
+ string runType;
vector<int32_t> row_offset_buf;
row_offset_buf.resize(
@@ -153,8 +197,25 @@ void performance_test() {
DoNothing<int32_t, int32_t> doNothing32BitObj;
memCopy<> memcopyObj(doNothing32BitObj);
+ runType = "FusedIm2Col";
ttot = 0;
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ double im2col_time = 0.0;
+ double total_im2col_time = 0.0;
+ double total_packing_time = 0.0;
+ double total_computing_time = 0.0;
+ double total_kernel_time = 0.0;
+ double total_postprocessing_time = 0.0;
+ double total_run_time = 0.0;
+#endif
for (auto i = 0; i < NWARMUP + NITER; ++i) {
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ packing_time = 0.0;
+ computing_time = 0.0;
+ kernel_time = 0.0;
+ postprocessing_time = 0.0;
+ run_time = 0.0;
+#endif
llc_flush(llc);
begin = chrono::high_resolution_clock::now();
fbgemmPacked(
@@ -171,20 +232,69 @@ void performance_test() {
if (i >= NWARMUP) {
auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin);
ttot += dur.count();
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ total_packing_time += packing_time;
+ total_computing_time += computing_time;
+ total_kernel_time += kernel_time;
+ total_postprocessing_time += postprocessing_time;
+ total_run_time += run_time;
+#endif
}
}
- cout << fixed << "fused im2col GOPs: "
- << static_cast<double>(NITER) * 2 * MDim * NDim * KDim / ttot << endl;
+
+ cout << setw(4) << conv_p.MB << ", " << conv_p.IC << ", " << conv_p.OC
+ << ", " << conv_p.IH << ", " << conv_p.IW << ", " << conv_p.G << ", "
+ << conv_p.KH << ", " << conv_p.KW << ", " << conv_p.stride_h << ", "
+ << conv_p.stride_w << ", " << conv_p.pad_h << ", " << conv_p.pad_w
+ << ", ";
+
+ cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5)
+ << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6)
+ << KDim << ", ";
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ cout << fixed << setprecision(6) << setw(8) << 0 << ", "
+ << total_packing_time / (double)NITER / 1e6 << ", "
+ << total_kernel_time / (double)NITER / 1e6 << ", "
+ << total_postprocessing_time / (double)NITER / 1e6 << ", "
+ << total_run_time / (double)NITER / 1e6 << ", "
+ << ttot / (double)NITER / 1e6 << ", ";
+#endif
+ cout << setprecision(2) << nops / ttot << endl;
compare_buffers(Cint32_ref.data(), Cint32_fb.data(), MDim, NDim, NDim, 5);
+ runType = "UnfusedIm2Col";
+ row_offset_buf.resize(
+ PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
ttot = 0;
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ total_im2col_time = 0.0;
+ total_packing_time = 0.0;
+ total_computing_time = 0.0;
+ total_kernel_time = 0.0;
+ total_postprocessing_time = 0.0;
+ total_run_time = 0.0;
+#endif
for (auto i = 0; i < NWARMUP + NITER; ++i) {
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ im2col_time = 0.0;
+ packing_time = 0.0;
+ computing_time = 0.0;
+ kernel_time = 0.0;
+ postprocessing_time = 0.0;
+ run_time = 0.0;
+#endif
llc_flush(llc);
begin = chrono::high_resolution_clock::now();
im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_out.data());
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ end = chrono::high_resolution_clock::now();
+ im2col_time =
+ chrono::duration_cast<chrono::nanoseconds>(end - begin).count();
+#endif
+
// printMatrix(matrix_op_t::NoTranspose, Aint8_out.data(), MDim, KDim,
// KDim, "A_out after im2col unpacked");
@@ -213,8 +323,17 @@ void performance_test() {
if (i >= NWARMUP) {
auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin);
ttot += dur.count();
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ total_im2col_time += im2col_time;
+ total_packing_time += packing_time;
+ total_computing_time += computing_time;
+ total_kernel_time += kernel_time;
+ total_postprocessing_time += postprocessing_time;
+ total_run_time += run_time;
+#endif
}
}
+
((volatile char*)(llc.data()));
// packedB.printPackedMatrix("bench B Packed");
@@ -225,9 +344,26 @@ void performance_test() {
// printMatrix(matrix_op_t::NoTranspose,
// Cint32_ref.data(), MDim, NDim, NDim, "C ref fp32");
- cout << fixed << "unfused im2col GOPs: "
- << static_cast<double>(NITER) * 2 * MDim * NDim * KDim / ttot << endl;
- // cout << "total time: " << ttot << " ns" << endl;
+ cout << setw(4) << conv_p.MB << ", " << conv_p.IC << ", " << conv_p.OC
+ << ", " << conv_p.IH << ", " << conv_p.IW << ", " << conv_p.G << ", "
+ << conv_p.KH << ", " << conv_p.KW << ", " << conv_p.stride_h << ", "
+ << conv_p.stride_w << ", " << conv_p.pad_h << ", " << conv_p.pad_w
+ << ", ";
+
+ cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5)
+ << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6)
+ << KDim << ", ";
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ cout << fixed << setprecision(6) << setw(8)
+ << total_im2col_time / (double)NITER / 1e6 << ", "
+ << total_packing_time / (double)NITER / 1e6 << ", "
+ << total_kernel_time / (double)NITER / 1e6 << ", "
+ << total_postprocessing_time / (double)NITER / 1e6 << ", "
+ << total_run_time / (double)NITER / 1e6 << ", "
+ << ttot / (double)NITER / 1e6 << ", ";
+#endif
+ cout << setprecision(2) << nops / ttot << endl;
+
compare_buffers(Cint32_ref.data(), Cint32_fb2.data(), MDim, NDim, NDim, 5);
} // shapes
}