Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/bench
diff options
context:
space:
mode:
authorDaya Khudia <dskhudia@fb.com>2019-06-05 22:44:57 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-06-05 22:50:08 +0300
commit8197494f3ae280941639c72bc1342a9faa8e2ad6 (patch)
treee0f7fb9b8aad61c46b7d2a79f019255514c03985 /bench
parent77868418c7963572167690ef069b06cbfe67de1f (diff)
Unified convolution interface
Summary: We want to combine three different convolution interfaces under one top level function. Reviewed By: protonu Differential Revision: D15399811 fbshipit-source-id: 7390616d92783506fc156f0f6017f10b5f7f8e30
Diffstat (limited to 'bench')
-rw-r--r--bench/ConvUnifiedBenchmark.cc309
-rw-r--r--bench/Depthwise3DBenchmark.cc2
-rw-r--r--bench/DepthwiseBenchmark.cc2
3 files changed, 311 insertions, 2 deletions
diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc
new file mode 100644
index 0000000..59079c7
--- /dev/null
+++ b/bench/ConvUnifiedBenchmark.cc
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "BenchUtils.h"
+#include "fbgemm/Fbgemm.h"
+#include "src/RefImplementations.h"
+
+using namespace std;
+using namespace fbgemm;
+
+// 2D conv shapes
+vector<conv_param_t<2>> shapes_2d = {
+ // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
+ // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right
+ // 2D convolutions
+ // regular
+ conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ // groupwise
+ conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+
+ // DW
+ conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+};
+
+// 3D conv shapes
+vector<conv_param_t<3>> shapes_3d = {
+ // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h, stride_w},
+ // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right}
+ // Regular
+ conv_param_t<3>(1, 64, 64, {32, 56, 56}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
+ // Depthwise
+ conv_param_t<3>(1, 64, 64, {32, 56, 56}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1})
+};
+
+template <int SPATIAL_DIM, typename Acc_t>
+void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
+ bool flush = true;
+ std::vector<char> llc;
+
+ if (flush) {
+ llc.resize(128 * 1024 * 1024, 1.0);
+ }
+
+ constexpr int NWARMUP = 4;
+ constexpr int NITER = 10;
+
+ string header = "MB, IC, OC, ";
+ if (SPATIAL_DIM == 3) {
+ header += "IT, ";
+ }
+ header += "IH, IW, G, ";
+ if (SPATIAL_DIM == 3) {
+ header += "KT, ";
+ }
+ header += "KH, KW, ";
+ if (SPATIAL_DIM == 3) {
+ header += "stride_t, ";
+ }
+ header += "stride_h, stride_w, ";
+ if (SPATIAL_DIM == 3) {
+ header += "pad_t, ";
+ }
+ header += "pad_h, pad_w, ";
+
+ header += "Type, M, N, K, ";
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ cout << "WARNING: the timer may be inaccurate when used by multiple threads."
+ << endl;
+ cout << header << "Im2Col (ms), "
+ << "Packing (ms), "
+ << "Kernel (ms), "
+ << "Postprocessing (ms), "
+ << "fbgemmPacked (ms), "
+ << "Total (ms), "
+ << "GOPS" << endl;
+#else
+ cout << setw(6) << header << setw(5) << "GOPS" << endl;
+#endif
+
+ chrono::time_point<chrono::high_resolution_clock> begin, end;
+
+ for (auto conv_p : shapes) {
+ if (conv_p.IC % conv_p.G != 0 || conv_p.OC % conv_p.G != 0) {
+ // invalid shapes
+ continue;
+ }
+ int im_in_dim = accumulate(
+ conv_p.IN_DIM.begin(), conv_p.IN_DIM.end(), 1, multiplies<int>());
+ aligned_vector<uint8_t> Aint8(conv_p.MB * im_in_dim * conv_p.IC);
+
+ int kernel_dim =
+ accumulate(conv_p.K.begin(), conv_p.K.end(), 1, multiplies<int>());
+ aligned_vector<int8_t> Bint8(
+ kernel_dim * conv_p.IC * (conv_p.OC / conv_p.G));
+
+ int im_out_dim = accumulate(
+ conv_p.OUT_DIM.begin(), conv_p.OUT_DIM.end(), 1, multiplies<int>());
+ aligned_vector<int32_t> Cint32_ref(conv_p.MB * im_out_dim * conv_p.OC);
+ aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+ aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
+ aligned_vector<uint8_t> Cint8_fb2(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
+
+ // A matrix (input activations)
+ randFill<uint8_t>(Aint8, 0, 5);
+ int32_t Aint8_zero_point = 4;
+
+ // B matrix (weights)
+ randFill<int8_t>(Bint8, -4, 4);
+ aligned_vector<int32_t> Bint8_zero_point(1);
+ randFill(Bint8_zero_point, -3, -1);
+
+ aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+ randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2);
+ int32_t C_zero_point = 5;
+
+ aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
+
+ // reference implementation
+ conv_ref(
+ conv_p,
+ Aint8.data(),
+ Aint8_zero_point,
+ Bint8.data(),
+ Cint32_ref.data());
+
+ // matrix dimensions after im2col
+ int MDim = conv_p.MB * im_out_dim;
+ int NDim = conv_p.OC / conv_p.G;
+ int KDim = kernel_dim * conv_p.IC;
+ int KDimPerGroup = KDim / conv_p.G;
+
+ int OC_per_G = conv_p.OC / conv_p.G;
+
+ // computing row offset
+ vector<int32_t> row_offsets(MDim);
+ vector<uint8_t> Aint8_im2col(MDim * KDim);
+ im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
+
+ // computing column offset
+ vector<int32_t> col_offsets(conv_p.OC);
+ for (int g = 0; g < conv_p.G; ++g) {
+ col_offsets_with_zero_pt_s8acc32_ref(
+ KDimPerGroup,
+ OC_per_G,
+ OC_per_G,
+ Bint8.data() + g * KDimPerGroup * OC_per_G,
+ Bint8_zero_point.data(),
+ col_offsets.data() + g * OC_per_G,
+ conv_p.OC);
+ }
+
+ for (int g = 0; g < conv_p.G; ++g) {
+ row_offsets_u8acc32_ref(
+ MDim,
+ KDimPerGroup,
+ KDim,
+ Aint8_im2col.data() + g * KDimPerGroup,
+ row_offsets.data());
+
+ requantize_u8acc32_ref(
+ MDim,
+ NDim,
+ conv_p.G * NDim,
+ Cint32_ref.data() + g * NDim,
+ Cint8_ref.data() + g * NDim,
+ C_multiplier.data() + g * NDim / conv_p.OC,
+ C_zero_point,
+ Aint8_zero_point,
+ Bint8_zero_point.data() + g * NDim / conv_p.OC,
+ row_offsets.data(),
+ col_offsets.data() + g * NDim,
+ nullptr,
+ conv_p.OC);
+ }
+
+ double nops = 2.0 * static_cast<double>(NITER) * MDim * NDim * KDim;
+ double ttot = 0.0;
+ string runType;
+
+ PackWeightsForConv<SPATIAL_DIM> packedB(conv_p, Bint8.data());
+
+ // no-op output process objects
+ DoNothing<> doNothingObj{};
+ ReQuantizeOutput<false, QuantizationGranularity::TENSOR> outputProcObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_point,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ nullptr, // row offsets
+ col_offsets.data(),
+ nullptr, // bias
+ conv_p.OC,
+ conv_p.G);
+
+ runType = "UniConv";
+ ttot = 0;
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ double im2col_time = 0.0;
+ double total_im2col_time = 0.0;
+ double total_packing_time = 0.0;
+ double total_computing_time = 0.0;
+ double total_kernel_time = 0.0;
+ double total_postprocessing_time = 0.0;
+ double total_run_time = 0.0;
+#endif
+ for (auto i = 0; i < NWARMUP + NITER; ++i) {
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ packing_time = 0.0;
+ computing_time = 0.0;
+ kernel_time = 0.0;
+ postprocessing_time = 0.0;
+ run_time = 0.0;
+#endif
+ llc_flush(llc);
+ begin = chrono::high_resolution_clock::now();
+ fbgemmConv(
+ conv_p,
+ Aint8.data(),
+ packedB,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ outputProcObj,
+ 0,
+ 1);
+ end = chrono::high_resolution_clock::now();
+
+ if (i >= NWARMUP) {
+ auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin);
+ ttot += dur.count();
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ total_packing_time += packing_time;
+ total_computing_time += computing_time;
+ total_kernel_time += kernel_time;
+ total_postprocessing_time += postprocessing_time;
+ total_run_time += run_time;
+#endif
+ }
+ }
+
+ cout << conv_p.MB << ", " << conv_p.IC << ", " << conv_p.OC << ", ";
+ for (int i = 0; i < SPATIAL_DIM; ++i) {
+ cout << conv_p.IN_DIM[i] << ", ";
+ }
+ cout << conv_p.G << ", ";
+ for (int i = 0; i < SPATIAL_DIM; ++i) {
+ cout << conv_p.K[i] << ", ";
+ }
+ for (int i = 0; i < SPATIAL_DIM; ++i) {
+ cout << conv_p.stride[i] << ", ";
+ }
+ for (int i = 0; i < SPATIAL_DIM; ++i) {
+ cout << conv_p.pad[i] << ", ";
+ }
+
+ cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5)
+ << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6)
+ << KDim << ", ";
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ cout << fixed << setprecision(6) << setw(8) << 0 << ", "
+ << total_packing_time / (double)NITER / 1e6 << ", "
+ << total_kernel_time / (double)NITER / 1e6 << ", "
+ << total_postprocessing_time / (double)NITER / 1e6 << ", "
+ << total_run_time / (double)NITER / 1e6 << ", "
+ << ttot / (double)NITER / 1e6 << ", ";
+#endif
+ cout << setprecision(2) << nops / ttot << endl;
+
+ compare_buffers(
+ Cint8_ref.data(),
+ Cint8_fb.data(),
+ MDim,
+ NDim * conv_p.G,
+ NDim * conv_p.G,
+ 5);
+ } // shapes
+}
+
+int main() {
+#ifdef _OPENMP
+ // Use 1 thread unless OMP_NUM_THREADS is explicit set.
+ const char* val = getenv("OMP_NUM_THREADS");
+ if (val == nullptr || !*val) {
+ omp_set_num_threads(1);
+ }
+#endif
+ // performance_test<int16_t>();
+ performance_test<2, int32_t>(shapes_2d);
+ performance_test<3, int32_t>(shapes_3d);
+ return 0;
+}
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index c5f8ed9..0efdcac 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -20,7 +20,7 @@
#include "AlignedVec.h"
#include "BenchUtils.h"
#include "fbgemm/Utils.h"
-#include "src/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/FbgemmI8DepthwiseAvx2.h"
#include "src/RefImplementations.h"
using namespace std;
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index 780d83c..96921a1 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -18,7 +18,7 @@
#include "AlignedVec.h"
#include "BenchUtils.h"
#include "fbgemm/Utils.h"
-#include "src/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/FbgemmI8DepthwiseAvx2.h"
#include "src/RefImplementations.h"
using namespace std;