Unified convolution interface

Summary: We want to combine three different convolution interfaces under one top level function. Reviewed By: protonu Differential Revision: D15399811 fbshipit-source-id: 7390616d92783506fc156f0f6017f10b5f7f8e30
author: Daya Khudia <dskhudia@fb.com> 2019-06-05 22:44:57 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-06-05 22:50:08 +0300
commit: 8197494f3ae280941639c72bc1342a9faa8e2ad6 (patch)
tree: e0f7fb9b8aad61c46b7d2a79f019255514c03985 /bench
parent: 77868418c7963572167690ef069b06cbfe67de1f (diff)
3 files changed, 311 insertions, 2 deletions
diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc
new file mode 100644
index 0000000..59079c7
--- /dev/null
+++ b/bench/ConvUnifiedBenchmark.cc
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "BenchUtils.h"
+#include "fbgemm/Fbgemm.h"
+#include "src/RefImplementations.h"
+
+using namespace std;
+using namespace fbgemm;
+
+// 2D conv shapes
+vector<conv_param_t<2>> shapes_2d = {
+    // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
+    // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right
+    // 2D convolutions
+    // regular
+    conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // groupwise
+    conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+
+    // DW
+    conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+};
+
+// 3D conv shapes
+vector<conv_param_t<3>> shapes_3d = {
+  // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h, stride_w},
+  // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right}
+  // Regular
+  conv_param_t<3>(1, 64, 64, {32, 56, 56}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
+  // Depthwise
+  conv_param_t<3>(1, 64, 64, {32, 56, 56}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1})
+};
+
+template <int SPATIAL_DIM, typename Acc_t>
+void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
+  bool flush = true;
+  std::vector<char> llc;
+
+  if (flush) {
+    llc.resize(128 * 1024 * 1024, 1.0);
+  }
+
+  constexpr int NWARMUP = 4;
+  constexpr int NITER = 10;
+
+  string header = "MB, IC, OC, ";
+  if (SPATIAL_DIM == 3) {
+    header += "IT, ";
+  }
+  header += "IH, IW, G, ";
+  if (SPATIAL_DIM == 3) {
+    header += "KT, ";
+  }
+  header += "KH, KW, ";
+  if (SPATIAL_DIM == 3) {
+    header += "stride_t, ";
+  }
+  header += "stride_h, stride_w, ";
+  if (SPATIAL_DIM == 3) {
+    header += "pad_t, ";
+  }
+  header += "pad_h, pad_w, ";
+
+  header += "Type, M, N, K, ";
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+  cout << "WARNING: the timer may be inaccurate when used by multiple threads."
+       << endl;
+  cout << header << "Im2Col (ms), "
+       << "Packing (ms), "
+       << "Kernel (ms), "
+       << "Postprocessing (ms), "
+       << "fbgemmPacked (ms), "
+       << "Total (ms), "
+       << "GOPS" << endl;
+#else
+  cout << setw(6) << header << setw(5) << "GOPS" << endl;
+#endif
+
+  chrono::time_point<chrono::high_resolution_clock> begin, end;
+
+  for (auto conv_p : shapes) {
+    if (conv_p.IC % conv_p.G != 0 || conv_p.OC % conv_p.G != 0) {
+      // invalid shapes
+      continue;
+    }
+    int im_in_dim = accumulate(
+        conv_p.IN_DIM.begin(), conv_p.IN_DIM.end(), 1, multiplies<int>());
+    aligned_vector<uint8_t> Aint8(conv_p.MB * im_in_dim * conv_p.IC);
+
+    int kernel_dim =
+        accumulate(conv_p.K.begin(), conv_p.K.end(), 1, multiplies<int>());
+    aligned_vector<int8_t> Bint8(
+        kernel_dim * conv_p.IC * (conv_p.OC / conv_p.G));
+
+    int im_out_dim = accumulate(
+        conv_p.OUT_DIM.begin(), conv_p.OUT_DIM.end(), 1, multiplies<int>());
+    aligned_vector<int32_t> Cint32_ref(conv_p.MB * im_out_dim * conv_p.OC);
+    aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+    aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
+    aligned_vector<uint8_t> Cint8_fb2(Cint32_ref.size(), 0);
+    aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
+
+    // A matrix (input activations)
+    randFill<uint8_t>(Aint8, 0, 5);
+    int32_t Aint8_zero_point = 4;
+
+    // B matrix (weights)
+    randFill<int8_t>(Bint8, -4, 4);
+    aligned_vector<int32_t> Bint8_zero_point(1);
+    randFill(Bint8_zero_point, -3, -1);
+
+    aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+    randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2);
+    int32_t C_zero_point = 5;
+
+    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
+
+    // reference implementation
+    conv_ref(
+        conv_p,
+        Aint8.data(),
+        Aint8_zero_point,
+        Bint8.data(),
+        Cint32_ref.data());
+
+    // matrix dimensions after im2col
+    int MDim = conv_p.MB * im_out_dim;
+    int NDim = conv_p.OC / conv_p.G;
+    int KDim = kernel_dim * conv_p.IC;
+    int KDimPerGroup = KDim / conv_p.G;
+
+    int OC_per_G = conv_p.OC / conv_p.G;
+
+    // computing row offset
+    vector<int32_t> row_offsets(MDim);
+    vector<uint8_t> Aint8_im2col(MDim * KDim);
+    im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
+
+    // computing column offset
+    vector<int32_t> col_offsets(conv_p.OC);
+    for (int g = 0; g < conv_p.G; ++g) {
+      col_offsets_with_zero_pt_s8acc32_ref(
+          KDimPerGroup,
+          OC_per_G,
+          OC_per_G,
+          Bint8.data() + g * KDimPerGroup * OC_per_G,
+          Bint8_zero_point.data(),
+          col_offsets.data() + g * OC_per_G,
+          conv_p.OC);
+    }
+
+    for (int g = 0; g < conv_p.G; ++g) {
+      row_offsets_u8acc32_ref(
+          MDim,
+          KDimPerGroup,
+          KDim,
+          Aint8_im2col.data() + g * KDimPerGroup,
+          row_offsets.data());
+
+      requantize_u8acc32_ref(
+          MDim,
+          NDim,
+          conv_p.G * NDim,
+          Cint32_ref.data() + g * NDim,
+          Cint8_ref.data() + g * NDim,
+          C_multiplier.data() + g * NDim / conv_p.OC,
+          C_zero_point,
+          Aint8_zero_point,
+          Bint8_zero_point.data() + g * NDim / conv_p.OC,
+          row_offsets.data(),
+          col_offsets.data() + g * NDim,
+          nullptr,
+          conv_p.OC);
+    }
+
+    double nops = 2.0 * static_cast<double>(NITER) * MDim * NDim * KDim;
+    double ttot = 0.0;
+    string runType;
+
+    PackWeightsForConv<SPATIAL_DIM> packedB(conv_p, Bint8.data());
+
+    // no-op output process objects
+    DoNothing<> doNothingObj{};
+    ReQuantizeOutput<false, QuantizationGranularity::TENSOR> outputProcObj(
+        doNothingObj,
+        C_multiplier.data(),
+        C_zero_point,
+        Aint8_zero_point,
+        Bint8_zero_point.data(),
+        nullptr, // row offsets
+        col_offsets.data(),
+        nullptr, // bias
+        conv_p.OC,
+        conv_p.G);
+
+    runType = "UniConv";
+    ttot = 0;
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+    double im2col_time = 0.0;
+    double total_im2col_time = 0.0;
+    double total_packing_time = 0.0;
+    double total_computing_time = 0.0;
+    double total_kernel_time = 0.0;
+    double total_postprocessing_time = 0.0;
+    double total_run_time = 0.0;
+#endif
+    for (auto i = 0; i < NWARMUP + NITER; ++i) {
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+      packing_time = 0.0;
+      computing_time = 0.0;
+      kernel_time = 0.0;
+      postprocessing_time = 0.0;
+      run_time = 0.0;
+#endif
+      llc_flush(llc);
+      begin = chrono::high_resolution_clock::now();
+      fbgemmConv(
+          conv_p,
+          Aint8.data(),
+          packedB,
+          Cint8_fb.data(),
+          Cint32_fb.data(),
+          outputProcObj,
+          0,
+          1);
+      end = chrono::high_resolution_clock::now();
+
+      if (i >= NWARMUP) {
+        auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin);
+        ttot += dur.count();
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+        total_packing_time += packing_time;
+        total_computing_time += computing_time;
+        total_kernel_time += kernel_time;
+        total_postprocessing_time += postprocessing_time;
+        total_run_time += run_time;
+#endif
+      }
+    }
+
+    cout << conv_p.MB << ", " << conv_p.IC << ", " << conv_p.OC << ", ";
+    for (int i = 0; i < SPATIAL_DIM; ++i) {
+      cout << conv_p.IN_DIM[i] << ", ";
+    }
+    cout << conv_p.G << ", ";
+    for (int i = 0; i < SPATIAL_DIM; ++i) {
+      cout << conv_p.K[i] << ", ";
+    }
+    for (int i = 0; i < SPATIAL_DIM; ++i) {
+      cout << conv_p.stride[i] << ", ";
+    }
+    for (int i = 0; i < SPATIAL_DIM; ++i) {
+      cout << conv_p.pad[i] << ", ";
+    }
+
+    cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5)
+         << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6)
+         << KDim << ", ";
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+    cout << fixed << setprecision(6) << setw(8) << 0 << ", "
+         << total_packing_time / (double)NITER / 1e6 << ", "
+         << total_kernel_time / (double)NITER / 1e6 << ", "
+         << total_postprocessing_time / (double)NITER / 1e6 << ", "
+         << total_run_time / (double)NITER / 1e6 << ", "
+         << ttot / (double)NITER / 1e6 << ", ";
+#endif
+    cout << setprecision(2) << nops / ttot << endl;
+
+    compare_buffers(
+        Cint8_ref.data(),
+        Cint8_fb.data(),
+        MDim,
+        NDim * conv_p.G,
+        NDim * conv_p.G,
+        5);
+  } // shapes
+}
+
+int main() {
+#ifdef _OPENMP
+  // Use 1 thread unless OMP_NUM_THREADS is explicit set.
+  const char* val = getenv("OMP_NUM_THREADS");
+  if (val == nullptr || !*val) {
+    omp_set_num_threads(1);
+  }
+#endif
+  // performance_test<int16_t>();
+  performance_test<2, int32_t>(shapes_2d);
+  performance_test<3, int32_t>(shapes_3d);
+  return 0;
+}
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index c5f8ed9..0efdcac 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -20,7 +20,7 @@
 #include "AlignedVec.h"
 #include "BenchUtils.h"
 #include "fbgemm/Utils.h"
-#include "src/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/FbgemmI8DepthwiseAvx2.h"
 #include "src/RefImplementations.h"
 
 using namespace std;
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index 780d83c..96921a1 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -18,7 +18,7 @@
 #include "AlignedVec.h"
 #include "BenchUtils.h"
 #include "fbgemm/Utils.h"
-#include "src/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/FbgemmI8DepthwiseAvx2.h"
 #include "src/RefImplementations.h"
 
 using namespace std;
author	Daya Khudia <dskhudia@fb.com>	2019-06-05 22:44:57 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-06-05 22:50:08 +0300
commit	8197494f3ae280941639c72bc1342a9faa8e2ad6 (patch)
tree	e0f7fb9b8aad61c46b7d2a79f019255514c03985 /bench
parent	77868418c7963572167690ef069b06cbfe67de1f (diff)