Initial commit

author: Daya S Khudia <dskhudia@fb.com> 2018-10-13 01:48:13 +0300
committer: Daya S Khudia <dskhudia@fb.com> 2018-10-31 00:56:00 +0300
commit: e85b5a12254fa47ca6b56236489253a68fd32104 (patch)
tree: d62190c53913c65e136fb26dc89bfab38144e2c3 /bench/I8SpmdmBenchmark.cc
1 files changed, 212 insertions, 0 deletions
diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc
new file mode 100644
index 0000000..f97d152
--- /dev/null
+++ b/bench/I8SpmdmBenchmark.cc
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <random>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "fbgemm/FbgemmI8Spmdm.h"
+#include "src/RefImplementations.h"
+#include "BenchUtils.h"
+
+using namespace std;
+using namespace fbgemm2;
+
+int main() {
+  const vector<array<int, 3>> shapes = {
+      //   M,    N,    K
+      {1024, 1024, 1024},
+      {511, 512, 512},
+  };
+
+  // SpMDM is often memory BW bound so we want to flush LLC.
+  bool flush = true;
+  std::vector<char> llc;
+  if (flush) {
+    llc.resize(128 * 1024 * 1024, 1.0);
+  }
+
+  constexpr int NWARMUP = 4;
+  constexpr int NITER = 16;
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+  cout << "WARNING: the timer may be inaccurate when used by multiple threads."
+       << endl;
+  cout << "M, "
+       << "N, "
+       << "K, "
+       << "Density, "
+       << "Accumulation, "
+       << "Initialize (ms), "
+       << "Transpose uint8 (ms), "
+       << "Transpose 32xN (ms), "
+       << "Compute (ms), "
+       << "Transpose 32xN (ms), "
+       << "Total (ms), "
+       << "GB/s, "
+       << "GOPs" << endl;
+#else
+  cout << "M, "
+       << "N, "
+       << "K, "
+       << "Density, "
+       << "Accumulation, "
+       << "GB/s, "
+       << "GOPs" << endl;
+#endif
+
+  for (const auto& shape : shapes) {
+    for (float density : {0.0001f, 0.001f, 0.01f, 0.1f, 1.0f}) {
+      for (bool accumulation : {false, true}) {
+        int M = shape[0];
+        int N = shape[1];
+        int K = shape[2];
+
+        cout << M << ", " << N << ", " << K << ", ";
+
+        aligned_vector<uint8_t> A(M * K);
+        randFill(A, 0, 255);
+
+        fbgemm2::CompressedSparseColumn B_csc(K, N);
+        vector<int32_t> C(M * N);
+        vector<int32_t> C_ref(C.size());
+
+        for (int i = 0; i < M; ++i) {
+          for (int j = 0; j < N; ++j) {
+            C_ref[i * N + j] = i + j;
+          }
+        }
+
+        // deterministic random number
+        std::default_random_engine eng;
+        binomial_distribution<> per_col_nnz_dist(K, density);
+        uniform_int_distribution<> value_dist(
+            numeric_limits<int8_t>::min() / 2,
+            numeric_limits<int8_t>::max() / 2);
+
+        vector<int> row_indices(K);
+
+        int total_nnz = 0;
+        for (int j = 0; j < N; ++j) {
+          B_csc.ColPtr()[j] = total_nnz;
+
+          int nnz_of_j = per_col_nnz_dist(eng);
+          total_nnz += nnz_of_j;
+
+          iota(row_indices.begin(), row_indices.end(), 0);
+          shuffle(row_indices.begin(), row_indices.end(), eng);
+          sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
+
+          for (int k = 0; k < nnz_of_j; ++k) {
+            B_csc.RowIdx().push_back(row_indices[k]);
+            B_csc.Values().push_back(value_dist(eng));
+          }
+        }
+        B_csc.ColPtr()[N] = total_nnz;
+
+        double ttot = 0;
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+        double total_initial_time = 0.0;
+        double total_transpose_uint8_time = 0.0;
+        double total_transpose_32xN_time = 0.0;
+        double total_compute_time = 0.0;
+        double total_transpose_Nx32_time = 0.0;
+        double total_run_time = 0.0;
+#endif
+        double ops = double(NITER) * B_csc.NumOfNonZeros() * M * 2;
+        double bytes = double(NITER) *
+            (M * N * sizeof(int32_t) + M * K +
+             B_csc.NumOfNonZeros() * (sizeof(int16_t) + sizeof(int8_t)) +
+             B_csc.ColPtr().size() * sizeof(int32_t));
+
+        spmdm_ref(M, A.data(), K, B_csc, accumulation, C_ref.data(), N);
+
+        chrono::time_point<chrono::system_clock> t_begin, t_end;
+        for (int iter = 0; iter < NWARMUP + NITER; ++iter) {
+          for (int i = 0; i < M; ++i) {
+            for (int j = 0; j < N; ++j) {
+              C[i * N + j] = i + j;
+            }
+          }
+          llc_flush(llc);
+
+          t_begin = chrono::system_clock::now();
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+          spmdm_initial_time = 0.0;
+          spmdm_transpose_uint8_time = 0.0;
+          spmdm_transpose_32xN_time = 0.0;
+          spmdm_compute_time = 0.0;
+          spmdm_transpose_Nx32_time = 0.0;
+          spmdm_run_time = 0.0;
+#endif
+
+#ifndef FBGEMM_MEASURE_TIME_BREAKDOWN
+#pragma omp parallel
+#endif
+          {
+#if defined (FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+            int num_threads = 1;
+            int tid = 0;
+#else
+            int num_threads = omp_get_num_threads();
+            int tid = omp_get_thread_num();
+#endif
+            int i_per_thread =
+                ((M + 31) / 32 + num_threads - 1) / num_threads * 32;
+            int i_begin = std::min(tid * i_per_thread, M);
+            int i_end = std::min(i_begin + i_per_thread, M);
+
+            block_type_t block = {i_begin, i_end - i_begin, 0, N};
+            B_csc.SpMDM(
+                block, A.data(), K, accumulation, C.data() + i_begin * N, N);
+          }
+          t_end = chrono::system_clock::now();
+          if (iter >= NWARMUP) {
+            double dt = chrono::duration<double>(t_end - t_begin).count();
+            // double dt = chrono::duration_cast<chrono::nanoseconds>(t_end -
+            // t_begin).count();
+            ttot += dt;
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+            total_initial_time += spmdm_initial_time;
+            total_transpose_uint8_time += spmdm_transpose_uint8_time;
+            total_transpose_32xN_time += spmdm_transpose_32xN_time;
+            total_compute_time += spmdm_compute_time;
+            total_transpose_Nx32_time += spmdm_transpose_Nx32_time;
+            total_run_time += spmdm_run_time;
+#endif
+          }
+        }
+
+        compare_buffers(C_ref.data(), C.data(), M, N, N, 5, 0);
+
+        cout << fixed << B_csc.Density() << ", " << accumulation << ", ";
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+        cout << fixed << total_initial_time / (double)NITER / 1e6 << ", "
+             << total_transpose_uint8_time / (double)NITER / 1e6 << ", "
+             << total_transpose_32xN_time / (double)NITER / 1e6 << ", "
+             << total_compute_time / (double)NITER / 1e6 << ", "
+             << total_transpose_Nx32_time / (double)NITER / 1e6 << ", "
+             << total_run_time / (double)NITER / 1e6 << ", ";
+#endif
+        // Report performance
+        cout << fixed << bytes / ttot / 1e9 << ", " << ops / ttot / 1e9 << endl;
+
+      } // accumulation
+    } // for each density
+  } // for each shape
+
+  return 0;
+}
author	Daya S Khudia <dskhudia@fb.com>	2018-10-13 01:48:13 +0300
committer	Daya S Khudia <dskhudia@fb.com>	2018-10-31 00:56:00 +0300
commit	e85b5a12254fa47ca6b56236489253a68fd32104 (patch)
tree	d62190c53913c65e136fb26dc89bfab38144e2c3 /bench/I8SpmdmBenchmark.cc