diff options
author | Daya S Khudia <dskhudia@fb.com> | 2018-10-13 01:48:13 +0300 |
---|---|---|
committer | Daya S Khudia <dskhudia@fb.com> | 2018-10-31 00:56:00 +0300 |
commit | e85b5a12254fa47ca6b56236489253a68fd32104 (patch) | |
tree | d62190c53913c65e136fb26dc89bfab38144e2c3 /bench/I8SpmdmBenchmark.cc |
Initial commit
Diffstat (limited to 'bench/I8SpmdmBenchmark.cc')
-rw-r--r-- | bench/I8SpmdmBenchmark.cc | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc new file mode 100644 index 0000000..f97d152 --- /dev/null +++ b/bench/I8SpmdmBenchmark.cc @@ -0,0 +1,212 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include <algorithm> +#include <array> +#include <cassert> +#include <chrono> +#include <cstdlib> +#include <iostream> +#include <numeric> +#include <random> + +#ifdef _OPENMP +#include <omp.h> +#endif + +#include "fbgemm/FbgemmI8Spmdm.h" +#include "src/RefImplementations.h" +#include "BenchUtils.h" + +using namespace std; +using namespace fbgemm2; + +int main() { + const vector<array<int, 3>> shapes = { + // M, N, K + {1024, 1024, 1024}, + {511, 512, 512}, + }; + + // SpMDM is often memory BW bound so we want to flush LLC. + bool flush = true; + std::vector<char> llc; + if (flush) { + llc.resize(128 * 1024 * 1024, 1.0); + } + + constexpr int NWARMUP = 4; + constexpr int NITER = 16; + +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + cout << "WARNING: the timer may be inaccurate when used by multiple threads." + << endl; + cout << "M, " + << "N, " + << "K, " + << "Density, " + << "Accumulation, " + << "Initialize (ms), " + << "Transpose uint8 (ms), " + << "Transpose 32xN (ms), " + << "Compute (ms), " + << "Transpose 32xN (ms), " + << "Total (ms), " + << "GB/s, " + << "GOPs" << endl; +#else + cout << "M, " + << "N, " + << "K, " + << "Density, " + << "Accumulation, " + << "GB/s, " + << "GOPs" << endl; +#endif + + for (const auto& shape : shapes) { + for (float density : {0.0001f, 0.001f, 0.01f, 0.1f, 1.0f}) { + for (bool accumulation : {false, true}) { + int M = shape[0]; + int N = shape[1]; + int K = shape[2]; + + cout << M << ", " << N << ", " << K << ", "; + + aligned_vector<uint8_t> A(M * K); + randFill(A, 0, 255); + + fbgemm2::CompressedSparseColumn B_csc(K, N); + vector<int32_t> C(M * N); + vector<int32_t> C_ref(C.size()); + + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { + C_ref[i * N + j] = i + j; + } + } + + // deterministic random number + std::default_random_engine eng; + binomial_distribution<> per_col_nnz_dist(K, density); + uniform_int_distribution<> value_dist( + numeric_limits<int8_t>::min() / 2, + numeric_limits<int8_t>::max() / 2); + + vector<int> row_indices(K); + + int total_nnz = 0; + for (int j = 0; j < N; ++j) { + B_csc.ColPtr()[j] = total_nnz; + + int nnz_of_j = per_col_nnz_dist(eng); + total_nnz += nnz_of_j; + + iota(row_indices.begin(), row_indices.end(), 0); + shuffle(row_indices.begin(), row_indices.end(), eng); + sort(row_indices.begin(), row_indices.begin() + nnz_of_j); + + for (int k = 0; k < nnz_of_j; ++k) { + B_csc.RowIdx().push_back(row_indices[k]); + B_csc.Values().push_back(value_dist(eng)); + } + } + B_csc.ColPtr()[N] = total_nnz; + + double ttot = 0; +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + double total_initial_time = 0.0; + double total_transpose_uint8_time = 0.0; + double total_transpose_32xN_time = 0.0; + double total_compute_time = 0.0; + double total_transpose_Nx32_time = 0.0; + double total_run_time = 0.0; +#endif + double ops = double(NITER) * B_csc.NumOfNonZeros() * M * 2; + double bytes = double(NITER) * + (M * N * sizeof(int32_t) + M * K + + B_csc.NumOfNonZeros() * (sizeof(int16_t) + sizeof(int8_t)) + + B_csc.ColPtr().size() * sizeof(int32_t)); + + spmdm_ref(M, A.data(), K, B_csc, accumulation, C_ref.data(), N); + + chrono::time_point<chrono::system_clock> t_begin, t_end; + for (int iter = 0; iter < NWARMUP + NITER; ++iter) { + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { + C[i * N + j] = i + j; + } + } + llc_flush(llc); + + t_begin = chrono::system_clock::now(); +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + spmdm_initial_time = 0.0; + spmdm_transpose_uint8_time = 0.0; + spmdm_transpose_32xN_time = 0.0; + spmdm_compute_time = 0.0; + spmdm_transpose_Nx32_time = 0.0; + spmdm_run_time = 0.0; +#endif + +#ifndef FBGEMM_MEASURE_TIME_BREAKDOWN +#pragma omp parallel +#endif + { +#if defined (FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP) + int num_threads = 1; + int tid = 0; +#else + int num_threads = omp_get_num_threads(); + int tid = omp_get_thread_num(); +#endif + int i_per_thread = + ((M + 31) / 32 + num_threads - 1) / num_threads * 32; + int i_begin = std::min(tid * i_per_thread, M); + int i_end = std::min(i_begin + i_per_thread, M); + + block_type_t block = {i_begin, i_end - i_begin, 0, N}; + B_csc.SpMDM( + block, A.data(), K, accumulation, C.data() + i_begin * N, N); + } + t_end = chrono::system_clock::now(); + if (iter >= NWARMUP) { + double dt = chrono::duration<double>(t_end - t_begin).count(); + // double dt = chrono::duration_cast<chrono::nanoseconds>(t_end - + // t_begin).count(); + ttot += dt; +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + total_initial_time += spmdm_initial_time; + total_transpose_uint8_time += spmdm_transpose_uint8_time; + total_transpose_32xN_time += spmdm_transpose_32xN_time; + total_compute_time += spmdm_compute_time; + total_transpose_Nx32_time += spmdm_transpose_Nx32_time; + total_run_time += spmdm_run_time; +#endif + } + } + + compare_buffers(C_ref.data(), C.data(), M, N, N, 5, 0); + + cout << fixed << B_csc.Density() << ", " << accumulation << ", "; + +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + cout << fixed << total_initial_time / (double)NITER / 1e6 << ", " + << total_transpose_uint8_time / (double)NITER / 1e6 << ", " + << total_transpose_32xN_time / (double)NITER / 1e6 << ", " + << total_compute_time / (double)NITER / 1e6 << ", " + << total_transpose_Nx32_time / (double)NITER / 1e6 << ", " + << total_run_time / (double)NITER / 1e6 << ", "; +#endif + // Report performance + cout << fixed << bytes / ttot / 1e9 << ", " << ops / ttot / 1e9 << endl; + + } // accumulation + } // for each density + } // for each shape + + return 0; +} |