Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/bench
diff options
context:
space:
mode:
authorJongsoo Park <jongsoo@fb.com>2018-11-19 07:17:54 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2018-11-19 07:19:53 +0300
commit6d00b52ee80787065561ac16942e55042ed6c760 (patch)
tree95986eb1a6e4a05e43c7b3aa21c31addedf9453b /bench
parent0581331ce7c2b528bc3c726c64d3baf74d12c959 (diff)
clang-format (#11)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/11 clang format of fbgemm Reviewed By: dskhudia Differential Revision: D13115202 fbshipit-source-id: 6dab29cb8b5f4fabcc165019663351567a2a2952
Diffstat (limited to 'bench')
-rw-r--r--bench/AlignedVec.h64
-rw-r--r--bench/BenchUtils.cc24
-rw-r--r--bench/BenchUtils.h2
-rw-r--r--bench/Depthwise3DBenchmark.cc45
-rw-r--r--bench/DepthwiseBenchmark.cc33
-rw-r--r--bench/FP16Benchmark.cc2
-rw-r--r--bench/I8SpmdmBenchmark.cc4
-rw-r--r--bench/Im2ColFusedRequantizeAcc16Benchmark.cc8
-rw-r--r--bench/Im2ColFusedRequantizeAcc32Benchmark.cc8
-rw-r--r--bench/PackedFloatInOutBenchmark.cc72
-rw-r--r--bench/PackedRequantizeAcc16Benchmark.cc2
-rw-r--r--bench/PackedRequantizeAcc32Benchmark.cc78
12 files changed, 182 insertions, 160 deletions
diff --git a/bench/AlignedVec.h b/bench/AlignedVec.h
index 30fd266..fd4b88e 100644
--- a/bench/AlignedVec.h
+++ b/bench/AlignedVec.h
@@ -12,63 +12,73 @@
* <http://blogs.msdn.com/b/vcblog/archive/2008/08/28/the-mallocator.aspx>
*
*/
-template <typename T, std::size_t Alignment> class aligned_allocator {
-public:
+template <typename T, std::size_t Alignment>
+class aligned_allocator {
+ public:
// The following will be the same for virtually all allocators.
- typedef T *pointer;
- typedef const T *const_pointer;
- typedef T &reference;
- typedef const T &const_reference;
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
typedef T value_type;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
- T *address(T &r) const { return &r; }
+ T* address(T& r) const {
+ return &r;
+ }
- const T *address(const T &s) const { return &s; }
+ const T* address(const T& s) const {
+ return &s;
+ }
std::size_t max_size() const {
// The following has been carefully written to be independent of
// the definition of size_t and to avoid signed/unsigned warnings.
return (static_cast<std::size_t>(0) - static_cast<std::size_t>(1)) /
- sizeof(T);
+ sizeof(T);
}
// The following must be the same for all allocators.
- template <typename U> struct rebind {
+ template <typename U>
+ struct rebind {
typedef aligned_allocator<U, Alignment> other;
};
- bool operator!=(const aligned_allocator &other) const {
+ bool operator!=(const aligned_allocator& other) const {
return !(*this == other);
}
- void construct(T *const p, const T &t) const {
- void *const pv = static_cast<void *>(p);
+ void construct(T* const p, const T& t) const {
+ void* const pv = static_cast<void*>(p);
new (pv) T(t);
}
- void destroy(T *const p) const { p->~T(); }
+ void destroy(T* const p) const {
+ p->~T();
+ }
// Returns true if and only if storage allocated from *this
// can be deallocated from other, and vice versa.
// Always returns true for stateless allocators.
- bool operator==(const aligned_allocator & /*other*/) const { return true; }
+ bool operator==(const aligned_allocator& /*other*/) const {
+ return true;
+ }
// Default constructor, copy constructor, rebinding constructor, and
// destructor. Empty for stateless allocators.
aligned_allocator() {}
- aligned_allocator(const aligned_allocator &) {}
+ aligned_allocator(const aligned_allocator&) {}
template <typename U>
- aligned_allocator(const aligned_allocator<U, Alignment> &) {}
+ aligned_allocator(const aligned_allocator<U, Alignment>&) {}
~aligned_allocator() {}
// The following will be different for each allocator.
- T *allocate(const std::size_t n) const {
+ T* allocate(const std::size_t n) const {
// The return value of allocate(0) is unspecified.
// Mallocator returns NULL in order to avoid depending
// on malloc(0)'s implementation-defined behavior
@@ -88,7 +98,7 @@ public:
}
// Mallocator wraps malloc().
- void *pv = nullptr;
+ void* pv = nullptr;
posix_memalign(&pv, Alignment, n * sizeof(T));
// pv = aligned_alloc(Alignment, n * sizeof(T));
@@ -98,14 +108,16 @@ public:
throw std::bad_alloc();
}
- return static_cast<T *>(pv);
+ return static_cast<T*>(pv);
}
- void deallocate(T *const p, const std::size_t /*n*/) const { free(p); }
+ void deallocate(T* const p, const std::size_t /*n*/) const {
+ free(p);
+ }
// The following will be the same for all allocators that ignore hints.
template <typename U>
- T *allocate(const std::size_t n, const U * /* const hint */) const {
+ T* allocate(const std::size_t n, const U* /* const hint */) const {
return allocate(n);
}
@@ -116,9 +128,11 @@ public:
// "assignment operator could not be generated because a
// base class assignment operator is inaccessible" within
// the STL headers, but that warning is useless.
-private:
- aligned_allocator &operator=(const aligned_allocator &) { assert(0); }
+ private:
+ aligned_allocator& operator=(const aligned_allocator&) {
+ assert(0);
+ }
};
template <typename T>
-using aligned_vector = std::vector<T, aligned_allocator<T, 64> >;
+using aligned_vector = std::vector<T, aligned_allocator<T, 64>>;
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
index 7b4cde4..db40ee0 100644
--- a/bench/BenchUtils.cc
+++ b/bench/BenchUtils.cc
@@ -12,27 +12,23 @@ namespace fbgemm {
std::default_random_engine eng;
template <typename T>
-void randFill(aligned_vector<T> &vec, const int low, const int high) {
+void randFill(aligned_vector<T>& vec, const int low, const int high) {
std::random_device r;
std::uniform_int_distribution<int> dis(low, high);
- for (auto &v : vec) {
+ for (auto& v : vec) {
v = static_cast<T>(dis(eng));
}
}
-template
-void randFill<float>(aligned_vector<float> &vec,
- const int low, const int high);
-template
-void randFill<uint8_t>(aligned_vector<uint8_t> &vec,
- const int low, const int high);
-template
-void randFill<int8_t>(aligned_vector<int8_t> &vec,
- const int low, const int high);
+template void
+randFill<float>(aligned_vector<float>& vec, const int low, const int high);
+template void
+randFill<uint8_t>(aligned_vector<uint8_t>& vec, const int low, const int high);
+template void
+randFill<int8_t>(aligned_vector<int8_t>& vec, const int low, const int high);
-template
-void randFill<int>(aligned_vector<int> &vec,
- const int low, const int high);
+template void
+randFill<int>(aligned_vector<int>& vec, const int low, const int high);
void llc_flush(std::vector<char>& llc) {
volatile char* data = llc.data();
diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
index 5c16a06..8ca99df 100644
--- a/bench/BenchUtils.h
+++ b/bench/BenchUtils.h
@@ -11,7 +11,7 @@
namespace fbgemm {
template <typename T>
-void randFill(aligned_vector<T> &vec, const int low, const int high);
+void randFill(aligned_vector<T>& vec, const int low, const int high);
void llc_flush(std::vector<char>& llc);
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index 417ddd1..f53eeea 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -18,10 +18,10 @@
#endif
#include "AlignedVec.h"
-#include "src/FbgemmI8Depthwise.h"
+#include "BenchUtils.h"
#include "fbgemm/Utils.h"
+#include "src/FbgemmI8Depthwise.h"
#include "src/RefImplementations.h"
-#include "BenchUtils.h"
using namespace std;
using namespace fbgemm;
@@ -33,9 +33,9 @@ int main() {
if (flush) {
llc.resize(128 * 1024 * 1024, 1.0);
}
-#define llc_flush() \
- for (auto i = 0; i < llc.size(); i++) { \
- llc[i]++; \
+#define llc_flush() \
+ for (auto i = 0; i < llc.size(); i++) { \
+ llc[i]++; \
}
constexpr int NWARMUP = 4;
@@ -116,10 +116,10 @@ int main() {
Packed3x3x3ConvMatrix Bp(K, B.data());
double ttot = 0;
- double bytes =
- double(NITER) *
- (K * (N * (2. * sizeof(int32_t) * T_OUT * H_OUT * W_OUT + T * H * W) +
- K_T * K_H * K_W));
+ double bytes = double(NITER) *
+ (K *
+ (N * (2. * sizeof(int32_t) * T_OUT * H_OUT * W_OUT + T * H * W) +
+ K_T * K_H * K_W));
double ops =
double(NITER) * N * T_OUT * H_OUT * W_OUT * K * K_T * K_H * K_W * 2;
chrono::time_point<chrono::system_clock> t_begin, t_end;
@@ -183,8 +183,14 @@ int main() {
} // n
// Report performance
- printf("N = %d K = %d T = %d H = %d W = %d stride = %d\n", N, K, T, H, W,
- stride_h);
+ printf(
+ "N = %d K = %d T = %d H = %d W = %d stride = %d\n",
+ N,
+ K,
+ T,
+ H,
+ W,
+ stride_h);
printf("GB/s = %f Gops/s = %f\n", bytes / ttot / 1e9, ops / ttot / 1e9);
ttot = 0;
@@ -236,9 +242,8 @@ int main() {
for (int h = 0; h < H_OUT; ++h) {
for (int w = 0; w < W_OUT; ++w) {
for (int g = 0; g < K; ++g) {
- uint8_t expected =
- C_uint8_ref[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K +
- g];
+ uint8_t expected = C_uint8_ref
+ [(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + g];
uint8_t actual =
C_uint8[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + g];
if (expected != actual) {
@@ -255,9 +260,15 @@ int main() {
} // n
// Report performance
- printf("N = %d K = %d T = %d H = %d W = %d stride = %d with requantization "
- "fused\n",
- N, K, T, H, W, stride_h);
+ printf(
+ "N = %d K = %d T = %d H = %d W = %d stride = %d with requantization "
+ "fused\n",
+ N,
+ K,
+ T,
+ H,
+ W,
+ stride_h);
printf("GB/s = %f Gops/s = %f\n", bytes / ttot / 1e9, ops / ttot / 1e9);
} // for each shape
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index 0bf2d73..8e6d83d 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -16,10 +16,10 @@
#endif
#include "AlignedVec.h"
-#include "src/FbgemmI8Depthwise.h"
+#include "BenchUtils.h"
#include "fbgemm/Utils.h"
+#include "src/FbgemmI8Depthwise.h"
#include "src/RefImplementations.h"
-#include "BenchUtils.h"
using namespace std;
using namespace fbgemm;
@@ -27,6 +27,8 @@ using namespace fbgemm;
int main() {
// From Xray OCR
vector<vector<int>> shapes = {
+ // NOTE: clang-format wants to use a different formatting but the current
+ // formatting should be easier to read.
// N, G, H_in, W_in, stride
{ 1, 272, 47, 125, 1, },
{ 1, 272, 64, 125, 1, },
@@ -135,9 +137,9 @@ int main() {
if (flush) {
llc.resize(128 * 1024 * 1024, 1.0);
}
-#define llc_flush() \
- for (auto i = 0; i < llc.size(); i++) { \
- llc[i]++; \
+#define llc_flush() \
+ for (auto i = 0; i < llc.size(); i++) { \
+ llc[i]++; \
}
constexpr int NWARMUP = 4;
@@ -209,8 +211,7 @@ int main() {
Packed3x3ConvMatrix Bp(G, B.data());
double ttot = 0;
- double bytes =
- double(NITER) *
+ double bytes = double(NITER) *
(G * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S));
double ops = double(NITER) * N * H_OUT * W_OUT * G * R * S * 2;
chrono::time_point<chrono::system_clock> t_begin, t_end;
@@ -256,9 +257,9 @@ int main() {
int32_t expected = C_ref[((n * H_OUT + h) * W_OUT + w) * G + g];
int32_t actual = C[((n * H_OUT + h) * W_OUT + w) * G + g];
if (expected != actual) {
- cerr << "Depthwise 3x3 results differ at (" << n << ", "
- << h << ", " << w << ", " << g << "). expected "
- << expected << " actual " << actual << endl;
+ cerr << "Depthwise 3x3 results differ at (" << n << ", " << h
+ << ", " << w << ", " << g << "). expected " << expected
+ << " actual " << actual << endl;
return -1;
}
assert(expected == actual);
@@ -320,9 +321,9 @@ int main() {
C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * G + g];
uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * G + g];
if (expected != actual) {
- cerr << "Depthwise 3x3 results differ at (" << n << ", "
- << h << ", " << w << ", " << g << "). expected "
- << (int)expected << " actual " << (int)actual << endl;
+ cerr << "Depthwise 3x3 results differ at (" << n << ", " << h
+ << ", " << w << ", " << g << "). expected " << (int)expected
+ << " actual " << (int)actual << endl;
return -1;
}
assert(expected == actual);
@@ -334,7 +335,11 @@ int main() {
// Report performance
printf(
"N = %d G = %d H = %d W = %d stride = %d with requantization fused\n",
- N, G, H, W, stride_h);
+ N,
+ G,
+ H,
+ W,
+ stride_h);
printf("GB/s = %f Gops/s = %f\n", bytes / ttot / 1e9, ops / ttot / 1e9);
} // for each shape
diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc
index 8fbe878..c03f18a 100644
--- a/bench/FP16Benchmark.cc
+++ b/bench/FP16Benchmark.cc
@@ -16,9 +16,9 @@
#include <omp.h>
#endif
+#include "AlignedVec.h"
#include "bench/BenchUtils.h"
#include "fbgemm/FbgemmFP16.h"
-#include "AlignedVec.h"
using namespace std;
using namespace fbgemm;
diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc
index d361bb5..07b73dc 100644
--- a/bench/I8SpmdmBenchmark.cc
+++ b/bench/I8SpmdmBenchmark.cc
@@ -17,9 +17,9 @@
#include <omp.h>
#endif
+#include "BenchUtils.h"
#include "fbgemm/FbgemmI8Spmdm.h"
#include "src/RefImplementations.h"
-#include "BenchUtils.h"
using namespace std;
using namespace fbgemm;
@@ -156,7 +156,7 @@ int main() {
#pragma omp parallel
#endif
{
-#if defined (FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
int num_threads = 1;
int tid = 0;
#else
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
index 8827b4c..e3c9da2 100644
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -16,9 +16,9 @@
#include <omp.h>
#endif
+#include "BenchUtils.h"
#include "fbgemm/Fbgemm.h"
#include "src/RefImplementations.h"
-#include "BenchUtils.h"
using namespace std;
using namespace fbgemm;
@@ -189,11 +189,7 @@ void performance_test() {
PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, int16_t> packA(
- conv_p,
- Aint8.data(),
- nullptr,
- Aint8_zero_point,
- row_offset_buf.data());
+ conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
PackBMatrix<int8_t, int16_t> packedB(
matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
index b87f7d7..153dc3b 100644
--- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
@@ -16,9 +16,9 @@
#include <omp.h>
#endif
+#include "BenchUtils.h"
#include "fbgemm/Fbgemm.h"
#include "src/RefImplementations.h"
-#include "BenchUtils.h"
using namespace std;
using namespace fbgemm;
@@ -191,11 +191,7 @@ void performance_test() {
PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, int32_t> packA(
- conv_p,
- Aint8.data(),
- nullptr,
- Aint8_zero_point,
- row_offset_buf.data());
+ conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
PackBMatrix<int8_t, int32_t> packedB(
matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
index 29a7547..dc9536e 100644
--- a/bench/PackedFloatInOutBenchmark.cc
+++ b/bench/PackedFloatInOutBenchmark.cc
@@ -19,50 +19,52 @@
#include <mkl.h>
#endif
+#include "BenchUtils.h"
#include "fbgemm/Fbgemm.h"
#include "src/RefImplementations.h"
#include "test/QuantizationHelpers.h"
-#include "BenchUtils.h"
using namespace std;
using namespace fbgemm;
void performance_test() {
vector<vector<int>> shapes = {
- {1, 128, 512},
- {1, 1024, 256},
- {1, 2048, 512},
- {1, 4096, 1024},
-
- {6, 256, 1024},
- {6, 256, 2048},
- {6, 512, 512},
- {6, 1024, 256},
- {6, 2048, 256},
- {6, 2048, 512},
- {6, 4096, 256},
- {6, 4096, 1024},
- {6, 4096, 2048},
-
- {10, 2048, 256},
- {10, 4096, 1024},
-
- {20, 2048, 256},
- {20, 4096, 1024},
-
- {102, 1024, 512},
- {102, 2323, 256},
- {102, 512, 256},
-
- {1, 800, 3200},
- {1, 800, 8000},
-
- {16, 256, 1500},
- {16, 256, 1567},
- {1, 128, 2876},
- {16, 128, 1567},
- {1, 128, 2722},
- {16, 256, 512},
+ // NOTE: clang-format wants to use a different formatting but the current
+ // formatting should be easier to read.
+ {1, 128, 512},
+ {1, 1024, 256},
+ {1, 2048, 512},
+ {1, 4096, 1024},
+
+ {6, 256, 1024},
+ {6, 256, 2048},
+ {6, 512, 512},
+ {6, 1024, 256},
+ {6, 2048, 256},
+ {6, 2048, 512},
+ {6, 4096, 256},
+ {6, 4096, 1024},
+ {6, 4096, 2048},
+
+ {10, 2048, 256},
+ {10, 4096, 1024},
+
+ {20, 2048, 256},
+ {20, 4096, 1024},
+
+ {102, 1024, 512},
+ {102, 2323, 256},
+ {102, 512, 256},
+
+ {1, 800, 3200},
+ {1, 800, 8000},
+
+ {16, 256, 1500},
+ {16, 256, 1567},
+ {1, 128, 2876},
+ {16, 128, 1567},
+ {1, 128, 2722},
+ {16, 256, 512},
};
bool flush = true;
std::vector<char> llc;
diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
index 4974ba2..367aa32 100644
--- a/bench/PackedRequantizeAcc16Benchmark.cc
+++ b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -20,9 +20,9 @@
#include <mkl.h>
#endif
+#include "BenchUtils.h"
#include "fbgemm/Fbgemm.h"
#include "src/RefImplementations.h"
-#include "BenchUtils.h"
using namespace std;
using namespace fbgemm;
diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc
index 94bb899..84d7c24 100644
--- a/bench/PackedRequantizeAcc32Benchmark.cc
+++ b/bench/PackedRequantizeAcc32Benchmark.cc
@@ -19,53 +19,55 @@
#include <mkl.h>
#endif
+#include "BenchUtils.h"
#include "fbgemm/Fbgemm.h"
#include "src/RefImplementations.h"
#include "test/QuantizationHelpers.h"
-#include "BenchUtils.h"
using namespace std;
using namespace fbgemm;
void performance_test() {
vector<vector<int>> shapes = {
- {156800, 4, 36},
- {156800, 8, 36},
- {156800, 16, 36},
- {1, 128, 512},
- {1, 1024, 256},
- {1, 2048, 512},
- {1, 4096, 1024},
-
- {6, 256, 1024},
- {6, 256, 2048},
- {6, 512, 512},
- {6, 1024, 256},
- {6, 2048, 256},
- {6, 2048, 512},
- {6, 4096, 256},
- {6, 4096, 1024},
- {6, 4096, 2048},
-
- {10, 2048, 256},
- {10, 4096, 1024},
-
- {20, 2048, 256},
- {20, 4096, 1024},
-
- {102, 1024, 512},
- {102, 2323, 256},
- {102, 512, 256},
-
- {1, 800, 3200},
- {1, 800, 8000},
-
- {16, 256, 1500},
- {16, 256, 1567},
- {1, 128, 2876},
- {16, 128, 1567},
- {1, 128, 2722},
- {16, 256, 512},
+ // NOTE: clang-format wants to use a different formatting but the current
+ // formatting should be easier to read.
+ {156800, 4, 36},
+ {156800, 8, 36},
+ {156800, 16, 36},
+ {1, 128, 512},
+ {1, 1024, 256},
+ {1, 2048, 512},
+ {1, 4096, 1024},
+
+ {6, 256, 1024},
+ {6, 256, 2048},
+ {6, 512, 512},
+ {6, 1024, 256},
+ {6, 2048, 256},
+ {6, 2048, 512},
+ {6, 4096, 256},
+ {6, 4096, 1024},
+ {6, 4096, 2048},
+
+ {10, 2048, 256},
+ {10, 4096, 1024},
+
+ {20, 2048, 256},
+ {20, 4096, 1024},
+
+ {102, 1024, 512},
+ {102, 2323, 256},
+ {102, 512, 256},
+
+ {1, 800, 3200},
+ {1, 800, 8000},
+
+ {16, 256, 1500},
+ {16, 256, 1567},
+ {1, 128, 2876},
+ {16, 128, 1567},
+ {1, 128, 2722},
+ {16, 256, 512},
};
bool flush = true;
std::vector<char> llc;