diff options
author | Nikolay Bogoychev <nheart@gmail.com> | 2019-06-18 14:30:49 +0300 |
---|---|---|
committer | Nikolay Bogoychev <nheart@gmail.com> | 2019-06-18 14:30:49 +0300 |
commit | 5915ec8ff3d3afcaf98e5c69594fc993b599ab51 (patch) | |
tree | 8bbb7ec5cc7b852660ec7ac159b0a5abce3834b0 | |
parent | fe8146452aecc39f4e348be23928e7ed9baaefc1 (diff) |
Backport benchmarksadd127
-rw-r--r-- | CMakeLists.txt | 6 | ||||
-rw-r--r-- | benchmarks/benchmark.cc (renamed from benchmark.cc) | 0 | ||||
-rw-r--r-- | benchmarks/biasmultiply.cc | 170 |
3 files changed, 174 insertions, 2 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 50dc250..b0d858b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,10 +26,12 @@ if(NOT AVX512) set_source_files_properties(example.cc test/quantize_test.cc test/multiply_test.cc benchmark.cc intgemm.cc PROPERTIES COMPILE_DEFINITIONS "INTGEMM_NO_AVX512") endif() -foreach(exe example benchmark) - add_executable(${exe} ${exe}.cc intgemm.cc) +foreach(exe benchmark biasmultiply) + add_executable(${exe} benchmarks/${exe}.cc intgemm.cc) endforeach() +add_executable(example example.cc intgemm.cc) + include_directories(.) add_executable(tests test/common.cc diff --git a/benchmark.cc b/benchmarks/benchmark.cc index 5e3d9d8..5e3d9d8 100644 --- a/benchmark.cc +++ b/benchmarks/benchmark.cc diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc new file mode 100644 index 0000000..d5c27c4 --- /dev/null +++ b/benchmarks/biasmultiply.cc @@ -0,0 +1,170 @@ +#include "intgemm.h" +#include "aligned.h" +#include <chrono> +#include <random> +#include <iostream> + +using namespace intgemm; + +template <class Routine> +void testOld(Index rows, Index cols) { + +} + +template <class Routine> +std::chrono::duration<double> testNew(Index A_rows, Index width, Index B_cols) { + AlignedVector<float> A(A_rows * width); + AlignedVector<float> B(width * B_cols); + AlignedVector<float> bias(B_cols); + std::mt19937 gen; + std::uniform_real_distribution<float> dist(-1.0f, 1.0f); + for (auto& it : A) { + it = dist(gen); + } + for (auto& it : B) { + it = dist(gen); + } + for (auto& it : bias) { + it = dist(gen); + } + + float alpha = 2.0f; + float quant_mult = 127/alpha; + float unquant_mult = 1.0/(quant_mult*quant_mult); + + AlignedVector<uint8_t> A_prep(A.size()); + AlignedVector<int8_t> B_prep(B.size()); + Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width); + Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols); + + AlignedVector<float> test_C(A_rows * B_cols); + + Routine::PrepareBiasFor8(B.begin(), bias.begin(), alpha, width, B_cols); + auto start = std::chrono::system_clock::now(); + Routine::Multiply8new(A_prep.begin(), B_prep.begin(), BiasAddUnquantizeC(test_C.begin(), bias.begin(), unquant_mult), A_rows, width, B_cols); + auto end = std::chrono::system_clock::now(); + + std::chrono::duration<double> elapsed_seconds = end-start; + return elapsed_seconds; + +} + +template <class Routine> +std::chrono::duration<double> testOld(Index A_rows, Index width, Index B_cols) { + AlignedVector<float> A(A_rows * width); + AlignedVector<float> B(width * B_cols); + AlignedVector<float> bias(B_cols); + std::mt19937 gen; + std::uniform_real_distribution<float> dist(-1.0f, 1.0f); + for (auto& it : A) { + it = dist(gen); + } + for (auto& it : B) { + it = dist(gen); + } + for (auto& it : bias) { + it = dist(gen); + } + + float alpha = 2.0f; + float quant_mult = 127/alpha; + float unquant_mult = 1.0/(quant_mult*quant_mult); + + AlignedVector<int8_t> A_prep(A.size()); + AlignedVector<int8_t> B_prep(B.size()); + Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width); + Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols); + + AlignedVector<float> test_C(A_rows * B_cols); + + auto start = std::chrono::system_clock::now(); + Routine::Multiply(A_prep.begin(), B_prep.begin(), BiasAddUnquantizeC(test_C.begin(), bias.begin(), unquant_mult), A_rows, width, B_cols); + auto end = std::chrono::system_clock::now(); + + std::chrono::duration<double> elapsed_seconds = end-start; + return elapsed_seconds; + +} + +int main(int argc, char ** argv) { + int repeat = 1000; + if (argc > 1) { + repeat = atoi(argv[1]); + } + + + std::chrono::duration<double> oldSSSE3 = testOld<SSSE3_8bit>(1, 64, 8); + for (int i = 0; i<repeat; i++) { + oldSSSE3 += testOld<SSSE3_8bit>(8, 256, 256); + oldSSSE3 += testOld<SSSE3_8bit>(8, 2048, 256); + oldSSSE3 += testOld<SSSE3_8bit>(320, 256, 256); + oldSSSE3 += testOld<SSSE3_8bit>(472, 256, 256); + oldSSSE3 += testOld<SSSE3_8bit>(248, 256, 256); + oldSSSE3 += testOld<SSSE3_8bit>(200, 256, 256); + } + + std::cout << repeat << " iterations of Old SSSE3 took: " << oldSSSE3.count() << " seconds." << std::endl; + + std::chrono::duration<double> newTimeSSSE3 = testOld<SSSE3_8bit>(1, 64, 8); + for (int i = 0; i<repeat; i++) { + newTimeSSSE3 += testNew<SSSE3_8bit>(8, 256, 256); + newTimeSSSE3 += testNew<SSSE3_8bit>(8, 2048, 256); + newTimeSSSE3 += testNew<SSSE3_8bit>(320, 256, 256); + newTimeSSSE3 += testNew<SSSE3_8bit>(472, 256, 256); + newTimeSSSE3 += testNew<SSSE3_8bit>(248, 256, 256); + newTimeSSSE3 += testNew<SSSE3_8bit>(200, 256, 256); + } + + std::cout << repeat << " iterations of New SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl; + + std::chrono::duration<double> oldAVX2 = testOld<AVX2_8bit>(1, 64, 8); + for (int i = 0; i<repeat; i++) { + oldAVX2 += testOld<AVX2_8bit>(8, 256, 256); + oldAVX2 += testOld<AVX2_8bit>(8, 2048, 256); + oldAVX2 += testOld<AVX2_8bit>(320, 256, 256); + oldAVX2 += testOld<AVX2_8bit>(472, 256, 256); + oldAVX2 += testOld<AVX2_8bit>(248, 256, 256); + oldAVX2 += testOld<AVX2_8bit>(200, 256, 256); + } + + std::cout << repeat << " iterations of Old AVX2 took: " << oldAVX2.count() << " seconds." << std::endl; + + std::chrono::duration<double> newTimeAVX2 = testOld<AVX2_8bit>(1, 64, 8); + for (int i = 0; i<repeat; i++) { + newTimeAVX2 += testNew<AVX2_8bit>(8, 256, 256); + newTimeAVX2 += testNew<AVX2_8bit>(8, 2048, 256); + newTimeAVX2 += testNew<AVX2_8bit>(320, 256, 256); + newTimeAVX2 += testNew<AVX2_8bit>(472, 256, 256); + newTimeAVX2 += testNew<AVX2_8bit>(248, 256, 256); + newTimeAVX2 += testNew<AVX2_8bit>(200, 256, 256); + } + + std::cout << repeat << " iterations of New AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl; + + if (kCPU < CPU_AVX512BW) return 0; + std::chrono::duration<double> oldAVX512 = testOld<AVX512_8bit>(1, 64, 8); + for (int i = 0; i<repeat; i++) { + oldAVX512 += testOld<AVX512_8bit>(8, 256, 256); + oldAVX512 += testOld<AVX512_8bit>(8, 2048, 256); + oldAVX512 += testOld<AVX512_8bit>(320, 256, 256); + oldAVX512 += testOld<AVX512_8bit>(472, 256, 256); + oldAVX512 += testOld<AVX512_8bit>(248, 256, 256); + oldAVX512 += testOld<AVX512_8bit>(200, 256, 256); + } + + std::cout << repeat << " iterations of Old AVX512 took: " << oldAVX512.count() << " seconds." << std::endl; + + std::chrono::duration<double> newTimeAVX512 = testOld<AVX512_8bit>(1, 64, 8); + for (int i = 0; i<repeat; i++) { + newTimeAVX512 += testNew<AVX512_8bit>(8, 256, 256); + newTimeAVX512 += testNew<AVX512_8bit>(8, 2048, 256); + newTimeAVX512 += testNew<AVX512_8bit>(320, 256, 256); + newTimeAVX512 += testNew<AVX512_8bit>(472, 256, 256); + newTimeAVX512 += testNew<AVX512_8bit>(248, 256, 256); + newTimeAVX512 += testNew<AVX512_8bit>(200, 256, 256); + } + + std::cout << repeat << " iterations of New AVX512 took: " << newTimeAVX512.count() << " seconds." << std::endl; + + +} |