#include "../intgemm.h" #include "../aligned.h" #include #include #include using namespace intgemm; template void testOld(Index /*rows*/, Index /*cols*/) { } template std::chrono::duration testNew(Index A_rows, Index width, Index B_cols) { AlignedVector A(A_rows * width); AlignedVector B(width * B_cols); AlignedVector bias(B_cols); std::mt19937 gen; std::uniform_real_distribution dist(-1.0f, 1.0f); for (auto& it : A) { it = dist(gen); } for (auto& it : B) { it = dist(gen); } for (auto& it : bias) { it = dist(gen); } float alpha = 2.0f; float quant_mult = 127/alpha; float unquant_mult = 1.0/(quant_mult*quant_mult); AlignedVector A_prep(A.size()); AlignedVector B_prep(B.size()); Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width); Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols); AlignedVector test_C(A_rows * B_cols); float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on Routine::PrepareBias(B_prep.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), bias.begin())); auto start = std::chrono::system_clock::now(); Routine::Multiply8Shift(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin())); auto end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = end-start; return elapsed_seconds; } template std::chrono::duration testOld(Index A_rows, Index width, Index B_cols) { AlignedVector A(A_rows * width); AlignedVector B(width * B_cols); AlignedVector bias(B_cols); std::mt19937 gen; std::uniform_real_distribution dist(-1.0f, 1.0f); for (auto& it : A) { it = dist(gen); } for (auto& it : B) { it = dist(gen); } for (auto& it : bias) { it = dist(gen); } float alpha = 2.0f; float quant_mult = 127/alpha; float unquant_mult = 1.0/(quant_mult*quant_mult); AlignedVector A_prep(A.size()); AlignedVector B_prep(B.size()); Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width); Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols); AlignedVector test_C(A_rows * B_cols); auto start = std::chrono::system_clock::now(); Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin())); auto end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = end-start; return elapsed_seconds; } template std::chrono::duration testOld_nobias(Index A_rows, Index width, Index B_cols) { AlignedVector A(A_rows * width); AlignedVector B(width * B_cols); std::mt19937 gen; std::uniform_real_distribution dist(-1.0f, 1.0f); for (auto& it : A) { it = dist(gen); } for (auto& it : B) { it = dist(gen); } float alpha = 2.0f; float quant_mult = 127/alpha; float unquant_mult = 1.0/(quant_mult*quant_mult); AlignedVector A_prep(A.size()); AlignedVector B_prep(B.size()); Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width); Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols); AlignedVector test_C(A_rows * B_cols); auto start = std::chrono::system_clock::now(); Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndWrite(unquant_mult, test_C.begin())); auto end = std::chrono::system_clock::now(); std::chrono::duration elapsed_seconds = end-start; return elapsed_seconds; } int main(int argc, char ** argv) { int repeat = 1000; if (argc > 1) { repeat = atoi(argv[1]); } std::chrono::duration oldSSSE3_nobias = testOld_nobias(1, 64, 8); for (int i = 0; i(8, 256, 256); oldSSSE3_nobias += testOld_nobias(8, 2048, 256); oldSSSE3_nobias += testOld_nobias(320, 256, 256); oldSSSE3_nobias += testOld_nobias(472, 256, 256); oldSSSE3_nobias += testOld_nobias(248, 256, 256); oldSSSE3_nobias += testOld_nobias(200, 256, 256); } std::cout << repeat << " iterations of SSSE3 without bias took: " << oldSSSE3_nobias.count() << " seconds." << std::endl; std::chrono::duration oldSSSE3 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); oldSSSE3 += testOld(8, 2048, 256); oldSSSE3 += testOld(320, 256, 256); oldSSSE3 += testOld(472, 256, 256); oldSSSE3 += testOld(248, 256, 256); oldSSSE3 += testOld(200, 256, 256); } std::cout << repeat << " iterations of SSSE3 took: " << oldSSSE3.count() << " seconds." << std::endl; std::chrono::duration newTimeSSSE3 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); newTimeSSSE3 += testNew(8, 2048, 256); newTimeSSSE3 += testNew(320, 256, 256); newTimeSSSE3 += testNew(472, 256, 256); newTimeSSSE3 += testNew(248, 256, 256); newTimeSSSE3 += testNew(200, 256, 256); } std::cout << repeat << " iterations of Shifted SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl; std::chrono::duration oldAVX2_nobias = testOld_nobias(1, 64, 8); for (int i = 0; i(8, 256, 256); oldAVX2_nobias += testOld_nobias(8, 2048, 256); oldAVX2_nobias += testOld_nobias(320, 256, 256); oldAVX2_nobias += testOld_nobias(472, 256, 256); oldAVX2_nobias += testOld_nobias(248, 256, 256); oldAVX2_nobias += testOld_nobias(200, 256, 256); } std::cout << repeat << " iterations of AVX2 without bias took: " << oldAVX2_nobias.count() << " seconds." << std::endl; std::chrono::duration oldAVX2 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); oldAVX2 += testOld(8, 2048, 256); oldAVX2 += testOld(320, 256, 256); oldAVX2 += testOld(472, 256, 256); oldAVX2 += testOld(248, 256, 256); oldAVX2 += testOld(200, 256, 256); } std::cout << repeat << " iterations of AVX2 took: " << oldAVX2.count() << " seconds." << std::endl; std::chrono::duration newTimeAVX2 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); newTimeAVX2 += testNew(8, 2048, 256); newTimeAVX2 += testNew(320, 256, 256); newTimeAVX2 += testNew(472, 256, 256); newTimeAVX2 += testNew(248, 256, 256); newTimeAVX2 += testNew(200, 256, 256); } std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW if (kCPU < CPUType::AVX512BW) return 0; std::chrono::duration oldAVX512_nobias = testOld_nobias(1, 64, 8); for (int i = 0; i(8, 256, 256); oldAVX512_nobias += testOld_nobias(8, 2048, 256); oldAVX512_nobias += testOld_nobias(320, 256, 256); oldAVX512_nobias += testOld_nobias(472, 256, 256); oldAVX512_nobias += testOld_nobias(248, 256, 256); oldAVX512_nobias += testOld_nobias(200, 256, 256); } std::cout << repeat << " iterations of AVX512 without bias took: " << oldAVX512_nobias.count() << " seconds." << std::endl; std::chrono::duration oldAVX512 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); oldAVX512 += testOld(8, 2048, 256); oldAVX512 += testOld(320, 256, 256); oldAVX512 += testOld(472, 256, 256); oldAVX512 += testOld(248, 256, 256); oldAVX512 += testOld(200, 256, 256); } std::cout << repeat << " iterations of AVX512 took: " << oldAVX512.count() << " seconds." << std::endl; std::chrono::duration newTimeAVX512 = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); newTimeAVX512 += testNew(8, 2048, 256); newTimeAVX512 += testNew(320, 256, 256); newTimeAVX512 += testNew(472, 256, 256); newTimeAVX512 += testNew(248, 256, 256); newTimeAVX512 += testNew(200, 256, 256); } std::cout << repeat << " iterations of Shifted AVX512 took: " << newTimeAVX512.count() << " seconds." << std::endl; #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI if (kCPU < CPUType::AVX512VNNI) return 0; std::chrono::duration oldAVX512VNNI_nobias = testOld_nobias(1, 64, 8); for (int i = 0; i(8, 256, 256); oldAVX512VNNI_nobias += testOld_nobias(8, 2048, 256); oldAVX512VNNI_nobias += testOld_nobias(320, 256, 256); oldAVX512VNNI_nobias += testOld_nobias(472, 256, 256); oldAVX512VNNI_nobias += testOld_nobias(248, 256, 256); oldAVX512VNNI_nobias += testOld_nobias(200, 256, 256); } std::cout << repeat << " iterations of AVX512VNNI without bias took: " << oldAVX512VNNI_nobias.count() << " seconds." << std::endl; std::chrono::duration oldAVX512VNNI = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); oldAVX512VNNI += testOld(8, 2048, 256); oldAVX512VNNI += testOld(320, 256, 256); oldAVX512VNNI += testOld(472, 256, 256); oldAVX512VNNI += testOld(248, 256, 256); oldAVX512VNNI += testOld(200, 256, 256); } std::cout << repeat << " iterations of AVX512VNNI took: " << oldAVX512VNNI.count() << " seconds." << std::endl; std::chrono::duration newTimeAVX512VNNI = testOld(1, 64, 8); for (int i = 0; i(8, 256, 256); newTimeAVX512VNNI += testNew(8, 2048, 256); newTimeAVX512VNNI += testNew(320, 256, 256); newTimeAVX512VNNI += testNew(472, 256, 256); newTimeAVX512VNNI += testNew(248, 256, 256); newTimeAVX512VNNI += testNew(200, 256, 256); } std::cout << repeat << " iterations of Shifted AVX512VNNI took: " << newTimeAVX512VNNI.count() << " seconds." << std::endl; #endif }