diff options
author | Nikolay Bogoychev <nheart@gmail.com> | 2020-11-15 19:50:02 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-11-15 19:50:02 +0300 |
commit | 8abde25b13c3ab210c0dec8e23f4944e3953812d (patch) | |
tree | 90b591ee994252ddd44d593276b4ef895bbcb5aa | |
parent | 874ceebbf53a85691b326495100b6361a2166cec (diff) | |
parent | 8f28282c3bd854922da638024d2659be52e892e9 (diff) |
Merge pull request #2 from kpu/master
Merge with latest intgemm master
-rw-r--r-- | .github/workflows/mac.yml (renamed from .github/workflows/release.yml) | 7 | ||||
-rw-r--r-- | .github/workflows/ubuntu-gcc5-debug.yml | 27 | ||||
-rw-r--r-- | .github/workflows/ubuntu.yml | 25 | ||||
-rw-r--r-- | .github/workflows/windows.yml | 25 | ||||
-rw-r--r-- | CMakeLists.txt | 11 | ||||
-rw-r--r-- | LICENSE | 30 | ||||
-rw-r--r-- | README.md | 9 | ||||
-rw-r--r-- | benchmarks/benchmark.cc | 48 | ||||
-rw-r--r-- | benchmarks/benchmark_quantizer.cc | 18 | ||||
-rw-r--r-- | benchmarks/biasmultiply.cc | 172 | ||||
-rw-r--r-- | example.cc | 13 | ||||
-rw-r--r-- | intgemm.cc | 71 | ||||
-rw-r--r-- | intgemm/aligned.h (renamed from aligned.h) | 1 | ||||
-rw-r--r-- | intgemm/avx2_gemm.h (renamed from avx2_gemm.h) | 101 | ||||
-rw-r--r-- | intgemm/avx512_gemm.h (renamed from avx512_gemm.h) | 106 | ||||
-rw-r--r-- | intgemm/avx512vnni_gemm.h (renamed from avx512vnni_gemm.h) | 15 | ||||
-rw-r--r-- | intgemm/callbacks.h (renamed from callbacks.h) | 2 | ||||
-rw-r--r-- | intgemm/callbacks/configs.h (renamed from callbacks/configs.h) | 0 | ||||
-rw-r--r-- | intgemm/callbacks/implementations.inl (renamed from callbacks/implementations.inl) | 18 | ||||
-rw-r--r-- | intgemm/callbacks/output_buffer_info.h (renamed from callbacks/output_buffer_info.h) | 0 | ||||
-rw-r--r-- | intgemm/interleave.h (renamed from interleave.h) | 49 | ||||
-rw-r--r-- | intgemm/intgemm.cc | 71 | ||||
-rw-r--r-- | intgemm/intgemm.h (renamed from intgemm.h) | 28 | ||||
-rw-r--r-- | intgemm/intgemm_config.h.in (renamed from intgemm_config.h.in) | 0 | ||||
-rw-r--r-- | intgemm/intrinsics.h (renamed from intrinsics.h) | 51 | ||||
-rw-r--r-- | intgemm/kernels.h (renamed from kernels.h) | 2 | ||||
-rw-r--r-- | intgemm/kernels/implementations.inl (renamed from kernels/implementations.inl) | 34 | ||||
-rw-r--r-- | intgemm/multiply.h (renamed from multiply.h) | 28 | ||||
-rw-r--r-- | intgemm/sse2_gemm.h (renamed from sse2_gemm.h) | 44 | ||||
-rw-r--r-- | intgemm/ssse3_gemm.h (renamed from ssse3_gemm.h) | 64 | ||||
-rw-r--r-- | intgemm/stats.h (renamed from stats.h) | 0 | ||||
-rw-r--r-- | intgemm/stats.inl (renamed from stats.inl) | 2 | ||||
-rw-r--r-- | intgemm/types.h (renamed from types.h) | 0 | ||||
-rw-r--r-- | intgemm/utils.h (renamed from utils.h) | 0 | ||||
-rw-r--r-- | intgemm/vec_traits.h (renamed from vec_traits.h) | 0 | ||||
-rw-r--r-- | test/3rd_party/LICENSE_1_0.txt | 24 | ||||
-rw-r--r-- | test/3rd_party/catch.hpp (renamed from 3rd_party/catch.hpp) | 0 | ||||
-rw-r--r-- | test/add127_test.cc | 210 | ||||
-rw-r--r-- | test/kernels/add_bias_test.cc | 4 | ||||
-rw-r--r-- | test/kernels/bitwise_not_test.cc | 4 | ||||
-rw-r--r-- | test/kernels/downcast_test.cc | 4 | ||||
-rw-r--r-- | test/kernels/exp_test.cc | 4 | ||||
-rw-r--r-- | test/kernels/floor_test.cc | 4 | ||||
-rw-r--r-- | test/kernels/multiply_sat_test.cc | 54 | ||||
-rw-r--r-- | test/kernels/multiply_test.cc | 5 | ||||
-rw-r--r-- | test/kernels/quantize_test.cc | 4 | ||||
-rw-r--r-- | test/kernels/relu_test.cc | 5 | ||||
-rw-r--r-- | test/kernels/rescale_test.cc | 5 | ||||
-rw-r--r-- | test/kernels/sigmoid_test.cc | 4 | ||||
-rw-r--r-- | test/kernels/tanh_test.cc | 4 | ||||
-rw-r--r-- | test/kernels/unquantize_test.cc | 4 | ||||
-rw-r--r-- | test/kernels/upcast_test.cc | 9 | ||||
-rw-r--r-- | test/kernels/write_test.cc | 4 | ||||
-rw-r--r-- | test/multiply_test.cc | 230 | ||||
-rw-r--r-- | test/prepare_b_quantized_transposed.cc | 24 | ||||
-rw-r--r-- | test/prepare_b_transposed.cc | 24 | ||||
-rw-r--r-- | test/quantize_test.cc | 38 | ||||
-rw-r--r-- | test/test.cc | 10 | ||||
-rw-r--r-- | test/test.h | 16 | ||||
-rw-r--r-- | test/utils_test.cc | 2 | ||||
-rw-r--r-- | test_mull.cpp | 328 |
61 files changed, 885 insertions, 1211 deletions
diff --git a/.github/workflows/release.yml b/.github/workflows/mac.yml index 4fb9b3f..767cf1a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/mac.yml @@ -1,4 +1,4 @@ -name: Release +name: Mac on: push: @@ -8,10 +8,7 @@ on: jobs: build: - strategy: - matrix: - runs-on: [ubuntu-latest, macOS-latest, windows-latest] - runs-on: ${{ matrix.runs-on }} + runs-on: macOS-latest steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/ubuntu-gcc5-debug.yml b/.github/workflows/ubuntu-gcc5-debug.yml new file mode 100644 index 0000000..1323828 --- /dev/null +++ b/.github/workflows/ubuntu-gcc5-debug.yml @@ -0,0 +1,27 @@ +name: Ubuntu gcc5 debug + +on: + push: + branches: [master, static] + pull_request: + branches: [master, static] + +jobs: + build: + runs-on: ubuntu-18.04 + + steps: + - uses: actions/checkout@v2 + - name: install + run: sudo apt-get install -y gcc-5 g++-5 + - name: cmake + run: | + cmake -E make_directory build + cd build + cmake -DCMAKE_C_COMPILER=gcc-5 -DCMAKE_CXX_COMPILER=g++-5 -DCMAKE_BUILD_TYPE=Debug .. + - name: Compile + working-directory: build + run: cmake --build . -j2 + - name: Test + working-directory: build + run: ctest -j2 diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml new file mode 100644 index 0000000..f0e457b --- /dev/null +++ b/.github/workflows/ubuntu.yml @@ -0,0 +1,25 @@ +name: Ubuntu + +on: + push: + branches: [master, static] + pull_request: + branches: [master, static] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: cmake + run: | + cmake -E make_directory build + cd build + cmake .. + - name: Compile + working-directory: build + run: cmake --build . -j2 + - name: Test + working-directory: build + run: ctest -j2 diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 0000000..d64fefd --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,25 @@ +name: Windows + +on: + push: + branches: [master, static] + pull_request: + branches: [master, static] + +jobs: + build: + runs-on: windows-latest + + steps: + - uses: actions/checkout@v2 + - name: cmake + run: | + cmake -E make_directory build + cd build + cmake .. + - name: Compile + working-directory: build + run: cmake --build . -j2 + - name: Test + working-directory: build + run: ctest -j2 diff --git a/CMakeLists.txt b/CMakeLists.txt index c675315..d1885f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,12 +33,16 @@ if(NOT INTGEMM_COMPILER_SUPPORTS_AVX512VNNI) message(WARNING "${Orange}Not building AVX512VNNI-based multiplication because your compiler is too old.\nFor details rerun cmake with --debug-trycompile then try to build in compile_tests/CMakeFiles/CMakeTmp.${ColourReset}") endif() -# Generate configure file -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/intgemm_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/intgemm_config.h) +add_library(intgemm STATIC intgemm/intgemm.cc) +# Generate configure file +configure_file(intgemm/intgemm_config.h.in intgemm/intgemm_config.h) +#Ensure it is included by users. include_directories(${CMAKE_CURRENT_BINARY_DIR}) +target_include_directories(intgemm PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) -add_library(intgemm STATIC intgemm.cc) +# This isn't necessary since intgemm uses entirely relative paths but source code depending on it may want to #include <intgemm/intgemm.h> +target_include_directories(intgemm INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) option(USE_OPENMP "Use OpenMP" OFF) if (USE_OPENMP) @@ -80,7 +84,6 @@ add_executable(tests test/kernels/downcast_test.cc test/kernels/exp_test.cc test/kernels/floor_test.cc - test/kernels/multiply_sat_test.cc test/kernels/multiply_test.cc test/kernels/quantize_test.cc test/kernels/relu_test.cc @@ -10,6 +10,36 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI +test/3rd_party/catch.hpp +Copyright (c) 2019 Two Blue Cubes Ltd. All rights reserved. +Distributed under the Boost Software License, Version 1.0. (See accompanying +file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + + The original 16-bit SSE2 code came from: @@ -1,6 +1,11 @@ [![Build SSE](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-SSE.svg?label=SSE)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-SSE/) [![Build AVX2](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-AVX2.svg?label=AVX2)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-AVX2/) [![Build AVX512BW](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-AVX512BW.svg?label=AVX512BW)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-AVX512BW/) +![Build Ubuntu](https://github.com/kpu/intgemm/workflows/Ubuntu/badge.svg) +![Build Ubuntu debug](https://github.com/kpu/intgemm/workflows/Ubuntu%20debug/badge.svg) +![Build Ubuntu OpenMP](https://github.com/kpu/intgemm/workflows/Ubuntu%20OpenMP/badge.svg) +![Build Windows](https://github.com/kpu/intgemm/workflows/Windows/badge.svg) +![Build Mac](https://github.com/kpu/intgemm/workflows/Mac/badge.svg) # Integer Matrix Multiplication @@ -25,7 +30,7 @@ A full example appears in [example.cc](example.cc). Both A and B should be prepared before multiplication. ```C++ -#include "intgemm.h" +#include "intgemm/intgemm.h" /* Not shown: allocate 64-byte aligned memory with e.g. aligned_alloc. * A is A_rows x width. @@ -51,7 +56,7 @@ The last argument of `Multiply` is a callback which is usually used to performs For 8-bit, you can make use a of a slightly faster implementation, assuming you can determine tha quantization multipliers and prepare the biases offline: ```C++ -#include "intgemm.h" +#include "intgemm/intgemm.h" /* Not shown: allocate 64-byte aligned memory with e.g. aligned_alloc. * A is A_rows x width. diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index ebd0920..c6133bf 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -1,12 +1,12 @@ -#include "../aligned.h" -#include "intgemm_config.h" -#include "../avx512_gemm.h" -#include "../sse2_gemm.h" -#include "../avx2_gemm.h" -#include "../ssse3_gemm.h" -#include "../intgemm.h" -#include "../stats.h" -#include "../callbacks.h" +#include "../intgemm/aligned.h" +#include "intgemm/intgemm_config.h" +#include "../intgemm/avx512_gemm.h" +#include "../intgemm/sse2_gemm.h" +#include "../intgemm/avx2_gemm.h" +#include "../intgemm/ssse3_gemm.h" +#include "../intgemm/intgemm.h" +#include "../intgemm/stats.h" +#include "../intgemm/callbacks.h" #include <algorithm> #include <cassert> @@ -43,7 +43,7 @@ struct RandomMatrices { }; template <class Backend> double Run(const RandomMatrices &m) { - typedef typename Backend::Integer Integer; + using Integer = typename Backend::Integer; float quant_mult = 127.0f / 2.0f; float unquant_mult = 1.0f / (quant_mult * quant_mult); AlignedVector<Integer> A_prepared(m.A_rows * m.width); @@ -145,45 +145,45 @@ int main(int, char ** argv) { std::cerr << "SSSE3 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll<SSSE3_8bit>(matrices, end, stats.ssse3_8bit); + RunAll<ssse3::Kernels8>(matrices, end, stats.ssse3_8bit); } std::cerr << "SSE2 16bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll<SSE2_16bit>(matrices, end, stats.sse2_16bit); + RunAll<sse2::Kernels16>(matrices, end, stats.sse2_16bit); } std::cerr << "AVX2 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll<AVX2_8bit>(matrices, end, stats.avx2_8bit); + RunAll<avx2::Kernels8>(matrices, end, stats.avx2_8bit); } std::cerr << "AVX2 16bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll<AVX2_16bit>(matrices, end, stats.avx2_16bit); + RunAll<avx2::Kernels16>(matrices, end, stats.avx2_16bit); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW std::cerr << "AVX512 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll<AVX512_8bit>(matrices, end, stats.avx512_8bit); + RunAll<avx512bw::Kernels8>(matrices, end, stats.avx512_8bit); } std::cerr << "AVX512 16bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll<AVX512_16bit>(matrices, end, stats.avx512_16bit); + RunAll<avx512bw::Kernels16>(matrices, end, stats.avx512_16bit); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI std::cerr << "AVX512VNNI 8bit, 100 samples..." << std::endl; for (int samples = 0; samples < kSamples; ++samples) { RandomMatrices *end = (samples < 4) ? matrices_end : full_sample; - RunAll<AVX512VNNI_8bit>(matrices, end, stats.avx512vnni_8bit); + RunAll<avx512vnni::Kernels8>(matrices, end, stats.avx512vnni_8bit); } #endif @@ -193,18 +193,18 @@ int main(int, char ** argv) { } for (std::size_t i = 0; i < sizeof(matrices) / sizeof(RandomMatrices); ++i) { std::cout << "Multiply\t" << matrices[i].A_rows << '\t' << matrices[i].width << '\t' << matrices[i].B_cols << '\t' << "Samples=" << (kOutlierThreshold * stats.sse2_16bit[i].size()) << '\n'; - Print<SSSE3_8bit>(stats.ssse3_8bit, i); - Print<AVX2_8bit>(stats.avx2_8bit, i); + Print<ssse3::Kernels8>(stats.ssse3_8bit, i); + Print<avx2::Kernels8>(stats.avx2_8bit, i); #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - Print<AVX512_8bit>(stats.avx512_8bit, i); + Print<avx512bw::Kernels8>(stats.avx512_8bit, i); #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI - Print<AVX512VNNI_8bit>(stats.avx512vnni_8bit, i); + Print<avx512vnni::Kernels8>(stats.avx512vnni_8bit, i); #endif - Print<SSE2_16bit>(stats.sse2_16bit, i); - Print<AVX2_16bit>(stats.avx2_16bit, i); + Print<sse2::Kernels16>(stats.sse2_16bit, i); + Print<avx2::Kernels16>(stats.avx2_16bit, i); #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - Print<AVX512_16bit>(stats.avx512_16bit, i); + Print<avx512bw::Kernels16>(stats.avx512_16bit, i); #endif } return 0; diff --git a/benchmarks/benchmark_quantizer.cc b/benchmarks/benchmark_quantizer.cc index 16bf67f..5f36bd7 100644 --- a/benchmarks/benchmark_quantizer.cc +++ b/benchmarks/benchmark_quantizer.cc @@ -1,8 +1,8 @@ -#include "../intgemm.h" -#include "../aligned.h" -#include "../ssse3_gemm.h" -#include "../avx2_gemm.h" -#include "../avx512_gemm.h" +#include "../intgemm/intgemm.h" +#include "../intgemm/aligned.h" +#include "../intgemm/ssse3_gemm.h" +#include "../intgemm/avx2_gemm.h" +#include "../intgemm/avx512_gemm.h" #include <chrono> #include <iomanip> @@ -14,7 +14,7 @@ namespace { float MaxAbsoluteBaseline(const float *begin, const float *end) { auto res = std::minmax_element(begin, end); - return std::max(fabsf(*res.first), fabsf(*res.second)); + return std::max(std::fabs(*res.first), std::fabs(*res.second)); } void BenchmarkMaxAbsolute() { @@ -63,10 +63,10 @@ int main() { for (float &element : in) { element = dist(gen); } - QuantizerBench<intgemm::SSSE3_8bit>(in.begin(), out.begin(), static_cast<intgemm::Index>(count)); - QuantizerBench<intgemm::AVX2_8bit>(in.begin(), out.begin(), static_cast<intgemm::Index>(count)); + QuantizerBench<intgemm::ssse3::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count)); + QuantizerBench<intgemm::avx2::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count)); #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - QuantizerBench<intgemm::AVX512_8bit>(in.begin(), out.begin(), static_cast<intgemm::Index>(count)); + QuantizerBench<intgemm::avx512bw::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count)); #endif } } diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc index da46bb9..490bf3b 100644 --- a/benchmarks/biasmultiply.cc +++ b/benchmarks/biasmultiply.cc @@ -1,5 +1,5 @@ -#include "../intgemm.h" -#include "../aligned.h" +#include "../intgemm/intgemm.h" +#include "../intgemm/aligned.h" #include <chrono> #include <random> #include <iostream> @@ -125,149 +125,149 @@ int main(int argc, char ** argv) { repeat = atoi(argv[1]); } - std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<SSSE3_8bit>(1, 64, 8); + std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<ssse3::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(8, 256, 256); - oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(8, 2048, 256); - oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(320, 256, 256); - oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(472, 256, 256); - oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(248, 256, 256); - oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(200, 256, 256); + oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(8, 256, 256); + oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(8, 2048, 256); + oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(320, 256, 256); + oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(472, 256, 256); + oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(248, 256, 256); + oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of SSSE3 without bias took: " << oldSSSE3_nobias.count() << " seconds." << std::endl; - std::chrono::duration<double> oldSSSE3 = testOld<SSSE3_8bit>(1, 64, 8); + std::chrono::duration<double> oldSSSE3 = testOld<ssse3::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - oldSSSE3 += testOld<SSSE3_8bit>(8, 256, 256); - oldSSSE3 += testOld<SSSE3_8bit>(8, 2048, 256); - oldSSSE3 += testOld<SSSE3_8bit>(320, 256, 256); - oldSSSE3 += testOld<SSSE3_8bit>(472, 256, 256); - oldSSSE3 += testOld<SSSE3_8bit>(248, 256, 256); - oldSSSE3 += testOld<SSSE3_8bit>(200, 256, 256); + oldSSSE3 += testOld<ssse3::Kernels8>(8, 256, 256); + oldSSSE3 += testOld<ssse3::Kernels8>(8, 2048, 256); + oldSSSE3 += testOld<ssse3::Kernels8>(320, 256, 256); + oldSSSE3 += testOld<ssse3::Kernels8>(472, 256, 256); + oldSSSE3 += testOld<ssse3::Kernels8>(248, 256, 256); + oldSSSE3 += testOld<ssse3::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of SSSE3 took: " << oldSSSE3.count() << " seconds." << std::endl; - std::chrono::duration<double> newTimeSSSE3 = testOld<SSSE3_8bit>(1, 64, 8); + std::chrono::duration<double> newTimeSSSE3 = testOld<ssse3::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - newTimeSSSE3 += testNew<SSSE3_8bit>(8, 256, 256); - newTimeSSSE3 += testNew<SSSE3_8bit>(8, 2048, 256); - newTimeSSSE3 += testNew<SSSE3_8bit>(320, 256, 256); - newTimeSSSE3 += testNew<SSSE3_8bit>(472, 256, 256); - newTimeSSSE3 += testNew<SSSE3_8bit>(248, 256, 256); - newTimeSSSE3 += testNew<SSSE3_8bit>(200, 256, 256); + newTimeSSSE3 += testNew<ssse3::Kernels8>(8, 256, 256); + newTimeSSSE3 += testNew<ssse3::Kernels8>(8, 2048, 256); + newTimeSSSE3 += testNew<ssse3::Kernels8>(320, 256, 256); + newTimeSSSE3 += testNew<ssse3::Kernels8>(472, 256, 256); + newTimeSSSE3 += testNew<ssse3::Kernels8>(248, 256, 256); + newTimeSSSE3 += testNew<ssse3::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of Shifted SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl; - std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<AVX2_8bit>(1, 64, 8); + std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<avx2::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - oldAVX2_nobias += testOld_nobias<AVX2_8bit>(8, 256, 256); - oldAVX2_nobias += testOld_nobias<AVX2_8bit>(8, 2048, 256); - oldAVX2_nobias += testOld_nobias<AVX2_8bit>(320, 256, 256); - oldAVX2_nobias += testOld_nobias<AVX2_8bit>(472, 256, 256); - oldAVX2_nobias += testOld_nobias<AVX2_8bit>(248, 256, 256); - oldAVX2_nobias += testOld_nobias<AVX2_8bit>(200, 256, 256); + oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(8, 256, 256); + oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(8, 2048, 256); + oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(320, 256, 256); + oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(472, 256, 256); + oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(248, 256, 256); + oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of AVX2 without bias took: " << oldAVX2_nobias.count() << " seconds." << std::endl; - std::chrono::duration<double> oldAVX2 = testOld<AVX2_8bit>(1, 64, 8); + std::chrono::duration<double> oldAVX2 = testOld<avx2::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - oldAVX2 += testOld<AVX2_8bit>(8, 256, 256); - oldAVX2 += testOld<AVX2_8bit>(8, 2048, 256); - oldAVX2 += testOld<AVX2_8bit>(320, 256, 256); - oldAVX2 += testOld<AVX2_8bit>(472, 256, 256); - oldAVX2 += testOld<AVX2_8bit>(248, 256, 256); - oldAVX2 += testOld<AVX2_8bit>(200, 256, 256); + oldAVX2 += testOld<avx2::Kernels8>(8, 256, 256); + oldAVX2 += testOld<avx2::Kernels8>(8, 2048, 256); + oldAVX2 += testOld<avx2::Kernels8>(320, 256, 256); + oldAVX2 += testOld<avx2::Kernels8>(472, 256, 256); + oldAVX2 += testOld<avx2::Kernels8>(248, 256, 256); + oldAVX2 += testOld<avx2::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of AVX2 took: " << oldAVX2.count() << " seconds." << std::endl; - std::chrono::duration<double> newTimeAVX2 = testOld<AVX2_8bit>(1, 64, 8); + std::chrono::duration<double> newTimeAVX2 = testOld<avx2::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - newTimeAVX2 += testNew<AVX2_8bit>(8, 256, 256); - newTimeAVX2 += testNew<AVX2_8bit>(8, 2048, 256); - newTimeAVX2 += testNew<AVX2_8bit>(320, 256, 256); - newTimeAVX2 += testNew<AVX2_8bit>(472, 256, 256); - newTimeAVX2 += testNew<AVX2_8bit>(248, 256, 256); - newTimeAVX2 += testNew<AVX2_8bit>(200, 256, 256); + newTimeAVX2 += testNew<avx2::Kernels8>(8, 256, 256); + newTimeAVX2 += testNew<avx2::Kernels8>(8, 2048, 256); + newTimeAVX2 += testNew<avx2::Kernels8>(320, 256, 256); + newTimeAVX2 += testNew<avx2::Kernels8>(472, 256, 256); + newTimeAVX2 += testNew<avx2::Kernels8>(248, 256, 256); + newTimeAVX2 += testNew<avx2::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW if (kCPU < CPUType::AVX512BW) return 0; - std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<AVX512_8bit>(1, 64, 8); + std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<avx512bw::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - oldAVX512_nobias += testOld_nobias<AVX512_8bit>(8, 256, 256); - oldAVX512_nobias += testOld_nobias<AVX512_8bit>(8, 2048, 256); - oldAVX512_nobias += testOld_nobias<AVX512_8bit>(320, 256, 256); - oldAVX512_nobias += testOld_nobias<AVX512_8bit>(472, 256, 256); - oldAVX512_nobias += testOld_nobias<AVX512_8bit>(248, 256, 256); - oldAVX512_nobias += testOld_nobias<AVX512_8bit>(200, 256, 256); + oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(8, 256, 256); + oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(8, 2048, 256); + oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(320, 256, 256); + oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(472, 256, 256); + oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(248, 256, 256); + oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of AVX512 without bias took: " << oldAVX512_nobias.count() << " seconds." << std::endl; - std::chrono::duration<double> oldAVX512 = testOld<AVX512_8bit>(1, 64, 8); + std::chrono::duration<double> oldAVX512 = testOld<avx512bw::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - oldAVX512 += testOld<AVX512_8bit>(8, 256, 256); - oldAVX512 += testOld<AVX512_8bit>(8, 2048, 256); - oldAVX512 += testOld<AVX512_8bit>(320, 256, 256); - oldAVX512 += testOld<AVX512_8bit>(472, 256, 256); - oldAVX512 += testOld<AVX512_8bit>(248, 256, 256); - oldAVX512 += testOld<AVX512_8bit>(200, 256, 256); + oldAVX512 += testOld<avx512bw::Kernels8>(8, 256, 256); + oldAVX512 += testOld<avx512bw::Kernels8>(8, 2048, 256); + oldAVX512 += testOld<avx512bw::Kernels8>(320, 256, 256); + oldAVX512 += testOld<avx512bw::Kernels8>(472, 256, 256); + oldAVX512 += testOld<avx512bw::Kernels8>(248, 256, 256); + oldAVX512 += testOld<avx512bw::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of AVX512 took: " << oldAVX512.count() << " seconds." << std::endl; - std::chrono::duration<double> newTimeAVX512 = testOld<AVX512_8bit>(1, 64, 8); + std::chrono::duration<double> newTimeAVX512 = testOld<avx512bw::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - newTimeAVX512 += testNew<AVX512_8bit>(8, 256, 256); - newTimeAVX512 += testNew<AVX512_8bit>(8, 2048, 256); - newTimeAVX512 += testNew<AVX512_8bit>(320, 256, 256); - newTimeAVX512 += testNew<AVX512_8bit>(472, 256, 256); - newTimeAVX512 += testNew<AVX512_8bit>(248, 256, 256); - newTimeAVX512 += testNew<AVX512_8bit>(200, 256, 256); + newTimeAVX512 += testNew<avx512bw::Kernels8>(8, 256, 256); + newTimeAVX512 += testNew<avx512bw::Kernels8>(8, 2048, 256); + newTimeAVX512 += testNew<avx512bw::Kernels8>(320, 256, 256); + newTimeAVX512 += testNew<avx512bw::Kernels8>(472, 256, 256); + newTimeAVX512 += testNew<avx512bw::Kernels8>(248, 256, 256); + newTimeAVX512 += testNew<avx512bw::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of Shifted AVX512 took: " << newTimeAVX512.count() << " seconds." << std::endl; #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI if (kCPU < CPUType::AVX512VNNI) return 0; - std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<AVX512_8bit>(1, 64, 8); + std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<avx512bw::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(8, 256, 256); - oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(8, 2048, 256); - oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(320, 256, 256); - oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(472, 256, 256); - oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(248, 256, 256); - oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(200, 256, 256); + oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(8, 256, 256); + oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(8, 2048, 256); + oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(320, 256, 256); + oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(472, 256, 256); + oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(248, 256, 256); + oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of AVX512VNNI without bias took: " << oldAVX512VNNI_nobias.count() << " seconds." << std::endl; - std::chrono::duration<double> oldAVX512VNNI = testOld<AVX512_8bit>(1, 64, 8); + std::chrono::duration<double> oldAVX512VNNI = testOld<avx512bw::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - oldAVX512VNNI += testOld<AVX512VNNI_8bit>(8, 256, 256); - oldAVX512VNNI += testOld<AVX512VNNI_8bit>(8, 2048, 256); - oldAVX512VNNI += testOld<AVX512VNNI_8bit>(320, 256, 256); - oldAVX512VNNI += testOld<AVX512VNNI_8bit>(472, 256, 256); - oldAVX512VNNI += testOld<AVX512VNNI_8bit>(248, 256, 256); - oldAVX512VNNI += testOld<AVX512VNNI_8bit>(200, 256, 256); + oldAVX512VNNI += testOld<avx512vnni::Kernels8>(8, 256, 256); + oldAVX512VNNI += testOld<avx512vnni::Kernels8>(8, 2048, 256); + oldAVX512VNNI += testOld<avx512vnni::Kernels8>(320, 256, 256); + oldAVX512VNNI += testOld<avx512vnni::Kernels8>(472, 256, 256); + oldAVX512VNNI += testOld<avx512vnni::Kernels8>(248, 256, 256); + oldAVX512VNNI += testOld<avx512vnni::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of AVX512VNNI took: " << oldAVX512VNNI.count() << " seconds." << std::endl; - std::chrono::duration<double> newTimeAVX512VNNI = testOld<AVX512_8bit>(1, 64, 8); + std::chrono::duration<double> newTimeAVX512VNNI = testOld<avx512bw::Kernels8>(1, 64, 8); for (int i = 0; i<repeat; i++) { - newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(8, 256, 256); - newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(8, 2048, 256); - newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(320, 256, 256); - newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(472, 256, 256); - newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(248, 256, 256); - newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(200, 256, 256); + newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(8, 256, 256); + newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(8, 2048, 256); + newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(320, 256, 256); + newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(472, 256, 256); + newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(248, 256, 256); + newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(200, 256, 256); } std::cout << repeat << " iterations of Shifted AVX512VNNI took: " << newTimeAVX512VNNI.count() << " seconds." << std::endl; @@ -1,11 +1,11 @@ -#include "intgemm.h" +#include "intgemm/intgemm.h" // This is just for AlignedVector, which helps managed 64-byte aligned memory. // Feel free to manage memory yourself. -#include "aligned.h" -#include "callbacks.h" +#include "intgemm/aligned.h" +#include "intgemm/callbacks.h" #include <cassert> -#include <math.h> +#include <cmath> #include <random> int main() { @@ -54,7 +54,7 @@ int main() { // Do the actual multiply. intgemm::Int16::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0f / (quant_mult * quant_mult), C.begin())); // Sanity check. C will be row major. - assert(fabsf(C[0] - top_left_reference) < 0.05f); + assert(std::fabs(C[0] - top_left_reference) < 0.05f); } // 8-bit multiplication. @@ -73,6 +73,7 @@ int main() { // Do the actual multiply. intgemm::Int8::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0f / (quant_mult * quant_mult), C.begin())); // Sanity check. C will be row major. - assert(fabsf(C[0] - top_left_reference) < 0.05f); + assert(std::fabs(C[0] - top_left_reference) < 0.05f); } + return 0; } diff --git a/intgemm.cc b/intgemm.cc deleted file mode 100644 index 2708952..0000000 --- a/intgemm.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include "intgemm.h" -#include "stats.h" - -namespace intgemm { - -float Unsupported_MaxAbsolute(const float * /*begin*/, const float * /*end*/) { - throw UnsupportedCPU(); -} - -MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/, bool /*absolute*/) { - throw UnsupportedCPU(); -} - -void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_16bit::Quantize, AVX512_16bit::Quantize, AVX2_16bit::Quantize, SSE2_16bit::Quantize, SSE2_16bit::Quantize, Unsupported_16bit::Quantize); - -void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_16bit::PrepareB, AVX512_16bit::PrepareB, AVX2_16bit::PrepareB, SSE2_16bit::PrepareB, SSE2_16bit::PrepareB, Unsupported_16bit::PrepareB); - -void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBQuantizedTransposed, AVX512_16bit::PrepareBQuantizedTransposed, AVX2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed); - -void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBTransposed, AVX512_16bit::PrepareBTransposed, AVX2_16bit::PrepareBTransposed, SSE2_16bit::PrepareBTransposed, SSE2_16bit::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed); - -void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB); - -const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName); - -void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI_8bit::Quantize, AVX512_8bit::Quantize, AVX2_8bit::Quantize, SSSE3_8bit::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize); - -void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI_8bit::QuantizeU, AVX512_8bit::QuantizeU, AVX2_8bit::QuantizeU, SSSE3_8bit::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); - -void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512VNNI_8bit::PrepareB, AVX512_8bit::PrepareB, AVX2_8bit::PrepareB, SSSE3_8bit::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB); - -void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_8bit::PrepareBQuantizedTransposed, AVX512_8bit::PrepareBQuantizedTransposed, AVX2_8bit::PrepareBQuantizedTransposed, SSSE3_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed); - -void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_8bit::PrepareBTransposed, AVX512_8bit::PrepareBTransposed, AVX2_8bit::PrepareBTransposed, SSSE3_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed); - -void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI_8bit::SelectColumnsB, AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB); - -const char *const Int8::kName = ChooseCPU(AVX512VNNI_8bit::kName, AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); - -void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI_8bit::QuantizeU, AVX512_8bit::QuantizeU, AVX2_8bit::QuantizeU, SSSE3_8bit::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); - -const char *const Int8Shift::kName = ChooseCPU(AVX512VNNI_8bit::kName, AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); - -const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED); - -#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW) -namespace avx512bw { -using avx2::MaxAbsolute; -using avx2::VectorMeanStd; -} // namespace avx512bw -#endif - -float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512bw::MaxAbsolute, avx512bw::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute); - -MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512bw::VectorMeanStd, avx512bw::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, Unsupported_VectorMeanStd); - -constexpr const char *const Unsupported_16bit::kName; -constexpr const char *const Unsupported_8bit::kName; -constexpr const char *const SSE2_16bit::kName; -constexpr const char *const SSSE3_8bit::kName; -constexpr const char *const AVX2_8bit::kName; -constexpr const char *const AVX2_16bit::kName; -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW -constexpr const char *const AVX512_8bit::kName; -constexpr const char *const AVX512_16bit::kName; -#endif -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI -constexpr const char *const AVX512VNNI_8bit::kName; -#endif - -} diff --git a/aligned.h b/intgemm/aligned.h index 8ad7242..7500a8c 100644 --- a/aligned.h +++ b/intgemm/aligned.h @@ -1,7 +1,6 @@ #pragma once #include <cstdlib> #include <new> -#include <stdlib.h> #ifdef _MSC_VER #include <malloc.h> #endif diff --git a/avx2_gemm.h b/intgemm/avx2_gemm.h index a929361..5e81475 100644 --- a/avx2_gemm.h +++ b/intgemm/avx2_gemm.h @@ -6,57 +6,46 @@ #include "types.h" #include <cstdint> -#include <stdint.h> #include <cstring> namespace intgemm { - namespace avx2 { -INTGEMM_AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) { - return kernels::quantize(loadu_ps<__m256>(input), quant_mult_reg); +INTGEMM_AVX2 inline Register QuantizerGrab(const float *input, const __m256 quant_mult_reg) { + return kernels::quantize(loadu_ps<FRegister>(input), quant_mult_reg); } INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i) class QuantizeTile16 { public: - typedef __m256i Register; - - INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {} - - INTGEMM_AVX2 Register Consecutive(const float *input) const { - return Tile(input, input + 8); + INTGEMM_AVX2 static inline Register Consecutive(FRegister mult_reg, const float *input) { + return Tile(mult_reg, input, input + 8); } - INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { - return Tile( + INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) { + return Tile(mult_reg, input, input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0)); } - INTGEMM_AVX2 Register ForReshape(const float *input, Index cols) const { + INTGEMM_AVX2 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) { // 8 rows in the first 128-bit register, 8 in the second register. - return Tile(input, input + 8 * cols); + return Tile(mult_reg, input, input + 8 * cols); } private: - INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) const { - __m256i g0 = QuantizerGrab(input0, mult_); - __m256i g1 = QuantizerGrab(input1, mult_); - __m256i packed = _mm256_packs_epi32(g0, g1); + INTGEMM_AVX2 static inline Register Tile(FRegister mult_reg, const float *input0, const float *input1) { + Register g0 = QuantizerGrab(input0, mult_reg); + Register g1 = QuantizerGrab(input1, mult_reg); + Register packed = _mm256_packs_epi32(g0, g1); // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15. // Technically this could be removed if the PrepareB did the same reordering internally. return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); } - - const __m256 mult_; }; -} // namespace - - -struct AVX2_16bit { +struct Kernels16 { typedef int16_t Integer; // Currently A is prepared by quantization but this could theoretically change. @@ -68,10 +57,10 @@ struct AVX2_16bit { INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { assert(size % 16 == 0); assert(reinterpret_cast<uintptr_t>(input) % 32 == 0); - avx2::QuantizeTile16 q(quant_mult); + FRegister q = set1_ps<FRegister>(quant_mult); const float *end = input + size; for (; input != end; input += 16, output += 16) { - *reinterpret_cast<__m256i*>(output) = q.Consecutive(input); + *reinterpret_cast<__m256i*>(output) = QuantizeTile16::Consecutive(q, input); } } @@ -83,7 +72,7 @@ struct AVX2_16bit { PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols); }*/ INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16) - INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, CPUType::AVX2, int16_t) + INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int16_t) INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile16, int16_t) INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { @@ -97,26 +86,21 @@ struct AVX2_16bit { static const CPUType kUses = CPUType::AVX2; }; -namespace avx2 { /* Read 8 floats at a time from input0, input1, input2, and input3. Quantize * them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate * the result into one register and return it. */ class QuantizeTile8 { public: - typedef __m256i Register; - - INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {} - - INTGEMM_AVX2 inline __m256i Consecutive(const float *input) const { - return Tile(input, input + 8, input + 16, input + 24); + INTGEMM_AVX2 static inline Register Consecutive(FRegister quant_mult, const float *input) { + return Tile(quant_mult, input, input + 8, input + 16, input + 24); } - INTGEMM_AVX2 inline __m256i ConsecutiveU(const float *input) const { - return TileU(input, input + 8, input + 16, input + 24); + INTGEMM_AVX2 static inline Register ConsecutiveU(FRegister quant_mult, const float *input) { + return TileU(quant_mult, input, input + 8, input + 16, input + 24); } - INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { + INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) { const float* inputs[4]; for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) { while (cols_left < sizeof(Register) / sizeof(float)) { @@ -127,24 +111,24 @@ class QuantizeTile8 { input += sizeof(Register) / sizeof(float); cols_left -= sizeof(Register) / sizeof(float); } - return Tile(inputs[0], inputs[1], inputs[2], inputs[3]); + return Tile(quant_mult, inputs[0], inputs[1], inputs[2], inputs[3]); } - INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) const { + INTGEMM_AVX2 static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) { // Put higher rows in the second half of the register. These will jumble // around in the same way then conveniently land in the right place. - return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols); + return Tile(quant_mult, input, input + 2 * cols, input + 16 * cols, input + 18 * cols); } - INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const { + INTGEMM_AVX2 static inline __m256i Tile(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) { // Looking at the assembly, gcc has pulled this outside the loops calling this. const __m256i neg127 = _mm256_set1_epi8(-127); const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); // Grab 4 registers at a time in 32-bit format. - __m256i g0 = avx2::QuantizerGrab(input0, mult_); - __m256i g1 = avx2::QuantizerGrab(input1, mult_); - __m256i g2 = avx2::QuantizerGrab(input2, mult_); - __m256i g3 = avx2::QuantizerGrab(input3, mult_); + __m256i g0 = avx2::QuantizerGrab(input0, quant_mult); + __m256i g1 = avx2::QuantizerGrab(input1, quant_mult); + __m256i g2 = avx2::QuantizerGrab(input2, quant_mult); + __m256i g3 = avx2::QuantizerGrab(input3, quant_mult); // Pack 32-bit to 16-bit. __m256i packed0 = _mm256_packs_epi32(g0, g1); __m256i packed1 = _mm256_packs_epi32(g2, g3); @@ -161,16 +145,16 @@ class QuantizeTile8 { private: //A version that produces uint8_ts - INTGEMM_AVX2 inline __m256i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const { + INTGEMM_AVX2 static inline Register TileU(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) { // Looking at the assembly, gcc has pulled this outside the loops calling this. const __m256i neg127 = _mm256_set1_epi8(-127); const __m256i pos127 = _mm256_set1_epi8(127); const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); // Grab 4 registers at a time in 32-bit format. - __m256i g0 = avx2::QuantizerGrab(input0, mult_); - __m256i g1 = avx2::QuantizerGrab(input1, mult_); - __m256i g2 = avx2::QuantizerGrab(input2, mult_); - __m256i g3 = avx2::QuantizerGrab(input3, mult_); + __m256i g0 = avx2::QuantizerGrab(input0, quant_mult); + __m256i g1 = avx2::QuantizerGrab(input1, quant_mult); + __m256i g2 = avx2::QuantizerGrab(input2, quant_mult); + __m256i g3 = avx2::QuantizerGrab(input3, quant_mult); // Pack 32-bit to 16-bit. __m256i packed0 = _mm256_packs_epi32(g0, g1); __m256i packed1 = _mm256_packs_epi32(g2, g3); @@ -185,13 +169,9 @@ class QuantizeTile8 { // and the values are only used for GEMM. return _mm256_permutevar8x32_epi32(packed, shuffle_param); } - - const __m256 mult_; }; -} // namespace - -struct AVX2_8bit { +struct Kernels8 { typedef int8_t Integer; // Currently A is prepared by quantization but this could theoretically change. @@ -199,9 +179,9 @@ struct AVX2_8bit { Quantize(input, output, quant_mult, rows * cols); } private: - INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2, __m256i, avx2) + INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2) public: - INTGEMM_QUANTIZE(INTGEMM_AVX2, __m256i, avx2) + INTGEMM_QUANTIZE(INTGEMM_AVX2) // Currently A is prepared by quantization but this could theoretically change. INTGEMM_AVX2 static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) { @@ -212,10 +192,10 @@ struct AVX2_8bit { INTGEMM_AVX2 static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) { assert(size % 32 == 0); assert(reinterpret_cast<uintptr_t>(input) % 32 == 0); - avx2::QuantizeTile8 q(quant_mult); + FRegister q = set1_ps<FRegister>(quant_mult); const float *end = input + size; for (; input != end; input += 32, output += 32) { - *reinterpret_cast<__m256i*>(output) = q.ConsecutiveU(input); + *reinterpret_cast<__m256i*>(output) = QuantizeTile8::ConsecutiveU(q, input); } } @@ -224,7 +204,7 @@ struct AVX2_8bit { static const Index kBTileCol = 8; INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8) - INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, CPUType::AVX2, int8_t) + INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int8_t) INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile8, int8_t) INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { @@ -242,4 +222,5 @@ struct AVX2_8bit { static const CPUType kUses = CPUType::AVX2; }; +} // namespace avx2 } // namespace intgemm diff --git a/avx512_gemm.h b/intgemm/avx512_gemm.h index d53e48e..f9fb1eb 100644 --- a/avx512_gemm.h +++ b/intgemm/avx512_gemm.h @@ -1,6 +1,6 @@ #pragma once -#include "intgemm_config.h" +#include "intgemm/intgemm_config.h" #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW @@ -12,10 +12,7 @@ #include <cassert> #include <cstddef> #include <cstdint> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> +#include <cstdlib> /* AVX512 implementation. * This uses INTGEMM_AVX512BW, INTGEMM_AVX512DQ, and might use AVX512VL @@ -34,8 +31,7 @@ namespace intgemm { // So conversion in memory uses these, but I also implement a wider version for // rearranging B. -// Convert to 16-bit signed integers. -namespace avx512f { +namespace avx512bw { // Load from memory, multiply, and convert to int32_t. /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ @@ -60,7 +56,7 @@ INTGEMM_AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) { // Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently. /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) { - __m512 appended = avx512f::Concat(loadu_ps<__m256>(input0), loadu_ps<__m256>(input1)); + __m512 appended = Concat(loadu_ps<__m256>(input0), loadu_ps<__m256>(input1)); appended = _mm512_mul_ps(appended, quant_mult_reg); return _mm512_cvtps_epi32(appended); } @@ -70,40 +66,27 @@ INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const f // being used for the quantizer. class QuantizeTile16 { public: - typedef __m512i Register; - - /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ - INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} - - INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { + INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) { auto input0 = input; auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0); - auto g0 = QuantizerGrabHalves(input0, input1, mult_reg_); - auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, mult_reg_); + auto g0 = QuantizerGrabHalves(input0, input1, quant_mult); + auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, quant_mult); auto packed = packs_epi32(g0, g1); return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); } - INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const { - __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_); - __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_); + INTGEMM_AVX512BW static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) { + __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, quant_mult); + __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, quant_mult); __m512i packed = packs_epi32(g0, g1); // Permute within 256-bit lanes, so same as INTGEMM_AVX2 return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); } - - private: - const __m512 mult_reg_; }; class QuantizeTile8 { public: - typedef __m512i Register; - - /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ - INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} - - INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { + INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) { static const __m512i neg127 = _mm512_set1_epi8(-127); static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); @@ -118,10 +101,10 @@ class QuantizeTile8 { cols_left -= sizeof(Register) / sizeof(float); } - auto g0 = QuantizerGrab(inputs[0], mult_reg_); - auto g1 = QuantizerGrab(inputs[1], mult_reg_); - auto g2 = QuantizerGrab(inputs[2], mult_reg_); - auto g3 = QuantizerGrab(inputs[3], mult_reg_); + auto g0 = QuantizerGrab(inputs[0], quant_mult); + auto g1 = QuantizerGrab(inputs[1], quant_mult); + auto g2 = QuantizerGrab(inputs[2], quant_mult); + auto g3 = QuantizerGrab(inputs[3], quant_mult); auto packed0 = packs_epi32(g0, g1); auto packed1 = packs_epi32(g2, g3); @@ -130,17 +113,17 @@ class QuantizeTile8 { return _mm512_permutexvar_epi32(shuffle_param, packed); } - INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const { + INTGEMM_AVX512BW static inline __m512i ForReshape(FRegister quant_mult, const float *input, Index cols) { // TODO: try alternative: _mm512_cvtsepi32_epi8 ? const __m512i neg127 = _mm512_set1_epi8(-127); // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc. const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // 32-bit format. - __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_); - __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_); - __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_); - __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_); + __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, quant_mult); + __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, quant_mult); + __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, quant_mult); + __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, quant_mult); // Pack 32-bit to 16-bit. __m512i packed0 = packs_epi32(g0, g1); __m512i packed1 = packs_epi32(g2, g3); @@ -151,14 +134,9 @@ class QuantizeTile8 { // 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63 return _mm512_permutexvar_epi32(shuffle_param, packed); } - - private: - const __m512 mult_reg_; }; -} // namespace - -struct AVX512_16bit { +struct Kernels16 { typedef int16_t Integer; // Currently A is prepared by quantization but this could theoretically change. @@ -181,7 +159,7 @@ struct AVX512_16bit { const float *end = input + size; for (; input != end; input += 16, output += 16) { // There doesn't seem to be an unmasked version. - _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, avx512f::QuantizerGrab(input, quant_mult_reg)); + _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, QuantizerGrab(input, quant_mult_reg)); } } @@ -189,19 +167,15 @@ struct AVX512_16bit { // Tile size for B; B must be a multiple of this block size. static const Index kBTileRow = 32; static const Index kBTileCol = 8; -/* - INTGEMM_AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { - PrepareBFor16(input, output, avx512f::QuantizeTile16(quant_mult), rows, cols); - } -*/ + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ - INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, avx512f::QuantizeTile16) - INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, CPUType::AVX512BW, int16_t) - INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, avx512f::QuantizeTile16, int16_t) + INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, QuantizeTile16) + INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, int16_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, QuantizeTile16, int16_t) /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end); + SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end); } /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ @@ -212,7 +186,7 @@ struct AVX512_16bit { static const CPUType kUses = CPUType::AVX512BW; }; -struct AVX512_8bit { +struct Kernels8 { typedef int8_t Integer; // Currently A is prepared by quantization but this could theoretically change. @@ -237,7 +211,7 @@ struct AVX512_8bit { const std::size_t kBatch = sizeof(__m512i) / sizeof(float); #pragma omp for for (std::size_t i = 0; i < count; i += kBatch) { - __m512i asint = avx512f::QuantizerGrab(input + i, quant_mult_reg); + __m512i asint = QuantizerGrab(input + i, quant_mult_reg); asint = _mm512_max_epi32(asint, neg127); // There doesn't seem to be an unmasked version. _mm512_mask_cvtsepi32_storeu_epi8(output + i, 0xffff, asint); @@ -263,7 +237,7 @@ struct AVX512_8bit { if (!overhang) return; // We needed a branch anyway for the empty case. const __m512i neg127 = _mm512_set1_epi32(-127); const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); - __m512i asint = avx512f::QuantizerGrab(fast_input_end, quant_mult_reg); + __m512i asint = QuantizerGrab(fast_input_end, quant_mult_reg); asint = _mm512_max_epi32(asint, neg127); _mm512_mask_cvtsepi32_storeu_epi8(fast_output_end, (1 << overhang) - 1, asint); } @@ -287,7 +261,7 @@ struct AVX512_8bit { const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); const float *end = input + size; for (; input < end; input += 16, output += 16) { - __m512i asint = avx512f::QuantizerGrab(input, quant_mult_reg); + __m512i asint = QuantizerGrab(input, quant_mult_reg); asint = _mm512_min_epi32(asint, pos127); asint = _mm512_add_epi32(asint, pos127); asint = _mm512_max_epi32(asint, zero); @@ -298,26 +272,21 @@ struct AVX512_8bit { // Tile size for B; B must be a multiple of this block size. static const Index kBTileRow = 64; static const Index kBTileCol = 8; -/* - INTGEMM_AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { - PrepareBFor8(input, output, avx512f::QuantizeTile8(quant_mult), rows, cols); - }*/ + /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ - INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, avx512f::QuantizeTile8) - INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, CPUType::AVX512BW, int8_t) - INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, avx512f::QuantizeTile8, int8_t) + INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, QuantizeTile8) + INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, int8_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, QuantizeTile8, int8_t) /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { - avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end); + SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end); } // Special AVX512 implementation due to having 32 registers (so I don't have to // allocate registers manually) and no sign instruction. template <typename Callback> INTGEMM_AVX512BW static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { - typedef __m512i Register; - //typedef __m256 Float; // For quantization we only do 8 at a time. // This is copy-paste from Multiply8_SSE2OrAVX2. assert(width % sizeof(Register) == 0); assert(B_cols % 8 == 0); @@ -325,7 +294,7 @@ struct AVX512_8bit { assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); // There's 8 results for INTGEMM_AVX2 to handle. auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback); - const int simd_width = width / sizeof(Register); + const Index simd_width = width / sizeof(Register); // Added for AVX512. Register zeros = setzero_si<Register>(); // Go over 8 columns of B at a time. @@ -436,6 +405,7 @@ struct AVX512_8bit { static const CPUType kUses = CPUType::AVX512BW; }; +} // namespace avx512bw } // namespace intgemm #endif diff --git a/avx512vnni_gemm.h b/intgemm/avx512vnni_gemm.h index 22c5c4e..c660168 100644 --- a/avx512vnni_gemm.h +++ b/intgemm/avx512vnni_gemm.h @@ -1,12 +1,13 @@ #pragma once -#include "intgemm_config.h" +#include "intgemm/intgemm_config.h" #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI #include "avx512_gemm.h" #include "types.h" namespace intgemm { +namespace avx512vnni { // Workaround extra vmovdqa64 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663 INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) { @@ -17,16 +18,15 @@ INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) { #endif } -struct AVX512VNNI_8bit : public AVX512_8bit { +struct Kernels8 : public avx512bw::Kernels8 { template <typename Callback> INTGEMM_AVX512VNNI static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { - typedef __m512i Register; assert(width % sizeof(Register) == 0); assert(B_cols % 8 == 0); assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback); - const int simd_width = width / sizeof(Register); + const Index simd_width = width / sizeof(Register); Register zeros = setzero_si<Register>(); // Go over 8 columns of B at a time. #pragma omp for @@ -82,13 +82,12 @@ struct AVX512VNNI_8bit : public AVX512_8bit { template <typename Callback> INTGEMM_AVX512VNNI static void Multiply8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { - typedef __m512i Register; assert(width % sizeof(Register) == 0); assert(B_cols % 8 == 0); assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback); - const int simd_width = width / sizeof(Register); + const Index simd_width = width / sizeof(Register); Register zeros = setzero_si<Register>(); // Go over 8 columns of B at a time. #pragma omp for @@ -124,12 +123,11 @@ struct AVX512VNNI_8bit : public AVX512_8bit { template <typename Callback> INTGEMM_AVX512VNNI static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) { - typedef __m512i Register; assert(width % sizeof(Register) == 0); assert(B_cols % 8 == 0); assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback); - const int simd_width = width / sizeof(Register); + Index simd_width = width / sizeof(Register); Register zeros = setzero_si<Register>(); const Register a = set1_epi8<Register>(1); // Go over 8 columns of B at a time. @@ -164,6 +162,7 @@ struct AVX512VNNI_8bit : public AVX512_8bit { static const CPUType kUses = CPUType::AVX512VNNI; }; +} // namespace avx512vnni } // namespace intgemm #endif diff --git a/callbacks.h b/intgemm/callbacks.h index 24f9009..23d3be1 100644 --- a/callbacks.h +++ b/intgemm/callbacks.h @@ -3,7 +3,7 @@ #include "callbacks/configs.h" #include "callbacks/output_buffer_info.h" -#include "intgemm_config.h" +#include "intgemm/intgemm_config.h" #include "intrinsics.h" #include "kernels.h" #include "types.h" diff --git a/callbacks/configs.h b/intgemm/callbacks/configs.h index 1222448..1222448 100644 --- a/callbacks/configs.h +++ b/intgemm/callbacks/configs.h diff --git a/callbacks/implementations.inl b/intgemm/callbacks/implementations.inl index d2b7d95..47d2aa4 100644 --- a/callbacks/implementations.inl +++ b/intgemm/callbacks/implementations.inl @@ -129,7 +129,14 @@ public: } CPU_ATTR void operator()(vi input, const OutputBufferInfo& info) { - auto result = kernels::unquantize(input, unquant_mult); + // Workaround gcc 5 internal compiler error that can't read register members in debug. + vf mult_reg; +#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER) + asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult)); +#else + mult_reg = unquant_mult; +#endif + auto result = kernels::unquantize(input, mult_reg); kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx); } @@ -164,7 +171,14 @@ public: } CPU_ATTR void operator()(vi input, const OutputBufferInfo& info) { - auto result = kernels::unquantize(input, unquant_mult); + // Workaround gcc 5 internal compiler error that can't read register members in debug. + vf mult_reg; +#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER) + asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult)); +#else + mult_reg = unquant_mult; +#endif + auto result = kernels::unquantize(input, mult_reg); result = kernels::add_bias(result, config.bias_addr, info.col_idx); kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx); } diff --git a/callbacks/output_buffer_info.h b/intgemm/callbacks/output_buffer_info.h index 213aef4..213aef4 100644 --- a/callbacks/output_buffer_info.h +++ b/intgemm/callbacks/output_buffer_info.h diff --git a/interleave.h b/intgemm/interleave.h index 231be46..1ec686b 100644 --- a/interleave.h +++ b/intgemm/interleave.h @@ -1,12 +1,11 @@ #pragma once -#include "intgemm_config.h" +#include "intgemm/intgemm_config.h" #include "intrinsics.h" #include "types.h" #include <algorithm> #include <cassert> -#include <stdint.h> namespace intgemm { @@ -180,11 +179,9 @@ template <class Register> static inline void Transpose8InLane( // ... ... #define INTGEMM_PREPARE_B_8(target, QuantClass) \ target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \ - typedef typename QuantClass Quantizer; \ - typedef typename Quantizer::Register Register; \ - Quantizer q = Quantizer(quant_mult); \ + FRegister q = set1_ps<FRegister>(quant_mult); \ /* Currently all multipliers have a stride of 8 columns.*/ \ - const int kColStride = 8; \ + const Index kColStride = 8; \ assert(cols % kColStride == 0); \ assert(rows % sizeof(Register) == 0); \ assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ @@ -196,14 +193,14 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl This isn't quite Transpose8InLane because it's half the number of columns, \ so each register starts with two rows instead of being one row. \ The quantizers know to skip a row.*/ \ - output[0] = q.ForReshape(input + cols * (r ) + c, cols); \ - output[1] = q.ForReshape(input + cols * (r + 1) + c, cols); \ - output[2] = q.ForReshape(input + cols * (r + 4) + c, cols); \ - output[3] = q.ForReshape(input + cols * (r + 5) + c, cols); \ - output[4] = q.ForReshape(input + cols * (r + 8) + c, cols); \ - output[5] = q.ForReshape(input + cols * (r + 9) + c, cols); \ - output[6] = q.ForReshape(input + cols * (r + 12) + c, cols); \ - output[7] = q.ForReshape(input + cols * (r + 13) + c, cols); \ + output[0] = QuantClass::ForReshape(q, input + cols * (r ) + c, cols); \ + output[1] = QuantClass::ForReshape(q, input + cols * (r + 1) + c, cols); \ + output[2] = QuantClass::ForReshape(q, input + cols * (r + 4) + c, cols); \ + output[3] = QuantClass::ForReshape(q, input + cols * (r + 5) + c, cols); \ + output[4] = QuantClass::ForReshape(q, input + cols * (r + 8) + c, cols); \ + output[5] = QuantClass::ForReshape(q, input + cols * (r + 9) + c, cols); \ + output[6] = QuantClass::ForReshape(q, input + cols * (r + 12) + c, cols); \ + output[7] = QuantClass::ForReshape(q, input + cols * (r + 13) + c, cols); \ Interleave8(output[0], output[1]); \ Interleave8(output[2], output[3]); \ Interleave8(output[4], output[5]); \ @@ -215,9 +212,7 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl #define INTGEMM_PREPARE_B_16(target, QuantClass) \ target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \ - typedef typename QuantClass Quantizer; \ - typedef typename Quantizer::Register Register; \ - Quantizer q = Quantizer(quant_mult); \ + FRegister q = set1_ps<FRegister>(quant_mult); \ assert(cols % 8 == 0); \ assert(rows % (sizeof(Register) / sizeof(int16_t)) == 0); \ assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ @@ -226,8 +221,8 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f for (Index c = 0; c < cols; c += 8) { \ for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \ /* gcc unrolls this loop and uses registers for output[k]*/ \ - for (int k = 0; k < 8; ++k) { \ - output[k] = q.ForReshape(input + cols * (r + k) + c, cols); \ + for (Index k = 0; k < 8; ++k) { \ + output[k] = QuantClass::ForReshape(q, input + cols * (r + k) + c, cols); \ } \ Transpose16InLane(output[0], output[1], output[2], output[3], output[4], output[5], output[6], output[7]); \ } \ @@ -241,9 +236,8 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f * * cols and rows describe size of transposed B. */ -#define INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(target, cpu_type, Integer) \ +#define INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(target, Integer) \ target static inline void PrepareBQuantizedTransposed(const Integer* input, Integer* output, Index cols, Index rows) { \ - using Register = vector_t<cpu_type, Integer>; \ const Index RegisterElems = sizeof(Register) / sizeof(Integer); \ const Index kColStride = 8; \ \ @@ -268,7 +262,6 @@ target static inline void PrepareBQuantizedTransposed(const Integer* input, Inte */ #define INTGEMM_PREPARE_B_TRANSPOSED(target, Quantizer, Integer) \ target static inline void PrepareBTransposed(const float* input, Integer* output, float quant_mult, Index cols, Index rows) { \ - using Register = typename Quantizer::Register; \ const Index RegisterElemsInt = sizeof(Register) / sizeof(Integer); \ const Index kColStride = 8; \ \ @@ -277,13 +270,13 @@ target static inline void PrepareBTransposed(const float* input, Integer* output assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \ \ - Quantizer quantizer(quant_mult); \ + FRegister q = set1_ps<FRegister>(quant_mult); \ Register* output_it = reinterpret_cast<Register*>(output); \ Index r = 0; \ Index c = 0; \ while (r < rows) { \ for (Index ri = 0; ri < 8; ++ri) \ - *output_it++ = quantizer.ConsecutiveWithWrapping(input + (r + ri) * cols + c, cols - c, cols, 8); \ + *output_it++ = Quantizer::ConsecutiveWithWrapping(q, input + (r + ri) * cols + c, cols - c, cols, 8); \ c += RegisterElemsInt; \ while (c >= cols) { \ r += kColStride; \ @@ -299,14 +292,14 @@ target static inline void SelectColumnsOfB(const Register *input, Register *outp assert(rows_bytes % sizeof(Register) == 0); \ assert((cols_end - cols_begin) % 8 == 0); \ /* Do columns for multiples of 8.*/ \ - int register_rows = rows_bytes / sizeof(Register); \ + Index register_rows = rows_bytes / sizeof(Register); \ const Register *starts[8]; \ for (; cols_begin != cols_end; cols_begin += 8) { \ - for (int k = 0; k < 8; ++k) { \ + for (Index k = 0; k < 8; ++k) { \ starts[k] = input + (cols_begin[k] & 7) + (cols_begin[k] & ~7) * register_rows; \ } \ - for (int r = 0; r < register_rows; ++r) { \ - for (int k = 0; k < 8; ++k) { \ + for (Index r = 0; r < register_rows; ++r) { \ + for (Index k = 0; k < 8; ++k) { \ *(output++) = *starts[k]; \ starts[k] += 8; \ } \ diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc new file mode 100644 index 0000000..f859b9a --- /dev/null +++ b/intgemm/intgemm.cc @@ -0,0 +1,71 @@ +#include "intgemm.h" +#include "stats.h" + +namespace intgemm { + +float Unsupported_MaxAbsolute(const float * /*begin*/, const float * /*end*/) { + throw UnsupportedCPU(); +} + +MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/, bool /*absolute*/) { + throw UnsupportedCPU(); +} + +void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(avx512bw::Kernels16::Quantize, avx512bw::Kernels16::Quantize, avx2::Kernels16::Quantize, sse2::Kernels16::Quantize, sse2::Kernels16::Quantize, Unsupported_16bit::Quantize); + +void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512bw::Kernels16::PrepareB, avx512bw::Kernels16::PrepareB, avx2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, Unsupported_16bit::PrepareB); + +void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBQuantizedTransposed, avx512bw::Kernels16::PrepareBQuantizedTransposed, avx2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed); + +void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBTransposed, avx512bw::Kernels16::PrepareBTransposed, avx2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed); + +void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512bw::Kernels16::SelectColumnsB, avx512bw::Kernels16::SelectColumnsB, avx2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB); + +const char *const Int16::kName = ChooseCPU(avx512bw::Kernels16::kName, avx512bw::Kernels16::kName, avx2::Kernels16::kName, sse2::Kernels16::kName, sse2::Kernels16::kName, Unsupported_16bit::kName); + +void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::Quantize, avx512bw::Kernels8::Quantize, avx2::Kernels8::Quantize, ssse3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize); + +void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); + +void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512vnni::Kernels8::PrepareB, avx512bw::Kernels8::PrepareB, avx2::Kernels8::PrepareB, ssse3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB); + +void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBQuantizedTransposed, avx512bw::Kernels8::PrepareBQuantizedTransposed, avx2::Kernels8::PrepareBQuantizedTransposed, ssse3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed); + +void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBTransposed, avx512bw::Kernels8::PrepareBTransposed, avx2::Kernels8::PrepareBTransposed, ssse3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed); + +void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512vnni::Kernels8::SelectColumnsB, avx512bw::Kernels8::SelectColumnsB, avx2::Kernels8::SelectColumnsB, ssse3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB); + +const char *const Int8::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); + +void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU); + +const char *const Int8Shift::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName); + +const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED); + +#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW) +namespace avx512bw { +using avx2::MaxAbsolute; +using avx2::VectorMeanStd; +} // namespace avx512bw +#endif + +float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512bw::MaxAbsolute, avx512bw::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute); + +MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512bw::VectorMeanStd, avx512bw::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, Unsupported_VectorMeanStd); + +constexpr const char *const Unsupported_16bit::kName; +constexpr const char *const Unsupported_8bit::kName; +constexpr const char *const sse2::Kernels16::kName; +constexpr const char *const ssse3::Kernels8::kName; +constexpr const char *const avx2::Kernels8::kName; +constexpr const char *const avx2::Kernels16::kName; +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW +constexpr const char *const avx512bw::Kernels8::kName; +constexpr const char *const avx512bw::Kernels16::kName; +#endif +#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI +constexpr const char *const avx512vnni::Kernels8::kName; +#endif + +} diff --git a/intgemm.h b/intgemm/intgemm.h index db7d2fe..8e2da02 100644 --- a/intgemm.h +++ b/intgemm/intgemm.h @@ -39,11 +39,9 @@ * passing unquant_mult = \lambda / (A_quant_mult * B_quant_mult). */ -// Yes, both headers due to the debacle about int32_t #include <cstdint> -#include <stdint.h> -#include "intgemm_config.h" +#include "intgemm/intgemm_config.h" #include "types.h" #include "sse2_gemm.h" #include "ssse3_gemm.h" @@ -126,7 +124,15 @@ struct Unsupported_8bit { #ifndef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI // These won't ever be called in this capacity, but it does let the code below compile. -typedef Unsupported_8bit AVX512VNNI_8bit; +namespace avx512vnni { +typedef Unsupported_8bit Kernels8; +} // namespace avx512vnni +#endif +#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW +namespace avx512bw { +typedef Unsupported_8bit Kernels8; +typedef Unsupported_16bit Kernels16; +} // namespace avx512bw #endif /* Returns: @@ -274,7 +280,7 @@ private: }; template <typename Callback> -void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512VNNI_8bit>, OMPParallelWrap<Callback, AVX512_8bit>, OMPParallelWrap<Callback, AVX2_8bit>, OMPParallelWrap<Callback, SSSE3_8bit>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>); +void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512vnni::Kernels8>, OMPParallelWrap<Callback, avx512bw::Kernels8>, OMPParallelWrap<Callback, avx2::Kernels8>, OMPParallelWrap<Callback, ssse3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>); /* * 8-bit matrix multiplication with shifting A by 127 @@ -338,14 +344,14 @@ private: template <class Callback> void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU( - OMPParallelWrap8Shift<Callback, AVX512VNNI_8bit>, - OMPParallelWrap8Shift<Callback, AVX512_8bit>, - OMPParallelWrap8Shift<Callback, AVX2_8bit>, - OMPParallelWrap8Shift<Callback, SSSE3_8bit>, + OMPParallelWrap8Shift<Callback, avx512vnni::Kernels8>, + OMPParallelWrap8Shift<Callback, avx512bw::Kernels8>, + OMPParallelWrap8Shift<Callback, avx2::Kernels8>, + OMPParallelWrap8Shift<Callback, ssse3::Kernels8>, Unsupported_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift<Callback>); template <class Callback> -void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::PrepareBias<Callback>, AVX512_8bit::PrepareBias<Callback>, AVX2_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, Unsupported_8bit::PrepareBias); +void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(avx512vnni::Kernels8::PrepareBias<Callback>, avx512bw::Kernels8::PrepareBias<Callback>, avx2::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias); /* * 16-bit matrix multiplication @@ -401,7 +407,7 @@ private: }; template <typename Callback> -void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512_16bit> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, AVX512_16bit>, OMPParallelWrap<Callback, AVX2_16bit>, OMPParallelWrap<Callback, SSE2_16bit>, OMPParallelWrap<Callback, SSE2_16bit>, Unsupported_16bit::Multiply<Callback>); +void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512bw::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, avx512bw::Kernels16>, OMPParallelWrap<Callback, avx2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, Unsupported_16bit::Multiply<Callback>); extern const CPUType kCPU; diff --git a/intgemm_config.h.in b/intgemm/intgemm_config.h.in index 920e9ae..920e9ae 100644 --- a/intgemm_config.h.in +++ b/intgemm/intgemm_config.h.in diff --git a/intrinsics.h b/intgemm/intrinsics.h index bf79e43..480f421 100644 --- a/intrinsics.h +++ b/intgemm/intrinsics.h @@ -1,6 +1,6 @@ #pragma once -#include "intgemm_config.h" +#include "intgemm/intgemm_config.h" #include "types.h" #include <tmmintrin.h> @@ -9,7 +9,6 @@ #include <xmmintrin.h> #include <cstdint> -#include <stdint.h> /* * NOTE: Please keep intrinsics in alphabetical order. @@ -162,17 +161,17 @@ template <> INTGEMM_SSE2 inline __m128i setzero_si<__m128i>() { INTGEMM_SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) { return _mm_sign_epi8(first, second); } -INTGEMM_SSE2 static inline __m128i slli_epi16(__m128i a, int8_t b) { - return _mm_slli_epi16(a, b); +template <int imm8> INTGEMM_SSE2 static inline __m128i slli_epi16(__m128i a) { + return _mm_slli_epi16(a, imm8); } -INTGEMM_SSE2 static inline __m128i srai_epi16(__m128i a, int8_t b) { - return _mm_srai_epi16(a, b); +template <int imm8> INTGEMM_SSE2 static inline __m128i srai_epi16(__m128i a) { + return _mm_srai_epi16(a, imm8); } -INTGEMM_SSE2 static inline __m128i srai_epi32(__m128i a, int8_t b) { - return _mm_srai_epi32(a, b); +template <int imm8> INTGEMM_SSE2 static inline __m128i srai_epi32(__m128i a) { + return _mm_srai_epi32(a, imm8); } -INTGEMM_SSE2 static inline __m128i srli_epi16(__m128i a, int8_t b) { - return _mm_srli_epi16(a, b); +template <int imm8> INTGEMM_SSE2 static inline __m128i srli_epi16(__m128i a) { + return _mm_srli_epi16(a, imm8); } INTGEMM_SSE2 static inline void storeu_ps(float* mem_addr, __m128 a) { _mm_storeu_ps(mem_addr, a); @@ -343,17 +342,17 @@ template <> INTGEMM_AVX2 inline __m256i setzero_si<__m256i>() { INTGEMM_AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) { return _mm256_sign_epi8(first, second); } -INTGEMM_AVX2 static inline __m256i slli_epi16(__m256i a, int8_t b) { - return _mm256_slli_epi16(a, b); +template <int imm8> INTGEMM_AVX2 static inline __m256i slli_epi16(__m256i a) { + return _mm256_slli_epi16(a, imm8); } -INTGEMM_AVX2 static inline __m256i srai_epi16(__m256i a, int8_t b) { - return _mm256_srai_epi16(a, b); +template <int imm8> INTGEMM_AVX2 static inline __m256i srai_epi16(__m256i a) { + return _mm256_srai_epi16(a, imm8); } -INTGEMM_AVX2 static inline __m256i srai_epi32(__m256i a, int8_t b) { - return _mm256_srai_epi32(a, b); +template <int imm8> INTGEMM_AVX2 static inline __m256i srai_epi32(__m256i a) { + return _mm256_srai_epi32(a, imm8); } -INTGEMM_AVX2 static inline __m256i srli_epi16(__m256i a, int8_t b) { - return _mm256_srli_epi16(a, b); +template <int imm8> INTGEMM_AVX2 static inline __m256i srli_epi16(__m256i a) { + return _mm256_srli_epi16(a, imm8); } INTGEMM_AVX2 static inline void storeu_ps(float* mem_addr, __m256 a) { _mm256_storeu_ps(mem_addr, a); @@ -540,17 +539,17 @@ template <> INTGEMM_AVX512BW inline __m512 load_ps<__m512>(const float* from) { /* * Missing sign_epi8 */ -INTGEMM_AVX512BW static inline __m512i slli_epi16(__m512i a, int8_t b) { - return _mm512_slli_epi16(a, b); +template <int imm8> INTGEMM_AVX512BW static inline __m512i slli_epi16(__m512i a) { + return _mm512_slli_epi16(a, imm8); } -INTGEMM_AVX512BW static inline __m512i srai_epi16(__m512i a, int8_t b) { - return _mm512_srai_epi16(a, b); +template <int imm8> INTGEMM_AVX512BW static inline __m512i srai_epi16(__m512i a) { + return _mm512_srai_epi16(a, imm8); } -INTGEMM_AVX512BW static inline __m512i srai_epi32(__m512i a, int8_t b) { - return _mm512_srai_epi32(a, b); +template <int imm8> INTGEMM_AVX512BW static inline __m512i srai_epi32(__m512i a) { + return _mm512_srai_epi32(a, imm8); } -INTGEMM_AVX512BW static inline __m512i srli_epi16(__m512i a, int8_t b) { - return _mm512_srli_epi16(a, b); +template <int imm8> INTGEMM_AVX512BW static inline __m512i srli_epi16(__m512i a) { + return _mm512_srli_epi16(a, imm8); } INTGEMM_AVX512BW static inline void storeu_ps(float* mem_addr, __m512 a) { _mm512_storeu_ps(mem_addr, a); diff --git a/kernels.h b/intgemm/kernels.h index 84631b5..ee35966 100644 --- a/kernels.h +++ b/intgemm/kernels.h @@ -1,6 +1,6 @@ #pragma once -#include "intgemm_config.h" +#include "intgemm/intgemm_config.h" #include "intrinsics.h" #include "types.h" #include "utils.h" diff --git a/kernels/implementations.inl b/intgemm/kernels/implementations.inl index 2ec9f1f..4f1b39f 100644 --- a/kernels/implementations.inl +++ b/intgemm/kernels/implementations.inl @@ -145,8 +145,8 @@ CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> multiply(vector_t<CPUTy template <> CPU_ATTR inline vi multiply<int8_t>(vi a, vi b) { auto even = mullo_epi16(a, b); - auto odd = mullo_epi16(srli_epi16(a, 8), srli_epi16(b, 8)); - return or_si(slli_epi16(odd, 8), srli_epi16(slli_epi16(even, 8), 8)); + auto odd = mullo_epi16(srli_epi16<8>(a), srli_epi16<8>(b)); + return or_si(slli_epi16<8>(odd), srli_epi16<8>(slli_epi16<8>(even))); } template <> @@ -235,7 +235,7 @@ CPU_ATTR static inline dvector_t<CPUType::CPU_NAME, int16_t> upcast8to16(vi inpu static const auto permutation_indices = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0); input = _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(input))); - auto negatives = _mm512_cmp_epi8_mask(input, vzero, _MM_CMPINT_LT); + auto negatives = _mm512_cmp_epi8_mask(input, vzero, 1 /* _MM_CMPINT_LT */); auto higher_byte = _mm512_mask_blend_epi8(negatives, vzero, vmax_negative); #endif @@ -258,7 +258,7 @@ CPU_ATTR static inline dvector_t<CPUType::CPU_NAME, int> upcast16to32(vi input) static const auto permutation_indices = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0); input = _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(input))); - auto negatives = _mm512_cmp_epi16_mask(input, vzero, _MM_CMPINT_LT); + auto negatives = _mm512_cmp_epi16_mask(input, vzero, 1 /* _MM_CMPINT_LT */); auto higher_byte = _mm512_mask_blend_epi16(negatives, vzero, vmax_negative); #endif @@ -296,32 +296,6 @@ CPU_ATTR static inline vi bitwise_not(vi v) { } /* - * Multiply with saturation (elemwise) - */ -template <typename Type> -CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> multiply_sat(vector_t<CPUType::CPU_NAME, Type> a, vector_t<CPUType::CPU_NAME, Type> b, uint8_t right_shift); - -template <> -CPU_ATTR inline vi multiply_sat<int8_t>(vi a, vi b, uint8_t right_shift) { - auto upcasted_a = upcast8to16(a); - auto upcasted_b = upcast8to16(b); - auto low = srai_epi16(multiply<int16_t>(upcasted_a.first, upcasted_b.first), right_shift); - auto hi = srai_epi16(multiply<int16_t>(upcasted_a.second, upcasted_b.second), right_shift); - - return downcast16to8(low, hi); -} - -template <> -CPU_ATTR inline vi multiply_sat<int16_t>(vi a, vi b, uint8_t right_shift) { - auto upcasted_a = upcast16to32(a); - auto upcasted_b = upcast16to32(b); - auto low = srai_epi32(multiply<int32_t>(upcasted_a.first, upcasted_b.first), right_shift); - auto hi = srai_epi32(multiply<int32_t>(upcasted_a.second, upcasted_b.second), right_shift); - - return downcast32to16(low, hi); -} - -/* * Floor */ CPU_ATTR static inline vf floor(vf input) { diff --git a/multiply.h b/intgemm/multiply.h index 80940d4..e201e09 100644 --- a/multiply.h +++ b/intgemm/multiply.h @@ -1,6 +1,6 @@ #pragma once -#include "intgemm_config.h" +#include "intgemm/intgemm_config.h" #include "interleave.h" #include "intrinsics.h" #include "vec_traits.h" @@ -47,16 +47,16 @@ INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i p // Quantize function used for SSSE3 and AVX2. // Separate function for thread to work around gcc 7 bug that doesn't imbue // target attributes across #pragma omp parallel. -#define INTGEMM_QUANTIZE_THREAD(target, Register, name) \ +#define INTGEMM_QUANTIZE_THREAD(target) \ target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \ - name::QuantizeTile8 q(quant_mult); \ + FRegister q = set1_ps<FRegister>(quant_mult); \ INTGEMM_OMP_FOR \ for (std::size_t i = 0; i < count; i += sizeof(Register)) { \ - *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \ + *reinterpret_cast<Register*>(output + i) = QuantizeTile8::Consecutive(q, input + i); \ } \ } -#define INTGEMM_QUANTIZE(target, Register, name) \ +#define INTGEMM_QUANTIZE(target) \ target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \ assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \ @@ -68,7 +68,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa } \ std::size_t overhang = size & (kBatch - 1); \ if (!overhang) return; \ - name::QuantizeTile8 q(quant_mult); \ + FRegister q = set1_ps<FRegister>(quant_mult); \ /* Each does size(Register) / 32 == kBatch / 4 floats at a time. * If we're allowed to read one of them, then we can read the whole register. */ \ const float *inputs[4]; \ @@ -80,7 +80,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa for (; i < 4; ++i) { \ inputs[i] = &input[fast_end]; \ } \ - Register result = q.Tile(inputs[0], inputs[1], inputs[2], inputs[3]); \ + Register result = QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]); \ std::memcpy(output + (size & ~(kBatch - 1)), &result, overhang); \ } @@ -159,7 +159,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const assert(B_cols % 8 == 0); \ assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \ assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \ - const int simd_width = width / (sizeof(Register) / sizeof(int16_t)); \ + const Index simd_width = width / (sizeof(Register) / sizeof(int16_t)); \ auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \ INTGEMM_OMP_FOR \ for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \ @@ -169,7 +169,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \ /* These will be packed 32-bit integers containing sums for each row of B multiplied by the row of A. \ Iterate over shared (inner) dimension.*/ \ - int k = 0; \ + Index k = 0; \ Register a = *(A_row + k); \ Register sum0 = madd_epi16(a, *(B0_col + k * 8)); \ Register sum1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \ @@ -216,7 +216,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \ assert(B_cols % 8 == 0); \ assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \ - const int simd_width = width / (sizeof(Register) / sizeof(int8_t)); \ + const Index simd_width = width / (sizeof(Register) / sizeof(int8_t)); \ auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \ const Register a = set1_epi8<Register>(1); \ INTGEMM_OMP_FOR \ @@ -225,7 +225,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const /*const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width);*/ \ /* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \ Iterate over shared (inner) dimension.*/ \ - int k = 0; \ + Index k = 0; \ Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \ Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \ Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \ @@ -291,7 +291,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const assert(B_cols % 8 == 0); \ assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \ assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \ - const int simd_width = width / (sizeof(Register) / sizeof(int8_t)); \ + const Index simd_width = width / (sizeof(Register) / sizeof(int8_t)); \ auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \ INTGEMM_OMP_FOR \ for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \ @@ -301,7 +301,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \ /* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \ Iterate over shared (inner) dimension.*/ \ - int k = 0; \ + Index k = 0; \ Register a = *(A_row + k); \ Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \ Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \ @@ -538,7 +538,7 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3( assert(B_cols % 8 == 0); \ assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \ assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \ - const int simd_width = width / sizeof(Register); \ + const Index simd_width = width / sizeof(Register); \ auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \ INTGEMM_OMP_FOR \ for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \ diff --git a/sse2_gemm.h b/intgemm/sse2_gemm.h index 69f6741..cd49efe 100644 --- a/sse2_gemm.h +++ b/intgemm/sse2_gemm.h @@ -5,12 +5,10 @@ #include "types.h" #include <cstdint> -#include <stdint.h> // 8 bit is in ssse3_gemm.h namespace intgemm { - namespace sse2 { INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { @@ -21,37 +19,30 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i) class QuantizeTile16 { public: - typedef __m128i Register; - - INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {} - - INTGEMM_SSE2 inline __m128i Consecutive(const float *input) const { - return Tile(input, input + 4); + INTGEMM_SSE2 static inline Register Consecutive(__m128 mult_reg, const float *input) { + return Tile(mult_reg, input, input + 4); } - INTGEMM_SSE2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { - return Tile( + INTGEMM_SSE2 static inline Register ConsecutiveWithWrapping(__m128 mult_reg, const float *input, Index cols_left, Index cols, Index row_step) { + return Tile(mult_reg, input, input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0)); } - INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) const { - return Consecutive(input); + INTGEMM_SSE2 static inline Register ForReshape(__m128 mult_reg, const float *input, int) { + return Consecutive(mult_reg, input); } private: - INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) const { - __m128i g0 = QuantizerGrab(input0, mult_reg_); - __m128i g1 = QuantizerGrab(input1, mult_reg_); + INTGEMM_SSE2 static inline Register Tile(__m128 mult_reg, const float *input0, const float *input1) { + __m128i g0 = kernels::quantize(loadu_ps<__m128>(input0), mult_reg); + __m128i g1 = kernels::quantize(loadu_ps<__m128>(input1), mult_reg); return _mm_packs_epi32(g0, g1); } - - const __m128 mult_reg_; }; -} //namespace -// This should be pure INTGEMM_SSE2 (and below). -struct SSE2_16bit { +// This should be pure SSE2 (and below). +struct Kernels16 { typedef int16_t Integer; // Currently A is prepared by quantization but this could theoretically change. @@ -63,10 +54,10 @@ struct SSE2_16bit { assert(size % 8 == 0); assert(reinterpret_cast<uintptr_t>(input) % 16 == 0); assert(reinterpret_cast<uintptr_t>(output) % 16 == 0); - sse2::QuantizeTile16 q(quant_mult); + FRegister q = set1_ps<FRegister>(quant_mult); const float *end = input + size; for (; input != end; input += 8, output += 8) { - *reinterpret_cast<__m128i*>(output) = q.Consecutive(input); + *reinterpret_cast<__m128i*>(output) = QuantizeTile16::Consecutive(q, input); } } @@ -74,13 +65,13 @@ struct SSE2_16bit { static const Index kBTileRow = 8; static const Index kBTileCol = 8; - INTGEMM_PREPARE_B_16(INTGEMM_SSE2, sse2::QuantizeTile16) - INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSE2, CPUType::SSE2, int16_t) - INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSE2, sse2::QuantizeTile16, int16_t) + INTGEMM_PREPARE_B_16(INTGEMM_SSE2, QuantizeTile16) + INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSE2, int16_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSE2, QuantizeTile16, int16_t) INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { //TODO #DEFINE - sse2::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end); + SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end); } INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, CPUType::SSE2) @@ -89,4 +80,5 @@ struct SSE2_16bit { static const CPUType kUses = CPUType::SSE2; }; +} // namespace sse2 } // namespace intgemm diff --git a/ssse3_gemm.h b/intgemm/ssse3_gemm.h index fd3ab8c..865fe12 100644 --- a/ssse3_gemm.h +++ b/intgemm/ssse3_gemm.h @@ -6,13 +6,11 @@ #include "types.h" #include <cstdint> -#include <stdint.h> #include <cstring> // 16-bit is in sse2_gemm.h namespace intgemm { - namespace ssse3 { INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) { @@ -23,24 +21,20 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i) class QuantizeTile8 { public: - typedef __m128i Register; - - INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {} - - INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) const { + INTGEMM_SSSE3 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) { // Skip a row. - return Tile(input, input + 4, input + 2 * cols, input + 2 * cols + 4); + return Tile(mult_reg, input, input + 4, input + 2 * cols, input + 2 * cols + 4); } - INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) const { - return Tile(input, input + 4, input + 8, input + 12); + INTGEMM_SSSE3 static inline Register Consecutive(FRegister mult_reg, const float *input) { + return Tile(mult_reg, input, input + 4, input + 8, input + 12); } - INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) const { - return TileU(input, input + 4, input + 8, input + 12); + INTGEMM_SSSE3 static inline Register ConsecutiveU(FRegister mult_reg, const float *input) { + return TileU(mult_reg, input, input + 4, input + 8, input + 12); } - INTGEMM_SSSE3 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { + INTGEMM_SSSE3 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) { const float* inputs[4]; for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) { while (cols_left < sizeof(Register) / sizeof(float)) { @@ -51,16 +45,16 @@ class QuantizeTile8 { input += sizeof(Register) / sizeof(float); cols_left -= sizeof(Register) / sizeof(float); } - return Tile(inputs[0], inputs[1], inputs[2], inputs[3]); + return Tile(mult_reg, inputs[0], inputs[1], inputs[2], inputs[3]); } // Quantize 16xfloat into 16xint8_t - INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const { + INTGEMM_SSSE3 static inline __m128i Tile(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) { const __m128i neg128 = _mm_set1_epi8(-128); - __m128i g0 = QuantizerGrab(input0, mult_reg_); - __m128i g1 = QuantizerGrab(input1, mult_reg_); - __m128i g2 = QuantizerGrab(input2, mult_reg_); - __m128i g3 = QuantizerGrab(input3, mult_reg_); + __m128i g0 = QuantizerGrab(input0, mult_reg); + __m128i g1 = QuantizerGrab(input1, mult_reg); + __m128i g2 = QuantizerGrab(input2, mult_reg); + __m128i g3 = QuantizerGrab(input3, mult_reg); __m128i packed0 = _mm_packs_epi32(g0, g1); __m128i packed1 = _mm_packs_epi32(g2, g3); __m128i packed = _mm_packs_epi16(packed0, packed1); @@ -78,13 +72,13 @@ class QuantizeTile8 { } private: - INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const { + INTGEMM_SSSE3 static inline __m128i TileU(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) { const __m128i neg128 = _mm_set1_epi8(-128); const __m128i pos127 = _mm_set1_epi8(127); - __m128i g0 = QuantizerGrab(input0, mult_reg_); - __m128i g1 = QuantizerGrab(input1, mult_reg_); - __m128i g2 = QuantizerGrab(input2, mult_reg_); - __m128i g3 = QuantizerGrab(input3, mult_reg_); + __m128i g0 = QuantizerGrab(input0, mult_reg); + __m128i g1 = QuantizerGrab(input1, mult_reg); + __m128i g2 = QuantizerGrab(input2, mult_reg); + __m128i g3 = QuantizerGrab(input3, mult_reg); __m128i packed0 = _mm_packs_epi32(g0, g1); __m128i packed1 = _mm_packs_epi32(g2, g3); __m128i packed = _mm_packs_epi16(packed0, packed1); @@ -100,15 +94,10 @@ class QuantizeTile8 { return _mm_add_epi8(_mm_sub_epi8(packed, evils), pos127); // No permute needed. packs is in order for SSE. } - - private: - const __m128 mult_reg_; }; -} // namespace - -// pmaddubsw (the 8-bit multiply) is INTGEMM_SSSE3, so pedantically that's the version we need. -struct SSSE3_8bit { +// pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need. +struct Kernels8 { typedef int8_t Integer; // Currently A is prepared by quantization but this could theoretically change. @@ -117,9 +106,9 @@ struct SSSE3_8bit { } private: - INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3, __m128i, ssse3) + INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3) public: - INTGEMM_QUANTIZE(INTGEMM_SSSE3, __m128i, ssse3) + INTGEMM_QUANTIZE(INTGEMM_SSSE3) // Version with unsigned int + 127 // Currently A is prepared by quantization but this could theoretically change. @@ -131,10 +120,10 @@ struct SSSE3_8bit { assert(size % 16 == 0); assert(reinterpret_cast<uintptr_t>(input) % 16 == 0); assert(reinterpret_cast<uintptr_t>(output) % 16 == 0); - ssse3::QuantizeTile8 q(quant_mult); + FRegister q = set1_ps<FRegister>(quant_mult); const float *end = input + size; for (; input != end; input += 16, output += 16) { - *reinterpret_cast<__m128i*>(output) = q.ConsecutiveU(input); + *reinterpret_cast<__m128i*>(output) = QuantizeTile8::ConsecutiveU(q, input); } } @@ -143,8 +132,8 @@ struct SSSE3_8bit { static const Index kBTileCol = 8; INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8) - INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, CPUType::SSE2, int8_t) - INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, ssse3::QuantizeTile8, int8_t) + INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t) INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); @@ -161,4 +150,5 @@ struct SSSE3_8bit { static const CPUType kUses = CPUType::SSSE3; }; +} // namespace ssse3 } // namespace intgemm diff --git a/stats.h b/intgemm/stats.h index 6f9eda2..6f9eda2 100644 --- a/stats.h +++ b/intgemm/stats.h diff --git a/stats.inl b/intgemm/stats.inl index 7fc7afb..d6a850e 100644 --- a/stats.inl +++ b/intgemm/stats.inl @@ -54,7 +54,7 @@ INTGEMM_TARGET static inline float MaxAbsolute(const float *begin_float, const f } #else for (const float *i = end_reg; i < end_float; ++i) { - ret = std::max(ret, fabsf(*i)); + ret = std::max(ret, std::fabs(*i)); } #endif return ret; diff --git a/types.h b/intgemm/types.h index da0429f..da0429f 100644 --- a/types.h +++ b/intgemm/types.h diff --git a/utils.h b/intgemm/utils.h index a520ea0..a520ea0 100644 --- a/utils.h +++ b/intgemm/utils.h diff --git a/vec_traits.h b/intgemm/vec_traits.h index 86265b2..86265b2 100644 --- a/vec_traits.h +++ b/intgemm/vec_traits.h diff --git a/test/3rd_party/LICENSE_1_0.txt b/test/3rd_party/LICENSE_1_0.txt new file mode 100644 index 0000000..7925d62 --- /dev/null +++ b/test/3rd_party/LICENSE_1_0.txt @@ -0,0 +1,24 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + diff --git a/3rd_party/catch.hpp b/test/3rd_party/catch.hpp index 1850fff..1850fff 100644 --- a/3rd_party/catch.hpp +++ b/test/3rd_party/catch.hpp diff --git a/test/add127_test.cc b/test/add127_test.cc index 28c241f..b7ce49b 100644 --- a/test/add127_test.cc +++ b/test/add127_test.cc @@ -282,51 +282,51 @@ template <class Routine> void TestMultiplyShiftInt(Index A_rows, Index width, In // Bias TEST_CASE("PrepareBias SSSE3", "[Add127]") { if (kCPU < CPUType::SSSE3) return; - TestPrepareBias<SSSE3_8bit>(256,256); - TestPrepareBias<SSSE3_8bit>(2048,256); - TestPrepareBias<SSSE3_8bit>(512,512); + TestPrepareBias<ssse3::Kernels8>(256,256); + TestPrepareBias<ssse3::Kernels8>(2048,256); + TestPrepareBias<ssse3::Kernels8>(512,512); } TEST_CASE("PrepareBias AVX2", "[Add127]") { if (kCPU < CPUType::AVX2) return; - TestPrepareBias<AVX2_8bit>(256,256); - TestPrepareBias<AVX2_8bit>(2048,256); - TestPrepareBias<AVX2_8bit>(512,512); + TestPrepareBias<avx2::Kernels8>(256,256); + TestPrepareBias<avx2::Kernels8>(2048,256); + TestPrepareBias<avx2::Kernels8>(512,512); } TEST_CASE("PrepareBias AVX512F", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - TestPrepareBias<AVX512_8bit>(256,256); - TestPrepareBias<AVX512_8bit>(2048,256); - TestPrepareBias<AVX512_8bit>(512,512); + TestPrepareBias<avx512bw::Kernels8>(256,256); + TestPrepareBias<avx512bw::Kernels8>(2048,256); + TestPrepareBias<avx512bw::Kernels8>(512,512); #endif } //A TEST_CASE("PrepareA SSSE3", "[Add127]") { if (kCPU < CPUType::SSSE3) return; - TestPrepareA<SSSE3_8bit>(64,64); - TestPrepareA<SSSE3_8bit>(256,256); - TestPrepareA<SSSE3_8bit>(512,512); - TestPrepareA<SSSE3_8bit>(2048,256); + TestPrepareA<ssse3::Kernels8>(64,64); + TestPrepareA<ssse3::Kernels8>(256,256); + TestPrepareA<ssse3::Kernels8>(512,512); + TestPrepareA<ssse3::Kernels8>(2048,256); } TEST_CASE("PrepareA AVX2", "[Add127]") { if (kCPU < CPUType::AVX2) return; - TestPrepareA<AVX2_8bit>(64,64); - TestPrepareA<AVX2_8bit>(256,256); - TestPrepareA<AVX2_8bit>(512,512); - TestPrepareA<AVX2_8bit>(2048,256); + TestPrepareA<avx2::Kernels8>(64,64); + TestPrepareA<avx2::Kernels8>(256,256); + TestPrepareA<avx2::Kernels8>(512,512); + TestPrepareA<avx2::Kernels8>(2048,256); } TEST_CASE("PrepareA AVX512F", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - TestPrepareA<AVX512_8bit>(64,64); - TestPrepareA<AVX512_8bit>(256,256); - TestPrepareA<AVX512_8bit>(512,512); - TestPrepareA<AVX512_8bit>(2048,256); + TestPrepareA<avx512bw::Kernels8>(64,64); + TestPrepareA<avx512bw::Kernels8>(256,256); + TestPrepareA<avx512bw::Kernels8>(512,512); + TestPrepareA<avx512bw::Kernels8>(2048,256); #endif } @@ -334,144 +334,144 @@ TEST_CASE("PrepareA AVX512F", "[Add127]") { TEST_CASE ("Multiply SSSE3 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::SSSE3) return; - TestMultiplyBiasNew<SSSE3_8bit>(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f); - TestMultiplyBiasNew<SSSE3_8bit>(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f); - TestMultiplyBiasNew<SSSE3_8bit>(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f); - TestMultiplyBiasNew<SSSE3_8bit>(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f); - TestMultiplyBiasNew<SSSE3_8bit>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f); - TestMultiplyBiasNew<SSSE3_8bit>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f); - TestMultiplyBiasNew<SSSE3_8bit>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f); + TestMultiplyBiasNew<ssse3::Kernels8>(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f); + TestMultiplyBiasNew<ssse3::Kernels8>(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f); + TestMultiplyBiasNew<ssse3::Kernels8>(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f); + TestMultiplyBiasNew<ssse3::Kernels8>(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f); + TestMultiplyBiasNew<ssse3::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f); + TestMultiplyBiasNew<ssse3::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f); + TestMultiplyBiasNew<ssse3::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f); } TEST_CASE ("Multiply AVX2 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::AVX2) return; - TestMultiplyBiasNew<AVX2_8bit>(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f); - TestMultiplyBiasNew<AVX2_8bit>(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f); - TestMultiplyBiasNew<AVX2_8bit>(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f); - TestMultiplyBiasNew<AVX2_8bit>(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f); - TestMultiplyBiasNew<AVX2_8bit>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f); - TestMultiplyBiasNew<AVX2_8bit>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f); - TestMultiplyBiasNew<AVX2_8bit>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f); + TestMultiplyBiasNew<avx2::Kernels8>(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f); + TestMultiplyBiasNew<avx2::Kernels8>(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f); + TestMultiplyBiasNew<avx2::Kernels8>(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f); + TestMultiplyBiasNew<avx2::Kernels8>(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f); + TestMultiplyBiasNew<avx2::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f); + TestMultiplyBiasNew<avx2::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f); + TestMultiplyBiasNew<avx2::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiplyBiasNew<AVX512_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); - TestMultiplyBiasNew<AVX512_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); - TestMultiplyBiasNew<AVX512_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f); - TestMultiplyBiasNew<AVX512_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyBiasNew<AVX512_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); - TestMultiplyBiasNew<AVX512_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyBiasNew<AVX512_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); + TestMultiplyBiasNew<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); + TestMultiplyBiasNew<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); + TestMultiplyBiasNew<avx512bw::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f); + TestMultiplyBiasNew<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyBiasNew<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); + TestMultiplyBiasNew<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyBiasNew<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI TEST_CASE ("Multiply AVX512VNNI 8bit Shift with bias", "[Add127]") { if (kCPU < CPUType::AVX512VNNI) return; - TestMultiplyBiasNew<AVX512VNNI_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); - TestMultiplyBiasNew<AVX512VNNI_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); - TestMultiplyBiasNew<AVX512VNNI_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f); - TestMultiplyBiasNew<AVX512VNNI_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyBiasNew<AVX512VNNI_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); - TestMultiplyBiasNew<AVX512VNNI_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyBiasNew<AVX512VNNI_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); + TestMultiplyBiasNew<avx512vnni::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); + TestMultiplyBiasNew<avx512vnni::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); + TestMultiplyBiasNew<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f); + TestMultiplyBiasNew<avx512vnni::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyBiasNew<avx512vnni::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); + TestMultiplyBiasNew<avx512vnni::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyBiasNew<avx512vnni::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); } #endif //Multiply old vs new TEST_CASE ("Multiply SSSE3 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::SSSE3) return; - TestMultiplyShiftNonShift<SSSE3_8bit>(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift<SSSE3_8bit>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f); - TestMultiplyShiftNonShift<SSSE3_8bit>(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad - TestMultiplyShiftNonShift<SSSE3_8bit>(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f); - TestMultiplyShiftNonShift<SSSE3_8bit>(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f); - TestMultiplyShiftNonShift<SSSE3_8bit>(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f); - TestMultiplyShiftNonShift<SSSE3_8bit>(200, 256, 256, 1, 0.74f, 0.17f, 0.006f); + TestMultiplyShiftNonShift<ssse3::Kernels8>(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift<ssse3::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f); + TestMultiplyShiftNonShift<ssse3::Kernels8>(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad + TestMultiplyShiftNonShift<ssse3::Kernels8>(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f); + TestMultiplyShiftNonShift<ssse3::Kernels8>(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f); + TestMultiplyShiftNonShift<ssse3::Kernels8>(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f); + TestMultiplyShiftNonShift<ssse3::Kernels8>(200, 256, 256, 1, 0.74f, 0.17f, 0.006f); } TEST_CASE ("Multiply AVX2 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::AVX2) return; - TestMultiplyShiftNonShift<AVX2_8bit>(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift<AVX2_8bit>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f); - TestMultiplyShiftNonShift<AVX2_8bit>(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad - TestMultiplyShiftNonShift<AVX2_8bit>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftNonShift<AVX2_8bit>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); - TestMultiplyShiftNonShift<AVX2_8bit>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftNonShift<AVX2_8bit>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); + TestMultiplyShiftNonShift<avx2::Kernels8>(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift<avx2::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f); + TestMultiplyShiftNonShift<avx2::Kernels8>(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad + TestMultiplyShiftNonShift<avx2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftNonShift<avx2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); + TestMultiplyShiftNonShift<avx2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftNonShift<avx2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiplyShiftNonShift<AVX512_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); - TestMultiplyShiftNonShift<AVX512_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); - TestMultiplyShiftNonShift<AVX512_8bit>(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f); - TestMultiplyShiftNonShift<AVX512_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyShiftNonShift<AVX512_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); - TestMultiplyShiftNonShift<AVX512_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); - TestMultiplyShiftNonShift<AVX512_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); + TestMultiplyShiftNonShift<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f); + TestMultiplyShiftNonShift<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f); + TestMultiplyShiftNonShift<avx512bw::Kernels8>(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f); + TestMultiplyShiftNonShift<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyShiftNonShift<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f); + TestMultiplyShiftNonShift<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f); + TestMultiplyShiftNonShift<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs nonshift", "[Add127]") { if (kCPU < CPUType::AVX512VNNI) return; - TestMultiplyShiftNonShift<AVX512VNNI_8bit>(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f); - TestMultiplyShiftNonShift<AVX512VNNI_8bit>(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift<AVX512VNNI_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); - TestMultiplyShiftNonShift<AVX512VNNI_8bit>(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift<AVX512VNNI_8bit>(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift<AVX512VNNI_8bit>(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f); - TestMultiplyShiftNonShift<AVX512VNNI_8bit>(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift<avx512vnni::Kernels8>(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f); + TestMultiplyShiftNonShift<avx512vnni::Kernels8>(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); + TestMultiplyShiftNonShift<avx512vnni::Kernels8>(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift<avx512vnni::Kernels8>(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift<avx512vnni::Kernels8>(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f); + TestMultiplyShiftNonShift<avx512vnni::Kernels8>(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f); } #endif //Multiply Shift vs int shift implementation TEST_CASE ("Multiply SSSE3 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::SSSE3) return; - TestMultiplyShiftInt<SSSE3_8bit>(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f); - TestMultiplyShiftInt<SSSE3_8bit>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f); - TestMultiplyShiftInt<SSSE3_8bit>(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f); - TestMultiplyShiftInt<SSSE3_8bit>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftInt<SSSE3_8bit>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); - TestMultiplyShiftInt<SSSE3_8bit>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftInt<SSSE3_8bit>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); + TestMultiplyShiftInt<ssse3::Kernels8>(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f); + TestMultiplyShiftInt<ssse3::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f); + TestMultiplyShiftInt<ssse3::Kernels8>(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f); + TestMultiplyShiftInt<ssse3::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftInt<ssse3::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); + TestMultiplyShiftInt<ssse3::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftInt<ssse3::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); } TEST_CASE ("Multiply AVX2 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::AVX2) return; - TestMultiplyShiftInt<AVX2_8bit>(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f); - TestMultiplyShiftInt<AVX2_8bit>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f); - TestMultiplyShiftInt<AVX2_8bit>(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f); - TestMultiplyShiftInt<AVX2_8bit>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftInt<AVX2_8bit>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); - TestMultiplyShiftInt<AVX2_8bit>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); - TestMultiplyShiftInt<AVX2_8bit>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); + TestMultiplyShiftInt<avx2::Kernels8>(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx2::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f); + TestMultiplyShiftInt<avx2::Kernels8>(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f); + TestMultiplyShiftInt<avx2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftInt<avx2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f); + TestMultiplyShiftInt<avx2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f); + TestMultiplyShiftInt<avx2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512F 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiplyShiftInt<AVX512_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f); - TestMultiplyShiftInt<AVX512_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f); - TestMultiplyShiftInt<AVX512_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); - TestMultiplyShiftInt<AVX512_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); - TestMultiplyShiftInt<AVX512_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f); - TestMultiplyShiftInt<AVX512_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); - TestMultiplyShiftInt<AVX512_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f); + TestMultiplyShiftInt<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx512bw::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); + TestMultiplyShiftInt<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f); } #endif #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs Int", "[Add127]") { if (kCPU < CPUType::AVX512VNNI) return; - TestMultiplyShiftInt<AVX512VNNI_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f); - TestMultiplyShiftInt<AVX512VNNI_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f); - TestMultiplyShiftInt<AVX512VNNI_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); - TestMultiplyShiftInt<AVX512VNNI_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); - TestMultiplyShiftInt<AVX512VNNI_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f); - TestMultiplyShiftInt<AVX512VNNI_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); - TestMultiplyShiftInt<AVX512VNNI_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx512vnni::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f); + TestMultiplyShiftInt<avx512vnni::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f); + TestMultiplyShiftInt<avx512vnni::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx512vnni::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx512vnni::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f); + TestMultiplyShiftInt<avx512vnni::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f); } #endif diff --git a/test/kernels/add_bias_test.cc b/test/kernels/add_bias_test.cc index 7c299f0..492c669 100644 --- a/test/kernels/add_bias_test.cc +++ b/test/kernels/add_bias_test.cc @@ -1,6 +1,6 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <numeric> diff --git a/test/kernels/bitwise_not_test.cc b/test/kernels/bitwise_not_test.cc index 1408db3..e908c43 100644 --- a/test/kernels/bitwise_not_test.cc +++ b/test/kernels/bitwise_not_test.cc @@ -1,6 +1,6 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <cstdlib> #include <numeric> diff --git a/test/kernels/downcast_test.cc b/test/kernels/downcast_test.cc index 6c1f3ab..5f9db66 100644 --- a/test/kernels/downcast_test.cc +++ b/test/kernels/downcast_test.cc @@ -1,6 +1,6 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <cstddef> #include <numeric> diff --git a/test/kernels/exp_test.cc b/test/kernels/exp_test.cc index b76a2e1..838e228 100644 --- a/test/kernels/exp_test.cc +++ b/test/kernels/exp_test.cc @@ -1,6 +1,6 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <cstddef> #include <numeric> diff --git a/test/kernels/floor_test.cc b/test/kernels/floor_test.cc index 01b607f..2659c3f 100644 --- a/test/kernels/floor_test.cc +++ b/test/kernels/floor_test.cc @@ -1,6 +1,6 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <cstddef> #include <numeric> diff --git a/test/kernels/multiply_sat_test.cc b/test/kernels/multiply_sat_test.cc deleted file mode 100644 index 444eae4..0000000 --- a/test/kernels/multiply_sat_test.cc +++ /dev/null @@ -1,54 +0,0 @@ -#include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" - -#include <stdint.h> -#include <cstdint> -#include <cstddef> -#include <numeric> - -namespace intgemm { - -template <CPUType CPUType_, typename Type_> -void kernel_multiply_sat_test() { - if (kCPU < CPUType_) - return; - - using vec_t = vector_t<CPUType_, Type_>; - constexpr int VECTOR_LENGTH = sizeof(vec_t) / sizeof(Type_); - - AlignedVector<Type_> input1(VECTOR_LENGTH); - AlignedVector<Type_> input2(VECTOR_LENGTH); - AlignedVector<Type_> output(VECTOR_LENGTH); - - std::iota(input1.begin(), input1.end(), static_cast<Type_>(-VECTOR_LENGTH / 2)); - std::iota(input2.begin(), input2.end(), static_cast<Type_>(-VECTOR_LENGTH / 3)); - - // TODO: try all shifts. The shift must be an immediate. - int8_t shift = 1; - *output.template as<vec_t>() = kernels::multiply_sat<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>(), shift); - for (std::size_t i = 0; i < output.size(); ++i) { - auto ref = (int64_t(input1[i]) * input2[i]) >> shift; - auto ref_sat = Type_(std::min<int64_t>(std::numeric_limits<Type_>::max(), std::max<int64_t>(std::numeric_limits<Type_>::min(), ref))); - CHECK(output[i] == ref_sat); - } -} - -template INTGEMM_SSE2 void kernel_multiply_sat_test<CPUType::SSE2, int8_t>(); -template INTGEMM_SSE2 void kernel_multiply_sat_test<CPUType::SSE2, int16_t>(); -KERNEL_TEST_CASE("multiply_sat/int8 SSE2") { return kernel_multiply_sat_test<CPUType::SSE2, int8_t>(); } -KERNEL_TEST_CASE("multiply_sat/int16 SSE2") { return kernel_multiply_sat_test<CPUType::SSE2, int16_t>(); } - -template INTGEMM_AVX2 void kernel_multiply_sat_test<CPUType::AVX2, int8_t>(); -template INTGEMM_AVX2 void kernel_multiply_sat_test<CPUType::AVX2, int16_t>(); -KERNEL_TEST_CASE("multiply_sat/int8 AVX2") { return kernel_multiply_sat_test<CPUType::AVX2, int8_t>(); } -KERNEL_TEST_CASE("multiply_sat/int16 AVX2") { return kernel_multiply_sat_test<CPUType::AVX2, int16_t>(); } - -#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW -template INTGEMM_AVX512BW void kernel_multiply_sat_test<CPUType::AVX512BW, int8_t>(); -template INTGEMM_AVX512BW void kernel_multiply_sat_test<CPUType::AVX512BW, int16_t>(); -KERNEL_TEST_CASE("multiply_sat/int8 AVX512BW") { return kernel_multiply_sat_test<CPUType::AVX512BW, int8_t>(); } -KERNEL_TEST_CASE("multiply_sat/int16 AVX512BW") { return kernel_multiply_sat_test<CPUType::AVX512BW, int16_t>(); } -#endif - -} diff --git a/test/kernels/multiply_test.cc b/test/kernels/multiply_test.cc index ca8c54c..029e3ac 100644 --- a/test/kernels/multiply_test.cc +++ b/test/kernels/multiply_test.cc @@ -1,9 +1,8 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <cstdint> -#include <stdint.h> #include <numeric> namespace intgemm { diff --git a/test/kernels/quantize_test.cc b/test/kernels/quantize_test.cc index f163a45..ae3c068 100644 --- a/test/kernels/quantize_test.cc +++ b/test/kernels/quantize_test.cc @@ -1,6 +1,6 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <numeric> diff --git a/test/kernels/relu_test.cc b/test/kernels/relu_test.cc index 3ad6dd8..6fcef98 100644 --- a/test/kernels/relu_test.cc +++ b/test/kernels/relu_test.cc @@ -1,8 +1,7 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" -#include <stdint.h> #include <cstdint> #include <numeric> diff --git a/test/kernels/rescale_test.cc b/test/kernels/rescale_test.cc index ae13984..280b513 100644 --- a/test/kernels/rescale_test.cc +++ b/test/kernels/rescale_test.cc @@ -1,9 +1,8 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <cstdint> -#include <stdint.h> #include <numeric> namespace intgemm { diff --git a/test/kernels/sigmoid_test.cc b/test/kernels/sigmoid_test.cc index 7f7392d..af9dad1 100644 --- a/test/kernels/sigmoid_test.cc +++ b/test/kernels/sigmoid_test.cc @@ -1,6 +1,6 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <cstddef> #include <numeric> diff --git a/test/kernels/tanh_test.cc b/test/kernels/tanh_test.cc index 4ba099c..e2c36f5 100644 --- a/test/kernels/tanh_test.cc +++ b/test/kernels/tanh_test.cc @@ -1,6 +1,6 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <cstddef> #include <numeric> diff --git a/test/kernels/unquantize_test.cc b/test/kernels/unquantize_test.cc index 20c3d6a..ee4bc80 100644 --- a/test/kernels/unquantize_test.cc +++ b/test/kernels/unquantize_test.cc @@ -1,6 +1,6 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <numeric> diff --git a/test/kernels/upcast_test.cc b/test/kernels/upcast_test.cc index cc782b5..92be1bd 100644 --- a/test/kernels/upcast_test.cc +++ b/test/kernels/upcast_test.cc @@ -1,9 +1,10 @@ +// This test triggers an internal compiler error in gcc 5. +#if defined(__OPTIMIZE__) || defined(__clang__) || defined(__INTEL_COMPILER) || !defined(__GNUC__) || (__GNUC__ != 5) #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <cstdint> -#include <stdint.h> #include <numeric> namespace intgemm { @@ -72,6 +73,7 @@ template INTGEMM_AVX512BW void kernel_upcast16to32_test<CPUType::AVX512BW>(); KERNEL_TEST_CASE("upcast16to32 AVX512BW") { return kernel_upcast16to32_test<CPUType::AVX512BW>(); } #endif + template <CPUType CPUType_> void kernel_upcast8to32_test() { if (kCPU < CPUType_) @@ -107,3 +109,4 @@ KERNEL_TEST_CASE("upcast8to32 AVX512BW") { return kernel_upcast8to32_test<CPUTyp #endif } +#endif diff --git a/test/kernels/write_test.cc b/test/kernels/write_test.cc index a0189fe..c263fca 100644 --- a/test/kernels/write_test.cc +++ b/test/kernels/write_test.cc @@ -1,6 +1,6 @@ #include "../test.h" -#include "../../aligned.h" -#include "../../kernels.h" +#include "../../intgemm/aligned.h" +#include "../../intgemm/kernels.h" #include <cstddef> #include <numeric> diff --git a/test/multiply_test.cc b/test/multiply_test.cc index a1138a0..6c16edd 100644 --- a/test/multiply_test.cc +++ b/test/multiply_test.cc @@ -1,10 +1,10 @@ #include "test.h" -#include "../aligned.h" -#include "../callbacks.h" -#include "../interleave.h" -#include "../intgemm.h" -#include "../multiply.h" -#include "../stats.h" +#include "../intgemm/aligned.h" +#include "../intgemm/callbacks.h" +#include "../intgemm/interleave.h" +#include "../intgemm/intgemm.h" +#include "../intgemm/multiply.h" +#include "../intgemm/stats.h" #include <algorithm> #include <cassert> @@ -66,7 +66,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) { it = dist(gen); } - typedef typename Routine::Integer Integer; + using Integer = typename Routine::Integer; // Call Prepare AlignedVector<Integer> test(input.size()); Routine::PrepareB(input.begin(), test.begin(), 1, rows, cols); @@ -85,30 +85,30 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) { TEST_CASE("Prepare AVX512", "[prepare]") { if (kCPU < CPUType::AVX512BW) return; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - TestPrepare<AVX512_8bit>(64, 8); - TestPrepare<AVX512_8bit>(256, 32); - TestPrepare<AVX512_16bit>(64, 8); - TestPrepare<AVX512_16bit>(256, 32); + TestPrepare<avx512bw::Kernels8>(64, 8); + TestPrepare<avx512bw::Kernels8>(256, 32); + TestPrepare<avx512bw::Kernels16>(64, 8); + TestPrepare<avx512bw::Kernels16>(256, 32); #endif } TEST_CASE("Prepare AVX2", "[prepare]") { if (kCPU < CPUType::AVX2) return; - TestPrepare<AVX2_8bit>(64, 32); - TestPrepare<AVX2_16bit>(64, 32); + TestPrepare<avx2::Kernels8>(64, 32); + TestPrepare<avx2::Kernels16>(64, 32); } TEST_CASE("Prepare SSSE3", "[prepare]") { if (kCPU < CPUType::SSSE3) return; - TestPrepare<SSSE3_8bit>(16, 8); - TestPrepare<SSSE3_8bit>(32, 16); - TestPrepare<SSSE3_8bit>(32, 32); + TestPrepare<ssse3::Kernels8>(16, 8); + TestPrepare<ssse3::Kernels8>(32, 16); + TestPrepare<ssse3::Kernels8>(32, 32); } TEST_CASE("Prepare SSE2", "[prepare]") { if (kCPU < CPUType::SSE2) return; - TestPrepare<SSE2_16bit>(8, 8); - TestPrepare<SSE2_16bit>(32, 32); + TestPrepare<sse2::Kernels16>(8, 8); + TestPrepare<sse2::Kernels16>(32, 32); } template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 16) { @@ -119,7 +119,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1 for (auto& it : input) { it = dist(gen); } - typedef typename Routine::Integer Integer; + using Integer = typename Routine::Integer; AlignedVector<Integer> prepared(input.size()); Routine::PrepareB(input.begin(), prepared.begin(), 1, rows, cols); @@ -150,27 +150,27 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1 TEST_CASE("SelectColumnsB AVX512", "[select]") { if (kCPU < CPUType::AVX512BW) return; #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW - TestSelectColumnsB<AVX512_8bit>(); - TestSelectColumnsB<AVX512_16bit>(256, 256); + TestSelectColumnsB<avx512bw::Kernels8>(); + TestSelectColumnsB<avx512bw::Kernels16>(256, 256); #endif } TEST_CASE("SelectColumnsB AVX2", "[select]") { if (kCPU < CPUType::AVX2) return; - TestSelectColumnsB<AVX2_8bit>(256, 256); - TestSelectColumnsB<AVX2_16bit>(256, 256); + TestSelectColumnsB<avx2::Kernels8>(256, 256); + TestSelectColumnsB<avx2::Kernels16>(256, 256); } TEST_CASE("SelectColumnsB SSSE3", "[select]") { if (kCPU < CPUType::SSSE3) return; - TestSelectColumnsB<SSSE3_8bit>(); - TestSelectColumnsB<SSSE3_8bit>(256, 256); + TestSelectColumnsB<ssse3::Kernels8>(); + TestSelectColumnsB<ssse3::Kernels8>(256, 256); } TEST_CASE("SelectColumnsB SSE2", "[select]") { if (kCPU < CPUType::SSE2) return; - TestSelectColumnsB<SSE2_16bit>(); - TestSelectColumnsB<SSE2_16bit>(256, 256); + TestSelectColumnsB<sse2::Kernels16>(); + TestSelectColumnsB<sse2::Kernels16>(256, 256); } template <class Register> void TestMax() { @@ -187,8 +187,8 @@ TEST_CASE("Max", "[max]") { } void CompareMaxAbs(const float *begin, const float *end, float test, std::size_t offset) { - float largest = fabs(*std::max_element(begin, end)); - float smallest = fabs(*std::min_element(begin, end)); + float largest = std::fabs(*std::max_element(begin, end)); + float smallest = std::fabs(*std::min_element(begin, end)); largest = std::max(largest, smallest); CHECK_MESSAGE(largest == test, "Error: " << largest << " versus " << test << " in length " << (end - begin) << " offset " << offset); } @@ -255,7 +255,7 @@ TEST_CASE("MaxAbsolute AVX512BW", "[max]") { template <class Routine> void TestMultiply(Index A_rows, Index width, Index B_cols, float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) { - typedef typename Routine::Integer Integer; + using Integer = typename Routine::Integer; std::ostringstream info; info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n'; @@ -307,7 +307,7 @@ template <class Routine> void TestMultiply(Index A_rows, Index width, Index B_co //Require different number of arguments. I don't think the refactoring is worth it. template <class Routine> void TestMultiplyBias(Index A_rows, Index width, Index B_cols, float int_tolerance = 0.1f, float float_tolerance = 1.0f, float MSE_float_tolerance = 0.0f, float MSE_int_tolerance = 0.0f) { - typedef typename Routine::Integer Integer; + using Integer = typename Routine::Integer; std::ostringstream info; info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n'; @@ -358,145 +358,145 @@ template <class Routine> void TestMultiplyBias(Index A_rows, Index width, Index TEST_CASE ("Multiply SSE2 16bit", "[multiply]") { if (kCPU < CPUType::SSE2) return; - TestMultiply<SSE2_16bit>(8, 256, 256, .1f, 1, 0.01f); - TestMultiply<SSE2_16bit>(8, 2048, 256, .1f, 1, 0.02f); - TestMultiply<SSE2_16bit>(320, 256, 256, .1f, 1, 0.01f); - TestMultiply<SSE2_16bit>(472, 256, 256, .1f, 1, 0.01f); - TestMultiply<SSE2_16bit>(248, 256, 256, .1f, 1, 0.01f); - TestMultiply<SSE2_16bit>(200, 256, 256, .1f, 1, 0.01f); + TestMultiply<sse2::Kernels16>(8, 256, 256, .1f, 1, 0.01f); + TestMultiply<sse2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f); + TestMultiply<sse2::Kernels16>(320, 256, 256, .1f, 1, 0.01f); + TestMultiply<sse2::Kernels16>(472, 256, 256, .1f, 1, 0.01f); + TestMultiply<sse2::Kernels16>(248, 256, 256, .1f, 1, 0.01f); + TestMultiply<sse2::Kernels16>(200, 256, 256, .1f, 1, 0.01f); } TEST_CASE ("Multiply SSE2 16bit with bias", "[biased_multiply]") { if (kCPU < CPUType::SSE2) return; - TestMultiplyBias<SSE2_16bit>(8, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<SSE2_16bit>(8, 2048, 256, .1f, 1, 0.02f); - TestMultiplyBias<SSE2_16bit>(320, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<SSE2_16bit>(472, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<SSE2_16bit>(248, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<SSE2_16bit>(200, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<sse2::Kernels16>(8, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<sse2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f); + TestMultiplyBias<sse2::Kernels16>(320, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<sse2::Kernels16>(472, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<sse2::Kernels16>(248, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<sse2::Kernels16>(200, 256, 256, .1f, 1, 0.01f); } TEST_CASE ("Multiply SSSE3 8bit", "[multiply]") { if (kCPU < CPUType::SSSE3) return; - TestMultiply<SSSE3_8bit>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f); - TestMultiply<SSSE3_8bit>(8, 2048, 256, 33, 33, 4.4f, 4.4f); - TestMultiply<SSSE3_8bit>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f); - TestMultiply<SSSE3_8bit>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f); - TestMultiply<SSSE3_8bit>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f); - TestMultiply<SSSE3_8bit>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f); + TestMultiply<ssse3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f); + TestMultiply<ssse3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f); + TestMultiply<ssse3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f); + TestMultiply<ssse3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f); + TestMultiply<ssse3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f); + TestMultiply<ssse3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f); } TEST_CASE ("Multiply SSSE3 8bit with bias", "[biased_multiply]") { if (kCPU < CPUType::SSSE3) return; - TestMultiplyBias<SSSE3_8bit>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f); - TestMultiplyBias<SSSE3_8bit>(8, 2048, 256, 33, 33, 4.4f, 4.4f); - TestMultiplyBias<SSSE3_8bit>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f); - TestMultiplyBias<SSSE3_8bit>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f); - TestMultiplyBias<SSSE3_8bit>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f); - TestMultiplyBias<SSSE3_8bit>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f); + TestMultiplyBias<ssse3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f); + TestMultiplyBias<ssse3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f); + TestMultiplyBias<ssse3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f); + TestMultiplyBias<ssse3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f); + TestMultiplyBias<ssse3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f); + TestMultiplyBias<ssse3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f); } TEST_CASE ("Multiply AVX2 8bit", "[multiply]") { if (kCPU < CPUType::AVX2) return; - TestMultiply<AVX2_8bit>(8, 256, 256, .1f, 1, 0.1f); - TestMultiply<AVX2_8bit>(8, 2048, 256, 19, 19, 1.8f, 1.8f); - TestMultiply<AVX2_8bit>(320, 256, 256, .1f, 1, 0.1f); - TestMultiply<AVX2_8bit>(472, 256, 256, .1f, 1, 0.1f); - TestMultiply<AVX2_8bit>(248, 256, 256, .1f, 1, 0.1f); - TestMultiply<AVX2_8bit>(200, 256, 256, .1f, 1, 0.1f); + TestMultiply<avx2::Kernels8>(8, 256, 256, .1f, 1, 0.1f); + TestMultiply<avx2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f); + TestMultiply<avx2::Kernels8>(320, 256, 256, .1f, 1, 0.1f); + TestMultiply<avx2::Kernels8>(472, 256, 256, .1f, 1, 0.1f); + TestMultiply<avx2::Kernels8>(248, 256, 256, .1f, 1, 0.1f); + TestMultiply<avx2::Kernels8>(200, 256, 256, .1f, 1, 0.1f); } TEST_CASE ("Multiply AVX2 8bit with bias", "[biased_multiply]") { if (kCPU < CPUType::AVX2) return; - TestMultiplyBias<AVX2_8bit>(8, 256, 256, .1f, 1, 0.1f); - TestMultiplyBias<AVX2_8bit>(8, 2048, 256, 19, 19, 1.8f, 1.8f); - TestMultiplyBias<AVX2_8bit>(320, 256, 256, .1f, 1, 0.1f); - TestMultiplyBias<AVX2_8bit>(472, 256, 256, .1f, 1, 0.1f); - TestMultiplyBias<AVX2_8bit>(248, 256, 256, .1f, 1, 0.1f); - TestMultiplyBias<AVX2_8bit>(200, 256, 256, .1f, 1, 0.1f); + TestMultiplyBias<avx2::Kernels8>(8, 256, 256, .1f, 1, 0.1f); + TestMultiplyBias<avx2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f); + TestMultiplyBias<avx2::Kernels8>(320, 256, 256, .1f, 1, 0.1f); + TestMultiplyBias<avx2::Kernels8>(472, 256, 256, .1f, 1, 0.1f); + TestMultiplyBias<avx2::Kernels8>(248, 256, 256, .1f, 1, 0.1f); + TestMultiplyBias<avx2::Kernels8>(200, 256, 256, .1f, 1, 0.1f); } TEST_CASE ("Multiply AVX2 16bit", "[multiply]") { if (kCPU < CPUType::AVX2) return; - TestMultiply<AVX2_16bit>(8, 256, 256, .1f, 1, 0.01f); - TestMultiply<AVX2_16bit>(8, 2048, 256, .1f, 1, 0.02f); - TestMultiply<AVX2_16bit>(320, 256, 256, .1f, 1, 0.01f); - TestMultiply<AVX2_16bit>(472, 256, 256, .1f, 1, 0.01f); - TestMultiply<AVX2_16bit>(248, 256, 256, .1f, 1, 0.01f); - TestMultiply<AVX2_16bit>(200, 256, 256, .1f, 1, 0.01f); + TestMultiply<avx2::Kernels16>(8, 256, 256, .1f, 1, 0.01f); + TestMultiply<avx2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f); + TestMultiply<avx2::Kernels16>(320, 256, 256, .1f, 1, 0.01f); + TestMultiply<avx2::Kernels16>(472, 256, 256, .1f, 1, 0.01f); + TestMultiply<avx2::Kernels16>(248, 256, 256, .1f, 1, 0.01f); + TestMultiply<avx2::Kernels16>(200, 256, 256, .1f, 1, 0.01f); } TEST_CASE ("Multiply AVX2 16bit with bias", "[biased_multiply]") { if (kCPU < CPUType::AVX2) return; - TestMultiplyBias<AVX2_16bit>(8, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<AVX2_16bit>(8, 2048, 256, .1f, 1, 0.02f); - TestMultiplyBias<AVX2_16bit>(320, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<AVX2_16bit>(472, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<AVX2_16bit>(248, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<AVX2_16bit>(200, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<avx2::Kernels16>(8, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<avx2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f); + TestMultiplyBias<avx2::Kernels16>(320, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<avx2::Kernels16>(472, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<avx2::Kernels16>(248, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<avx2::Kernels16>(200, 256, 256, .1f, 1, 0.01f); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Multiply AVX512 8bit", "[multiply]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiply<AVX512_8bit>(8, 256, 256, 0, 0.25f, 0.062f); - TestMultiply<AVX512_8bit>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f); - TestMultiply<AVX512_8bit>(320, 256, 256, 0, 0.26f, 0.059f); - TestMultiply<AVX512_8bit>(472, 256, 256, 0, 0.29f, 0.059f); - TestMultiply<AVX512_8bit>(248, 256, 256, 0, 0.29f, 0.059f); - TestMultiply<AVX512_8bit>(200, 256, 256, 0, 0.28f, 0.06f); + TestMultiply<avx512bw::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f); + TestMultiply<avx512bw::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f); + TestMultiply<avx512bw::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f); + TestMultiply<avx512bw::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f); + TestMultiply<avx512bw::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f); + TestMultiply<avx512bw::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f); } TEST_CASE ("Multiply AVX512 8bit with bias", "[biased_multiply]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiplyBias<AVX512_8bit>(8, 256, 256, 0, 0.25f, 0.062f); - TestMultiplyBias<AVX512_8bit>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f); - TestMultiplyBias<AVX512_8bit>(320, 256, 256, 0, 0.26f, 0.059f); - TestMultiplyBias<AVX512_8bit>(472, 256, 256, 0, 0.29f, 0.059f); - TestMultiplyBias<AVX512_8bit>(248, 256, 256, 0, 0.29f, 0.059f); - TestMultiplyBias<AVX512_8bit>(200, 256, 256, 0, 0.28f, 0.06f); + TestMultiplyBias<avx512bw::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f); + TestMultiplyBias<avx512bw::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f); + TestMultiplyBias<avx512bw::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f); + TestMultiplyBias<avx512bw::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f); + TestMultiplyBias<avx512bw::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f); + TestMultiplyBias<avx512bw::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI TEST_CASE ("Multiply AVX512VNNI 8bit", "[multiply]") { if (kCPU < CPUType::AVX512VNNI) return; - TestMultiply<AVX512VNNI_8bit>(8, 256, 256, 0, 0.25f, 0.062f); - TestMultiply<AVX512VNNI_8bit>(8, 2048, 256, 0, 0.55f, 0.25f); - TestMultiply<AVX512VNNI_8bit>(320, 256, 256, 0, 0.26f, 0.059f); - TestMultiply<AVX512VNNI_8bit>(472, 256, 256, 0, 0.29f, 0.059f); - TestMultiply<AVX512VNNI_8bit>(248, 256, 256, 0, 0.29f, 0.059f); - TestMultiply<AVX512VNNI_8bit>(200, 256, 256, 0, 0.28f, 0.06f); + TestMultiply<avx512vnni::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f); + TestMultiply<avx512vnni::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f); + TestMultiply<avx512vnni::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f); + TestMultiply<avx512vnni::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f); + TestMultiply<avx512vnni::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f); + TestMultiply<avx512vnni::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f); } TEST_CASE ("Multiply AVX512VNNI 8bit with bias", "[biased_multiply]") { if (kCPU < CPUType::AVX512VNNI) return; - TestMultiplyBias<AVX512VNNI_8bit>(8, 256, 256, 0, 0.25f, 0.062f); - TestMultiplyBias<AVX512VNNI_8bit>(8, 2048, 256, 0, 0.55f, 0.25f); - TestMultiplyBias<AVX512VNNI_8bit>(320, 256, 256, 0, 0.26f, 0.059f); - TestMultiplyBias<AVX512VNNI_8bit>(472, 256, 256, 0, 0.29f, 0.059f); - TestMultiplyBias<AVX512VNNI_8bit>(248, 256, 256, 0, 0.29f, 0.059f); - TestMultiplyBias<AVX512VNNI_8bit>(200, 256, 256, 0, 0.28f, 0.06f); + TestMultiplyBias<avx512vnni::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f); + TestMultiplyBias<avx512vnni::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f); + TestMultiplyBias<avx512vnni::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f); + TestMultiplyBias<avx512vnni::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f); + TestMultiplyBias<avx512vnni::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f); + TestMultiplyBias<avx512vnni::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f); } #endif TEST_CASE ("Multiply AVX512 16bit", "[multiply]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiply<AVX512_16bit>(8, 256, 256, .1f, 1, 0.01f); - TestMultiply<AVX512_16bit>(8, 2048, 256, .1f, 1, 0.011f); - TestMultiply<AVX512_16bit>(320, 256, 256, .1f, 1, 0.01f); - TestMultiply<AVX512_16bit>(472, 256, 256, .1f, 1, 0.01f); - TestMultiply<AVX512_16bit>(248, 256, 256, .1f, 1, 0.01f); - TestMultiply<AVX512_16bit>(200, 256, 256, .1f, 1, 0.01f); + TestMultiply<avx512bw::Kernels16>(8, 256, 256, .1f, 1, 0.01f); + TestMultiply<avx512bw::Kernels16>(8, 2048, 256, .1f, 1, 0.011f); + TestMultiply<avx512bw::Kernels16>(320, 256, 256, .1f, 1, 0.01f); + TestMultiply<avx512bw::Kernels16>(472, 256, 256, .1f, 1, 0.01f); + TestMultiply<avx512bw::Kernels16>(248, 256, 256, .1f, 1, 0.01f); + TestMultiply<avx512bw::Kernels16>(200, 256, 256, .1f, 1, 0.01f); } TEST_CASE ("Multiply AVX512 16bit with bias", "[biased_multiply]") { if (kCPU < CPUType::AVX512BW) return; - TestMultiplyBias<AVX512_16bit>(8, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<AVX512_16bit>(8, 2048, 256, .1f, 1, 0.011f); - TestMultiplyBias<AVX512_16bit>(320, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<AVX512_16bit>(472, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<AVX512_16bit>(248, 256, 256, .1f, 1, 0.01f); - TestMultiplyBias<AVX512_16bit>(200, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<avx512bw::Kernels16>(8, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<avx512bw::Kernels16>(8, 2048, 256, .1f, 1, 0.011f); + TestMultiplyBias<avx512bw::Kernels16>(320, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<avx512bw::Kernels16>(472, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<avx512bw::Kernels16>(248, 256, 256, .1f, 1, 0.01f); + TestMultiplyBias<avx512bw::Kernels16>(200, 256, 256, .1f, 1, 0.01f); } #endif diff --git a/test/prepare_b_quantized_transposed.cc b/test/prepare_b_quantized_transposed.cc index 938cc28..1437e0a 100644 --- a/test/prepare_b_quantized_transposed.cc +++ b/test/prepare_b_quantized_transposed.cc @@ -1,13 +1,13 @@ #include "test.h" -#include "../aligned.h" -#include "../avx2_gemm.h" -#include "../avx512_gemm.h" -#include "../sse2_gemm.h" -#include "../ssse3_gemm.h" +#include "../intgemm/aligned.h" +#include "../intgemm/avx2_gemm.h" +#include "../intgemm/avx512_gemm.h" +#include "../intgemm/sse2_gemm.h" +#include "../intgemm/ssse3_gemm.h" +#include <cmath> #include <cstring> #include <iostream> -#include <math.h> namespace intgemm { namespace { @@ -62,22 +62,22 @@ TEST_CASE("PrepareBQuantizedTransposed SSE2", "") { if (kCPU < CPUType::SSE2) return; - CHECK(TestMany<SSE2_16bit>(32, 128)); + CHECK(TestMany<sse2::Kernels16>(32, 128)); } TEST_CASE("PrepareBQuantizedTransposed SSSE3", "") { if (kCPU < CPUType::SSSE3) return; - CHECK(TestMany<SSSE3_8bit>(32, 128)); + CHECK(TestMany<ssse3::Kernels8>(32, 128)); } TEST_CASE("PrepareBQuantizedTransposed AVX2", "") { if (kCPU < CPUType::AVX2) return; - CHECK(TestMany<AVX2_8bit>(32, 128)); - CHECK(TestMany<AVX2_16bit>(32, 128)); + CHECK(TestMany<avx2::Kernels8>(32, 128)); + CHECK(TestMany<avx2::Kernels16>(32, 128)); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW @@ -85,8 +85,8 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") { if (kCPU < CPUType::AVX512BW) return; - CHECK(TestMany<AVX512_8bit>(64, 128)); - CHECK(TestMany<AVX512_16bit>(64, 128)); + CHECK(TestMany<avx512bw::Kernels8>(64, 128)); + CHECK(TestMany<avx512bw::Kernels16>(64, 128)); } #endif diff --git a/test/prepare_b_transposed.cc b/test/prepare_b_transposed.cc index 5969724..bc35138 100644 --- a/test/prepare_b_transposed.cc +++ b/test/prepare_b_transposed.cc @@ -1,13 +1,13 @@ #include "test.h" -#include "../aligned.h" -#include "../avx2_gemm.h" -#include "../avx512_gemm.h" -#include "../sse2_gemm.h" -#include "../ssse3_gemm.h" +#include "../intgemm/aligned.h" +#include "../intgemm/avx2_gemm.h" +#include "../intgemm/avx512_gemm.h" +#include "../intgemm/sse2_gemm.h" +#include "../intgemm/ssse3_gemm.h" +#include <cmath> #include <cstring> #include <iostream> -#include <math.h> namespace intgemm { namespace { @@ -63,22 +63,22 @@ TEST_CASE("PrepareBTransposed SSE2", "") { if (kCPU < CPUType::SSE2) return; - CHECK(TestMany<SSE2_16bit>(4, 128, 2.0f)); + CHECK(TestMany<sse2::Kernels16>(4, 128, 2.0f)); } TEST_CASE("PrepareBTransposed SSSE3", "") { if (kCPU < CPUType::SSSE3) return; - CHECK(TestMany<SSSE3_8bit>(4, 128, 2.0f)); + CHECK(TestMany<ssse3::Kernels8>(4, 128, 2.0f)); } TEST_CASE("PrepareBTransposed AVX2", "") { if (kCPU < CPUType::AVX2) return; - CHECK(TestMany<AVX2_8bit>(8, 128, 2.0f)); - CHECK(TestMany<AVX2_16bit>(8, 128, 2.0f)); + CHECK(TestMany<avx2::Kernels8>(8, 128, 2.0f)); + CHECK(TestMany<avx2::Kernels16>(8, 128, 2.0f)); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW @@ -86,8 +86,8 @@ TEST_CASE("PrepareBTransposed AVX2", "") { if (kCPU < CPUType::AVX512BW) return; - CHECK(TestMany<AVX512_8bit>(16, 128, 2.0f)); - CHECK(TestMany<AVX512_16bit>(16, 128, 2.0f)); + CHECK(TestMany<avx512bw::Kernels8>(16, 128, 2.0f)); + CHECK(TestMany<avx512bw::Kernels16>(16, 128, 2.0f)); } #endif diff --git a/test/quantize_test.cc b/test/quantize_test.cc index d2f6304..550ec66 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -1,14 +1,14 @@ #include "test.h" -#include "../aligned.h" -#include "../avx2_gemm.h" -#include "../avx512_gemm.h" -#include "../sse2_gemm.h" -#include "../ssse3_gemm.h" -#include "../stats.h" - +#include "../intgemm/aligned.h" +#include "../intgemm/avx2_gemm.h" +#include "../intgemm/avx512_gemm.h" +#include "../intgemm/sse2_gemm.h" +#include "../intgemm/ssse3_gemm.h" +#include "../intgemm/stats.h" + +#include <cmath> #include <cstring> #include <iostream> -#include <math.h> namespace intgemm { namespace { @@ -61,8 +61,8 @@ void testVectorMeanStd(int num_items, bool absolute=false) { MeanStd reference = VectorMeanStd(inputVec, num_items, absolute); MeanStd fast = Backend(inputVec.begin(), inputVec.end(), absolute); - float meanDifference = fabsf(reference.mean - fast.mean); - float stdDifference = fabsf(reference.stddev - fast.stddev); + float meanDifference = std::fabs(reference.mean - fast.mean); + float stdDifference = std::fabs(reference.stddev - fast.stddev); float eps = 0.00002f; //Accumulating horizontal sums can lead to errors. CHECK_MESSAGE(meanDifference <= eps, "Items: " << num_items << " Absolute: " << absolute << " Reference mean: " << reference.mean << " actual: " << fast.mean); @@ -73,15 +73,15 @@ void testVectorMeanStd(int num_items, bool absolute=false) { template <class I> bool IsOff(float from, I ref, I test) { if (ref == test) return false; if (ref - test > 1 && test - ref > 1) return true; - float off_test = fabs((float)test - from); - float off_ref = fabs((float)ref - from); + float off_test = std::fabs(static_cast<float>(test) - from); + float off_ref = std::fabs(static_cast<float>(ref) - from); // Allow 0.5 to round either way. if (off_test > 0.49 && off_test < 0.51 && off_ref > 0.49 && off_ref < 0.51) return false; return true; } template <class Backend> bool Test(const float *input_unaligned, float quant_mult, std::size_t size) { - typedef typename Backend::Integer Integer; + using Integer = typename Backend::Integer; bool success = true; AlignedVector<float> input(size); std::memcpy(input.begin(), input_unaligned, sizeof(float) * size); @@ -120,24 +120,24 @@ template <class Backend> void TestMany(std::size_t grow) { TEST_CASE ("Quantize SSE2", "[quantize]") { if (kCPU < CPUType::SSE2) return; - TestMany<SSE2_16bit>(8); + TestMany<sse2::Kernels16>(8); } TEST_CASE ("Quantize SSSE3", "[quantize]") { if (kCPU < CPUType::SSSE3) return; - TestMany<SSSE3_8bit>(1); + TestMany<ssse3::Kernels8>(1); } TEST_CASE ("Quantize AVX2", "[quantize]") { if (kCPU < CPUType::AVX2) return; - TestMany<AVX2_8bit>(1); - TestMany<AVX2_16bit>(16); + TestMany<avx2::Kernels8>(1); + TestMany<avx2::Kernels16>(16); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW TEST_CASE ("Quantize AVX512", "[quantize]") { if (kCPU < CPUType::AVX512BW) return; - TestMany<AVX512_8bit>(1); - TestMany<AVX512_16bit>(16); + TestMany<avx512bw::Kernels8>(1); + TestMany<avx512bw::Kernels16>(16); } #endif diff --git a/test/test.cc b/test/test.cc index 3559738..45c27ad 100644 --- a/test/test.cc +++ b/test/test.cc @@ -1,6 +1,8 @@ #define CATCH_CONFIG_RUNNER #include "test.h" +#include <cmath> + int main(int argc, char ** argv) { return Catch::Session().run(argc, argv); } @@ -13,13 +15,13 @@ void CompareMSE(const float *float_ref, const float *int_ref, const float *int_t for (std::size_t i = 0; i < size; ++i) { float int_diff = int_ref[i] - int_test[i]; float float_diff = float_ref[i] - int_test[i]; - CHECK_MESSAGE(fabs(int_diff) <= int_tolerance, test_info << "Inaccurate compared to int reference at " << i << ' ' << int_ref[i] << ' ' << int_test[i]); - CHECK_MESSAGE(fabs(float_diff) <= float_tolerance, test_info << "Inaccurate compared to float reference at " << i << ' ' << float_ref[i] << ' ' << int_test[i]); + CHECK_MESSAGE(std::fabs(int_diff) <= int_tolerance, test_info << "Inaccurate compared to int reference at " << i << ' ' << int_ref[i] << ' ' << int_test[i]); + CHECK_MESSAGE(std::fabs(float_diff) <= float_tolerance, test_info << "Inaccurate compared to float reference at " << i << ' ' << float_ref[i] << ' ' << int_test[i]); int_sum += int_diff * int_diff; float_sum += float_diff * float_diff; } - CHECK_MESSAGE(fabs(sqrt(float_sum / size)) <= MSE_float_tolerance, test_info << "Float MSE = " << sqrt(float_sum / size)); - CHECK_MESSAGE(fabs(sqrt(int_sum / size)) <= MSE_int_tolerance, test_info << "Int MSE = " << sqrt(int_sum / size)); + CHECK_MESSAGE(std::fabs(sqrt(float_sum / size)) <= MSE_float_tolerance, test_info << "Float MSE = " << sqrt(float_sum / size)); + CHECK_MESSAGE(std::fabs(sqrt(int_sum / size)) <= MSE_int_tolerance, test_info << "Int MSE = " << sqrt(int_sum / size)); } } // namespace intgemm diff --git a/test/test.h b/test/test.h index f145681..1f884c5 100644 --- a/test/test.h +++ b/test/test.h @@ -1,12 +1,12 @@ #pragma once -#include "intgemm_config.h" +#include "intgemm/intgemm_config.h" -#include "../3rd_party/catch.hpp" -#include "../intgemm.h" -#include "../aligned.h" +#include "3rd_party/catch.hpp" +#include "../intgemm/intgemm.h" +#include "../intgemm/aligned.h" -#include <math.h> +#include <cmath> #include <sstream> #include <iostream> #include <iomanip> @@ -18,7 +18,7 @@ #define CHECK_EPS(actual, expected, epsilon) \ do { \ - if (fabs((actual) - (expected)) < epsilon) { SUCCEED(); } \ + if (std::fabs((actual) - (expected)) < epsilon) { SUCCEED(); } \ else { CHECK((actual) == (expected)); } \ } while(0) @@ -39,8 +39,8 @@ void CompareEps(const Type* reference, const Type* actual, Index size, Type epsi for (Index i = 0; i < size; ++i) { INFO("Inaccurate at " << i << ' ' << reference[i] << ' ' << actual[i]); // Ratio to maximum value. - float threshold = epsilon * std::max<float>(0.01f, fabsf(reference[i])); - CHECK(fabsf(reference[i] - actual[i]) < threshold); + float threshold = epsilon * std::max<float>(0.01f, std::fabs(reference[i])); + CHECK(std::fabs(reference[i] - actual[i]) < threshold); } } diff --git a/test/utils_test.cc b/test/utils_test.cc index 00b5277..e7d07e8 100644 --- a/test/utils_test.cc +++ b/test/utils_test.cc @@ -1,5 +1,5 @@ #include "test.h" -#include "../utils.h" +#include "../intgemm/utils.h" namespace intgemm { namespace { diff --git a/test_mull.cpp b/test_mull.cpp deleted file mode 100644 index 42924a6..0000000 --- a/test_mull.cpp +++ /dev/null @@ -1,328 +0,0 @@ -#include "intgemm.cc" -#include "aligned.h" -#include <iostream> -#include <random> -#include <string> -#include <algorithm> -#include <fstream> -#include <sstream> - - -/*Adapted from https://www.bfilipek.com/2018/07/string-view-perf-followup.html . We should probably go string_view way -inline void tokenizeLine(std::string& str, std::vector<std::string>& output, - std::string delimeter = " ") { - auto first = std::begin(str); - - while (first != str.end()) { - const auto second = std::find_first_of(first, std::end(str), std::begin(delimeter), std::end(delimeter)); - - if (first != second) { - output.emplace_back(str.substr(std::distance(std::begin(str), first), std::distance(first, second))); - } - - if (second == str.end()) - break; - - first = std::next(second); - } -} - -//This is a different parsing method, without stringStream -template<class StringType> -void ReadInFile(StringType infile) { - std::ifstream in(infile); - std::string line; - - //First line, Info about the matrix - std::getline(in, line); - std::istringstream iss(line); - std::string temp1, temp2, temp3, temp4; - int RowsA, ColsA, RowsB, ColsB; - if (!(iss >> temp1 >> RowsA >> temp2 >> ColsA >> temp3 >> RowsB >> temp4 >> ColsB)) { - std::cerr << "Error parsing line 1 " << std::endl; - exit(1); - } - - //Second line, get QuantMult - std::getline(in, line); - std::istringstream iss2(line); - float quantMultA, quantMultB; - if (!(iss2 >> temp1 >> quantMultA >> temp2 >> quantMultA)) { - std::cerr << "Error parsing line 2 " << std::endl; - exit(1); - } - std::getline(in, line); //Just some text - //Fourth line, AQuant - std::vector<int> AQuant; - std::getline(in, line); - std::vector<std::string> tmp_container; - tokenizeLine(line, tmp_container); - if (tmp_container.size() != RowsA*ColsA) { - std::cerr << "Error parsing matrix A. Size mismatch. Expected " << RowsA*ColsA << " got " << tmp_container.size() << std::endl; - } - for (auto&& num : tmp_container) { - AQuant.push_back(std::stoi(num)); - } - tmp_container.resize(0); - - std::getline(in, line); //Just some text - //Sixth line, B_raw - std::vector<float> B_raw; - std::getline(in, line); - tokenizeLine(line, tmp_container); - if (tmp_container.size() != RowsB*ColsB) { - std::cerr << "Error parsing matrix B. Size mismatch. Expected " << RowsB*ColsB << " got " << tmp_container.size() << std::endl; - } - for (auto&& num : tmp_container) { - B_raw.push_back(std::stof(num)); - } - tmp_container.resize(0); - - std::getline(in, line); //Just some text - //Eight line, Bias - std::vector<float> Bias; - std::getline(in, line); - tokenizeLine(line, tmp_container); - if (tmp_container.size() != ColsB) { - std::cerr << "Error parsing bias. Size mismatch. Expected " << ColsB << " got " << tmp_container.size() << std::endl; - } - for (auto&& num : tmp_container) { - Bias.push_back(std::stof(num)); - } - tmp_container.resize(0); - -} - -*/ -template<class StringType> -void ReadInFile(StringType infile) { - std::ifstream in(infile); - std::string line; - - //First line, Info about the matrix - std::getline(in, line); - std::istringstream iss(line); - std::string temp1, temp2, temp3, temp4; - int RowsA, ColsA, RowsB, ColsB; - if (!(iss >> temp1 >> RowsA >> temp2 >> ColsA >> temp3 >> RowsB >> temp4 >> ColsB)) { - std::cerr << "Error parsing line 1 " << std::endl; - exit(1); - } - - //Second line, get QuantMult - std::getline(in, line); - std::istringstream iss2(line); - float quantMultA, quantMultB; - if (!(iss2 >> temp1 >> quantMultA >> temp2 >> quantMultA)) { - std::cerr << "Error parsing line 2 " << std::endl; - exit(1); - } - std::getline(in, line); //Just some text for human readability - - //4th line, AQuant - std::vector<int> AQuant; - std::getline(in, line); - std::istringstream iss3(line); - for (int i = 0; i < RowsA*ColsA; i++) { - int num; - if (!(iss3 >> num)) { - std::cerr << "Error parsing matrix A at " << i << std::endl;; - } - AQuant.push_back(num); - } - - std::getline(in, line); //Just some text for human readability - //6th line, B_raw - std::vector<float> B_raw; - std::getline(in, line); - std::istringstream iss4(line); - for (int i = 0; i < RowsB*ColsB; i++) { - float num; - if (!(iss4 >> num)) { - std::cerr << "Error parsing matrix B " << std::endl; - } - B_raw.push_back(num); - } - - std::getline(in, line); //Just some text for human readability - //8th line, Bias - std::vector<float> Bias; - std::getline(in, line); - std::istringstream iss5(line); - for (int i = 0; i < ColsB; i++) { - float num; - if (!(iss5 >> num)) { - std::cerr << "Error parsing matrix bias " << std::endl; - } - Bias.push_back(num); - } -} - -using namespace intgemm; -template<class T> -void printMatrix(T* data, Index rows, Index cols) { - std::cout << "["; - for (int i = 0; i<rows; i++) { - std::cout << "["; - for (int j =0; j<cols; j++) { - std::cout << (float)data[i*cols + j]; - if (j != cols - 1) { - std::cout << ", "; - } - } - std::cout << "]"; - if (i != rows -1) { - std::cout << ',' << std::endl; - } - } - std::cout << "]" << std::endl; -} - -void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index width, Index B_cols, const float *bias) { - for (Index r = 0; r < A_rows; ++r) { - for (Index c = 0; c < B_cols; ++c) { - float sum = 0.0f; - for (Index k = 0; k < width; ++k) { - sum += A[r * width + k] * B[k * B_cols + c]; - } - if (bias) { - C[r * B_cols + c] = sum + bias[c]; - } else { - C[r * B_cols + c] = sum; - } - } - } -} - -// Compute A*B slowly from integers. -template <class Integer> -void SlowRefInt(const Integer *A, const Integer *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias) { - for (Index r = 0; r < A_rows; ++r) { - for (Index c = 0; c < B_cols; ++c) { - int32_t sum = 0; - for (Index k = 0; k < width; ++k) { - sum += static_cast<int16_t>(A[r * width + k]) * static_cast<int16_t>(B[k * B_cols + c]); - } - if (bias) { - C[r * B_cols + c] = sum * unquant_mult + bias[c]; - } else { - C[r * B_cols + c] = sum * unquant_mult; - } - } - } -} - -int main() { - - const Index A_rows = 1; - const Index width = 2048; - const Index B_cols = 8; - - AlignedVector<float> A(A_rows * width); - AlignedVector<float> B(width * B_cols); - AlignedVector<float> bias(B_cols); - - float alpha = 2.0f; - float quant_mult = 127/alpha; - float unquant_mult = 1.0 / (quant_mult * quant_mult); - - std::mt19937 gen; - std::uniform_real_distribution<float> dist(-2.0f, 2.0f); - - for (auto& it : A) { - it = dist(gen); - } - for (auto& it : B) { - it = dist(gen); - } - for (auto& it : bias) { - it = dist(gen); - } - - AlignedVector<float> bias_orig(B_cols); - for (int i = 0; i < bias.size(); i++) { - bias_orig[i] = bias[i]; - } - - AlignedVector<int8_t> A_prep(A.size()); - AlignedVector<int8_t> B_prep(B.size()); - - AVX2_8bit::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width); - AVX2_8bit::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols); - /* - std::cout << "A:" << std::endl; - printMatrix(A.begin(), A_rows, width); - std::cout << "B:" << std::endl; - printMatrix(B.begin(), width, B_cols); - std::cout << "bias:" << std::endl; - printMatrix(bias.begin(), 1, B_cols);*/ - - - AlignedVector<float> test_C(A_rows * B_cols); - - AVX2_8bit::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin())); - //AVX2_8bit::Multiply(A_prep.begin(), B_prep.begin(), JustUnquantizeC(test_C.begin(), unquant_mult), A_rows, width, B_cols); - std::cout << "Old multiply:" << std::endl; - printMatrix(test_C.begin(), A_rows, B_cols); - - //NEEEXT - AlignedVector<uint8_t> A_prep2(A.size()); - AVX2_8bit::PrepareA(A.begin(), A_prep2.begin(), quant_mult, A_rows, width); - - AVX2_8bit::PrepareBiasFor8(B.begin(), bias.begin(), alpha, width, B_cols); - - //printMatrix(bias.begin(), 1, B_cols); //Print bias - - AVX2_8bit::Multiply8Shift(reinterpret_cast<uint8_t*>(A_prep2.begin()), B_prep.begin(), A_rows, width, B_cols, UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin())); - //AVX2_8bit::Multiply8Shift(reinterpret_cast<uint8_t*>(A_prep.begin()), B_prep.begin(), JustUnquantizeC(test_C.begin(), unquant_mult), A_rows, width, B_cols); - - AlignedVector<int16_t> A_prep3(A.size()); - AlignedVector<int16_t> B_prep3(B.size()); - std::cout << "New multiply:" << std::endl; - printMatrix(test_C.begin(), A_rows, B_cols); - for (int i = 0; i < A_prep2.size(); i++) { - A_prep3[i] = A_prep2[i]; - } - AVX2_16bit::PrepareB(B.begin(), B_prep3.begin(), quant_mult, width, B_cols); - AVX2_16bit::Multiply(A_prep3.begin(), B_prep3.begin(), A_rows, width, B_cols, UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin())); - - std::cout << "New multiply, 16 bit:" << std::endl; - printMatrix(test_C.begin(), A_rows, B_cols); - - //FULL INTS - AlignedVector<float> C_slowint(A_rows * B_cols); - AlignedVector<int8_t> B_quant(width * B_cols); - AVX2_8bit::Quantize(B.begin(), B_quant.begin(), quant_mult, B.size()); - - SlowRefInt(A_prep.begin(), B_quant.begin(), C_slowint.begin(), - unquant_mult, A_rows, width, B_cols, bias_orig.begin()); - - - std::cout << "Reference int8:" << std::endl; - printMatrix(C_slowint.begin(), A_rows, B_cols); - - //FULL INT16 - AlignedVector<int16_t> A_prep4(A.size()); - for (int i = 0; i < A_prep2.size(); i++) { - A_prep4[i] = A_prep[i]; - } - - AlignedVector<float> C_slowint2(A_rows * B_cols); - AlignedVector<int16_t> B_quant2(width * B_cols); - AVX2_16bit::Quantize(B.begin(), B_quant2.begin(), quant_mult, B.size()); - - SlowRefInt(A_prep4.begin(), B_quant2.begin(), C_slowint2.begin(), - unquant_mult, A_rows, width, B_cols, bias_orig.begin()); - - - std::cout << "Reference int16:" << std::endl; - printMatrix(C_slowint2.begin(), A_rows, B_cols); - - //FLOATS - AlignedVector<float> C(A_rows * B_cols); - - SlowRefFloat(A.begin(), B.begin(), C.begin(), A_rows, width, B_cols, bias_orig.begin()); - std::cout << "Reference float:" << std::endl; - printMatrix(C.begin(), A_rows, B_cols); - -} |