Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNikolay Bogoychev <nheart@gmail.com>2020-11-15 19:50:02 +0300
committerGitHub <noreply@github.com>2020-11-15 19:50:02 +0300
commit8abde25b13c3ab210c0dec8e23f4944e3953812d (patch)
tree90b591ee994252ddd44d593276b4ef895bbcb5aa
parent874ceebbf53a85691b326495100b6361a2166cec (diff)
parent8f28282c3bd854922da638024d2659be52e892e9 (diff)
Merge pull request #2 from kpu/master
Merge with latest intgemm master
-rw-r--r--.github/workflows/mac.yml (renamed from .github/workflows/release.yml)7
-rw-r--r--.github/workflows/ubuntu-gcc5-debug.yml27
-rw-r--r--.github/workflows/ubuntu.yml25
-rw-r--r--.github/workflows/windows.yml25
-rw-r--r--CMakeLists.txt11
-rw-r--r--LICENSE30
-rw-r--r--README.md9
-rw-r--r--benchmarks/benchmark.cc48
-rw-r--r--benchmarks/benchmark_quantizer.cc18
-rw-r--r--benchmarks/biasmultiply.cc172
-rw-r--r--example.cc13
-rw-r--r--intgemm.cc71
-rw-r--r--intgemm/aligned.h (renamed from aligned.h)1
-rw-r--r--intgemm/avx2_gemm.h (renamed from avx2_gemm.h)101
-rw-r--r--intgemm/avx512_gemm.h (renamed from avx512_gemm.h)106
-rw-r--r--intgemm/avx512vnni_gemm.h (renamed from avx512vnni_gemm.h)15
-rw-r--r--intgemm/callbacks.h (renamed from callbacks.h)2
-rw-r--r--intgemm/callbacks/configs.h (renamed from callbacks/configs.h)0
-rw-r--r--intgemm/callbacks/implementations.inl (renamed from callbacks/implementations.inl)18
-rw-r--r--intgemm/callbacks/output_buffer_info.h (renamed from callbacks/output_buffer_info.h)0
-rw-r--r--intgemm/interleave.h (renamed from interleave.h)49
-rw-r--r--intgemm/intgemm.cc71
-rw-r--r--intgemm/intgemm.h (renamed from intgemm.h)28
-rw-r--r--intgemm/intgemm_config.h.in (renamed from intgemm_config.h.in)0
-rw-r--r--intgemm/intrinsics.h (renamed from intrinsics.h)51
-rw-r--r--intgemm/kernels.h (renamed from kernels.h)2
-rw-r--r--intgemm/kernels/implementations.inl (renamed from kernels/implementations.inl)34
-rw-r--r--intgemm/multiply.h (renamed from multiply.h)28
-rw-r--r--intgemm/sse2_gemm.h (renamed from sse2_gemm.h)44
-rw-r--r--intgemm/ssse3_gemm.h (renamed from ssse3_gemm.h)64
-rw-r--r--intgemm/stats.h (renamed from stats.h)0
-rw-r--r--intgemm/stats.inl (renamed from stats.inl)2
-rw-r--r--intgemm/types.h (renamed from types.h)0
-rw-r--r--intgemm/utils.h (renamed from utils.h)0
-rw-r--r--intgemm/vec_traits.h (renamed from vec_traits.h)0
-rw-r--r--test/3rd_party/LICENSE_1_0.txt24
-rw-r--r--test/3rd_party/catch.hpp (renamed from 3rd_party/catch.hpp)0
-rw-r--r--test/add127_test.cc210
-rw-r--r--test/kernels/add_bias_test.cc4
-rw-r--r--test/kernels/bitwise_not_test.cc4
-rw-r--r--test/kernels/downcast_test.cc4
-rw-r--r--test/kernels/exp_test.cc4
-rw-r--r--test/kernels/floor_test.cc4
-rw-r--r--test/kernels/multiply_sat_test.cc54
-rw-r--r--test/kernels/multiply_test.cc5
-rw-r--r--test/kernels/quantize_test.cc4
-rw-r--r--test/kernels/relu_test.cc5
-rw-r--r--test/kernels/rescale_test.cc5
-rw-r--r--test/kernels/sigmoid_test.cc4
-rw-r--r--test/kernels/tanh_test.cc4
-rw-r--r--test/kernels/unquantize_test.cc4
-rw-r--r--test/kernels/upcast_test.cc9
-rw-r--r--test/kernels/write_test.cc4
-rw-r--r--test/multiply_test.cc230
-rw-r--r--test/prepare_b_quantized_transposed.cc24
-rw-r--r--test/prepare_b_transposed.cc24
-rw-r--r--test/quantize_test.cc38
-rw-r--r--test/test.cc10
-rw-r--r--test/test.h16
-rw-r--r--test/utils_test.cc2
-rw-r--r--test_mull.cpp328
61 files changed, 885 insertions, 1211 deletions
diff --git a/.github/workflows/release.yml b/.github/workflows/mac.yml
index 4fb9b3f..767cf1a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/mac.yml
@@ -1,4 +1,4 @@
-name: Release
+name: Mac
on:
push:
@@ -8,10 +8,7 @@ on:
jobs:
build:
- strategy:
- matrix:
- runs-on: [ubuntu-latest, macOS-latest, windows-latest]
- runs-on: ${{ matrix.runs-on }}
+ runs-on: macOS-latest
steps:
- uses: actions/checkout@v2
diff --git a/.github/workflows/ubuntu-gcc5-debug.yml b/.github/workflows/ubuntu-gcc5-debug.yml
new file mode 100644
index 0000000..1323828
--- /dev/null
+++ b/.github/workflows/ubuntu-gcc5-debug.yml
@@ -0,0 +1,27 @@
+name: Ubuntu gcc5 debug
+
+on:
+ push:
+ branches: [master, static]
+ pull_request:
+ branches: [master, static]
+
+jobs:
+ build:
+ runs-on: ubuntu-18.04
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: install
+ run: sudo apt-get install -y gcc-5 g++-5
+ - name: cmake
+ run: |
+ cmake -E make_directory build
+ cd build
+ cmake -DCMAKE_C_COMPILER=gcc-5 -DCMAKE_CXX_COMPILER=g++-5 -DCMAKE_BUILD_TYPE=Debug ..
+ - name: Compile
+ working-directory: build
+ run: cmake --build . -j2
+ - name: Test
+ working-directory: build
+ run: ctest -j2
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
new file mode 100644
index 0000000..f0e457b
--- /dev/null
+++ b/.github/workflows/ubuntu.yml
@@ -0,0 +1,25 @@
+name: Ubuntu
+
+on:
+ push:
+ branches: [master, static]
+ pull_request:
+ branches: [master, static]
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: cmake
+ run: |
+ cmake -E make_directory build
+ cd build
+ cmake ..
+ - name: Compile
+ working-directory: build
+ run: cmake --build . -j2
+ - name: Test
+ working-directory: build
+ run: ctest -j2
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
new file mode 100644
index 0000000..d64fefd
--- /dev/null
+++ b/.github/workflows/windows.yml
@@ -0,0 +1,25 @@
+name: Windows
+
+on:
+ push:
+ branches: [master, static]
+ pull_request:
+ branches: [master, static]
+
+jobs:
+ build:
+ runs-on: windows-latest
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: cmake
+ run: |
+ cmake -E make_directory build
+ cd build
+ cmake ..
+ - name: Compile
+ working-directory: build
+ run: cmake --build . -j2
+ - name: Test
+ working-directory: build
+ run: ctest -j2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c675315..d1885f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,12 +33,16 @@ if(NOT INTGEMM_COMPILER_SUPPORTS_AVX512VNNI)
message(WARNING "${Orange}Not building AVX512VNNI-based multiplication because your compiler is too old.\nFor details rerun cmake with --debug-trycompile then try to build in compile_tests/CMakeFiles/CMakeTmp.${ColourReset}")
endif()
-# Generate configure file
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/intgemm_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/intgemm_config.h)
+add_library(intgemm STATIC intgemm/intgemm.cc)
+# Generate configure file
+configure_file(intgemm/intgemm_config.h.in intgemm/intgemm_config.h)
+#Ensure it is included by users.
include_directories(${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(intgemm PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
-add_library(intgemm STATIC intgemm.cc)
+# This isn't necessary since intgemm uses entirely relative paths but source code depending on it may want to #include <intgemm/intgemm.h>
+target_include_directories(intgemm INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
option(USE_OPENMP "Use OpenMP" OFF)
if (USE_OPENMP)
@@ -80,7 +84,6 @@ add_executable(tests
test/kernels/downcast_test.cc
test/kernels/exp_test.cc
test/kernels/floor_test.cc
- test/kernels/multiply_sat_test.cc
test/kernels/multiply_test.cc
test/kernels/quantize_test.cc
test/kernels/relu_test.cc
diff --git a/LICENSE b/LICENSE
index 9fcb60f..0d57f7b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -10,6 +10,36 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
+test/3rd_party/catch.hpp
+Copyright (c) 2019 Two Blue Cubes Ltd. All rights reserved.
+Distributed under the Boost Software License, Version 1.0. (See accompanying
+file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+
The original 16-bit SSE2 code came from:
diff --git a/README.md b/README.md
index 8947ca9..30469fc 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,11 @@
[![Build SSE](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-SSE.svg?label=SSE)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-SSE/)
[![Build AVX2](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-AVX2.svg?label=AVX2)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-AVX2/)
[![Build AVX512BW](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-AVX512BW.svg?label=AVX512BW)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-AVX512BW/)
+![Build Ubuntu](https://github.com/kpu/intgemm/workflows/Ubuntu/badge.svg)
+![Build Ubuntu debug](https://github.com/kpu/intgemm/workflows/Ubuntu%20debug/badge.svg)
+![Build Ubuntu OpenMP](https://github.com/kpu/intgemm/workflows/Ubuntu%20OpenMP/badge.svg)
+![Build Windows](https://github.com/kpu/intgemm/workflows/Windows/badge.svg)
+![Build Mac](https://github.com/kpu/intgemm/workflows/Mac/badge.svg)
# Integer Matrix Multiplication
@@ -25,7 +30,7 @@ A full example appears in [example.cc](example.cc).
Both A and B should be prepared before multiplication.
```C++
-#include "intgemm.h"
+#include "intgemm/intgemm.h"
/* Not shown: allocate 64-byte aligned memory with e.g. aligned_alloc.
* A is A_rows x width.
@@ -51,7 +56,7 @@ The last argument of `Multiply` is a callback which is usually used to performs
For 8-bit, you can make use a of a slightly faster implementation, assuming you can determine tha quantization multipliers and prepare the biases offline:
```C++
-#include "intgemm.h"
+#include "intgemm/intgemm.h"
/* Not shown: allocate 64-byte aligned memory with e.g. aligned_alloc.
* A is A_rows x width.
diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc
index ebd0920..c6133bf 100644
--- a/benchmarks/benchmark.cc
+++ b/benchmarks/benchmark.cc
@@ -1,12 +1,12 @@
-#include "../aligned.h"
-#include "intgemm_config.h"
-#include "../avx512_gemm.h"
-#include "../sse2_gemm.h"
-#include "../avx2_gemm.h"
-#include "../ssse3_gemm.h"
-#include "../intgemm.h"
-#include "../stats.h"
-#include "../callbacks.h"
+#include "../intgemm/aligned.h"
+#include "intgemm/intgemm_config.h"
+#include "../intgemm/avx512_gemm.h"
+#include "../intgemm/sse2_gemm.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/ssse3_gemm.h"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/stats.h"
+#include "../intgemm/callbacks.h"
#include <algorithm>
#include <cassert>
@@ -43,7 +43,7 @@ struct RandomMatrices {
};
template <class Backend> double Run(const RandomMatrices &m) {
- typedef typename Backend::Integer Integer;
+ using Integer = typename Backend::Integer;
float quant_mult = 127.0f / 2.0f;
float unquant_mult = 1.0f / (quant_mult * quant_mult);
AlignedVector<Integer> A_prepared(m.A_rows * m.width);
@@ -145,45 +145,45 @@ int main(int, char ** argv) {
std::cerr << "SSSE3 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<SSSE3_8bit>(matrices, end, stats.ssse3_8bit);
+ RunAll<ssse3::Kernels8>(matrices, end, stats.ssse3_8bit);
}
std::cerr << "SSE2 16bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<SSE2_16bit>(matrices, end, stats.sse2_16bit);
+ RunAll<sse2::Kernels16>(matrices, end, stats.sse2_16bit);
}
std::cerr << "AVX2 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<AVX2_8bit>(matrices, end, stats.avx2_8bit);
+ RunAll<avx2::Kernels8>(matrices, end, stats.avx2_8bit);
}
std::cerr << "AVX2 16bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<AVX2_16bit>(matrices, end, stats.avx2_16bit);
+ RunAll<avx2::Kernels16>(matrices, end, stats.avx2_16bit);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
std::cerr << "AVX512 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<AVX512_8bit>(matrices, end, stats.avx512_8bit);
+ RunAll<avx512bw::Kernels8>(matrices, end, stats.avx512_8bit);
}
std::cerr << "AVX512 16bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<AVX512_16bit>(matrices, end, stats.avx512_16bit);
+ RunAll<avx512bw::Kernels16>(matrices, end, stats.avx512_16bit);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
std::cerr << "AVX512VNNI 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
- RunAll<AVX512VNNI_8bit>(matrices, end, stats.avx512vnni_8bit);
+ RunAll<avx512vnni::Kernels8>(matrices, end, stats.avx512vnni_8bit);
}
#endif
@@ -193,18 +193,18 @@ int main(int, char ** argv) {
}
for (std::size_t i = 0; i < sizeof(matrices) / sizeof(RandomMatrices); ++i) {
std::cout << "Multiply\t" << matrices[i].A_rows << '\t' << matrices[i].width << '\t' << matrices[i].B_cols << '\t' << "Samples=" << (kOutlierThreshold * stats.sse2_16bit[i].size()) << '\n';
- Print<SSSE3_8bit>(stats.ssse3_8bit, i);
- Print<AVX2_8bit>(stats.avx2_8bit, i);
+ Print<ssse3::Kernels8>(stats.ssse3_8bit, i);
+ Print<avx2::Kernels8>(stats.avx2_8bit, i);
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- Print<AVX512_8bit>(stats.avx512_8bit, i);
+ Print<avx512bw::Kernels8>(stats.avx512_8bit, i);
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
- Print<AVX512VNNI_8bit>(stats.avx512vnni_8bit, i);
+ Print<avx512vnni::Kernels8>(stats.avx512vnni_8bit, i);
#endif
- Print<SSE2_16bit>(stats.sse2_16bit, i);
- Print<AVX2_16bit>(stats.avx2_16bit, i);
+ Print<sse2::Kernels16>(stats.sse2_16bit, i);
+ Print<avx2::Kernels16>(stats.avx2_16bit, i);
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- Print<AVX512_16bit>(stats.avx512_16bit, i);
+ Print<avx512bw::Kernels16>(stats.avx512_16bit, i);
#endif
}
return 0;
diff --git a/benchmarks/benchmark_quantizer.cc b/benchmarks/benchmark_quantizer.cc
index 16bf67f..5f36bd7 100644
--- a/benchmarks/benchmark_quantizer.cc
+++ b/benchmarks/benchmark_quantizer.cc
@@ -1,8 +1,8 @@
-#include "../intgemm.h"
-#include "../aligned.h"
-#include "../ssse3_gemm.h"
-#include "../avx2_gemm.h"
-#include "../avx512_gemm.h"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/aligned.h"
+#include "../intgemm/ssse3_gemm.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/avx512_gemm.h"
#include <chrono>
#include <iomanip>
@@ -14,7 +14,7 @@ namespace {
float MaxAbsoluteBaseline(const float *begin, const float *end) {
auto res = std::minmax_element(begin, end);
- return std::max(fabsf(*res.first), fabsf(*res.second));
+ return std::max(std::fabs(*res.first), std::fabs(*res.second));
}
void BenchmarkMaxAbsolute() {
@@ -63,10 +63,10 @@ int main() {
for (float &element : in) {
element = dist(gen);
}
- QuantizerBench<intgemm::SSSE3_8bit>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
- QuantizerBench<intgemm::AVX2_8bit>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+ QuantizerBench<intgemm::ssse3::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+ QuantizerBench<intgemm::avx2::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- QuantizerBench<intgemm::AVX512_8bit>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+ QuantizerBench<intgemm::avx512bw::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
#endif
}
}
diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc
index da46bb9..490bf3b 100644
--- a/benchmarks/biasmultiply.cc
+++ b/benchmarks/biasmultiply.cc
@@ -1,5 +1,5 @@
-#include "../intgemm.h"
-#include "../aligned.h"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/aligned.h"
#include <chrono>
#include <random>
#include <iostream>
@@ -125,149 +125,149 @@ int main(int argc, char ** argv) {
repeat = atoi(argv[1]);
}
- std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<SSSE3_8bit>(1, 64, 8);
+ std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<ssse3::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(8, 256, 256);
- oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(8, 2048, 256);
- oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(320, 256, 256);
- oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(472, 256, 256);
- oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(248, 256, 256);
- oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(200, 256, 256);
+ oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(8, 256, 256);
+ oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(8, 2048, 256);
+ oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(320, 256, 256);
+ oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(472, 256, 256);
+ oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(248, 256, 256);
+ oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of SSSE3 without bias took: " << oldSSSE3_nobias.count() << " seconds." << std::endl;
- std::chrono::duration<double> oldSSSE3 = testOld<SSSE3_8bit>(1, 64, 8);
+ std::chrono::duration<double> oldSSSE3 = testOld<ssse3::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldSSSE3 += testOld<SSSE3_8bit>(8, 256, 256);
- oldSSSE3 += testOld<SSSE3_8bit>(8, 2048, 256);
- oldSSSE3 += testOld<SSSE3_8bit>(320, 256, 256);
- oldSSSE3 += testOld<SSSE3_8bit>(472, 256, 256);
- oldSSSE3 += testOld<SSSE3_8bit>(248, 256, 256);
- oldSSSE3 += testOld<SSSE3_8bit>(200, 256, 256);
+ oldSSSE3 += testOld<ssse3::Kernels8>(8, 256, 256);
+ oldSSSE3 += testOld<ssse3::Kernels8>(8, 2048, 256);
+ oldSSSE3 += testOld<ssse3::Kernels8>(320, 256, 256);
+ oldSSSE3 += testOld<ssse3::Kernels8>(472, 256, 256);
+ oldSSSE3 += testOld<ssse3::Kernels8>(248, 256, 256);
+ oldSSSE3 += testOld<ssse3::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of SSSE3 took: " << oldSSSE3.count() << " seconds." << std::endl;
- std::chrono::duration<double> newTimeSSSE3 = testOld<SSSE3_8bit>(1, 64, 8);
+ std::chrono::duration<double> newTimeSSSE3 = testOld<ssse3::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- newTimeSSSE3 += testNew<SSSE3_8bit>(8, 256, 256);
- newTimeSSSE3 += testNew<SSSE3_8bit>(8, 2048, 256);
- newTimeSSSE3 += testNew<SSSE3_8bit>(320, 256, 256);
- newTimeSSSE3 += testNew<SSSE3_8bit>(472, 256, 256);
- newTimeSSSE3 += testNew<SSSE3_8bit>(248, 256, 256);
- newTimeSSSE3 += testNew<SSSE3_8bit>(200, 256, 256);
+ newTimeSSSE3 += testNew<ssse3::Kernels8>(8, 256, 256);
+ newTimeSSSE3 += testNew<ssse3::Kernels8>(8, 2048, 256);
+ newTimeSSSE3 += testNew<ssse3::Kernels8>(320, 256, 256);
+ newTimeSSSE3 += testNew<ssse3::Kernels8>(472, 256, 256);
+ newTimeSSSE3 += testNew<ssse3::Kernels8>(248, 256, 256);
+ newTimeSSSE3 += testNew<ssse3::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl;
- std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<AVX2_8bit>(1, 64, 8);
+ std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<avx2::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX2_nobias += testOld_nobias<AVX2_8bit>(8, 256, 256);
- oldAVX2_nobias += testOld_nobias<AVX2_8bit>(8, 2048, 256);
- oldAVX2_nobias += testOld_nobias<AVX2_8bit>(320, 256, 256);
- oldAVX2_nobias += testOld_nobias<AVX2_8bit>(472, 256, 256);
- oldAVX2_nobias += testOld_nobias<AVX2_8bit>(248, 256, 256);
- oldAVX2_nobias += testOld_nobias<AVX2_8bit>(200, 256, 256);
+ oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(8, 256, 256);
+ oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(8, 2048, 256);
+ oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(320, 256, 256);
+ oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(472, 256, 256);
+ oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(248, 256, 256);
+ oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX2 without bias took: " << oldAVX2_nobias.count() << " seconds." << std::endl;
- std::chrono::duration<double> oldAVX2 = testOld<AVX2_8bit>(1, 64, 8);
+ std::chrono::duration<double> oldAVX2 = testOld<avx2::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX2 += testOld<AVX2_8bit>(8, 256, 256);
- oldAVX2 += testOld<AVX2_8bit>(8, 2048, 256);
- oldAVX2 += testOld<AVX2_8bit>(320, 256, 256);
- oldAVX2 += testOld<AVX2_8bit>(472, 256, 256);
- oldAVX2 += testOld<AVX2_8bit>(248, 256, 256);
- oldAVX2 += testOld<AVX2_8bit>(200, 256, 256);
+ oldAVX2 += testOld<avx2::Kernels8>(8, 256, 256);
+ oldAVX2 += testOld<avx2::Kernels8>(8, 2048, 256);
+ oldAVX2 += testOld<avx2::Kernels8>(320, 256, 256);
+ oldAVX2 += testOld<avx2::Kernels8>(472, 256, 256);
+ oldAVX2 += testOld<avx2::Kernels8>(248, 256, 256);
+ oldAVX2 += testOld<avx2::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX2 took: " << oldAVX2.count() << " seconds." << std::endl;
- std::chrono::duration<double> newTimeAVX2 = testOld<AVX2_8bit>(1, 64, 8);
+ std::chrono::duration<double> newTimeAVX2 = testOld<avx2::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- newTimeAVX2 += testNew<AVX2_8bit>(8, 256, 256);
- newTimeAVX2 += testNew<AVX2_8bit>(8, 2048, 256);
- newTimeAVX2 += testNew<AVX2_8bit>(320, 256, 256);
- newTimeAVX2 += testNew<AVX2_8bit>(472, 256, 256);
- newTimeAVX2 += testNew<AVX2_8bit>(248, 256, 256);
- newTimeAVX2 += testNew<AVX2_8bit>(200, 256, 256);
+ newTimeAVX2 += testNew<avx2::Kernels8>(8, 256, 256);
+ newTimeAVX2 += testNew<avx2::Kernels8>(8, 2048, 256);
+ newTimeAVX2 += testNew<avx2::Kernels8>(320, 256, 256);
+ newTimeAVX2 += testNew<avx2::Kernels8>(472, 256, 256);
+ newTimeAVX2 += testNew<avx2::Kernels8>(248, 256, 256);
+ newTimeAVX2 += testNew<avx2::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl;
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
if (kCPU < CPUType::AVX512BW) return 0;
- std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<AVX512_8bit>(1, 64, 8);
+ std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<avx512bw::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX512_nobias += testOld_nobias<AVX512_8bit>(8, 256, 256);
- oldAVX512_nobias += testOld_nobias<AVX512_8bit>(8, 2048, 256);
- oldAVX512_nobias += testOld_nobias<AVX512_8bit>(320, 256, 256);
- oldAVX512_nobias += testOld_nobias<AVX512_8bit>(472, 256, 256);
- oldAVX512_nobias += testOld_nobias<AVX512_8bit>(248, 256, 256);
- oldAVX512_nobias += testOld_nobias<AVX512_8bit>(200, 256, 256);
+ oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(8, 256, 256);
+ oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(8, 2048, 256);
+ oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(320, 256, 256);
+ oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(472, 256, 256);
+ oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(248, 256, 256);
+ oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512 without bias took: " << oldAVX512_nobias.count() << " seconds." << std::endl;
- std::chrono::duration<double> oldAVX512 = testOld<AVX512_8bit>(1, 64, 8);
+ std::chrono::duration<double> oldAVX512 = testOld<avx512bw::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX512 += testOld<AVX512_8bit>(8, 256, 256);
- oldAVX512 += testOld<AVX512_8bit>(8, 2048, 256);
- oldAVX512 += testOld<AVX512_8bit>(320, 256, 256);
- oldAVX512 += testOld<AVX512_8bit>(472, 256, 256);
- oldAVX512 += testOld<AVX512_8bit>(248, 256, 256);
- oldAVX512 += testOld<AVX512_8bit>(200, 256, 256);
+ oldAVX512 += testOld<avx512bw::Kernels8>(8, 256, 256);
+ oldAVX512 += testOld<avx512bw::Kernels8>(8, 2048, 256);
+ oldAVX512 += testOld<avx512bw::Kernels8>(320, 256, 256);
+ oldAVX512 += testOld<avx512bw::Kernels8>(472, 256, 256);
+ oldAVX512 += testOld<avx512bw::Kernels8>(248, 256, 256);
+ oldAVX512 += testOld<avx512bw::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512 took: " << oldAVX512.count() << " seconds." << std::endl;
- std::chrono::duration<double> newTimeAVX512 = testOld<AVX512_8bit>(1, 64, 8);
+ std::chrono::duration<double> newTimeAVX512 = testOld<avx512bw::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- newTimeAVX512 += testNew<AVX512_8bit>(8, 256, 256);
- newTimeAVX512 += testNew<AVX512_8bit>(8, 2048, 256);
- newTimeAVX512 += testNew<AVX512_8bit>(320, 256, 256);
- newTimeAVX512 += testNew<AVX512_8bit>(472, 256, 256);
- newTimeAVX512 += testNew<AVX512_8bit>(248, 256, 256);
- newTimeAVX512 += testNew<AVX512_8bit>(200, 256, 256);
+ newTimeAVX512 += testNew<avx512bw::Kernels8>(8, 256, 256);
+ newTimeAVX512 += testNew<avx512bw::Kernels8>(8, 2048, 256);
+ newTimeAVX512 += testNew<avx512bw::Kernels8>(320, 256, 256);
+ newTimeAVX512 += testNew<avx512bw::Kernels8>(472, 256, 256);
+ newTimeAVX512 += testNew<avx512bw::Kernels8>(248, 256, 256);
+ newTimeAVX512 += testNew<avx512bw::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted AVX512 took: " << newTimeAVX512.count() << " seconds." << std::endl;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
if (kCPU < CPUType::AVX512VNNI) return 0;
- std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<AVX512_8bit>(1, 64, 8);
+ std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<avx512bw::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(8, 256, 256);
- oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(8, 2048, 256);
- oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(320, 256, 256);
- oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(472, 256, 256);
- oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(248, 256, 256);
- oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(200, 256, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(8, 256, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(8, 2048, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(320, 256, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(472, 256, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(248, 256, 256);
+ oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512VNNI without bias took: " << oldAVX512VNNI_nobias.count() << " seconds." << std::endl;
- std::chrono::duration<double> oldAVX512VNNI = testOld<AVX512_8bit>(1, 64, 8);
+ std::chrono::duration<double> oldAVX512VNNI = testOld<avx512bw::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- oldAVX512VNNI += testOld<AVX512VNNI_8bit>(8, 256, 256);
- oldAVX512VNNI += testOld<AVX512VNNI_8bit>(8, 2048, 256);
- oldAVX512VNNI += testOld<AVX512VNNI_8bit>(320, 256, 256);
- oldAVX512VNNI += testOld<AVX512VNNI_8bit>(472, 256, 256);
- oldAVX512VNNI += testOld<AVX512VNNI_8bit>(248, 256, 256);
- oldAVX512VNNI += testOld<AVX512VNNI_8bit>(200, 256, 256);
+ oldAVX512VNNI += testOld<avx512vnni::Kernels8>(8, 256, 256);
+ oldAVX512VNNI += testOld<avx512vnni::Kernels8>(8, 2048, 256);
+ oldAVX512VNNI += testOld<avx512vnni::Kernels8>(320, 256, 256);
+ oldAVX512VNNI += testOld<avx512vnni::Kernels8>(472, 256, 256);
+ oldAVX512VNNI += testOld<avx512vnni::Kernels8>(248, 256, 256);
+ oldAVX512VNNI += testOld<avx512vnni::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512VNNI took: " << oldAVX512VNNI.count() << " seconds." << std::endl;
- std::chrono::duration<double> newTimeAVX512VNNI = testOld<AVX512_8bit>(1, 64, 8);
+ std::chrono::duration<double> newTimeAVX512VNNI = testOld<avx512bw::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
- newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(8, 256, 256);
- newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(8, 2048, 256);
- newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(320, 256, 256);
- newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(472, 256, 256);
- newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(248, 256, 256);
- newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(200, 256, 256);
+ newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(8, 256, 256);
+ newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(8, 2048, 256);
+ newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(320, 256, 256);
+ newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(472, 256, 256);
+ newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(248, 256, 256);
+ newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted AVX512VNNI took: " << newTimeAVX512VNNI.count() << " seconds." << std::endl;
diff --git a/example.cc b/example.cc
index 292bd6b..5f558d0 100644
--- a/example.cc
+++ b/example.cc
@@ -1,11 +1,11 @@
-#include "intgemm.h"
+#include "intgemm/intgemm.h"
// This is just for AlignedVector, which helps managed 64-byte aligned memory.
// Feel free to manage memory yourself.
-#include "aligned.h"
-#include "callbacks.h"
+#include "intgemm/aligned.h"
+#include "intgemm/callbacks.h"
#include <cassert>
-#include <math.h>
+#include <cmath>
#include <random>
int main() {
@@ -54,7 +54,7 @@ int main() {
// Do the actual multiply.
intgemm::Int16::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0f / (quant_mult * quant_mult), C.begin()));
// Sanity check. C will be row major.
- assert(fabsf(C[0] - top_left_reference) < 0.05f);
+ assert(std::fabs(C[0] - top_left_reference) < 0.05f);
}
// 8-bit multiplication.
@@ -73,6 +73,7 @@ int main() {
// Do the actual multiply.
intgemm::Int8::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0f / (quant_mult * quant_mult), C.begin()));
// Sanity check. C will be row major.
- assert(fabsf(C[0] - top_left_reference) < 0.05f);
+ assert(std::fabs(C[0] - top_left_reference) < 0.05f);
}
+ return 0;
}
diff --git a/intgemm.cc b/intgemm.cc
deleted file mode 100644
index 2708952..0000000
--- a/intgemm.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "intgemm.h"
-#include "stats.h"
-
-namespace intgemm {
-
-float Unsupported_MaxAbsolute(const float * /*begin*/, const float * /*end*/) {
- throw UnsupportedCPU();
-}
-
-MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/, bool /*absolute*/) {
- throw UnsupportedCPU();
-}
-
-void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_16bit::Quantize, AVX512_16bit::Quantize, AVX2_16bit::Quantize, SSE2_16bit::Quantize, SSE2_16bit::Quantize, Unsupported_16bit::Quantize);
-
-void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_16bit::PrepareB, AVX512_16bit::PrepareB, AVX2_16bit::PrepareB, SSE2_16bit::PrepareB, SSE2_16bit::PrepareB, Unsupported_16bit::PrepareB);
-
-void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBQuantizedTransposed, AVX512_16bit::PrepareBQuantizedTransposed, AVX2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
-
-void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBTransposed, AVX512_16bit::PrepareBTransposed, AVX2_16bit::PrepareBTransposed, SSE2_16bit::PrepareBTransposed, SSE2_16bit::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
-
-void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
-
-const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName);
-
-void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI_8bit::Quantize, AVX512_8bit::Quantize, AVX2_8bit::Quantize, SSSE3_8bit::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
-
-void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI_8bit::QuantizeU, AVX512_8bit::QuantizeU, AVX2_8bit::QuantizeU, SSSE3_8bit::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
-
-void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512VNNI_8bit::PrepareB, AVX512_8bit::PrepareB, AVX2_8bit::PrepareB, SSSE3_8bit::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
-
-void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_8bit::PrepareBQuantizedTransposed, AVX512_8bit::PrepareBQuantizedTransposed, AVX2_8bit::PrepareBQuantizedTransposed, SSSE3_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
-
-void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_8bit::PrepareBTransposed, AVX512_8bit::PrepareBTransposed, AVX2_8bit::PrepareBTransposed, SSSE3_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
-
-void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI_8bit::SelectColumnsB, AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
-
-const char *const Int8::kName = ChooseCPU(AVX512VNNI_8bit::kName, AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
-
-void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI_8bit::QuantizeU, AVX512_8bit::QuantizeU, AVX2_8bit::QuantizeU, SSSE3_8bit::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
-
-const char *const Int8Shift::kName = ChooseCPU(AVX512VNNI_8bit::kName, AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
-
-const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED);
-
-#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW)
-namespace avx512bw {
-using avx2::MaxAbsolute;
-using avx2::VectorMeanStd;
-} // namespace avx512bw
-#endif
-
-float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512bw::MaxAbsolute, avx512bw::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute);
-
-MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512bw::VectorMeanStd, avx512bw::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, Unsupported_VectorMeanStd);
-
-constexpr const char *const Unsupported_16bit::kName;
-constexpr const char *const Unsupported_8bit::kName;
-constexpr const char *const SSE2_16bit::kName;
-constexpr const char *const SSSE3_8bit::kName;
-constexpr const char *const AVX2_8bit::kName;
-constexpr const char *const AVX2_16bit::kName;
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-constexpr const char *const AVX512_8bit::kName;
-constexpr const char *const AVX512_16bit::kName;
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-constexpr const char *const AVX512VNNI_8bit::kName;
-#endif
-
-}
diff --git a/aligned.h b/intgemm/aligned.h
index 8ad7242..7500a8c 100644
--- a/aligned.h
+++ b/intgemm/aligned.h
@@ -1,7 +1,6 @@
#pragma once
#include <cstdlib>
#include <new>
-#include <stdlib.h>
#ifdef _MSC_VER
#include <malloc.h>
#endif
diff --git a/avx2_gemm.h b/intgemm/avx2_gemm.h
index a929361..5e81475 100644
--- a/avx2_gemm.h
+++ b/intgemm/avx2_gemm.h
@@ -6,57 +6,46 @@
#include "types.h"
#include <cstdint>
-#include <stdint.h>
#include <cstring>
namespace intgemm {
-
namespace avx2 {
-INTGEMM_AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
- return kernels::quantize(loadu_ps<__m256>(input), quant_mult_reg);
+INTGEMM_AVX2 inline Register QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
+ return kernels::quantize(loadu_ps<FRegister>(input), quant_mult_reg);
}
INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i)
class QuantizeTile16 {
public:
- typedef __m256i Register;
-
- INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
-
- INTGEMM_AVX2 Register Consecutive(const float *input) const {
- return Tile(input, input + 8);
+ INTGEMM_AVX2 static inline Register Consecutive(FRegister mult_reg, const float *input) {
+ return Tile(mult_reg, input, input + 8);
}
- INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
- return Tile(
+ INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
+ return Tile(mult_reg,
input,
input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0));
}
- INTGEMM_AVX2 Register ForReshape(const float *input, Index cols) const {
+ INTGEMM_AVX2 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
// 8 rows in the first 128-bit register, 8 in the second register.
- return Tile(input, input + 8 * cols);
+ return Tile(mult_reg, input, input + 8 * cols);
}
private:
- INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) const {
- __m256i g0 = QuantizerGrab(input0, mult_);
- __m256i g1 = QuantizerGrab(input1, mult_);
- __m256i packed = _mm256_packs_epi32(g0, g1);
+ INTGEMM_AVX2 static inline Register Tile(FRegister mult_reg, const float *input0, const float *input1) {
+ Register g0 = QuantizerGrab(input0, mult_reg);
+ Register g1 = QuantizerGrab(input1, mult_reg);
+ Register packed = _mm256_packs_epi32(g0, g1);
// Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15.
// Technically this could be removed if the PrepareB did the same reordering internally.
return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
}
-
- const __m256 mult_;
};
-} // namespace
-
-
-struct AVX2_16bit {
+struct Kernels16 {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
@@ -68,10 +57,10 @@ struct AVX2_16bit {
INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
- avx2::QuantizeTile16 q(quant_mult);
+ FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 16, output += 16) {
- *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
+ *reinterpret_cast<__m256i*>(output) = QuantizeTile16::Consecutive(q, input);
}
}
@@ -83,7 +72,7 @@ struct AVX2_16bit {
PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols);
}*/
INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16)
- INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, CPUType::AVX2, int16_t)
+ INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int16_t)
INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile16, int16_t)
INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
@@ -97,26 +86,21 @@ struct AVX2_16bit {
static const CPUType kUses = CPUType::AVX2;
};
-namespace avx2 {
/* Read 8 floats at a time from input0, input1, input2, and input3. Quantize
* them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate
* the result into one register and return it.
*/
class QuantizeTile8 {
public:
- typedef __m256i Register;
-
- INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
-
- INTGEMM_AVX2 inline __m256i Consecutive(const float *input) const {
- return Tile(input, input + 8, input + 16, input + 24);
+ INTGEMM_AVX2 static inline Register Consecutive(FRegister quant_mult, const float *input) {
+ return Tile(quant_mult, input, input + 8, input + 16, input + 24);
}
- INTGEMM_AVX2 inline __m256i ConsecutiveU(const float *input) const {
- return TileU(input, input + 8, input + 16, input + 24);
+ INTGEMM_AVX2 static inline Register ConsecutiveU(FRegister quant_mult, const float *input) {
+ return TileU(quant_mult, input, input + 8, input + 16, input + 24);
}
- INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+ INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
const float* inputs[4];
for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
while (cols_left < sizeof(Register) / sizeof(float)) {
@@ -127,24 +111,24 @@ class QuantizeTile8 {
input += sizeof(Register) / sizeof(float);
cols_left -= sizeof(Register) / sizeof(float);
}
- return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
+ return Tile(quant_mult, inputs[0], inputs[1], inputs[2], inputs[3]);
}
- INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) const {
+ INTGEMM_AVX2 static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
// Put higher rows in the second half of the register. These will jumble
// around in the same way then conveniently land in the right place.
- return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
+ return Tile(quant_mult, input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
}
- INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const {
+ INTGEMM_AVX2 static inline __m256i Tile(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
// Looking at the assembly, gcc has pulled this outside the loops calling this.
const __m256i neg127 = _mm256_set1_epi8(-127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
// Grab 4 registers at a time in 32-bit format.
- __m256i g0 = avx2::QuantizerGrab(input0, mult_);
- __m256i g1 = avx2::QuantizerGrab(input1, mult_);
- __m256i g2 = avx2::QuantizerGrab(input2, mult_);
- __m256i g3 = avx2::QuantizerGrab(input3, mult_);
+ __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
+ __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
+ __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
+ __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
// Pack 32-bit to 16-bit.
__m256i packed0 = _mm256_packs_epi32(g0, g1);
__m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -161,16 +145,16 @@ class QuantizeTile8 {
private:
//A version that produces uint8_ts
- INTGEMM_AVX2 inline __m256i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const {
+ INTGEMM_AVX2 static inline Register TileU(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
// Looking at the assembly, gcc has pulled this outside the loops calling this.
const __m256i neg127 = _mm256_set1_epi8(-127);
const __m256i pos127 = _mm256_set1_epi8(127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
// Grab 4 registers at a time in 32-bit format.
- __m256i g0 = avx2::QuantizerGrab(input0, mult_);
- __m256i g1 = avx2::QuantizerGrab(input1, mult_);
- __m256i g2 = avx2::QuantizerGrab(input2, mult_);
- __m256i g3 = avx2::QuantizerGrab(input3, mult_);
+ __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
+ __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
+ __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
+ __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
// Pack 32-bit to 16-bit.
__m256i packed0 = _mm256_packs_epi32(g0, g1);
__m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -185,13 +169,9 @@ class QuantizeTile8 {
// and the values are only used for GEMM.
return _mm256_permutevar8x32_epi32(packed, shuffle_param);
}
-
- const __m256 mult_;
};
-} // namespace
-
-struct AVX2_8bit {
+struct Kernels8 {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
@@ -199,9 +179,9 @@ struct AVX2_8bit {
Quantize(input, output, quant_mult, rows * cols);
}
private:
- INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2, __m256i, avx2)
+ INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2)
public:
- INTGEMM_QUANTIZE(INTGEMM_AVX2, __m256i, avx2)
+ INTGEMM_QUANTIZE(INTGEMM_AVX2)
// Currently A is prepared by quantization but this could theoretically change.
INTGEMM_AVX2 static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) {
@@ -212,10 +192,10 @@ struct AVX2_8bit {
INTGEMM_AVX2 static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) {
assert(size % 32 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
- avx2::QuantizeTile8 q(quant_mult);
+ FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 32, output += 32) {
- *reinterpret_cast<__m256i*>(output) = q.ConsecutiveU(input);
+ *reinterpret_cast<__m256i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
}
}
@@ -224,7 +204,7 @@ struct AVX2_8bit {
static const Index kBTileCol = 8;
INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8)
- INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, CPUType::AVX2, int8_t)
+ INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int8_t)
INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile8, int8_t)
INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
@@ -242,4 +222,5 @@ struct AVX2_8bit {
static const CPUType kUses = CPUType::AVX2;
};
+} // namespace avx2
} // namespace intgemm
diff --git a/avx512_gemm.h b/intgemm/avx512_gemm.h
index d53e48e..f9fb1eb 100644
--- a/avx512_gemm.h
+++ b/intgemm/avx512_gemm.h
@@ -1,6 +1,6 @@
#pragma once
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
@@ -12,10 +12,7 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include <cstdlib>
/* AVX512 implementation.
* This uses INTGEMM_AVX512BW, INTGEMM_AVX512DQ, and might use AVX512VL
@@ -34,8 +31,7 @@ namespace intgemm {
// So conversion in memory uses these, but I also implement a wider version for
// rearranging B.
-// Convert to 16-bit signed integers.
-namespace avx512f {
+namespace avx512bw {
// Load from memory, multiply, and convert to int32_t.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
@@ -60,7 +56,7 @@ INTGEMM_AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) {
// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
- __m512 appended = avx512f::Concat(loadu_ps<__m256>(input0), loadu_ps<__m256>(input1));
+ __m512 appended = Concat(loadu_ps<__m256>(input0), loadu_ps<__m256>(input1));
appended = _mm512_mul_ps(appended, quant_mult_reg);
return _mm512_cvtps_epi32(appended);
}
@@ -70,40 +66,27 @@ INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const f
// being used for the quantizer.
class QuantizeTile16 {
public:
- typedef __m512i Register;
-
- /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
- INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
- INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+ INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
auto input0 = input;
auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0);
- auto g0 = QuantizerGrabHalves(input0, input1, mult_reg_);
- auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, mult_reg_);
+ auto g0 = QuantizerGrabHalves(input0, input1, quant_mult);
+ auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, quant_mult);
auto packed = packs_epi32(g0, g1);
return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
}
- INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const {
- __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
- __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
+ INTGEMM_AVX512BW static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
+ __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, quant_mult);
+ __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, quant_mult);
__m512i packed = packs_epi32(g0, g1);
// Permute within 256-bit lanes, so same as INTGEMM_AVX2
return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
}
-
- private:
- const __m512 mult_reg_;
};
class QuantizeTile8 {
public:
- typedef __m512i Register;
-
- /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
- INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
- INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+ INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
static const __m512i neg127 = _mm512_set1_epi8(-127);
static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
@@ -118,10 +101,10 @@ class QuantizeTile8 {
cols_left -= sizeof(Register) / sizeof(float);
}
- auto g0 = QuantizerGrab(inputs[0], mult_reg_);
- auto g1 = QuantizerGrab(inputs[1], mult_reg_);
- auto g2 = QuantizerGrab(inputs[2], mult_reg_);
- auto g3 = QuantizerGrab(inputs[3], mult_reg_);
+ auto g0 = QuantizerGrab(inputs[0], quant_mult);
+ auto g1 = QuantizerGrab(inputs[1], quant_mult);
+ auto g2 = QuantizerGrab(inputs[2], quant_mult);
+ auto g3 = QuantizerGrab(inputs[3], quant_mult);
auto packed0 = packs_epi32(g0, g1);
auto packed1 = packs_epi32(g2, g3);
@@ -130,17 +113,17 @@ class QuantizeTile8 {
return _mm512_permutexvar_epi32(shuffle_param, packed);
}
- INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const {
+ INTGEMM_AVX512BW static inline __m512i ForReshape(FRegister quant_mult, const float *input, Index cols) {
// TODO: try alternative: _mm512_cvtsepi32_epi8 ?
const __m512i neg127 = _mm512_set1_epi8(-127);
// In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
// 32-bit format.
- __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_);
- __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_);
- __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_);
- __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_);
+ __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, quant_mult);
+ __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, quant_mult);
+ __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, quant_mult);
+ __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, quant_mult);
// Pack 32-bit to 16-bit.
__m512i packed0 = packs_epi32(g0, g1);
__m512i packed1 = packs_epi32(g2, g3);
@@ -151,14 +134,9 @@ class QuantizeTile8 {
// 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63
return _mm512_permutexvar_epi32(shuffle_param, packed);
}
-
- private:
- const __m512 mult_reg_;
};
-} // namespace
-
-struct AVX512_16bit {
+struct Kernels16 {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
@@ -181,7 +159,7 @@ struct AVX512_16bit {
const float *end = input + size;
for (; input != end; input += 16, output += 16) {
// There doesn't seem to be an unmasked version.
- _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, avx512f::QuantizerGrab(input, quant_mult_reg));
+ _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, QuantizerGrab(input, quant_mult_reg));
}
}
@@ -189,19 +167,15 @@ struct AVX512_16bit {
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 32;
static const Index kBTileCol = 8;
-/*
- INTGEMM_AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
- PrepareBFor16(input, output, avx512f::QuantizeTile16(quant_mult), rows, cols);
- }
-*/
+
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
- INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, avx512f::QuantizeTile16)
- INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, CPUType::AVX512BW, int16_t)
- INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, avx512f::QuantizeTile16, int16_t)
+ INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, QuantizeTile16)
+ INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, int16_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, QuantizeTile16, int16_t)
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
+ SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
}
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
@@ -212,7 +186,7 @@ struct AVX512_16bit {
static const CPUType kUses = CPUType::AVX512BW;
};
-struct AVX512_8bit {
+struct Kernels8 {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
@@ -237,7 +211,7 @@ struct AVX512_8bit {
const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
#pragma omp for
for (std::size_t i = 0; i < count; i += kBatch) {
- __m512i asint = avx512f::QuantizerGrab(input + i, quant_mult_reg);
+ __m512i asint = QuantizerGrab(input + i, quant_mult_reg);
asint = _mm512_max_epi32(asint, neg127);
// There doesn't seem to be an unmasked version.
_mm512_mask_cvtsepi32_storeu_epi8(output + i, 0xffff, asint);
@@ -263,7 +237,7 @@ struct AVX512_8bit {
if (!overhang) return; // We needed a branch anyway for the empty case.
const __m512i neg127 = _mm512_set1_epi32(-127);
const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
- __m512i asint = avx512f::QuantizerGrab(fast_input_end, quant_mult_reg);
+ __m512i asint = QuantizerGrab(fast_input_end, quant_mult_reg);
asint = _mm512_max_epi32(asint, neg127);
_mm512_mask_cvtsepi32_storeu_epi8(fast_output_end, (1 << overhang) - 1, asint);
}
@@ -287,7 +261,7 @@ struct AVX512_8bit {
const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
const float *end = input + size;
for (; input < end; input += 16, output += 16) {
- __m512i asint = avx512f::QuantizerGrab(input, quant_mult_reg);
+ __m512i asint = QuantizerGrab(input, quant_mult_reg);
asint = _mm512_min_epi32(asint, pos127);
asint = _mm512_add_epi32(asint, pos127);
asint = _mm512_max_epi32(asint, zero);
@@ -298,26 +272,21 @@ struct AVX512_8bit {
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 64;
static const Index kBTileCol = 8;
-/*
- INTGEMM_AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
- PrepareBFor8(input, output, avx512f::QuantizeTile8(quant_mult), rows, cols);
- }*/
+
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
- INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, avx512f::QuantizeTile8)
- INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, CPUType::AVX512BW, int8_t)
- INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, avx512f::QuantizeTile8, int8_t)
+ INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, QuantizeTile8)
+ INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, int8_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, QuantizeTile8, int8_t)
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
- avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
+ SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
}
// Special AVX512 implementation due to having 32 registers (so I don't have to
// allocate registers manually) and no sign instruction.
template <typename Callback>
INTGEMM_AVX512BW static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
- typedef __m512i Register;
- //typedef __m256 Float; // For quantization we only do 8 at a time.
// This is copy-paste from Multiply8_SSE2OrAVX2.
assert(width % sizeof(Register) == 0);
assert(B_cols % 8 == 0);
@@ -325,7 +294,7 @@ struct AVX512_8bit {
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
// There's 8 results for INTGEMM_AVX2 to handle.
auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
- const int simd_width = width / sizeof(Register);
+ const Index simd_width = width / sizeof(Register);
// Added for AVX512.
Register zeros = setzero_si<Register>();
// Go over 8 columns of B at a time.
@@ -436,6 +405,7 @@ struct AVX512_8bit {
static const CPUType kUses = CPUType::AVX512BW;
};
+} // namespace avx512bw
} // namespace intgemm
#endif
diff --git a/avx512vnni_gemm.h b/intgemm/avx512vnni_gemm.h
index 22c5c4e..c660168 100644
--- a/avx512vnni_gemm.h
+++ b/intgemm/avx512vnni_gemm.h
@@ -1,12 +1,13 @@
#pragma once
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
#include "avx512_gemm.h"
#include "types.h"
namespace intgemm {
+namespace avx512vnni {
// Workaround extra vmovdqa64 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663
INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) {
@@ -17,16 +18,15 @@ INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) {
#endif
}
-struct AVX512VNNI_8bit : public AVX512_8bit {
+struct Kernels8 : public avx512bw::Kernels8 {
template <typename Callback>
INTGEMM_AVX512VNNI static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
- typedef __m512i Register;
assert(width % sizeof(Register) == 0);
assert(B_cols % 8 == 0);
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0);
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
- const int simd_width = width / sizeof(Register);
+ const Index simd_width = width / sizeof(Register);
Register zeros = setzero_si<Register>();
// Go over 8 columns of B at a time.
#pragma omp for
@@ -82,13 +82,12 @@ struct AVX512VNNI_8bit : public AVX512_8bit {
template <typename Callback>
INTGEMM_AVX512VNNI static void Multiply8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
- typedef __m512i Register;
assert(width % sizeof(Register) == 0);
assert(B_cols % 8 == 0);
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0);
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
- const int simd_width = width / sizeof(Register);
+ const Index simd_width = width / sizeof(Register);
Register zeros = setzero_si<Register>();
// Go over 8 columns of B at a time.
#pragma omp for
@@ -124,12 +123,11 @@ struct AVX512VNNI_8bit : public AVX512_8bit {
template <typename Callback>
INTGEMM_AVX512VNNI static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) {
- typedef __m512i Register;
assert(width % sizeof(Register) == 0);
assert(B_cols % 8 == 0);
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
- const int simd_width = width / sizeof(Register);
+ Index simd_width = width / sizeof(Register);
Register zeros = setzero_si<Register>();
const Register a = set1_epi8<Register>(1);
// Go over 8 columns of B at a time.
@@ -164,6 +162,7 @@ struct AVX512VNNI_8bit : public AVX512_8bit {
static const CPUType kUses = CPUType::AVX512VNNI;
};
+} // namespace avx512vnni
} // namespace intgemm
#endif
diff --git a/callbacks.h b/intgemm/callbacks.h
index 24f9009..23d3be1 100644
--- a/callbacks.h
+++ b/intgemm/callbacks.h
@@ -3,7 +3,7 @@
#include "callbacks/configs.h"
#include "callbacks/output_buffer_info.h"
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
#include "intrinsics.h"
#include "kernels.h"
#include "types.h"
diff --git a/callbacks/configs.h b/intgemm/callbacks/configs.h
index 1222448..1222448 100644
--- a/callbacks/configs.h
+++ b/intgemm/callbacks/configs.h
diff --git a/callbacks/implementations.inl b/intgemm/callbacks/implementations.inl
index d2b7d95..47d2aa4 100644
--- a/callbacks/implementations.inl
+++ b/intgemm/callbacks/implementations.inl
@@ -129,7 +129,14 @@ public:
}
CPU_ATTR void operator()(vi input, const OutputBufferInfo& info) {
- auto result = kernels::unquantize(input, unquant_mult);
+ // Workaround gcc 5 internal compiler error that can't read register members in debug.
+ vf mult_reg;
+#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+ asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
+#else
+ mult_reg = unquant_mult;
+#endif
+ auto result = kernels::unquantize(input, mult_reg);
kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
}
@@ -164,7 +171,14 @@ public:
}
CPU_ATTR void operator()(vi input, const OutputBufferInfo& info) {
- auto result = kernels::unquantize(input, unquant_mult);
+ // Workaround gcc 5 internal compiler error that can't read register members in debug.
+ vf mult_reg;
+#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+ asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
+#else
+ mult_reg = unquant_mult;
+#endif
+ auto result = kernels::unquantize(input, mult_reg);
result = kernels::add_bias(result, config.bias_addr, info.col_idx);
kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
}
diff --git a/callbacks/output_buffer_info.h b/intgemm/callbacks/output_buffer_info.h
index 213aef4..213aef4 100644
--- a/callbacks/output_buffer_info.h
+++ b/intgemm/callbacks/output_buffer_info.h
diff --git a/interleave.h b/intgemm/interleave.h
index 231be46..1ec686b 100644
--- a/interleave.h
+++ b/intgemm/interleave.h
@@ -1,12 +1,11 @@
#pragma once
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
#include "intrinsics.h"
#include "types.h"
#include <algorithm>
#include <cassert>
-#include <stdint.h>
namespace intgemm {
@@ -180,11 +179,9 @@ template <class Register> static inline void Transpose8InLane(
// ... ...
#define INTGEMM_PREPARE_B_8(target, QuantClass) \
target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \
- typedef typename QuantClass Quantizer; \
- typedef typename Quantizer::Register Register; \
- Quantizer q = Quantizer(quant_mult); \
+ FRegister q = set1_ps<FRegister>(quant_mult); \
/* Currently all multipliers have a stride of 8 columns.*/ \
- const int kColStride = 8; \
+ const Index kColStride = 8; \
assert(cols % kColStride == 0); \
assert(rows % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
@@ -196,14 +193,14 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
This isn't quite Transpose8InLane because it's half the number of columns, \
so each register starts with two rows instead of being one row. \
The quantizers know to skip a row.*/ \
- output[0] = q.ForReshape(input + cols * (r ) + c, cols); \
- output[1] = q.ForReshape(input + cols * (r + 1) + c, cols); \
- output[2] = q.ForReshape(input + cols * (r + 4) + c, cols); \
- output[3] = q.ForReshape(input + cols * (r + 5) + c, cols); \
- output[4] = q.ForReshape(input + cols * (r + 8) + c, cols); \
- output[5] = q.ForReshape(input + cols * (r + 9) + c, cols); \
- output[6] = q.ForReshape(input + cols * (r + 12) + c, cols); \
- output[7] = q.ForReshape(input + cols * (r + 13) + c, cols); \
+ output[0] = QuantClass::ForReshape(q, input + cols * (r ) + c, cols); \
+ output[1] = QuantClass::ForReshape(q, input + cols * (r + 1) + c, cols); \
+ output[2] = QuantClass::ForReshape(q, input + cols * (r + 4) + c, cols); \
+ output[3] = QuantClass::ForReshape(q, input + cols * (r + 5) + c, cols); \
+ output[4] = QuantClass::ForReshape(q, input + cols * (r + 8) + c, cols); \
+ output[5] = QuantClass::ForReshape(q, input + cols * (r + 9) + c, cols); \
+ output[6] = QuantClass::ForReshape(q, input + cols * (r + 12) + c, cols); \
+ output[7] = QuantClass::ForReshape(q, input + cols * (r + 13) + c, cols); \
Interleave8(output[0], output[1]); \
Interleave8(output[2], output[3]); \
Interleave8(output[4], output[5]); \
@@ -215,9 +212,7 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
#define INTGEMM_PREPARE_B_16(target, QuantClass) \
target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \
- typedef typename QuantClass Quantizer; \
- typedef typename Quantizer::Register Register; \
- Quantizer q = Quantizer(quant_mult); \
+ FRegister q = set1_ps<FRegister>(quant_mult); \
assert(cols % 8 == 0); \
assert(rows % (sizeof(Register) / sizeof(int16_t)) == 0); \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
@@ -226,8 +221,8 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
for (Index c = 0; c < cols; c += 8) { \
for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \
/* gcc unrolls this loop and uses registers for output[k]*/ \
- for (int k = 0; k < 8; ++k) { \
- output[k] = q.ForReshape(input + cols * (r + k) + c, cols); \
+ for (Index k = 0; k < 8; ++k) { \
+ output[k] = QuantClass::ForReshape(q, input + cols * (r + k) + c, cols); \
} \
Transpose16InLane(output[0], output[1], output[2], output[3], output[4], output[5], output[6], output[7]); \
} \
@@ -241,9 +236,8 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
*
* cols and rows describe size of transposed B.
*/
-#define INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(target, cpu_type, Integer) \
+#define INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(target, Integer) \
target static inline void PrepareBQuantizedTransposed(const Integer* input, Integer* output, Index cols, Index rows) { \
- using Register = vector_t<cpu_type, Integer>; \
const Index RegisterElems = sizeof(Register) / sizeof(Integer); \
const Index kColStride = 8; \
\
@@ -268,7 +262,6 @@ target static inline void PrepareBQuantizedTransposed(const Integer* input, Inte
*/
#define INTGEMM_PREPARE_B_TRANSPOSED(target, Quantizer, Integer) \
target static inline void PrepareBTransposed(const float* input, Integer* output, float quant_mult, Index cols, Index rows) { \
- using Register = typename Quantizer::Register; \
const Index RegisterElemsInt = sizeof(Register) / sizeof(Integer); \
const Index kColStride = 8; \
\
@@ -277,13 +270,13 @@ target static inline void PrepareBTransposed(const float* input, Integer* output
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
\
- Quantizer quantizer(quant_mult); \
+ FRegister q = set1_ps<FRegister>(quant_mult); \
Register* output_it = reinterpret_cast<Register*>(output); \
Index r = 0; \
Index c = 0; \
while (r < rows) { \
for (Index ri = 0; ri < 8; ++ri) \
- *output_it++ = quantizer.ConsecutiveWithWrapping(input + (r + ri) * cols + c, cols - c, cols, 8); \
+ *output_it++ = Quantizer::ConsecutiveWithWrapping(q, input + (r + ri) * cols + c, cols - c, cols, 8); \
c += RegisterElemsInt; \
while (c >= cols) { \
r += kColStride; \
@@ -299,14 +292,14 @@ target static inline void SelectColumnsOfB(const Register *input, Register *outp
assert(rows_bytes % sizeof(Register) == 0); \
assert((cols_end - cols_begin) % 8 == 0); \
/* Do columns for multiples of 8.*/ \
- int register_rows = rows_bytes / sizeof(Register); \
+ Index register_rows = rows_bytes / sizeof(Register); \
const Register *starts[8]; \
for (; cols_begin != cols_end; cols_begin += 8) { \
- for (int k = 0; k < 8; ++k) { \
+ for (Index k = 0; k < 8; ++k) { \
starts[k] = input + (cols_begin[k] & 7) + (cols_begin[k] & ~7) * register_rows; \
} \
- for (int r = 0; r < register_rows; ++r) { \
- for (int k = 0; k < 8; ++k) { \
+ for (Index r = 0; r < register_rows; ++r) { \
+ for (Index k = 0; k < 8; ++k) { \
*(output++) = *starts[k]; \
starts[k] += 8; \
} \
diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc
new file mode 100644
index 0000000..f859b9a
--- /dev/null
+++ b/intgemm/intgemm.cc
@@ -0,0 +1,71 @@
+#include "intgemm.h"
+#include "stats.h"
+
+namespace intgemm {
+
+float Unsupported_MaxAbsolute(const float * /*begin*/, const float * /*end*/) {
+ throw UnsupportedCPU();
+}
+
+MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/, bool /*absolute*/) {
+ throw UnsupportedCPU();
+}
+
+void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(avx512bw::Kernels16::Quantize, avx512bw::Kernels16::Quantize, avx2::Kernels16::Quantize, sse2::Kernels16::Quantize, sse2::Kernels16::Quantize, Unsupported_16bit::Quantize);
+
+void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512bw::Kernels16::PrepareB, avx512bw::Kernels16::PrepareB, avx2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, Unsupported_16bit::PrepareB);
+
+void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBQuantizedTransposed, avx512bw::Kernels16::PrepareBQuantizedTransposed, avx2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
+
+void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBTransposed, avx512bw::Kernels16::PrepareBTransposed, avx2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
+
+void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512bw::Kernels16::SelectColumnsB, avx512bw::Kernels16::SelectColumnsB, avx2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
+
+const char *const Int16::kName = ChooseCPU(avx512bw::Kernels16::kName, avx512bw::Kernels16::kName, avx2::Kernels16::kName, sse2::Kernels16::kName, sse2::Kernels16::kName, Unsupported_16bit::kName);
+
+void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::Quantize, avx512bw::Kernels8::Quantize, avx2::Kernels8::Quantize, ssse3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
+
+void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
+
+void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512vnni::Kernels8::PrepareB, avx512bw::Kernels8::PrepareB, avx2::Kernels8::PrepareB, ssse3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
+
+void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBQuantizedTransposed, avx512bw::Kernels8::PrepareBQuantizedTransposed, avx2::Kernels8::PrepareBQuantizedTransposed, ssse3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
+
+void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBTransposed, avx512bw::Kernels8::PrepareBTransposed, avx2::Kernels8::PrepareBTransposed, ssse3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
+
+void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512vnni::Kernels8::SelectColumnsB, avx512bw::Kernels8::SelectColumnsB, avx2::Kernels8::SelectColumnsB, ssse3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
+
+const char *const Int8::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
+
+void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
+
+const char *const Int8Shift::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
+
+const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED);
+
+#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW)
+namespace avx512bw {
+using avx2::MaxAbsolute;
+using avx2::VectorMeanStd;
+} // namespace avx512bw
+#endif
+
+float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512bw::MaxAbsolute, avx512bw::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute);
+
+MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512bw::VectorMeanStd, avx512bw::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, Unsupported_VectorMeanStd);
+
+constexpr const char *const Unsupported_16bit::kName;
+constexpr const char *const Unsupported_8bit::kName;
+constexpr const char *const sse2::Kernels16::kName;
+constexpr const char *const ssse3::Kernels8::kName;
+constexpr const char *const avx2::Kernels8::kName;
+constexpr const char *const avx2::Kernels16::kName;
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+constexpr const char *const avx512bw::Kernels8::kName;
+constexpr const char *const avx512bw::Kernels16::kName;
+#endif
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+constexpr const char *const avx512vnni::Kernels8::kName;
+#endif
+
+}
diff --git a/intgemm.h b/intgemm/intgemm.h
index db7d2fe..8e2da02 100644
--- a/intgemm.h
+++ b/intgemm/intgemm.h
@@ -39,11 +39,9 @@
* passing unquant_mult = \lambda / (A_quant_mult * B_quant_mult).
*/
-// Yes, both headers due to the debacle about int32_t
#include <cstdint>
-#include <stdint.h>
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
#include "types.h"
#include "sse2_gemm.h"
#include "ssse3_gemm.h"
@@ -126,7 +124,15 @@ struct Unsupported_8bit {
#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
// These won't ever be called in this capacity, but it does let the code below compile.
-typedef Unsupported_8bit AVX512VNNI_8bit;
+namespace avx512vnni {
+typedef Unsupported_8bit Kernels8;
+} // namespace avx512vnni
+#endif
+#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+namespace avx512bw {
+typedef Unsupported_8bit Kernels8;
+typedef Unsupported_16bit Kernels16;
+} // namespace avx512bw
#endif
/* Returns:
@@ -274,7 +280,7 @@ private:
};
template <typename Callback>
-void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512VNNI_8bit>, OMPParallelWrap<Callback, AVX512_8bit>, OMPParallelWrap<Callback, AVX2_8bit>, OMPParallelWrap<Callback, SSSE3_8bit>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>);
+void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512vnni::Kernels8>, OMPParallelWrap<Callback, avx512bw::Kernels8>, OMPParallelWrap<Callback, avx2::Kernels8>, OMPParallelWrap<Callback, ssse3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>);
/*
* 8-bit matrix multiplication with shifting A by 127
@@ -338,14 +344,14 @@ private:
template <class Callback>
void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(
- OMPParallelWrap8Shift<Callback, AVX512VNNI_8bit>,
- OMPParallelWrap8Shift<Callback, AVX512_8bit>,
- OMPParallelWrap8Shift<Callback, AVX2_8bit>,
- OMPParallelWrap8Shift<Callback, SSSE3_8bit>,
+ OMPParallelWrap8Shift<Callback, avx512vnni::Kernels8>,
+ OMPParallelWrap8Shift<Callback, avx512bw::Kernels8>,
+ OMPParallelWrap8Shift<Callback, avx2::Kernels8>,
+ OMPParallelWrap8Shift<Callback, ssse3::Kernels8>,
Unsupported_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift<Callback>);
template <class Callback>
-void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::PrepareBias<Callback>, AVX512_8bit::PrepareBias<Callback>, AVX2_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
+void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(avx512vnni::Kernels8::PrepareBias<Callback>, avx512bw::Kernels8::PrepareBias<Callback>, avx2::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
/*
* 16-bit matrix multiplication
@@ -401,7 +407,7 @@ private:
};
template <typename Callback>
-void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512_16bit> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, AVX512_16bit>, OMPParallelWrap<Callback, AVX2_16bit>, OMPParallelWrap<Callback, SSE2_16bit>, OMPParallelWrap<Callback, SSE2_16bit>, Unsupported_16bit::Multiply<Callback>);
+void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512bw::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, avx512bw::Kernels16>, OMPParallelWrap<Callback, avx2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, Unsupported_16bit::Multiply<Callback>);
extern const CPUType kCPU;
diff --git a/intgemm_config.h.in b/intgemm/intgemm_config.h.in
index 920e9ae..920e9ae 100644
--- a/intgemm_config.h.in
+++ b/intgemm/intgemm_config.h.in
diff --git a/intrinsics.h b/intgemm/intrinsics.h
index bf79e43..480f421 100644
--- a/intrinsics.h
+++ b/intgemm/intrinsics.h
@@ -1,6 +1,6 @@
#pragma once
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
#include "types.h"
#include <tmmintrin.h>
@@ -9,7 +9,6 @@
#include <xmmintrin.h>
#include <cstdint>
-#include <stdint.h>
/*
* NOTE: Please keep intrinsics in alphabetical order.
@@ -162,17 +161,17 @@ template <> INTGEMM_SSE2 inline __m128i setzero_si<__m128i>() {
INTGEMM_SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) {
return _mm_sign_epi8(first, second);
}
-INTGEMM_SSE2 static inline __m128i slli_epi16(__m128i a, int8_t b) {
- return _mm_slli_epi16(a, b);
+template <int imm8> INTGEMM_SSE2 static inline __m128i slli_epi16(__m128i a) {
+ return _mm_slli_epi16(a, imm8);
}
-INTGEMM_SSE2 static inline __m128i srai_epi16(__m128i a, int8_t b) {
- return _mm_srai_epi16(a, b);
+template <int imm8> INTGEMM_SSE2 static inline __m128i srai_epi16(__m128i a) {
+ return _mm_srai_epi16(a, imm8);
}
-INTGEMM_SSE2 static inline __m128i srai_epi32(__m128i a, int8_t b) {
- return _mm_srai_epi32(a, b);
+template <int imm8> INTGEMM_SSE2 static inline __m128i srai_epi32(__m128i a) {
+ return _mm_srai_epi32(a, imm8);
}
-INTGEMM_SSE2 static inline __m128i srli_epi16(__m128i a, int8_t b) {
- return _mm_srli_epi16(a, b);
+template <int imm8> INTGEMM_SSE2 static inline __m128i srli_epi16(__m128i a) {
+ return _mm_srli_epi16(a, imm8);
}
INTGEMM_SSE2 static inline void storeu_ps(float* mem_addr, __m128 a) {
_mm_storeu_ps(mem_addr, a);
@@ -343,17 +342,17 @@ template <> INTGEMM_AVX2 inline __m256i setzero_si<__m256i>() {
INTGEMM_AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) {
return _mm256_sign_epi8(first, second);
}
-INTGEMM_AVX2 static inline __m256i slli_epi16(__m256i a, int8_t b) {
- return _mm256_slli_epi16(a, b);
+template <int imm8> INTGEMM_AVX2 static inline __m256i slli_epi16(__m256i a) {
+ return _mm256_slli_epi16(a, imm8);
}
-INTGEMM_AVX2 static inline __m256i srai_epi16(__m256i a, int8_t b) {
- return _mm256_srai_epi16(a, b);
+template <int imm8> INTGEMM_AVX2 static inline __m256i srai_epi16(__m256i a) {
+ return _mm256_srai_epi16(a, imm8);
}
-INTGEMM_AVX2 static inline __m256i srai_epi32(__m256i a, int8_t b) {
- return _mm256_srai_epi32(a, b);
+template <int imm8> INTGEMM_AVX2 static inline __m256i srai_epi32(__m256i a) {
+ return _mm256_srai_epi32(a, imm8);
}
-INTGEMM_AVX2 static inline __m256i srli_epi16(__m256i a, int8_t b) {
- return _mm256_srli_epi16(a, b);
+template <int imm8> INTGEMM_AVX2 static inline __m256i srli_epi16(__m256i a) {
+ return _mm256_srli_epi16(a, imm8);
}
INTGEMM_AVX2 static inline void storeu_ps(float* mem_addr, __m256 a) {
_mm256_storeu_ps(mem_addr, a);
@@ -540,17 +539,17 @@ template <> INTGEMM_AVX512BW inline __m512 load_ps<__m512>(const float* from) {
/*
* Missing sign_epi8
*/
-INTGEMM_AVX512BW static inline __m512i slli_epi16(__m512i a, int8_t b) {
- return _mm512_slli_epi16(a, b);
+template <int imm8> INTGEMM_AVX512BW static inline __m512i slli_epi16(__m512i a) {
+ return _mm512_slli_epi16(a, imm8);
}
-INTGEMM_AVX512BW static inline __m512i srai_epi16(__m512i a, int8_t b) {
- return _mm512_srai_epi16(a, b);
+template <int imm8> INTGEMM_AVX512BW static inline __m512i srai_epi16(__m512i a) {
+ return _mm512_srai_epi16(a, imm8);
}
-INTGEMM_AVX512BW static inline __m512i srai_epi32(__m512i a, int8_t b) {
- return _mm512_srai_epi32(a, b);
+template <int imm8> INTGEMM_AVX512BW static inline __m512i srai_epi32(__m512i a) {
+ return _mm512_srai_epi32(a, imm8);
}
-INTGEMM_AVX512BW static inline __m512i srli_epi16(__m512i a, int8_t b) {
- return _mm512_srli_epi16(a, b);
+template <int imm8> INTGEMM_AVX512BW static inline __m512i srli_epi16(__m512i a) {
+ return _mm512_srli_epi16(a, imm8);
}
INTGEMM_AVX512BW static inline void storeu_ps(float* mem_addr, __m512 a) {
_mm512_storeu_ps(mem_addr, a);
diff --git a/kernels.h b/intgemm/kernels.h
index 84631b5..ee35966 100644
--- a/kernels.h
+++ b/intgemm/kernels.h
@@ -1,6 +1,6 @@
#pragma once
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
#include "intrinsics.h"
#include "types.h"
#include "utils.h"
diff --git a/kernels/implementations.inl b/intgemm/kernels/implementations.inl
index 2ec9f1f..4f1b39f 100644
--- a/kernels/implementations.inl
+++ b/intgemm/kernels/implementations.inl
@@ -145,8 +145,8 @@ CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> multiply(vector_t<CPUTy
template <>
CPU_ATTR inline vi multiply<int8_t>(vi a, vi b) {
auto even = mullo_epi16(a, b);
- auto odd = mullo_epi16(srli_epi16(a, 8), srli_epi16(b, 8));
- return or_si(slli_epi16(odd, 8), srli_epi16(slli_epi16(even, 8), 8));
+ auto odd = mullo_epi16(srli_epi16<8>(a), srli_epi16<8>(b));
+ return or_si(slli_epi16<8>(odd), srli_epi16<8>(slli_epi16<8>(even)));
}
template <>
@@ -235,7 +235,7 @@ CPU_ATTR static inline dvector_t<CPUType::CPU_NAME, int16_t> upcast8to16(vi inpu
static const auto permutation_indices = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
input = _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(input)));
- auto negatives = _mm512_cmp_epi8_mask(input, vzero, _MM_CMPINT_LT);
+ auto negatives = _mm512_cmp_epi8_mask(input, vzero, 1 /* _MM_CMPINT_LT */);
auto higher_byte = _mm512_mask_blend_epi8(negatives, vzero, vmax_negative);
#endif
@@ -258,7 +258,7 @@ CPU_ATTR static inline dvector_t<CPUType::CPU_NAME, int> upcast16to32(vi input)
static const auto permutation_indices = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
input = _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(input)));
- auto negatives = _mm512_cmp_epi16_mask(input, vzero, _MM_CMPINT_LT);
+ auto negatives = _mm512_cmp_epi16_mask(input, vzero, 1 /* _MM_CMPINT_LT */);
auto higher_byte = _mm512_mask_blend_epi16(negatives, vzero, vmax_negative);
#endif
@@ -296,32 +296,6 @@ CPU_ATTR static inline vi bitwise_not(vi v) {
}
/*
- * Multiply with saturation (elemwise)
- */
-template <typename Type>
-CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> multiply_sat(vector_t<CPUType::CPU_NAME, Type> a, vector_t<CPUType::CPU_NAME, Type> b, uint8_t right_shift);
-
-template <>
-CPU_ATTR inline vi multiply_sat<int8_t>(vi a, vi b, uint8_t right_shift) {
- auto upcasted_a = upcast8to16(a);
- auto upcasted_b = upcast8to16(b);
- auto low = srai_epi16(multiply<int16_t>(upcasted_a.first, upcasted_b.first), right_shift);
- auto hi = srai_epi16(multiply<int16_t>(upcasted_a.second, upcasted_b.second), right_shift);
-
- return downcast16to8(low, hi);
-}
-
-template <>
-CPU_ATTR inline vi multiply_sat<int16_t>(vi a, vi b, uint8_t right_shift) {
- auto upcasted_a = upcast16to32(a);
- auto upcasted_b = upcast16to32(b);
- auto low = srai_epi32(multiply<int32_t>(upcasted_a.first, upcasted_b.first), right_shift);
- auto hi = srai_epi32(multiply<int32_t>(upcasted_a.second, upcasted_b.second), right_shift);
-
- return downcast32to16(low, hi);
-}
-
-/*
* Floor
*/
CPU_ATTR static inline vf floor(vf input) {
diff --git a/multiply.h b/intgemm/multiply.h
index 80940d4..e201e09 100644
--- a/multiply.h
+++ b/intgemm/multiply.h
@@ -1,6 +1,6 @@
#pragma once
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
#include "interleave.h"
#include "intrinsics.h"
#include "vec_traits.h"
@@ -47,16 +47,16 @@ INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i p
// Quantize function used for SSSE3 and AVX2.
// Separate function for thread to work around gcc 7 bug that doesn't imbue
// target attributes across #pragma omp parallel.
-#define INTGEMM_QUANTIZE_THREAD(target, Register, name) \
+#define INTGEMM_QUANTIZE_THREAD(target) \
target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \
- name::QuantizeTile8 q(quant_mult); \
+ FRegister q = set1_ps<FRegister>(quant_mult); \
INTGEMM_OMP_FOR \
for (std::size_t i = 0; i < count; i += sizeof(Register)) { \
- *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+ *reinterpret_cast<Register*>(output + i) = QuantizeTile8::Consecutive(q, input + i); \
} \
}
-#define INTGEMM_QUANTIZE(target, Register, name) \
+#define INTGEMM_QUANTIZE(target) \
target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
@@ -68,7 +68,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa
} \
std::size_t overhang = size & (kBatch - 1); \
if (!overhang) return; \
- name::QuantizeTile8 q(quant_mult); \
+ FRegister q = set1_ps<FRegister>(quant_mult); \
/* Each does size(Register) / 32 == kBatch / 4 floats at a time.
* If we're allowed to read one of them, then we can read the whole register. */ \
const float *inputs[4]; \
@@ -80,7 +80,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa
for (; i < 4; ++i) { \
inputs[i] = &input[fast_end]; \
} \
- Register result = q.Tile(inputs[0], inputs[1], inputs[2], inputs[3]); \
+ Register result = QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]); \
std::memcpy(output + (size & ~(kBatch - 1)), &result, overhang); \
}
@@ -159,7 +159,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
assert(B_cols % 8 == 0); \
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
- const int simd_width = width / (sizeof(Register) / sizeof(int16_t)); \
+ const Index simd_width = width / (sizeof(Register) / sizeof(int16_t)); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
INTGEMM_OMP_FOR \
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
@@ -169,7 +169,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \
/* These will be packed 32-bit integers containing sums for each row of B multiplied by the row of A. \
Iterate over shared (inner) dimension.*/ \
- int k = 0; \
+ Index k = 0; \
Register a = *(A_row + k); \
Register sum0 = madd_epi16(a, *(B0_col + k * 8)); \
Register sum1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \
@@ -216,7 +216,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \
assert(B_cols % 8 == 0); \
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
- const int simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
+ const Index simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
const Register a = set1_epi8<Register>(1); \
INTGEMM_OMP_FOR \
@@ -225,7 +225,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
/*const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width);*/ \
/* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \
Iterate over shared (inner) dimension.*/ \
- int k = 0; \
+ Index k = 0; \
Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
@@ -291,7 +291,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
assert(B_cols % 8 == 0); \
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
- const int simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
+ const Index simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
INTGEMM_OMP_FOR \
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
@@ -301,7 +301,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \
/* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \
Iterate over shared (inner) dimension.*/ \
- int k = 0; \
+ Index k = 0; \
Register a = *(A_row + k); \
Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
@@ -538,7 +538,7 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
assert(B_cols % 8 == 0); \
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
- const int simd_width = width / sizeof(Register); \
+ const Index simd_width = width / sizeof(Register); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
INTGEMM_OMP_FOR \
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
diff --git a/sse2_gemm.h b/intgemm/sse2_gemm.h
index 69f6741..cd49efe 100644
--- a/sse2_gemm.h
+++ b/intgemm/sse2_gemm.h
@@ -5,12 +5,10 @@
#include "types.h"
#include <cstdint>
-#include <stdint.h>
// 8 bit is in ssse3_gemm.h
namespace intgemm {
-
namespace sse2 {
INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
@@ -21,37 +19,30 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
class QuantizeTile16 {
public:
- typedef __m128i Register;
-
- INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
- INTGEMM_SSE2 inline __m128i Consecutive(const float *input) const {
- return Tile(input, input + 4);
+ INTGEMM_SSE2 static inline Register Consecutive(__m128 mult_reg, const float *input) {
+ return Tile(mult_reg, input, input + 4);
}
- INTGEMM_SSE2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
- return Tile(
+ INTGEMM_SSE2 static inline Register ConsecutiveWithWrapping(__m128 mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
+ return Tile(mult_reg,
input,
input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0));
}
- INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) const {
- return Consecutive(input);
+ INTGEMM_SSE2 static inline Register ForReshape(__m128 mult_reg, const float *input, int) {
+ return Consecutive(mult_reg, input);
}
private:
- INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) const {
- __m128i g0 = QuantizerGrab(input0, mult_reg_);
- __m128i g1 = QuantizerGrab(input1, mult_reg_);
+ INTGEMM_SSE2 static inline Register Tile(__m128 mult_reg, const float *input0, const float *input1) {
+ __m128i g0 = kernels::quantize(loadu_ps<__m128>(input0), mult_reg);
+ __m128i g1 = kernels::quantize(loadu_ps<__m128>(input1), mult_reg);
return _mm_packs_epi32(g0, g1);
}
-
- const __m128 mult_reg_;
};
-} //namespace
-// This should be pure INTGEMM_SSE2 (and below).
-struct SSE2_16bit {
+// This should be pure SSE2 (and below).
+struct Kernels16 {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
@@ -63,10 +54,10 @@ struct SSE2_16bit {
assert(size % 8 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
- sse2::QuantizeTile16 q(quant_mult);
+ FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 8, output += 8) {
- *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
+ *reinterpret_cast<__m128i*>(output) = QuantizeTile16::Consecutive(q, input);
}
}
@@ -74,13 +65,13 @@ struct SSE2_16bit {
static const Index kBTileRow = 8;
static const Index kBTileCol = 8;
- INTGEMM_PREPARE_B_16(INTGEMM_SSE2, sse2::QuantizeTile16)
- INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSE2, CPUType::SSE2, int16_t)
- INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSE2, sse2::QuantizeTile16, int16_t)
+ INTGEMM_PREPARE_B_16(INTGEMM_SSE2, QuantizeTile16)
+ INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSE2, int16_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSE2, QuantizeTile16, int16_t)
INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
//TODO #DEFINE
- sse2::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
+ SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
}
INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, CPUType::SSE2)
@@ -89,4 +80,5 @@ struct SSE2_16bit {
static const CPUType kUses = CPUType::SSE2;
};
+} // namespace sse2
} // namespace intgemm
diff --git a/ssse3_gemm.h b/intgemm/ssse3_gemm.h
index fd3ab8c..865fe12 100644
--- a/ssse3_gemm.h
+++ b/intgemm/ssse3_gemm.h
@@ -6,13 +6,11 @@
#include "types.h"
#include <cstdint>
-#include <stdint.h>
#include <cstring>
// 16-bit is in sse2_gemm.h
namespace intgemm {
-
namespace ssse3 {
INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
@@ -23,24 +21,20 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i)
class QuantizeTile8 {
public:
- typedef __m128i Register;
-
- INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
- INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) const {
+ INTGEMM_SSSE3 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
// Skip a row.
- return Tile(input, input + 4, input + 2 * cols, input + 2 * cols + 4);
+ return Tile(mult_reg, input, input + 4, input + 2 * cols, input + 2 * cols + 4);
}
- INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) const {
- return Tile(input, input + 4, input + 8, input + 12);
+ INTGEMM_SSSE3 static inline Register Consecutive(FRegister mult_reg, const float *input) {
+ return Tile(mult_reg, input, input + 4, input + 8, input + 12);
}
- INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) const {
- return TileU(input, input + 4, input + 8, input + 12);
+ INTGEMM_SSSE3 static inline Register ConsecutiveU(FRegister mult_reg, const float *input) {
+ return TileU(mult_reg, input, input + 4, input + 8, input + 12);
}
- INTGEMM_SSSE3 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+ INTGEMM_SSSE3 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
const float* inputs[4];
for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
while (cols_left < sizeof(Register) / sizeof(float)) {
@@ -51,16 +45,16 @@ class QuantizeTile8 {
input += sizeof(Register) / sizeof(float);
cols_left -= sizeof(Register) / sizeof(float);
}
- return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
+ return Tile(mult_reg, inputs[0], inputs[1], inputs[2], inputs[3]);
}
// Quantize 16xfloat into 16xint8_t
- INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const {
+ INTGEMM_SSSE3 static inline __m128i Tile(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
const __m128i neg128 = _mm_set1_epi8(-128);
- __m128i g0 = QuantizerGrab(input0, mult_reg_);
- __m128i g1 = QuantizerGrab(input1, mult_reg_);
- __m128i g2 = QuantizerGrab(input2, mult_reg_);
- __m128i g3 = QuantizerGrab(input3, mult_reg_);
+ __m128i g0 = QuantizerGrab(input0, mult_reg);
+ __m128i g1 = QuantizerGrab(input1, mult_reg);
+ __m128i g2 = QuantizerGrab(input2, mult_reg);
+ __m128i g3 = QuantizerGrab(input3, mult_reg);
__m128i packed0 = _mm_packs_epi32(g0, g1);
__m128i packed1 = _mm_packs_epi32(g2, g3);
__m128i packed = _mm_packs_epi16(packed0, packed1);
@@ -78,13 +72,13 @@ class QuantizeTile8 {
}
private:
- INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const {
+ INTGEMM_SSSE3 static inline __m128i TileU(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
const __m128i neg128 = _mm_set1_epi8(-128);
const __m128i pos127 = _mm_set1_epi8(127);
- __m128i g0 = QuantizerGrab(input0, mult_reg_);
- __m128i g1 = QuantizerGrab(input1, mult_reg_);
- __m128i g2 = QuantizerGrab(input2, mult_reg_);
- __m128i g3 = QuantizerGrab(input3, mult_reg_);
+ __m128i g0 = QuantizerGrab(input0, mult_reg);
+ __m128i g1 = QuantizerGrab(input1, mult_reg);
+ __m128i g2 = QuantizerGrab(input2, mult_reg);
+ __m128i g3 = QuantizerGrab(input3, mult_reg);
__m128i packed0 = _mm_packs_epi32(g0, g1);
__m128i packed1 = _mm_packs_epi32(g2, g3);
__m128i packed = _mm_packs_epi16(packed0, packed1);
@@ -100,15 +94,10 @@ class QuantizeTile8 {
return _mm_add_epi8(_mm_sub_epi8(packed, evils), pos127);
// No permute needed. packs is in order for SSE.
}
-
- private:
- const __m128 mult_reg_;
};
-} // namespace
-
-// pmaddubsw (the 8-bit multiply) is INTGEMM_SSSE3, so pedantically that's the version we need.
-struct SSSE3_8bit {
+// pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need.
+struct Kernels8 {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
@@ -117,9 +106,9 @@ struct SSSE3_8bit {
}
private:
- INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3, __m128i, ssse3)
+ INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3)
public:
- INTGEMM_QUANTIZE(INTGEMM_SSSE3, __m128i, ssse3)
+ INTGEMM_QUANTIZE(INTGEMM_SSSE3)
// Version with unsigned int + 127
// Currently A is prepared by quantization but this could theoretically change.
@@ -131,10 +120,10 @@ struct SSSE3_8bit {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
- ssse3::QuantizeTile8 q(quant_mult);
+ FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 16, output += 16) {
- *reinterpret_cast<__m128i*>(output) = q.ConsecutiveU(input);
+ *reinterpret_cast<__m128i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
}
}
@@ -143,8 +132,8 @@ struct SSSE3_8bit {
static const Index kBTileCol = 8;
INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8)
- INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, CPUType::SSE2, int8_t)
- INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, ssse3::QuantizeTile8, int8_t)
+ INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t)
INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
@@ -161,4 +150,5 @@ struct SSSE3_8bit {
static const CPUType kUses = CPUType::SSSE3;
};
+} // namespace ssse3
} // namespace intgemm
diff --git a/stats.h b/intgemm/stats.h
index 6f9eda2..6f9eda2 100644
--- a/stats.h
+++ b/intgemm/stats.h
diff --git a/stats.inl b/intgemm/stats.inl
index 7fc7afb..d6a850e 100644
--- a/stats.inl
+++ b/intgemm/stats.inl
@@ -54,7 +54,7 @@ INTGEMM_TARGET static inline float MaxAbsolute(const float *begin_float, const f
}
#else
for (const float *i = end_reg; i < end_float; ++i) {
- ret = std::max(ret, fabsf(*i));
+ ret = std::max(ret, std::fabs(*i));
}
#endif
return ret;
diff --git a/types.h b/intgemm/types.h
index da0429f..da0429f 100644
--- a/types.h
+++ b/intgemm/types.h
diff --git a/utils.h b/intgemm/utils.h
index a520ea0..a520ea0 100644
--- a/utils.h
+++ b/intgemm/utils.h
diff --git a/vec_traits.h b/intgemm/vec_traits.h
index 86265b2..86265b2 100644
--- a/vec_traits.h
+++ b/intgemm/vec_traits.h
diff --git a/test/3rd_party/LICENSE_1_0.txt b/test/3rd_party/LICENSE_1_0.txt
new file mode 100644
index 0000000..7925d62
--- /dev/null
+++ b/test/3rd_party/LICENSE_1_0.txt
@@ -0,0 +1,24 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
diff --git a/3rd_party/catch.hpp b/test/3rd_party/catch.hpp
index 1850fff..1850fff 100644
--- a/3rd_party/catch.hpp
+++ b/test/3rd_party/catch.hpp
diff --git a/test/add127_test.cc b/test/add127_test.cc
index 28c241f..b7ce49b 100644
--- a/test/add127_test.cc
+++ b/test/add127_test.cc
@@ -282,51 +282,51 @@ template <class Routine> void TestMultiplyShiftInt(Index A_rows, Index width, In
// Bias
TEST_CASE("PrepareBias SSSE3", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
- TestPrepareBias<SSSE3_8bit>(256,256);
- TestPrepareBias<SSSE3_8bit>(2048,256);
- TestPrepareBias<SSSE3_8bit>(512,512);
+ TestPrepareBias<ssse3::Kernels8>(256,256);
+ TestPrepareBias<ssse3::Kernels8>(2048,256);
+ TestPrepareBias<ssse3::Kernels8>(512,512);
}
TEST_CASE("PrepareBias AVX2", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
- TestPrepareBias<AVX2_8bit>(256,256);
- TestPrepareBias<AVX2_8bit>(2048,256);
- TestPrepareBias<AVX2_8bit>(512,512);
+ TestPrepareBias<avx2::Kernels8>(256,256);
+ TestPrepareBias<avx2::Kernels8>(2048,256);
+ TestPrepareBias<avx2::Kernels8>(512,512);
}
TEST_CASE("PrepareBias AVX512F", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- TestPrepareBias<AVX512_8bit>(256,256);
- TestPrepareBias<AVX512_8bit>(2048,256);
- TestPrepareBias<AVX512_8bit>(512,512);
+ TestPrepareBias<avx512bw::Kernels8>(256,256);
+ TestPrepareBias<avx512bw::Kernels8>(2048,256);
+ TestPrepareBias<avx512bw::Kernels8>(512,512);
#endif
}
//A
TEST_CASE("PrepareA SSSE3", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
- TestPrepareA<SSSE3_8bit>(64,64);
- TestPrepareA<SSSE3_8bit>(256,256);
- TestPrepareA<SSSE3_8bit>(512,512);
- TestPrepareA<SSSE3_8bit>(2048,256);
+ TestPrepareA<ssse3::Kernels8>(64,64);
+ TestPrepareA<ssse3::Kernels8>(256,256);
+ TestPrepareA<ssse3::Kernels8>(512,512);
+ TestPrepareA<ssse3::Kernels8>(2048,256);
}
TEST_CASE("PrepareA AVX2", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
- TestPrepareA<AVX2_8bit>(64,64);
- TestPrepareA<AVX2_8bit>(256,256);
- TestPrepareA<AVX2_8bit>(512,512);
- TestPrepareA<AVX2_8bit>(2048,256);
+ TestPrepareA<avx2::Kernels8>(64,64);
+ TestPrepareA<avx2::Kernels8>(256,256);
+ TestPrepareA<avx2::Kernels8>(512,512);
+ TestPrepareA<avx2::Kernels8>(2048,256);
}
TEST_CASE("PrepareA AVX512F", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- TestPrepareA<AVX512_8bit>(64,64);
- TestPrepareA<AVX512_8bit>(256,256);
- TestPrepareA<AVX512_8bit>(512,512);
- TestPrepareA<AVX512_8bit>(2048,256);
+ TestPrepareA<avx512bw::Kernels8>(64,64);
+ TestPrepareA<avx512bw::Kernels8>(256,256);
+ TestPrepareA<avx512bw::Kernels8>(512,512);
+ TestPrepareA<avx512bw::Kernels8>(2048,256);
#endif
}
@@ -334,144 +334,144 @@ TEST_CASE("PrepareA AVX512F", "[Add127]") {
TEST_CASE ("Multiply SSSE3 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
- TestMultiplyBiasNew<SSSE3_8bit>(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f);
- TestMultiplyBiasNew<SSSE3_8bit>(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f);
- TestMultiplyBiasNew<SSSE3_8bit>(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f);
- TestMultiplyBiasNew<SSSE3_8bit>(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f);
- TestMultiplyBiasNew<SSSE3_8bit>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
- TestMultiplyBiasNew<SSSE3_8bit>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
- TestMultiplyBiasNew<SSSE3_8bit>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<ssse3::Kernels8>(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f);
+ TestMultiplyBiasNew<ssse3::Kernels8>(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<ssse3::Kernels8>(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f);
+ TestMultiplyBiasNew<ssse3::Kernels8>(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f);
+ TestMultiplyBiasNew<ssse3::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<ssse3::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
+ TestMultiplyBiasNew<ssse3::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
}
TEST_CASE ("Multiply AVX2 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiplyBiasNew<AVX2_8bit>(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f);
- TestMultiplyBiasNew<AVX2_8bit>(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f);
- TestMultiplyBiasNew<AVX2_8bit>(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f);
- TestMultiplyBiasNew<AVX2_8bit>(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f);
- TestMultiplyBiasNew<AVX2_8bit>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
- TestMultiplyBiasNew<AVX2_8bit>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
- TestMultiplyBiasNew<AVX2_8bit>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<avx2::Kernels8>(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f);
+ TestMultiplyBiasNew<avx2::Kernels8>(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<avx2::Kernels8>(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f);
+ TestMultiplyBiasNew<avx2::Kernels8>(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f);
+ TestMultiplyBiasNew<avx2::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
+ TestMultiplyBiasNew<avx2::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
+ TestMultiplyBiasNew<avx2::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512F 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiplyBiasNew<AVX512_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
- TestMultiplyBiasNew<AVX512_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
- TestMultiplyBiasNew<AVX512_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
- TestMultiplyBiasNew<AVX512_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyBiasNew<AVX512_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
- TestMultiplyBiasNew<AVX512_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyBiasNew<AVX512_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
+ TestMultiplyBiasNew<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<avx512bw::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
+ TestMultiplyBiasNew<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::AVX512VNNI) return;
- TestMultiplyBiasNew<AVX512VNNI_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
- TestMultiplyBiasNew<AVX512VNNI_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
- TestMultiplyBiasNew<AVX512VNNI_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
- TestMultiplyBiasNew<AVX512VNNI_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyBiasNew<AVX512VNNI_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
- TestMultiplyBiasNew<AVX512VNNI_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyBiasNew<AVX512VNNI_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<avx512vnni::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
+ TestMultiplyBiasNew<avx512vnni::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
+ TestMultiplyBiasNew<avx512vnni::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<avx512vnni::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<avx512vnni::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyBiasNew<avx512vnni::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
}
#endif
//Multiply old vs new
TEST_CASE ("Multiply SSSE3 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
- TestMultiplyShiftNonShift<SSSE3_8bit>(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<SSSE3_8bit>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
- TestMultiplyShiftNonShift<SSSE3_8bit>(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad
- TestMultiplyShiftNonShift<SSSE3_8bit>(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f);
- TestMultiplyShiftNonShift<SSSE3_8bit>(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f);
- TestMultiplyShiftNonShift<SSSE3_8bit>(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f);
- TestMultiplyShiftNonShift<SSSE3_8bit>(200, 256, 256, 1, 0.74f, 0.17f, 0.006f);
+ TestMultiplyShiftNonShift<ssse3::Kernels8>(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<ssse3::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
+ TestMultiplyShiftNonShift<ssse3::Kernels8>(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad
+ TestMultiplyShiftNonShift<ssse3::Kernels8>(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f);
+ TestMultiplyShiftNonShift<ssse3::Kernels8>(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f);
+ TestMultiplyShiftNonShift<ssse3::Kernels8>(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f);
+ TestMultiplyShiftNonShift<ssse3::Kernels8>(200, 256, 256, 1, 0.74f, 0.17f, 0.006f);
}
TEST_CASE ("Multiply AVX2 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiplyShiftNonShift<AVX2_8bit>(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<AVX2_8bit>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
- TestMultiplyShiftNonShift<AVX2_8bit>(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad
- TestMultiplyShiftNonShift<AVX2_8bit>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftNonShift<AVX2_8bit>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
- TestMultiplyShiftNonShift<AVX2_8bit>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftNonShift<AVX2_8bit>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
+ TestMultiplyShiftNonShift<avx2::Kernels8>(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<avx2::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
+ TestMultiplyShiftNonShift<avx2::Kernels8>(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad
+ TestMultiplyShiftNonShift<avx2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftNonShift<avx2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
+ TestMultiplyShiftNonShift<avx2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftNonShift<avx2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512F 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiplyShiftNonShift<AVX512_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
- TestMultiplyShiftNonShift<AVX512_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
- TestMultiplyShiftNonShift<AVX512_8bit>(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f);
- TestMultiplyShiftNonShift<AVX512_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyShiftNonShift<AVX512_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
- TestMultiplyShiftNonShift<AVX512_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
- TestMultiplyShiftNonShift<AVX512_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
+ TestMultiplyShiftNonShift<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
+ TestMultiplyShiftNonShift<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
+ TestMultiplyShiftNonShift<avx512bw::Kernels8>(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f);
+ TestMultiplyShiftNonShift<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyShiftNonShift<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
+ TestMultiplyShiftNonShift<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+ TestMultiplyShiftNonShift<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::AVX512VNNI) return;
- TestMultiplyShiftNonShift<AVX512VNNI_8bit>(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f);
- TestMultiplyShiftNonShift<AVX512VNNI_8bit>(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<AVX512VNNI_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
- TestMultiplyShiftNonShift<AVX512VNNI_8bit>(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<AVX512VNNI_8bit>(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<AVX512VNNI_8bit>(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
- TestMultiplyShiftNonShift<AVX512VNNI_8bit>(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<avx512vnni::Kernels8>(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f);
+ TestMultiplyShiftNonShift<avx512vnni::Kernels8>(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
+ TestMultiplyShiftNonShift<avx512vnni::Kernels8>(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<avx512vnni::Kernels8>(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<avx512vnni::Kernels8>(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
+ TestMultiplyShiftNonShift<avx512vnni::Kernels8>(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f);
}
#endif
//Multiply Shift vs int shift implementation
TEST_CASE ("Multiply SSSE3 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
- TestMultiplyShiftInt<SSSE3_8bit>(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<SSSE3_8bit>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<SSSE3_8bit>(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f);
- TestMultiplyShiftInt<SSSE3_8bit>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftInt<SSSE3_8bit>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<SSSE3_8bit>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftInt<SSSE3_8bit>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<ssse3::Kernels8>(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<ssse3::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<ssse3::Kernels8>(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f);
+ TestMultiplyShiftInt<ssse3::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftInt<ssse3::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<ssse3::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftInt<ssse3::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
}
TEST_CASE ("Multiply AVX2 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiplyShiftInt<AVX2_8bit>(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<AVX2_8bit>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<AVX2_8bit>(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f);
- TestMultiplyShiftInt<AVX2_8bit>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftInt<AVX2_8bit>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<AVX2_8bit>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
- TestMultiplyShiftInt<AVX2_8bit>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<avx2::Kernels8>(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx2::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<avx2::Kernels8>(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f);
+ TestMultiplyShiftInt<avx2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftInt<avx2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<avx2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+ TestMultiplyShiftInt<avx2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512F 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiplyShiftInt<AVX512_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
- TestMultiplyShiftInt<AVX512_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<AVX512_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<AVX512_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<AVX512_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<AVX512_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<AVX512_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
+ TestMultiplyShiftInt<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx512bw::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::AVX512VNNI) return;
- TestMultiplyShiftInt<AVX512VNNI_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
- TestMultiplyShiftInt<AVX512VNNI_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<AVX512VNNI_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
- TestMultiplyShiftInt<AVX512VNNI_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<AVX512VNNI_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<AVX512VNNI_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
- TestMultiplyShiftInt<AVX512VNNI_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx512vnni::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
+ TestMultiplyShiftInt<avx512vnni::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
+ TestMultiplyShiftInt<avx512vnni::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx512vnni::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx512vnni::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+ TestMultiplyShiftInt<avx512vnni::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
}
#endif
diff --git a/test/kernels/add_bias_test.cc b/test/kernels/add_bias_test.cc
index 7c299f0..492c669 100644
--- a/test/kernels/add_bias_test.cc
+++ b/test/kernels/add_bias_test.cc
@@ -1,6 +1,6 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <numeric>
diff --git a/test/kernels/bitwise_not_test.cc b/test/kernels/bitwise_not_test.cc
index 1408db3..e908c43 100644
--- a/test/kernels/bitwise_not_test.cc
+++ b/test/kernels/bitwise_not_test.cc
@@ -1,6 +1,6 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <cstdlib>
#include <numeric>
diff --git a/test/kernels/downcast_test.cc b/test/kernels/downcast_test.cc
index 6c1f3ab..5f9db66 100644
--- a/test/kernels/downcast_test.cc
+++ b/test/kernels/downcast_test.cc
@@ -1,6 +1,6 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
diff --git a/test/kernels/exp_test.cc b/test/kernels/exp_test.cc
index b76a2e1..838e228 100644
--- a/test/kernels/exp_test.cc
+++ b/test/kernels/exp_test.cc
@@ -1,6 +1,6 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
diff --git a/test/kernels/floor_test.cc b/test/kernels/floor_test.cc
index 01b607f..2659c3f 100644
--- a/test/kernels/floor_test.cc
+++ b/test/kernels/floor_test.cc
@@ -1,6 +1,6 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
diff --git a/test/kernels/multiply_sat_test.cc b/test/kernels/multiply_sat_test.cc
deleted file mode 100644
index 444eae4..0000000
--- a/test/kernels/multiply_sat_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
-
-#include <stdint.h>
-#include <cstdint>
-#include <cstddef>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_, typename Type_>
-void kernel_multiply_sat_test() {
- if (kCPU < CPUType_)
- return;
-
- using vec_t = vector_t<CPUType_, Type_>;
- constexpr int VECTOR_LENGTH = sizeof(vec_t) / sizeof(Type_);
-
- AlignedVector<Type_> input1(VECTOR_LENGTH);
- AlignedVector<Type_> input2(VECTOR_LENGTH);
- AlignedVector<Type_> output(VECTOR_LENGTH);
-
- std::iota(input1.begin(), input1.end(), static_cast<Type_>(-VECTOR_LENGTH / 2));
- std::iota(input2.begin(), input2.end(), static_cast<Type_>(-VECTOR_LENGTH / 3));
-
- // TODO: try all shifts. The shift must be an immediate.
- int8_t shift = 1;
- *output.template as<vec_t>() = kernels::multiply_sat<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>(), shift);
- for (std::size_t i = 0; i < output.size(); ++i) {
- auto ref = (int64_t(input1[i]) * input2[i]) >> shift;
- auto ref_sat = Type_(std::min<int64_t>(std::numeric_limits<Type_>::max(), std::max<int64_t>(std::numeric_limits<Type_>::min(), ref)));
- CHECK(output[i] == ref_sat);
- }
-}
-
-template INTGEMM_SSE2 void kernel_multiply_sat_test<CPUType::SSE2, int8_t>();
-template INTGEMM_SSE2 void kernel_multiply_sat_test<CPUType::SSE2, int16_t>();
-KERNEL_TEST_CASE("multiply_sat/int8 SSE2") { return kernel_multiply_sat_test<CPUType::SSE2, int8_t>(); }
-KERNEL_TEST_CASE("multiply_sat/int16 SSE2") { return kernel_multiply_sat_test<CPUType::SSE2, int16_t>(); }
-
-template INTGEMM_AVX2 void kernel_multiply_sat_test<CPUType::AVX2, int8_t>();
-template INTGEMM_AVX2 void kernel_multiply_sat_test<CPUType::AVX2, int16_t>();
-KERNEL_TEST_CASE("multiply_sat/int8 AVX2") { return kernel_multiply_sat_test<CPUType::AVX2, int8_t>(); }
-KERNEL_TEST_CASE("multiply_sat/int16 AVX2") { return kernel_multiply_sat_test<CPUType::AVX2, int16_t>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_multiply_sat_test<CPUType::AVX512BW, int8_t>();
-template INTGEMM_AVX512BW void kernel_multiply_sat_test<CPUType::AVX512BW, int16_t>();
-KERNEL_TEST_CASE("multiply_sat/int8 AVX512BW") { return kernel_multiply_sat_test<CPUType::AVX512BW, int8_t>(); }
-KERNEL_TEST_CASE("multiply_sat/int16 AVX512BW") { return kernel_multiply_sat_test<CPUType::AVX512BW, int16_t>(); }
-#endif
-
-}
diff --git a/test/kernels/multiply_test.cc b/test/kernels/multiply_test.cc
index ca8c54c..029e3ac 100644
--- a/test/kernels/multiply_test.cc
+++ b/test/kernels/multiply_test.cc
@@ -1,9 +1,8 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <cstdint>
-#include <stdint.h>
#include <numeric>
namespace intgemm {
diff --git a/test/kernels/quantize_test.cc b/test/kernels/quantize_test.cc
index f163a45..ae3c068 100644
--- a/test/kernels/quantize_test.cc
+++ b/test/kernels/quantize_test.cc
@@ -1,6 +1,6 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <numeric>
diff --git a/test/kernels/relu_test.cc b/test/kernels/relu_test.cc
index 3ad6dd8..6fcef98 100644
--- a/test/kernels/relu_test.cc
+++ b/test/kernels/relu_test.cc
@@ -1,8 +1,7 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
-#include <stdint.h>
#include <cstdint>
#include <numeric>
diff --git a/test/kernels/rescale_test.cc b/test/kernels/rescale_test.cc
index ae13984..280b513 100644
--- a/test/kernels/rescale_test.cc
+++ b/test/kernels/rescale_test.cc
@@ -1,9 +1,8 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <cstdint>
-#include <stdint.h>
#include <numeric>
namespace intgemm {
diff --git a/test/kernels/sigmoid_test.cc b/test/kernels/sigmoid_test.cc
index 7f7392d..af9dad1 100644
--- a/test/kernels/sigmoid_test.cc
+++ b/test/kernels/sigmoid_test.cc
@@ -1,6 +1,6 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
diff --git a/test/kernels/tanh_test.cc b/test/kernels/tanh_test.cc
index 4ba099c..e2c36f5 100644
--- a/test/kernels/tanh_test.cc
+++ b/test/kernels/tanh_test.cc
@@ -1,6 +1,6 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
diff --git a/test/kernels/unquantize_test.cc b/test/kernels/unquantize_test.cc
index 20c3d6a..ee4bc80 100644
--- a/test/kernels/unquantize_test.cc
+++ b/test/kernels/unquantize_test.cc
@@ -1,6 +1,6 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <numeric>
diff --git a/test/kernels/upcast_test.cc b/test/kernels/upcast_test.cc
index cc782b5..92be1bd 100644
--- a/test/kernels/upcast_test.cc
+++ b/test/kernels/upcast_test.cc
@@ -1,9 +1,10 @@
+// This test triggers an internal compiler error in gcc 5.
+#if defined(__OPTIMIZE__) || defined(__clang__) || defined(__INTEL_COMPILER) || !defined(__GNUC__) || (__GNUC__ != 5)
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <cstdint>
-#include <stdint.h>
#include <numeric>
namespace intgemm {
@@ -72,6 +73,7 @@ template INTGEMM_AVX512BW void kernel_upcast16to32_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("upcast16to32 AVX512BW") { return kernel_upcast16to32_test<CPUType::AVX512BW>(); }
#endif
+
template <CPUType CPUType_>
void kernel_upcast8to32_test() {
if (kCPU < CPUType_)
@@ -107,3 +109,4 @@ KERNEL_TEST_CASE("upcast8to32 AVX512BW") { return kernel_upcast8to32_test<CPUTyp
#endif
}
+#endif
diff --git a/test/kernels/write_test.cc b/test/kernels/write_test.cc
index a0189fe..c263fca 100644
--- a/test/kernels/write_test.cc
+++ b/test/kernels/write_test.cc
@@ -1,6 +1,6 @@
#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index a1138a0..6c16edd 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -1,10 +1,10 @@
#include "test.h"
-#include "../aligned.h"
-#include "../callbacks.h"
-#include "../interleave.h"
-#include "../intgemm.h"
-#include "../multiply.h"
-#include "../stats.h"
+#include "../intgemm/aligned.h"
+#include "../intgemm/callbacks.h"
+#include "../intgemm/interleave.h"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/multiply.h"
+#include "../intgemm/stats.h"
#include <algorithm>
#include <cassert>
@@ -66,7 +66,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
it = dist(gen);
}
- typedef typename Routine::Integer Integer;
+ using Integer = typename Routine::Integer;
// Call Prepare
AlignedVector<Integer> test(input.size());
Routine::PrepareB(input.begin(), test.begin(), 1, rows, cols);
@@ -85,30 +85,30 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
TEST_CASE("Prepare AVX512", "[prepare]") {
if (kCPU < CPUType::AVX512BW) return;
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- TestPrepare<AVX512_8bit>(64, 8);
- TestPrepare<AVX512_8bit>(256, 32);
- TestPrepare<AVX512_16bit>(64, 8);
- TestPrepare<AVX512_16bit>(256, 32);
+ TestPrepare<avx512bw::Kernels8>(64, 8);
+ TestPrepare<avx512bw::Kernels8>(256, 32);
+ TestPrepare<avx512bw::Kernels16>(64, 8);
+ TestPrepare<avx512bw::Kernels16>(256, 32);
#endif
}
TEST_CASE("Prepare AVX2", "[prepare]") {
if (kCPU < CPUType::AVX2) return;
- TestPrepare<AVX2_8bit>(64, 32);
- TestPrepare<AVX2_16bit>(64, 32);
+ TestPrepare<avx2::Kernels8>(64, 32);
+ TestPrepare<avx2::Kernels16>(64, 32);
}
TEST_CASE("Prepare SSSE3", "[prepare]") {
if (kCPU < CPUType::SSSE3) return;
- TestPrepare<SSSE3_8bit>(16, 8);
- TestPrepare<SSSE3_8bit>(32, 16);
- TestPrepare<SSSE3_8bit>(32, 32);
+ TestPrepare<ssse3::Kernels8>(16, 8);
+ TestPrepare<ssse3::Kernels8>(32, 16);
+ TestPrepare<ssse3::Kernels8>(32, 32);
}
TEST_CASE("Prepare SSE2", "[prepare]") {
if (kCPU < CPUType::SSE2) return;
- TestPrepare<SSE2_16bit>(8, 8);
- TestPrepare<SSE2_16bit>(32, 32);
+ TestPrepare<sse2::Kernels16>(8, 8);
+ TestPrepare<sse2::Kernels16>(32, 32);
}
template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 16) {
@@ -119,7 +119,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
for (auto& it : input) {
it = dist(gen);
}
- typedef typename Routine::Integer Integer;
+ using Integer = typename Routine::Integer;
AlignedVector<Integer> prepared(input.size());
Routine::PrepareB(input.begin(), prepared.begin(), 1, rows, cols);
@@ -150,27 +150,27 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
TEST_CASE("SelectColumnsB AVX512", "[select]") {
if (kCPU < CPUType::AVX512BW) return;
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
- TestSelectColumnsB<AVX512_8bit>();
- TestSelectColumnsB<AVX512_16bit>(256, 256);
+ TestSelectColumnsB<avx512bw::Kernels8>();
+ TestSelectColumnsB<avx512bw::Kernels16>(256, 256);
#endif
}
TEST_CASE("SelectColumnsB AVX2", "[select]") {
if (kCPU < CPUType::AVX2) return;
- TestSelectColumnsB<AVX2_8bit>(256, 256);
- TestSelectColumnsB<AVX2_16bit>(256, 256);
+ TestSelectColumnsB<avx2::Kernels8>(256, 256);
+ TestSelectColumnsB<avx2::Kernels16>(256, 256);
}
TEST_CASE("SelectColumnsB SSSE3", "[select]") {
if (kCPU < CPUType::SSSE3) return;
- TestSelectColumnsB<SSSE3_8bit>();
- TestSelectColumnsB<SSSE3_8bit>(256, 256);
+ TestSelectColumnsB<ssse3::Kernels8>();
+ TestSelectColumnsB<ssse3::Kernels8>(256, 256);
}
TEST_CASE("SelectColumnsB SSE2", "[select]") {
if (kCPU < CPUType::SSE2) return;
- TestSelectColumnsB<SSE2_16bit>();
- TestSelectColumnsB<SSE2_16bit>(256, 256);
+ TestSelectColumnsB<sse2::Kernels16>();
+ TestSelectColumnsB<sse2::Kernels16>(256, 256);
}
template <class Register> void TestMax() {
@@ -187,8 +187,8 @@ TEST_CASE("Max", "[max]") {
}
void CompareMaxAbs(const float *begin, const float *end, float test, std::size_t offset) {
- float largest = fabs(*std::max_element(begin, end));
- float smallest = fabs(*std::min_element(begin, end));
+ float largest = std::fabs(*std::max_element(begin, end));
+ float smallest = std::fabs(*std::min_element(begin, end));
largest = std::max(largest, smallest);
CHECK_MESSAGE(largest == test, "Error: " << largest << " versus " << test << " in length " << (end - begin) << " offset " << offset);
}
@@ -255,7 +255,7 @@ TEST_CASE("MaxAbsolute AVX512BW", "[max]") {
template <class Routine> void TestMultiply(Index A_rows, Index width, Index B_cols,
float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
- typedef typename Routine::Integer Integer;
+ using Integer = typename Routine::Integer;
std::ostringstream info;
info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
@@ -307,7 +307,7 @@ template <class Routine> void TestMultiply(Index A_rows, Index width, Index B_co
//Require different number of arguments. I don't think the refactoring is worth it.
template <class Routine> void TestMultiplyBias(Index A_rows, Index width, Index B_cols,
float int_tolerance = 0.1f, float float_tolerance = 1.0f, float MSE_float_tolerance = 0.0f, float MSE_int_tolerance = 0.0f) {
- typedef typename Routine::Integer Integer;
+ using Integer = typename Routine::Integer;
std::ostringstream info;
info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
@@ -358,145 +358,145 @@ template <class Routine> void TestMultiplyBias(Index A_rows, Index width, Index
TEST_CASE ("Multiply SSE2 16bit", "[multiply]") {
if (kCPU < CPUType::SSE2) return;
- TestMultiply<SSE2_16bit>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiply<SSE2_16bit>(8, 2048, 256, .1f, 1, 0.02f);
- TestMultiply<SSE2_16bit>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiply<SSE2_16bit>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiply<SSE2_16bit>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiply<SSE2_16bit>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<sse2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<sse2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+ TestMultiply<sse2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<sse2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<sse2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<sse2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply SSE2 16bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::SSE2) return;
- TestMultiplyBias<SSE2_16bit>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<SSE2_16bit>(8, 2048, 256, .1f, 1, 0.02f);
- TestMultiplyBias<SSE2_16bit>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<SSE2_16bit>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<SSE2_16bit>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<SSE2_16bit>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<sse2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<sse2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+ TestMultiplyBias<sse2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<sse2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<sse2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<sse2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply SSSE3 8bit", "[multiply]") {
if (kCPU < CPUType::SSSE3) return;
- TestMultiply<SSSE3_8bit>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
- TestMultiply<SSSE3_8bit>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
- TestMultiply<SSSE3_8bit>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
- TestMultiply<SSSE3_8bit>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
- TestMultiply<SSSE3_8bit>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
- TestMultiply<SSSE3_8bit>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
+ TestMultiply<ssse3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
+ TestMultiply<ssse3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
+ TestMultiply<ssse3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
+ TestMultiply<ssse3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
+ TestMultiply<ssse3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
+ TestMultiply<ssse3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
}
TEST_CASE ("Multiply SSSE3 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::SSSE3) return;
- TestMultiplyBias<SSSE3_8bit>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
- TestMultiplyBias<SSSE3_8bit>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
- TestMultiplyBias<SSSE3_8bit>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
- TestMultiplyBias<SSSE3_8bit>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
- TestMultiplyBias<SSSE3_8bit>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
- TestMultiplyBias<SSSE3_8bit>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
+ TestMultiplyBias<ssse3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
+ TestMultiplyBias<ssse3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
+ TestMultiplyBias<ssse3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
+ TestMultiplyBias<ssse3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
+ TestMultiplyBias<ssse3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
+ TestMultiplyBias<ssse3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
}
TEST_CASE ("Multiply AVX2 8bit", "[multiply]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiply<AVX2_8bit>(8, 256, 256, .1f, 1, 0.1f);
- TestMultiply<AVX2_8bit>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
- TestMultiply<AVX2_8bit>(320, 256, 256, .1f, 1, 0.1f);
- TestMultiply<AVX2_8bit>(472, 256, 256, .1f, 1, 0.1f);
- TestMultiply<AVX2_8bit>(248, 256, 256, .1f, 1, 0.1f);
- TestMultiply<AVX2_8bit>(200, 256, 256, .1f, 1, 0.1f);
+ TestMultiply<avx2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
+ TestMultiply<avx2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
+ TestMultiply<avx2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
+ TestMultiply<avx2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
+ TestMultiply<avx2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
+ TestMultiply<avx2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
}
TEST_CASE ("Multiply AVX2 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiplyBias<AVX2_8bit>(8, 256, 256, .1f, 1, 0.1f);
- TestMultiplyBias<AVX2_8bit>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
- TestMultiplyBias<AVX2_8bit>(320, 256, 256, .1f, 1, 0.1f);
- TestMultiplyBias<AVX2_8bit>(472, 256, 256, .1f, 1, 0.1f);
- TestMultiplyBias<AVX2_8bit>(248, 256, 256, .1f, 1, 0.1f);
- TestMultiplyBias<AVX2_8bit>(200, 256, 256, .1f, 1, 0.1f);
+ TestMultiplyBias<avx2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
+ TestMultiplyBias<avx2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
+ TestMultiplyBias<avx2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
+ TestMultiplyBias<avx2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
+ TestMultiplyBias<avx2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
+ TestMultiplyBias<avx2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
}
TEST_CASE ("Multiply AVX2 16bit", "[multiply]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiply<AVX2_16bit>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiply<AVX2_16bit>(8, 2048, 256, .1f, 1, 0.02f);
- TestMultiply<AVX2_16bit>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiply<AVX2_16bit>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiply<AVX2_16bit>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiply<AVX2_16bit>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<avx2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<avx2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+ TestMultiply<avx2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<avx2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<avx2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<avx2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply AVX2 16bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX2) return;
- TestMultiplyBias<AVX2_16bit>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<AVX2_16bit>(8, 2048, 256, .1f, 1, 0.02f);
- TestMultiplyBias<AVX2_16bit>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<AVX2_16bit>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<AVX2_16bit>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<AVX2_16bit>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<avx2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<avx2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+ TestMultiplyBias<avx2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<avx2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<avx2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<avx2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512 8bit", "[multiply]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiply<AVX512_8bit>(8, 256, 256, 0, 0.25f, 0.062f);
- TestMultiply<AVX512_8bit>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
- TestMultiply<AVX512_8bit>(320, 256, 256, 0, 0.26f, 0.059f);
- TestMultiply<AVX512_8bit>(472, 256, 256, 0, 0.29f, 0.059f);
- TestMultiply<AVX512_8bit>(248, 256, 256, 0, 0.29f, 0.059f);
- TestMultiply<AVX512_8bit>(200, 256, 256, 0, 0.28f, 0.06f);
+ TestMultiply<avx512bw::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+ TestMultiply<avx512bw::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
+ TestMultiply<avx512bw::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+ TestMultiply<avx512bw::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiply<avx512bw::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiply<avx512bw::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
TEST_CASE ("Multiply AVX512 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiplyBias<AVX512_8bit>(8, 256, 256, 0, 0.25f, 0.062f);
- TestMultiplyBias<AVX512_8bit>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
- TestMultiplyBias<AVX512_8bit>(320, 256, 256, 0, 0.26f, 0.059f);
- TestMultiplyBias<AVX512_8bit>(472, 256, 256, 0, 0.29f, 0.059f);
- TestMultiplyBias<AVX512_8bit>(248, 256, 256, 0, 0.29f, 0.059f);
- TestMultiplyBias<AVX512_8bit>(200, 256, 256, 0, 0.28f, 0.06f);
+ TestMultiplyBias<avx512bw::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+ TestMultiplyBias<avx512bw::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
+ TestMultiplyBias<avx512bw::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+ TestMultiplyBias<avx512bw::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiplyBias<avx512bw::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiplyBias<avx512bw::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit", "[multiply]") {
if (kCPU < CPUType::AVX512VNNI) return;
- TestMultiply<AVX512VNNI_8bit>(8, 256, 256, 0, 0.25f, 0.062f);
- TestMultiply<AVX512VNNI_8bit>(8, 2048, 256, 0, 0.55f, 0.25f);
- TestMultiply<AVX512VNNI_8bit>(320, 256, 256, 0, 0.26f, 0.059f);
- TestMultiply<AVX512VNNI_8bit>(472, 256, 256, 0, 0.29f, 0.059f);
- TestMultiply<AVX512VNNI_8bit>(248, 256, 256, 0, 0.29f, 0.059f);
- TestMultiply<AVX512VNNI_8bit>(200, 256, 256, 0, 0.28f, 0.06f);
+ TestMultiply<avx512vnni::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+ TestMultiply<avx512vnni::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
+ TestMultiply<avx512vnni::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+ TestMultiply<avx512vnni::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiply<avx512vnni::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiply<avx512vnni::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
TEST_CASE ("Multiply AVX512VNNI 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX512VNNI) return;
- TestMultiplyBias<AVX512VNNI_8bit>(8, 256, 256, 0, 0.25f, 0.062f);
- TestMultiplyBias<AVX512VNNI_8bit>(8, 2048, 256, 0, 0.55f, 0.25f);
- TestMultiplyBias<AVX512VNNI_8bit>(320, 256, 256, 0, 0.26f, 0.059f);
- TestMultiplyBias<AVX512VNNI_8bit>(472, 256, 256, 0, 0.29f, 0.059f);
- TestMultiplyBias<AVX512VNNI_8bit>(248, 256, 256, 0, 0.29f, 0.059f);
- TestMultiplyBias<AVX512VNNI_8bit>(200, 256, 256, 0, 0.28f, 0.06f);
+ TestMultiplyBias<avx512vnni::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+ TestMultiplyBias<avx512vnni::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
+ TestMultiplyBias<avx512vnni::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+ TestMultiplyBias<avx512vnni::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiplyBias<avx512vnni::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+ TestMultiplyBias<avx512vnni::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
#endif
TEST_CASE ("Multiply AVX512 16bit", "[multiply]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiply<AVX512_16bit>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiply<AVX512_16bit>(8, 2048, 256, .1f, 1, 0.011f);
- TestMultiply<AVX512_16bit>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiply<AVX512_16bit>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiply<AVX512_16bit>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiply<AVX512_16bit>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<avx512bw::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<avx512bw::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
+ TestMultiply<avx512bw::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<avx512bw::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<avx512bw::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiply<avx512bw::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply AVX512 16bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMultiplyBias<AVX512_16bit>(8, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<AVX512_16bit>(8, 2048, 256, .1f, 1, 0.011f);
- TestMultiplyBias<AVX512_16bit>(320, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<AVX512_16bit>(472, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<AVX512_16bit>(248, 256, 256, .1f, 1, 0.01f);
- TestMultiplyBias<AVX512_16bit>(200, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<avx512bw::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<avx512bw::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
+ TestMultiplyBias<avx512bw::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<avx512bw::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<avx512bw::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+ TestMultiplyBias<avx512bw::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
#endif
diff --git a/test/prepare_b_quantized_transposed.cc b/test/prepare_b_quantized_transposed.cc
index 938cc28..1437e0a 100644
--- a/test/prepare_b_quantized_transposed.cc
+++ b/test/prepare_b_quantized_transposed.cc
@@ -1,13 +1,13 @@
#include "test.h"
-#include "../aligned.h"
-#include "../avx2_gemm.h"
-#include "../avx512_gemm.h"
-#include "../sse2_gemm.h"
-#include "../ssse3_gemm.h"
+#include "../intgemm/aligned.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/avx512_gemm.h"
+#include "../intgemm/sse2_gemm.h"
+#include "../intgemm/ssse3_gemm.h"
+#include <cmath>
#include <cstring>
#include <iostream>
-#include <math.h>
namespace intgemm {
namespace {
@@ -62,22 +62,22 @@ TEST_CASE("PrepareBQuantizedTransposed SSE2", "") {
if (kCPU < CPUType::SSE2)
return;
- CHECK(TestMany<SSE2_16bit>(32, 128));
+ CHECK(TestMany<sse2::Kernels16>(32, 128));
}
TEST_CASE("PrepareBQuantizedTransposed SSSE3", "") {
if (kCPU < CPUType::SSSE3)
return;
- CHECK(TestMany<SSSE3_8bit>(32, 128));
+ CHECK(TestMany<ssse3::Kernels8>(32, 128));
}
TEST_CASE("PrepareBQuantizedTransposed AVX2", "") {
if (kCPU < CPUType::AVX2)
return;
- CHECK(TestMany<AVX2_8bit>(32, 128));
- CHECK(TestMany<AVX2_16bit>(32, 128));
+ CHECK(TestMany<avx2::Kernels8>(32, 128));
+ CHECK(TestMany<avx2::Kernels16>(32, 128));
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
@@ -85,8 +85,8 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") {
if (kCPU < CPUType::AVX512BW)
return;
- CHECK(TestMany<AVX512_8bit>(64, 128));
- CHECK(TestMany<AVX512_16bit>(64, 128));
+ CHECK(TestMany<avx512bw::Kernels8>(64, 128));
+ CHECK(TestMany<avx512bw::Kernels16>(64, 128));
}
#endif
diff --git a/test/prepare_b_transposed.cc b/test/prepare_b_transposed.cc
index 5969724..bc35138 100644
--- a/test/prepare_b_transposed.cc
+++ b/test/prepare_b_transposed.cc
@@ -1,13 +1,13 @@
#include "test.h"
-#include "../aligned.h"
-#include "../avx2_gemm.h"
-#include "../avx512_gemm.h"
-#include "../sse2_gemm.h"
-#include "../ssse3_gemm.h"
+#include "../intgemm/aligned.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/avx512_gemm.h"
+#include "../intgemm/sse2_gemm.h"
+#include "../intgemm/ssse3_gemm.h"
+#include <cmath>
#include <cstring>
#include <iostream>
-#include <math.h>
namespace intgemm {
namespace {
@@ -63,22 +63,22 @@ TEST_CASE("PrepareBTransposed SSE2", "") {
if (kCPU < CPUType::SSE2)
return;
- CHECK(TestMany<SSE2_16bit>(4, 128, 2.0f));
+ CHECK(TestMany<sse2::Kernels16>(4, 128, 2.0f));
}
TEST_CASE("PrepareBTransposed SSSE3", "") {
if (kCPU < CPUType::SSSE3)
return;
- CHECK(TestMany<SSSE3_8bit>(4, 128, 2.0f));
+ CHECK(TestMany<ssse3::Kernels8>(4, 128, 2.0f));
}
TEST_CASE("PrepareBTransposed AVX2", "") {
if (kCPU < CPUType::AVX2)
return;
- CHECK(TestMany<AVX2_8bit>(8, 128, 2.0f));
- CHECK(TestMany<AVX2_16bit>(8, 128, 2.0f));
+ CHECK(TestMany<avx2::Kernels8>(8, 128, 2.0f));
+ CHECK(TestMany<avx2::Kernels16>(8, 128, 2.0f));
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
@@ -86,8 +86,8 @@ TEST_CASE("PrepareBTransposed AVX2", "") {
if (kCPU < CPUType::AVX512BW)
return;
- CHECK(TestMany<AVX512_8bit>(16, 128, 2.0f));
- CHECK(TestMany<AVX512_16bit>(16, 128, 2.0f));
+ CHECK(TestMany<avx512bw::Kernels8>(16, 128, 2.0f));
+ CHECK(TestMany<avx512bw::Kernels16>(16, 128, 2.0f));
}
#endif
diff --git a/test/quantize_test.cc b/test/quantize_test.cc
index d2f6304..550ec66 100644
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -1,14 +1,14 @@
#include "test.h"
-#include "../aligned.h"
-#include "../avx2_gemm.h"
-#include "../avx512_gemm.h"
-#include "../sse2_gemm.h"
-#include "../ssse3_gemm.h"
-#include "../stats.h"
-
+#include "../intgemm/aligned.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/avx512_gemm.h"
+#include "../intgemm/sse2_gemm.h"
+#include "../intgemm/ssse3_gemm.h"
+#include "../intgemm/stats.h"
+
+#include <cmath>
#include <cstring>
#include <iostream>
-#include <math.h>
namespace intgemm {
namespace {
@@ -61,8 +61,8 @@ void testVectorMeanStd(int num_items, bool absolute=false) {
MeanStd reference = VectorMeanStd(inputVec, num_items, absolute);
MeanStd fast = Backend(inputVec.begin(), inputVec.end(), absolute);
- float meanDifference = fabsf(reference.mean - fast.mean);
- float stdDifference = fabsf(reference.stddev - fast.stddev);
+ float meanDifference = std::fabs(reference.mean - fast.mean);
+ float stdDifference = std::fabs(reference.stddev - fast.stddev);
float eps = 0.00002f; //Accumulating horizontal sums can lead to errors.
CHECK_MESSAGE(meanDifference <= eps, "Items: " << num_items << " Absolute: " << absolute << " Reference mean: " << reference.mean << " actual: " << fast.mean);
@@ -73,15 +73,15 @@ void testVectorMeanStd(int num_items, bool absolute=false) {
template <class I> bool IsOff(float from, I ref, I test) {
if (ref == test) return false;
if (ref - test > 1 && test - ref > 1) return true;
- float off_test = fabs((float)test - from);
- float off_ref = fabs((float)ref - from);
+ float off_test = std::fabs(static_cast<float>(test) - from);
+ float off_ref = std::fabs(static_cast<float>(ref) - from);
// Allow 0.5 to round either way.
if (off_test > 0.49 && off_test < 0.51 && off_ref > 0.49 && off_ref < 0.51) return false;
return true;
}
template <class Backend> bool Test(const float *input_unaligned, float quant_mult, std::size_t size) {
- typedef typename Backend::Integer Integer;
+ using Integer = typename Backend::Integer;
bool success = true;
AlignedVector<float> input(size);
std::memcpy(input.begin(), input_unaligned, sizeof(float) * size);
@@ -120,24 +120,24 @@ template <class Backend> void TestMany(std::size_t grow) {
TEST_CASE ("Quantize SSE2", "[quantize]") {
if (kCPU < CPUType::SSE2) return;
- TestMany<SSE2_16bit>(8);
+ TestMany<sse2::Kernels16>(8);
}
TEST_CASE ("Quantize SSSE3", "[quantize]") {
if (kCPU < CPUType::SSSE3) return;
- TestMany<SSSE3_8bit>(1);
+ TestMany<ssse3::Kernels8>(1);
}
TEST_CASE ("Quantize AVX2", "[quantize]") {
if (kCPU < CPUType::AVX2) return;
- TestMany<AVX2_8bit>(1);
- TestMany<AVX2_16bit>(16);
+ TestMany<avx2::Kernels8>(1);
+ TestMany<avx2::Kernels16>(16);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Quantize AVX512", "[quantize]") {
if (kCPU < CPUType::AVX512BW) return;
- TestMany<AVX512_8bit>(1);
- TestMany<AVX512_16bit>(16);
+ TestMany<avx512bw::Kernels8>(1);
+ TestMany<avx512bw::Kernels16>(16);
}
#endif
diff --git a/test/test.cc b/test/test.cc
index 3559738..45c27ad 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -1,6 +1,8 @@
#define CATCH_CONFIG_RUNNER
#include "test.h"
+#include <cmath>
+
int main(int argc, char ** argv) {
return Catch::Session().run(argc, argv);
}
@@ -13,13 +15,13 @@ void CompareMSE(const float *float_ref, const float *int_ref, const float *int_t
for (std::size_t i = 0; i < size; ++i) {
float int_diff = int_ref[i] - int_test[i];
float float_diff = float_ref[i] - int_test[i];
- CHECK_MESSAGE(fabs(int_diff) <= int_tolerance, test_info << "Inaccurate compared to int reference at " << i << ' ' << int_ref[i] << ' ' << int_test[i]);
- CHECK_MESSAGE(fabs(float_diff) <= float_tolerance, test_info << "Inaccurate compared to float reference at " << i << ' ' << float_ref[i] << ' ' << int_test[i]);
+ CHECK_MESSAGE(std::fabs(int_diff) <= int_tolerance, test_info << "Inaccurate compared to int reference at " << i << ' ' << int_ref[i] << ' ' << int_test[i]);
+ CHECK_MESSAGE(std::fabs(float_diff) <= float_tolerance, test_info << "Inaccurate compared to float reference at " << i << ' ' << float_ref[i] << ' ' << int_test[i]);
int_sum += int_diff * int_diff;
float_sum += float_diff * float_diff;
}
- CHECK_MESSAGE(fabs(sqrt(float_sum / size)) <= MSE_float_tolerance, test_info << "Float MSE = " << sqrt(float_sum / size));
- CHECK_MESSAGE(fabs(sqrt(int_sum / size)) <= MSE_int_tolerance, test_info << "Int MSE = " << sqrt(int_sum / size));
+ CHECK_MESSAGE(std::fabs(sqrt(float_sum / size)) <= MSE_float_tolerance, test_info << "Float MSE = " << sqrt(float_sum / size));
+ CHECK_MESSAGE(std::fabs(sqrt(int_sum / size)) <= MSE_int_tolerance, test_info << "Int MSE = " << sqrt(int_sum / size));
}
} // namespace intgemm
diff --git a/test/test.h b/test/test.h
index f145681..1f884c5 100644
--- a/test/test.h
+++ b/test/test.h
@@ -1,12 +1,12 @@
#pragma once
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
-#include "../3rd_party/catch.hpp"
-#include "../intgemm.h"
-#include "../aligned.h"
+#include "3rd_party/catch.hpp"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/aligned.h"
-#include <math.h>
+#include <cmath>
#include <sstream>
#include <iostream>
#include <iomanip>
@@ -18,7 +18,7 @@
#define CHECK_EPS(actual, expected, epsilon) \
do { \
- if (fabs((actual) - (expected)) < epsilon) { SUCCEED(); } \
+ if (std::fabs((actual) - (expected)) < epsilon) { SUCCEED(); } \
else { CHECK((actual) == (expected)); } \
} while(0)
@@ -39,8 +39,8 @@ void CompareEps(const Type* reference, const Type* actual, Index size, Type epsi
for (Index i = 0; i < size; ++i) {
INFO("Inaccurate at " << i << ' ' << reference[i] << ' ' << actual[i]);
// Ratio to maximum value.
- float threshold = epsilon * std::max<float>(0.01f, fabsf(reference[i]));
- CHECK(fabsf(reference[i] - actual[i]) < threshold);
+ float threshold = epsilon * std::max<float>(0.01f, std::fabs(reference[i]));
+ CHECK(std::fabs(reference[i] - actual[i]) < threshold);
}
}
diff --git a/test/utils_test.cc b/test/utils_test.cc
index 00b5277..e7d07e8 100644
--- a/test/utils_test.cc
+++ b/test/utils_test.cc
@@ -1,5 +1,5 @@
#include "test.h"
-#include "../utils.h"
+#include "../intgemm/utils.h"
namespace intgemm {
namespace {
diff --git a/test_mull.cpp b/test_mull.cpp
deleted file mode 100644
index 42924a6..0000000
--- a/test_mull.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-#include "intgemm.cc"
-#include "aligned.h"
-#include <iostream>
-#include <random>
-#include <string>
-#include <algorithm>
-#include <fstream>
-#include <sstream>
-
-
-/*Adapted from https://www.bfilipek.com/2018/07/string-view-perf-followup.html . We should probably go string_view way
-inline void tokenizeLine(std::string& str, std::vector<std::string>& output,
- std::string delimeter = " ") {
- auto first = std::begin(str);
-
- while (first != str.end()) {
- const auto second = std::find_first_of(first, std::end(str), std::begin(delimeter), std::end(delimeter));
-
- if (first != second) {
- output.emplace_back(str.substr(std::distance(std::begin(str), first), std::distance(first, second)));
- }
-
- if (second == str.end())
- break;
-
- first = std::next(second);
- }
-}
-
-//This is a different parsing method, without stringStream
-template<class StringType>
-void ReadInFile(StringType infile) {
- std::ifstream in(infile);
- std::string line;
-
- //First line, Info about the matrix
- std::getline(in, line);
- std::istringstream iss(line);
- std::string temp1, temp2, temp3, temp4;
- int RowsA, ColsA, RowsB, ColsB;
- if (!(iss >> temp1 >> RowsA >> temp2 >> ColsA >> temp3 >> RowsB >> temp4 >> ColsB)) {
- std::cerr << "Error parsing line 1 " << std::endl;
- exit(1);
- }
-
- //Second line, get QuantMult
- std::getline(in, line);
- std::istringstream iss2(line);
- float quantMultA, quantMultB;
- if (!(iss2 >> temp1 >> quantMultA >> temp2 >> quantMultA)) {
- std::cerr << "Error parsing line 2 " << std::endl;
- exit(1);
- }
- std::getline(in, line); //Just some text
- //Fourth line, AQuant
- std::vector<int> AQuant;
- std::getline(in, line);
- std::vector<std::string> tmp_container;
- tokenizeLine(line, tmp_container);
- if (tmp_container.size() != RowsA*ColsA) {
- std::cerr << "Error parsing matrix A. Size mismatch. Expected " << RowsA*ColsA << " got " << tmp_container.size() << std::endl;
- }
- for (auto&& num : tmp_container) {
- AQuant.push_back(std::stoi(num));
- }
- tmp_container.resize(0);
-
- std::getline(in, line); //Just some text
- //Sixth line, B_raw
- std::vector<float> B_raw;
- std::getline(in, line);
- tokenizeLine(line, tmp_container);
- if (tmp_container.size() != RowsB*ColsB) {
- std::cerr << "Error parsing matrix B. Size mismatch. Expected " << RowsB*ColsB << " got " << tmp_container.size() << std::endl;
- }
- for (auto&& num : tmp_container) {
- B_raw.push_back(std::stof(num));
- }
- tmp_container.resize(0);
-
- std::getline(in, line); //Just some text
- //Eight line, Bias
- std::vector<float> Bias;
- std::getline(in, line);
- tokenizeLine(line, tmp_container);
- if (tmp_container.size() != ColsB) {
- std::cerr << "Error parsing bias. Size mismatch. Expected " << ColsB << " got " << tmp_container.size() << std::endl;
- }
- for (auto&& num : tmp_container) {
- Bias.push_back(std::stof(num));
- }
- tmp_container.resize(0);
-
-}
-
-*/
-template<class StringType>
-void ReadInFile(StringType infile) {
- std::ifstream in(infile);
- std::string line;
-
- //First line, Info about the matrix
- std::getline(in, line);
- std::istringstream iss(line);
- std::string temp1, temp2, temp3, temp4;
- int RowsA, ColsA, RowsB, ColsB;
- if (!(iss >> temp1 >> RowsA >> temp2 >> ColsA >> temp3 >> RowsB >> temp4 >> ColsB)) {
- std::cerr << "Error parsing line 1 " << std::endl;
- exit(1);
- }
-
- //Second line, get QuantMult
- std::getline(in, line);
- std::istringstream iss2(line);
- float quantMultA, quantMultB;
- if (!(iss2 >> temp1 >> quantMultA >> temp2 >> quantMultA)) {
- std::cerr << "Error parsing line 2 " << std::endl;
- exit(1);
- }
- std::getline(in, line); //Just some text for human readability
-
- //4th line, AQuant
- std::vector<int> AQuant;
- std::getline(in, line);
- std::istringstream iss3(line);
- for (int i = 0; i < RowsA*ColsA; i++) {
- int num;
- if (!(iss3 >> num)) {
- std::cerr << "Error parsing matrix A at " << i << std::endl;;
- }
- AQuant.push_back(num);
- }
-
- std::getline(in, line); //Just some text for human readability
- //6th line, B_raw
- std::vector<float> B_raw;
- std::getline(in, line);
- std::istringstream iss4(line);
- for (int i = 0; i < RowsB*ColsB; i++) {
- float num;
- if (!(iss4 >> num)) {
- std::cerr << "Error parsing matrix B " << std::endl;
- }
- B_raw.push_back(num);
- }
-
- std::getline(in, line); //Just some text for human readability
- //8th line, Bias
- std::vector<float> Bias;
- std::getline(in, line);
- std::istringstream iss5(line);
- for (int i = 0; i < ColsB; i++) {
- float num;
- if (!(iss5 >> num)) {
- std::cerr << "Error parsing matrix bias " << std::endl;
- }
- Bias.push_back(num);
- }
-}
-
-using namespace intgemm;
-template<class T>
-void printMatrix(T* data, Index rows, Index cols) {
- std::cout << "[";
- for (int i = 0; i<rows; i++) {
- std::cout << "[";
- for (int j =0; j<cols; j++) {
- std::cout << (float)data[i*cols + j];
- if (j != cols - 1) {
- std::cout << ", ";
- }
- }
- std::cout << "]";
- if (i != rows -1) {
- std::cout << ',' << std::endl;
- }
- }
- std::cout << "]" << std::endl;
-}
-
-void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index width, Index B_cols, const float *bias) {
- for (Index r = 0; r < A_rows; ++r) {
- for (Index c = 0; c < B_cols; ++c) {
- float sum = 0.0f;
- for (Index k = 0; k < width; ++k) {
- sum += A[r * width + k] * B[k * B_cols + c];
- }
- if (bias) {
- C[r * B_cols + c] = sum + bias[c];
- } else {
- C[r * B_cols + c] = sum;
- }
- }
- }
-}
-
-// Compute A*B slowly from integers.
-template <class Integer>
-void SlowRefInt(const Integer *A, const Integer *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias) {
- for (Index r = 0; r < A_rows; ++r) {
- for (Index c = 0; c < B_cols; ++c) {
- int32_t sum = 0;
- for (Index k = 0; k < width; ++k) {
- sum += static_cast<int16_t>(A[r * width + k]) * static_cast<int16_t>(B[k * B_cols + c]);
- }
- if (bias) {
- C[r * B_cols + c] = sum * unquant_mult + bias[c];
- } else {
- C[r * B_cols + c] = sum * unquant_mult;
- }
- }
- }
-}
-
-int main() {
-
- const Index A_rows = 1;
- const Index width = 2048;
- const Index B_cols = 8;
-
- AlignedVector<float> A(A_rows * width);
- AlignedVector<float> B(width * B_cols);
- AlignedVector<float> bias(B_cols);
-
- float alpha = 2.0f;
- float quant_mult = 127/alpha;
- float unquant_mult = 1.0 / (quant_mult * quant_mult);
-
- std::mt19937 gen;
- std::uniform_real_distribution<float> dist(-2.0f, 2.0f);
-
- for (auto& it : A) {
- it = dist(gen);
- }
- for (auto& it : B) {
- it = dist(gen);
- }
- for (auto& it : bias) {
- it = dist(gen);
- }
-
- AlignedVector<float> bias_orig(B_cols);
- for (int i = 0; i < bias.size(); i++) {
- bias_orig[i] = bias[i];
- }
-
- AlignedVector<int8_t> A_prep(A.size());
- AlignedVector<int8_t> B_prep(B.size());
-
- AVX2_8bit::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
- AVX2_8bit::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
- /*
- std::cout << "A:" << std::endl;
- printMatrix(A.begin(), A_rows, width);
- std::cout << "B:" << std::endl;
- printMatrix(B.begin(), width, B_cols);
- std::cout << "bias:" << std::endl;
- printMatrix(bias.begin(), 1, B_cols);*/
-
-
- AlignedVector<float> test_C(A_rows * B_cols);
-
- AVX2_8bit::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
- //AVX2_8bit::Multiply(A_prep.begin(), B_prep.begin(), JustUnquantizeC(test_C.begin(), unquant_mult), A_rows, width, B_cols);
- std::cout << "Old multiply:" << std::endl;
- printMatrix(test_C.begin(), A_rows, B_cols);
-
- //NEEEXT
- AlignedVector<uint8_t> A_prep2(A.size());
- AVX2_8bit::PrepareA(A.begin(), A_prep2.begin(), quant_mult, A_rows, width);
-
- AVX2_8bit::PrepareBiasFor8(B.begin(), bias.begin(), alpha, width, B_cols);
-
- //printMatrix(bias.begin(), 1, B_cols); //Print bias
-
- AVX2_8bit::Multiply8Shift(reinterpret_cast<uint8_t*>(A_prep2.begin()), B_prep.begin(), A_rows, width, B_cols, UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
- //AVX2_8bit::Multiply8Shift(reinterpret_cast<uint8_t*>(A_prep.begin()), B_prep.begin(), JustUnquantizeC(test_C.begin(), unquant_mult), A_rows, width, B_cols);
-
- AlignedVector<int16_t> A_prep3(A.size());
- AlignedVector<int16_t> B_prep3(B.size());
- std::cout << "New multiply:" << std::endl;
- printMatrix(test_C.begin(), A_rows, B_cols);
- for (int i = 0; i < A_prep2.size(); i++) {
- A_prep3[i] = A_prep2[i];
- }
- AVX2_16bit::PrepareB(B.begin(), B_prep3.begin(), quant_mult, width, B_cols);
- AVX2_16bit::Multiply(A_prep3.begin(), B_prep3.begin(), A_rows, width, B_cols, UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
-
- std::cout << "New multiply, 16 bit:" << std::endl;
- printMatrix(test_C.begin(), A_rows, B_cols);
-
- //FULL INTS
- AlignedVector<float> C_slowint(A_rows * B_cols);
- AlignedVector<int8_t> B_quant(width * B_cols);
- AVX2_8bit::Quantize(B.begin(), B_quant.begin(), quant_mult, B.size());
-
- SlowRefInt(A_prep.begin(), B_quant.begin(), C_slowint.begin(),
- unquant_mult, A_rows, width, B_cols, bias_orig.begin());
-
-
- std::cout << "Reference int8:" << std::endl;
- printMatrix(C_slowint.begin(), A_rows, B_cols);
-
- //FULL INT16
- AlignedVector<int16_t> A_prep4(A.size());
- for (int i = 0; i < A_prep2.size(); i++) {
- A_prep4[i] = A_prep[i];
- }
-
- AlignedVector<float> C_slowint2(A_rows * B_cols);
- AlignedVector<int16_t> B_quant2(width * B_cols);
- AVX2_16bit::Quantize(B.begin(), B_quant2.begin(), quant_mult, B.size());
-
- SlowRefInt(A_prep4.begin(), B_quant2.begin(), C_slowint2.begin(),
- unquant_mult, A_rows, width, B_cols, bias_orig.begin());
-
-
- std::cout << "Reference int16:" << std::endl;
- printMatrix(C_slowint2.begin(), A_rows, B_cols);
-
- //FLOATS
- AlignedVector<float> C(A_rows * B_cols);
-
- SlowRefFloat(A.begin(), B.begin(), C.begin(), A_rows, width, B_cols, bias_orig.begin());
- std::cout << "Reference float:" << std::endl;
- printMatrix(C.begin(), A_rows, B_cols);
-
-}