Merge pull request #2 from kpu/master

Merge with latest intgemm master
author: Nikolay Bogoychev <nheart@gmail.com> 2020-11-15 19:50:02 +0300
committer: GitHub <noreply@github.com> 2020-11-15 19:50:02 +0300
commit: 8abde25b13c3ab210c0dec8e23f4944e3953812d (patch)
tree: 90b591ee994252ddd44d593276b4ef895bbcb5aa
parent: 874ceebbf53a85691b326495100b6361a2166cec (diff)
parent: 8f28282c3bd854922da638024d2659be52e892e9 (diff)
61 files changed, 885 insertions, 1211 deletions
diff --git a/.github/workflows/release.yml b/.github/workflows/mac.yml
index 4fb9b3f..767cf1a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/mac.yml
@@ -1,4 +1,4 @@
-name: Release
+name: Mac
 
 on:
   push:
@@ -8,10 +8,7 @@ on:
 
 jobs:
   build:
-    strategy:
-      matrix:
-        runs-on: [ubuntu-latest, macOS-latest, windows-latest]
-    runs-on: ${{ matrix.runs-on }}
+    runs-on: macOS-latest
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.github/workflows/ubuntu-gcc5-debug.yml b/.github/workflows/ubuntu-gcc5-debug.yml
new file mode 100644
index 0000000..1323828
--- /dev/null
+++ b/.github/workflows/ubuntu-gcc5-debug.yml
@@ -0,0 +1,27 @@
+name: Ubuntu gcc5 debug
+
+on:
+  push:
+    branches: [master, static]
+  pull_request:
+    branches: [master, static]
+
+jobs:
+  build:
+    runs-on: ubuntu-18.04
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: install
+      run: sudo apt-get install -y gcc-5 g++-5
+    - name: cmake
+      run: |
+        cmake -E make_directory build
+        cd build
+        cmake -DCMAKE_C_COMPILER=gcc-5 -DCMAKE_CXX_COMPILER=g++-5 -DCMAKE_BUILD_TYPE=Debug ..
+    - name: Compile
+      working-directory: build
+      run: cmake --build . -j2
+    - name: Test
+      working-directory: build
+      run: ctest -j2
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
new file mode 100644
index 0000000..f0e457b
--- /dev/null
+++ b/.github/workflows/ubuntu.yml
@@ -0,0 +1,25 @@
+name: Ubuntu
+
+on:
+  push:
+    branches: [master, static]
+  pull_request:
+    branches: [master, static]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: cmake
+      run: |
+        cmake -E make_directory build
+        cd build
+        cmake ..
+    - name: Compile
+      working-directory: build
+      run: cmake --build . -j2
+    - name: Test
+      working-directory: build
+      run: ctest -j2
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
new file mode 100644
index 0000000..d64fefd
--- /dev/null
+++ b/.github/workflows/windows.yml
@@ -0,0 +1,25 @@
+name: Windows
+
+on:
+  push:
+    branches: [master, static]
+  pull_request:
+    branches: [master, static]
+
+jobs:
+  build:
+    runs-on: windows-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: cmake
+      run: |
+        cmake -E make_directory build
+        cd build
+        cmake ..
+    - name: Compile
+      working-directory: build
+      run: cmake --build . -j2
+    - name: Test
+      working-directory: build
+      run: ctest -j2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c675315..d1885f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,12 +33,16 @@ if(NOT INTGEMM_COMPILER_SUPPORTS_AVX512VNNI)
   message(WARNING "${Orange}Not building AVX512VNNI-based multiplication because your compiler is too old.\nFor details rerun cmake with --debug-trycompile then try to build in compile_tests/CMakeFiles/CMakeTmp.${ColourReset}")
 endif()
 
-# Generate configure file
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/intgemm_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/intgemm_config.h)
+add_library(intgemm STATIC intgemm/intgemm.cc)
 
+# Generate configure file
+configure_file(intgemm/intgemm_config.h.in intgemm/intgemm_config.h)
+#Ensure it is included by users.
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(intgemm PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 
-add_library(intgemm STATIC intgemm.cc)
+# This isn't necessary since intgemm uses entirely relative paths but source code depending on it may want to #include <intgemm/intgemm.h>
+target_include_directories(intgemm INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 
 option(USE_OPENMP "Use OpenMP" OFF)
 if (USE_OPENMP)
@@ -80,7 +84,6 @@ add_executable(tests
   test/kernels/downcast_test.cc
   test/kernels/exp_test.cc
   test/kernels/floor_test.cc
-  test/kernels/multiply_sat_test.cc
   test/kernels/multiply_test.cc
   test/kernels/quantize_test.cc
   test/kernels/relu_test.cc
diff --git a/LICENSE b/LICENSE
index 9fcb60f..0d57f7b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -10,6 +10,36 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
 
 
 
+test/3rd_party/catch.hpp
+Copyright (c) 2019 Two Blue Cubes Ltd. All rights reserved.
+Distributed under the Boost Software License, Version 1.0. (See accompanying
+file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+
 
 The original 16-bit SSE2 code came from:
 
diff --git a/README.md b/README.md
index 8947ca9..30469fc 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,11 @@
 [![Build SSE](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-SSE.svg?label=SSE)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-SSE/)
 [![Build AVX2](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-AVX2.svg?label=AVX2)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-AVX2/)
 [![Build AVX512BW](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-AVX512BW.svg?label=AVX512BW)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-AVX512BW/)
+![Build Ubuntu](https://github.com/kpu/intgemm/workflows/Ubuntu/badge.svg)
+![Build Ubuntu debug](https://github.com/kpu/intgemm/workflows/Ubuntu%20debug/badge.svg)
+![Build Ubuntu OpenMP](https://github.com/kpu/intgemm/workflows/Ubuntu%20OpenMP/badge.svg)
+![Build Windows](https://github.com/kpu/intgemm/workflows/Windows/badge.svg)
+![Build Mac](https://github.com/kpu/intgemm/workflows/Mac/badge.svg)
 
 # Integer Matrix Multiplication
 
@@ -25,7 +30,7 @@ A full example appears in [example.cc](example.cc).
 
 Both A and B should be prepared before multiplication.
 ```C++
-#include "intgemm.h"
+#include "intgemm/intgemm.h"
 
 /* Not shown: allocate 64-byte aligned memory with e.g. aligned_alloc.
  * A is A_rows x width.
@@ -51,7 +56,7 @@ The last argument of `Multiply` is a callback which is usually used to performs
 For 8-bit, you can make use a of a slightly faster implementation, assuming you can determine tha quantization multipliers and prepare the biases offline:
 
 ```C++
-#include "intgemm.h"
+#include "intgemm/intgemm.h"
 
 /* Not shown: allocate 64-byte aligned memory with e.g. aligned_alloc.
  * A is A_rows x width.
diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc
index ebd0920..c6133bf 100644
--- a/benchmarks/benchmark.cc
+++ b/benchmarks/benchmark.cc
@@ -1,12 +1,12 @@
-#include "../aligned.h"
-#include "intgemm_config.h"
-#include "../avx512_gemm.h"
-#include "../sse2_gemm.h"
-#include "../avx2_gemm.h"
-#include "../ssse3_gemm.h"
-#include "../intgemm.h"
-#include "../stats.h"
-#include "../callbacks.h"
+#include "../intgemm/aligned.h"
+#include "intgemm/intgemm_config.h"
+#include "../intgemm/avx512_gemm.h"
+#include "../intgemm/sse2_gemm.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/ssse3_gemm.h"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/stats.h"
+#include "../intgemm/callbacks.h"
 
 #include <algorithm>
 #include <cassert>
@@ -43,7 +43,7 @@ struct RandomMatrices {
 };
 
 template <class Backend> double Run(const RandomMatrices &m) {
-  typedef typename Backend::Integer Integer;
+  using Integer = typename Backend::Integer;
   float quant_mult = 127.0f / 2.0f;
   float unquant_mult = 1.0f / (quant_mult * quant_mult);
   AlignedVector<Integer> A_prepared(m.A_rows * m.width);
@@ -145,45 +145,45 @@ int main(int, char ** argv) {
   std::cerr << "SSSE3 8bit, 100 samples..." << std::endl;
   for (int samples = 0; samples < kSamples; ++samples) {
     RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<SSSE3_8bit>(matrices, end, stats.ssse3_8bit);
+    RunAll<ssse3::Kernels8>(matrices, end, stats.ssse3_8bit);
   }
 
   std::cerr << "SSE2 16bit, 100 samples..." << std::endl;
   for (int samples = 0; samples < kSamples; ++samples) {
     RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<SSE2_16bit>(matrices, end, stats.sse2_16bit);
+    RunAll<sse2::Kernels16>(matrices, end, stats.sse2_16bit);
   }
 
   std::cerr << "AVX2 8bit, 100 samples..." << std::endl;
   for (int samples = 0; samples < kSamples; ++samples) {
     RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<AVX2_8bit>(matrices, end, stats.avx2_8bit);
+    RunAll<avx2::Kernels8>(matrices, end, stats.avx2_8bit);
   }
 
   std::cerr << "AVX2 16bit, 100 samples..." << std::endl;
   for (int samples = 0; samples < kSamples; ++samples) {
     RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<AVX2_16bit>(matrices, end, stats.avx2_16bit);
+    RunAll<avx2::Kernels16>(matrices, end, stats.avx2_16bit);
   }
 
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
   std::cerr << "AVX512 8bit, 100 samples..." << std::endl;
   for (int samples = 0; samples < kSamples; ++samples) {
     RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<AVX512_8bit>(matrices, end, stats.avx512_8bit);
+    RunAll<avx512bw::Kernels8>(matrices, end, stats.avx512_8bit);
   }
 
   std::cerr << "AVX512 16bit, 100 samples..." << std::endl;
   for (int samples = 0; samples < kSamples; ++samples) {
     RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<AVX512_16bit>(matrices, end, stats.avx512_16bit);
+    RunAll<avx512bw::Kernels16>(matrices, end, stats.avx512_16bit);
   }
 #endif
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
   std::cerr << "AVX512VNNI 8bit, 100 samples..." << std::endl;
   for (int samples = 0; samples < kSamples; ++samples) {
     RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<AVX512VNNI_8bit>(matrices, end, stats.avx512vnni_8bit);
+    RunAll<avx512vnni::Kernels8>(matrices, end, stats.avx512vnni_8bit);
   }
 #endif
 
@@ -193,18 +193,18 @@ int main(int, char ** argv) {
   }
   for (std::size_t i = 0; i < sizeof(matrices) / sizeof(RandomMatrices); ++i) {
     std::cout << "Multiply\t" << matrices[i].A_rows << '\t' << matrices[i].width << '\t' << matrices[i].B_cols << '\t' << "Samples=" << (kOutlierThreshold * stats.sse2_16bit[i].size()) << '\n';
-    Print<SSSE3_8bit>(stats.ssse3_8bit, i);
-    Print<AVX2_8bit>(stats.avx2_8bit, i);
+    Print<ssse3::Kernels8>(stats.ssse3_8bit, i);
+    Print<avx2::Kernels8>(stats.avx2_8bit, i);
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-    Print<AVX512_8bit>(stats.avx512_8bit, i);
+    Print<avx512bw::Kernels8>(stats.avx512_8bit, i);
 #endif
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-    Print<AVX512VNNI_8bit>(stats.avx512vnni_8bit, i);
+    Print<avx512vnni::Kernels8>(stats.avx512vnni_8bit, i);
 #endif
-    Print<SSE2_16bit>(stats.sse2_16bit, i);
-    Print<AVX2_16bit>(stats.avx2_16bit, i);
+    Print<sse2::Kernels16>(stats.sse2_16bit, i);
+    Print<avx2::Kernels16>(stats.avx2_16bit, i);
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-    Print<AVX512_16bit>(stats.avx512_16bit, i);
+    Print<avx512bw::Kernels16>(stats.avx512_16bit, i);
 #endif
   }
   return 0;
diff --git a/benchmarks/benchmark_quantizer.cc b/benchmarks/benchmark_quantizer.cc
index 16bf67f..5f36bd7 100644
--- a/benchmarks/benchmark_quantizer.cc
+++ b/benchmarks/benchmark_quantizer.cc
@@ -1,8 +1,8 @@
-#include "../intgemm.h"
-#include "../aligned.h"
-#include "../ssse3_gemm.h"
-#include "../avx2_gemm.h"
-#include "../avx512_gemm.h"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/aligned.h"
+#include "../intgemm/ssse3_gemm.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/avx512_gemm.h"
 
 #include <chrono>
 #include <iomanip>
@@ -14,7 +14,7 @@ namespace {
 
 float MaxAbsoluteBaseline(const float *begin, const float *end) {
   auto res = std::minmax_element(begin, end);
-  return std::max(fabsf(*res.first), fabsf(*res.second));
+  return std::max(std::fabs(*res.first), std::fabs(*res.second));
 }
 
 void BenchmarkMaxAbsolute() {
@@ -63,10 +63,10 @@ int main() {
     for (float &element : in) {
       element = dist(gen);
     }
-    QuantizerBench<intgemm::SSSE3_8bit>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
-    QuantizerBench<intgemm::AVX2_8bit>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+    QuantizerBench<intgemm::ssse3::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+    QuantizerBench<intgemm::avx2::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-    QuantizerBench<intgemm::AVX512_8bit>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
+    QuantizerBench<intgemm::avx512bw::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
 #endif
   }
 }
diff --git a/benchmarks/biasmultiply.cc b/benchmarks/biasmultiply.cc
index da46bb9..490bf3b 100644
--- a/benchmarks/biasmultiply.cc
+++ b/benchmarks/biasmultiply.cc
@@ -1,5 +1,5 @@
-#include "../intgemm.h"
-#include "../aligned.h"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/aligned.h"
 #include <chrono>
 #include <random>
 #include <iostream>
@@ -125,149 +125,149 @@ int main(int argc, char ** argv) {
 		repeat = atoi(argv[1]);
 	}
 
-	std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<SSSE3_8bit>(1, 64, 8);
+	std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<ssse3::Kernels8>(1, 64, 8);
 	for (int i = 0; i<repeat; i++) {
-		oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(8, 256, 256);
-		oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(8, 2048, 256);
-		oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(320, 256, 256);
-		oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(472, 256, 256);
-		oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(248, 256, 256);
-		oldSSSE3_nobias += testOld_nobias<SSSE3_8bit>(200, 256, 256);
+		oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(8, 256, 256);
+		oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(8, 2048, 256);
+		oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(320, 256, 256);
+		oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(472, 256, 256);
+		oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(248, 256, 256);
+		oldSSSE3_nobias += testOld_nobias<ssse3::Kernels8>(200, 256, 256);
 	}
 
 	std::cout << repeat << " iterations of SSSE3 without bias took: " << oldSSSE3_nobias.count() << " seconds." << std::endl;
 
-	std::chrono::duration<double> oldSSSE3 = testOld<SSSE3_8bit>(1, 64, 8);
+	std::chrono::duration<double> oldSSSE3 = testOld<ssse3::Kernels8>(1, 64, 8);
 	for (int i = 0; i<repeat; i++) {
-		oldSSSE3 += testOld<SSSE3_8bit>(8, 256, 256);
-		oldSSSE3 += testOld<SSSE3_8bit>(8, 2048, 256);
-		oldSSSE3 += testOld<SSSE3_8bit>(320, 256, 256);
-		oldSSSE3 += testOld<SSSE3_8bit>(472, 256, 256);
-		oldSSSE3 += testOld<SSSE3_8bit>(248, 256, 256);
-		oldSSSE3 += testOld<SSSE3_8bit>(200, 256, 256);
+		oldSSSE3 += testOld<ssse3::Kernels8>(8, 256, 256);
+		oldSSSE3 += testOld<ssse3::Kernels8>(8, 2048, 256);
+		oldSSSE3 += testOld<ssse3::Kernels8>(320, 256, 256);
+		oldSSSE3 += testOld<ssse3::Kernels8>(472, 256, 256);
+		oldSSSE3 += testOld<ssse3::Kernels8>(248, 256, 256);
+		oldSSSE3 += testOld<ssse3::Kernels8>(200, 256, 256);
 	}
 
 	std::cout << repeat << " iterations of SSSE3 took: " << oldSSSE3.count() << " seconds." << std::endl;
 
-	std::chrono::duration<double> newTimeSSSE3 = testOld<SSSE3_8bit>(1, 64, 8);
+	std::chrono::duration<double> newTimeSSSE3 = testOld<ssse3::Kernels8>(1, 64, 8);
 	for (int i = 0; i<repeat; i++) {
-		newTimeSSSE3 += testNew<SSSE3_8bit>(8, 256, 256);
-		newTimeSSSE3 += testNew<SSSE3_8bit>(8, 2048, 256);
-		newTimeSSSE3 += testNew<SSSE3_8bit>(320, 256, 256);
-		newTimeSSSE3 += testNew<SSSE3_8bit>(472, 256, 256);
-		newTimeSSSE3 += testNew<SSSE3_8bit>(248, 256, 256);
-		newTimeSSSE3 += testNew<SSSE3_8bit>(200, 256, 256);
+		newTimeSSSE3 += testNew<ssse3::Kernels8>(8, 256, 256);
+		newTimeSSSE3 += testNew<ssse3::Kernels8>(8, 2048, 256);
+		newTimeSSSE3 += testNew<ssse3::Kernels8>(320, 256, 256);
+		newTimeSSSE3 += testNew<ssse3::Kernels8>(472, 256, 256);
+		newTimeSSSE3 += testNew<ssse3::Kernels8>(248, 256, 256);
+		newTimeSSSE3 += testNew<ssse3::Kernels8>(200, 256, 256);
 	}
 
 	std::cout << repeat << " iterations of Shifted SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl;
 
-	std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<AVX2_8bit>(1, 64, 8);
+	std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<avx2::Kernels8>(1, 64, 8);
 	for (int i = 0; i<repeat; i++) {
-		oldAVX2_nobias += testOld_nobias<AVX2_8bit>(8, 256, 256);
-		oldAVX2_nobias += testOld_nobias<AVX2_8bit>(8, 2048, 256);
-		oldAVX2_nobias += testOld_nobias<AVX2_8bit>(320, 256, 256);
-		oldAVX2_nobias += testOld_nobias<AVX2_8bit>(472, 256, 256);
-		oldAVX2_nobias += testOld_nobias<AVX2_8bit>(248, 256, 256);
-		oldAVX2_nobias += testOld_nobias<AVX2_8bit>(200, 256, 256);
+		oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(8, 256, 256);
+		oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(8, 2048, 256);
+		oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(320, 256, 256);
+		oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(472, 256, 256);
+		oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(248, 256, 256);
+		oldAVX2_nobias += testOld_nobias<avx2::Kernels8>(200, 256, 256);
 	}
 
 	std::cout << repeat << " iterations of AVX2 without bias took: " << oldAVX2_nobias.count() << " seconds." << std::endl;
 
-	std::chrono::duration<double> oldAVX2 = testOld<AVX2_8bit>(1, 64, 8);
+	std::chrono::duration<double> oldAVX2 = testOld<avx2::Kernels8>(1, 64, 8);
 	for (int i = 0; i<repeat; i++) {
-		oldAVX2 += testOld<AVX2_8bit>(8, 256, 256);
-		oldAVX2 += testOld<AVX2_8bit>(8, 2048, 256);
-		oldAVX2 += testOld<AVX2_8bit>(320, 256, 256);
-		oldAVX2 += testOld<AVX2_8bit>(472, 256, 256);
-		oldAVX2 += testOld<AVX2_8bit>(248, 256, 256);
-		oldAVX2 += testOld<AVX2_8bit>(200, 256, 256);
+		oldAVX2 += testOld<avx2::Kernels8>(8, 256, 256);
+		oldAVX2 += testOld<avx2::Kernels8>(8, 2048, 256);
+		oldAVX2 += testOld<avx2::Kernels8>(320, 256, 256);
+		oldAVX2 += testOld<avx2::Kernels8>(472, 256, 256);
+		oldAVX2 += testOld<avx2::Kernels8>(248, 256, 256);
+		oldAVX2 += testOld<avx2::Kernels8>(200, 256, 256);
 	}
 
 	std::cout << repeat << " iterations of AVX2 took: " << oldAVX2.count() << " seconds." << std::endl;
 
-	std::chrono::duration<double> newTimeAVX2 = testOld<AVX2_8bit>(1, 64, 8);
+	std::chrono::duration<double> newTimeAVX2 = testOld<avx2::Kernels8>(1, 64, 8);
 	for (int i = 0; i<repeat; i++) {
-		newTimeAVX2 += testNew<AVX2_8bit>(8, 256, 256);
-		newTimeAVX2 += testNew<AVX2_8bit>(8, 2048, 256);
-		newTimeAVX2 += testNew<AVX2_8bit>(320, 256, 256);
-		newTimeAVX2 += testNew<AVX2_8bit>(472, 256, 256);
-		newTimeAVX2 += testNew<AVX2_8bit>(248, 256, 256);
-		newTimeAVX2 += testNew<AVX2_8bit>(200, 256, 256);
+		newTimeAVX2 += testNew<avx2::Kernels8>(8, 256, 256);
+		newTimeAVX2 += testNew<avx2::Kernels8>(8, 2048, 256);
+		newTimeAVX2 += testNew<avx2::Kernels8>(320, 256, 256);
+		newTimeAVX2 += testNew<avx2::Kernels8>(472, 256, 256);
+		newTimeAVX2 += testNew<avx2::Kernels8>(248, 256, 256);
+		newTimeAVX2 += testNew<avx2::Kernels8>(200, 256, 256);
 	}
 
 	std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl;
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
 	if (kCPU < CPUType::AVX512BW) return 0;
-	std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<AVX512_8bit>(1, 64, 8);
+	std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<avx512bw::Kernels8>(1, 64, 8);
 	for (int i = 0; i<repeat; i++) {
-		oldAVX512_nobias += testOld_nobias<AVX512_8bit>(8, 256, 256);
-		oldAVX512_nobias += testOld_nobias<AVX512_8bit>(8, 2048, 256);
-		oldAVX512_nobias += testOld_nobias<AVX512_8bit>(320, 256, 256);
-		oldAVX512_nobias += testOld_nobias<AVX512_8bit>(472, 256, 256);
-		oldAVX512_nobias += testOld_nobias<AVX512_8bit>(248, 256, 256);
-		oldAVX512_nobias += testOld_nobias<AVX512_8bit>(200, 256, 256);
+		oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(8, 256, 256);
+		oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(8, 2048, 256);
+		oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(320, 256, 256);
+		oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(472, 256, 256);
+		oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(248, 256, 256);
+		oldAVX512_nobias += testOld_nobias<avx512bw::Kernels8>(200, 256, 256);
 	}
 
 	std::cout << repeat << " iterations of AVX512 without bias took: " << oldAVX512_nobias.count() << " seconds." << std::endl;
 
-	std::chrono::duration<double> oldAVX512 = testOld<AVX512_8bit>(1, 64, 8);
+	std::chrono::duration<double> oldAVX512 = testOld<avx512bw::Kernels8>(1, 64, 8);
 	for (int i = 0; i<repeat; i++) {
-		oldAVX512 += testOld<AVX512_8bit>(8, 256, 256);
-		oldAVX512 += testOld<AVX512_8bit>(8, 2048, 256);
-		oldAVX512 += testOld<AVX512_8bit>(320, 256, 256);
-		oldAVX512 += testOld<AVX512_8bit>(472, 256, 256);
-		oldAVX512 += testOld<AVX512_8bit>(248, 256, 256);
-		oldAVX512 += testOld<AVX512_8bit>(200, 256, 256);
+		oldAVX512 += testOld<avx512bw::Kernels8>(8, 256, 256);
+		oldAVX512 += testOld<avx512bw::Kernels8>(8, 2048, 256);
+		oldAVX512 += testOld<avx512bw::Kernels8>(320, 256, 256);
+		oldAVX512 += testOld<avx512bw::Kernels8>(472, 256, 256);
+		oldAVX512 += testOld<avx512bw::Kernels8>(248, 256, 256);
+		oldAVX512 += testOld<avx512bw::Kernels8>(200, 256, 256);
 	}
 
 	std::cout << repeat << " iterations of AVX512 took: " << oldAVX512.count() << " seconds." << std::endl;
 
-	std::chrono::duration<double> newTimeAVX512 = testOld<AVX512_8bit>(1, 64, 8);
+	std::chrono::duration<double> newTimeAVX512 = testOld<avx512bw::Kernels8>(1, 64, 8);
 	for (int i = 0; i<repeat; i++) {
-		newTimeAVX512 += testNew<AVX512_8bit>(8, 256, 256);
-		newTimeAVX512 += testNew<AVX512_8bit>(8, 2048, 256);
-		newTimeAVX512 += testNew<AVX512_8bit>(320, 256, 256);
-		newTimeAVX512 += testNew<AVX512_8bit>(472, 256, 256);
-		newTimeAVX512 += testNew<AVX512_8bit>(248, 256, 256);
-		newTimeAVX512 += testNew<AVX512_8bit>(200, 256, 256);
+		newTimeAVX512 += testNew<avx512bw::Kernels8>(8, 256, 256);
+		newTimeAVX512 += testNew<avx512bw::Kernels8>(8, 2048, 256);
+		newTimeAVX512 += testNew<avx512bw::Kernels8>(320, 256, 256);
+		newTimeAVX512 += testNew<avx512bw::Kernels8>(472, 256, 256);
+		newTimeAVX512 += testNew<avx512bw::Kernels8>(248, 256, 256);
+		newTimeAVX512 += testNew<avx512bw::Kernels8>(200, 256, 256);
 	}
 
 	std::cout << repeat << " iterations of Shifted AVX512 took: " << newTimeAVX512.count() << " seconds." << std::endl;
 #endif
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
   if (kCPU < CPUType::AVX512VNNI) return 0;
-  std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<AVX512_8bit>(1, 64, 8);
+  std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<avx512bw::Kernels8>(1, 64, 8);
   for (int i = 0; i<repeat; i++) {
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(8, 256, 256);
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(8, 2048, 256);
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(320, 256, 256);
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(472, 256, 256);
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(248, 256, 256);
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI_8bit>(200, 256, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(8, 256, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(8, 2048, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(320, 256, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(472, 256, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(248, 256, 256);
+          oldAVX512VNNI_nobias += testOld_nobias<avx512vnni::Kernels8>(200, 256, 256);
   }
 
   std::cout << repeat << " iterations of AVX512VNNI without bias took: " << oldAVX512VNNI_nobias.count() << " seconds." << std::endl;
 
-  std::chrono::duration<double> oldAVX512VNNI = testOld<AVX512_8bit>(1, 64, 8);
+  std::chrono::duration<double> oldAVX512VNNI = testOld<avx512bw::Kernels8>(1, 64, 8);
   for (int i = 0; i<repeat; i++) {
-          oldAVX512VNNI += testOld<AVX512VNNI_8bit>(8, 256, 256);
-          oldAVX512VNNI += testOld<AVX512VNNI_8bit>(8, 2048, 256);
-          oldAVX512VNNI += testOld<AVX512VNNI_8bit>(320, 256, 256);
-          oldAVX512VNNI += testOld<AVX512VNNI_8bit>(472, 256, 256);
-          oldAVX512VNNI += testOld<AVX512VNNI_8bit>(248, 256, 256);
-          oldAVX512VNNI += testOld<AVX512VNNI_8bit>(200, 256, 256);
+          oldAVX512VNNI += testOld<avx512vnni::Kernels8>(8, 256, 256);
+          oldAVX512VNNI += testOld<avx512vnni::Kernels8>(8, 2048, 256);
+          oldAVX512VNNI += testOld<avx512vnni::Kernels8>(320, 256, 256);
+          oldAVX512VNNI += testOld<avx512vnni::Kernels8>(472, 256, 256);
+          oldAVX512VNNI += testOld<avx512vnni::Kernels8>(248, 256, 256);
+          oldAVX512VNNI += testOld<avx512vnni::Kernels8>(200, 256, 256);
   }
 
   std::cout << repeat << " iterations of AVX512VNNI took: " << oldAVX512VNNI.count() << " seconds." << std::endl;
 
-  std::chrono::duration<double> newTimeAVX512VNNI = testOld<AVX512_8bit>(1, 64, 8);
+  std::chrono::duration<double> newTimeAVX512VNNI = testOld<avx512bw::Kernels8>(1, 64, 8);
   for (int i = 0; i<repeat; i++) {
-    newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(8, 256, 256);
-    newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(8, 2048, 256);
-    newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(320, 256, 256);
-    newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(472, 256, 256);
-    newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(248, 256, 256);
-    newTimeAVX512VNNI += testNew<AVX512VNNI_8bit>(200, 256, 256);
+    newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(8, 256, 256);
+    newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(8, 2048, 256);
+    newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(320, 256, 256);
+    newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(472, 256, 256);
+    newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(248, 256, 256);
+    newTimeAVX512VNNI += testNew<avx512vnni::Kernels8>(200, 256, 256);
   }
 
   std::cout << repeat << " iterations of Shifted AVX512VNNI took: " << newTimeAVX512VNNI.count() << " seconds." << std::endl;
diff --git a/example.cc b/example.cc
index 292bd6b..5f558d0 100644
--- a/example.cc
+++ b/example.cc
@@ -1,11 +1,11 @@
-#include "intgemm.h"
+#include "intgemm/intgemm.h"
 // This is just for AlignedVector, which helps managed 64-byte aligned memory.
 // Feel free to manage memory yourself.
-#include "aligned.h" 
-#include "callbacks.h"
+#include "intgemm/aligned.h" 
+#include "intgemm/callbacks.h"
 
 #include <cassert>
-#include <math.h>
+#include <cmath>
 #include <random>
 
 int main() {
@@ -54,7 +54,7 @@ int main() {
     // Do the actual multiply.
     intgemm::Int16::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0f / (quant_mult * quant_mult), C.begin()));
     // Sanity check.  C will be row major.
-    assert(fabsf(C[0] - top_left_reference) < 0.05f);
+    assert(std::fabs(C[0] - top_left_reference) < 0.05f);
   }
 
   // 8-bit multiplication.
@@ -73,6 +73,7 @@ int main() {
     // Do the actual multiply.
     intgemm::Int8::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0f / (quant_mult * quant_mult), C.begin()));
     // Sanity check.  C will be row major.
-    assert(fabsf(C[0] - top_left_reference) < 0.05f);
+    assert(std::fabs(C[0] - top_left_reference) < 0.05f);
   }
+  return 0;
 }
diff --git a/intgemm.cc b/intgemm.cc
deleted file mode 100644
index 2708952..0000000
--- a/intgemm.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-#include "intgemm.h"
-#include "stats.h"
-
-namespace intgemm {
-
-float Unsupported_MaxAbsolute(const float * /*begin*/, const float * /*end*/) {
-  throw UnsupportedCPU();
-}
-
-MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/, bool /*absolute*/) {
-  throw UnsupportedCPU();
-}
-
-void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_16bit::Quantize, AVX512_16bit::Quantize, AVX2_16bit::Quantize, SSE2_16bit::Quantize, SSE2_16bit::Quantize, Unsupported_16bit::Quantize);
-
-void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_16bit::PrepareB, AVX512_16bit::PrepareB, AVX2_16bit::PrepareB, SSE2_16bit::PrepareB, SSE2_16bit::PrepareB, Unsupported_16bit::PrepareB);
-
-void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBQuantizedTransposed, AVX512_16bit::PrepareBQuantizedTransposed, AVX2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, SSE2_16bit::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
-
-void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_16bit::PrepareBTransposed, AVX512_16bit::PrepareBTransposed, AVX2_16bit::PrepareBTransposed, SSE2_16bit::PrepareBTransposed, SSE2_16bit::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
-
-void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
-
-const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName);
-
-void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI_8bit::Quantize, AVX512_8bit::Quantize, AVX2_8bit::Quantize, SSSE3_8bit::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
-
-void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI_8bit::QuantizeU, AVX512_8bit::QuantizeU, AVX2_8bit::QuantizeU, SSSE3_8bit::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
-
-void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512VNNI_8bit::PrepareB, AVX512_8bit::PrepareB, AVX2_8bit::PrepareB, SSSE3_8bit::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
-
-void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_8bit::PrepareBQuantizedTransposed, AVX512_8bit::PrepareBQuantizedTransposed, AVX2_8bit::PrepareBQuantizedTransposed, SSSE3_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
-
-void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512_8bit::PrepareBTransposed, AVX512_8bit::PrepareBTransposed, AVX2_8bit::PrepareBTransposed, SSSE3_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
-
-void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI_8bit::SelectColumnsB, AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
-
-const char *const Int8::kName = ChooseCPU(AVX512VNNI_8bit::kName, AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
-
-void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI_8bit::QuantizeU, AVX512_8bit::QuantizeU, AVX2_8bit::QuantizeU, SSSE3_8bit::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
-
-const char *const Int8Shift::kName = ChooseCPU(AVX512VNNI_8bit::kName, AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
-
-const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED);
-
-#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW)
-namespace avx512bw {
-using avx2::MaxAbsolute;
-using avx2::VectorMeanStd;
-} // namespace avx512bw
-#endif
-
-float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512bw::MaxAbsolute, avx512bw::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute);
-
-MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512bw::VectorMeanStd, avx512bw::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, Unsupported_VectorMeanStd);
-
-constexpr const char *const Unsupported_16bit::kName;
-constexpr const char *const Unsupported_8bit::kName;
-constexpr const char *const SSE2_16bit::kName;
-constexpr const char *const SSSE3_8bit::kName;
-constexpr const char *const AVX2_8bit::kName;
-constexpr const char *const AVX2_16bit::kName;
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-constexpr const char *const AVX512_8bit::kName;
-constexpr const char *const AVX512_16bit::kName;
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-constexpr const char *const AVX512VNNI_8bit::kName;
-#endif
-
-}
diff --git a/aligned.h b/intgemm/aligned.h
index 8ad7242..7500a8c 100644
--- a/aligned.h
+++ b/intgemm/aligned.h
@@ -1,7 +1,6 @@
 #pragma once
 #include <cstdlib>
 #include <new>
-#include <stdlib.h>
 #ifdef _MSC_VER
 #include <malloc.h>
 #endif
diff --git a/avx2_gemm.h b/intgemm/avx2_gemm.h
index a929361..5e81475 100644
--- a/avx2_gemm.h
+++ b/intgemm/avx2_gemm.h
@@ -6,57 +6,46 @@
 #include "types.h"
 
 #include <cstdint>
-#include <stdint.h>
 #include <cstring>
 
 namespace intgemm {
-
 namespace avx2 {
 
-INTGEMM_AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
-  return kernels::quantize(loadu_ps<__m256>(input), quant_mult_reg);
+INTGEMM_AVX2 inline Register QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
+  return kernels::quantize(loadu_ps<FRegister>(input), quant_mult_reg);
 }
 
 INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i)
 
 class QuantizeTile16 {
   public:
-    typedef __m256i Register;
-
-    INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
-
-    INTGEMM_AVX2 Register Consecutive(const float *input) const {
-      return Tile(input, input + 8);
+    INTGEMM_AVX2 static inline Register Consecutive(FRegister mult_reg, const float *input) {
+      return Tile(mult_reg, input, input + 8);
     }
 
-    INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
-      return Tile(
+    INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
+      return Tile(mult_reg,
         input,
         input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0));
     }
 
-    INTGEMM_AVX2 Register ForReshape(const float *input, Index cols) const {
+    INTGEMM_AVX2 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
       // 8 rows in the first 128-bit register, 8 in the second register.
-      return Tile(input, input + 8 * cols);
+      return Tile(mult_reg, input, input + 8 * cols);
     }
 
   private:
-    INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) const {
-      __m256i g0 = QuantizerGrab(input0, mult_);
-      __m256i g1 = QuantizerGrab(input1, mult_);
-      __m256i packed = _mm256_packs_epi32(g0, g1);
+    INTGEMM_AVX2 static inline Register Tile(FRegister mult_reg, const float *input0, const float *input1) {
+      Register g0 = QuantizerGrab(input0, mult_reg);
+      Register g1 = QuantizerGrab(input1, mult_reg);
+      Register packed = _mm256_packs_epi32(g0, g1);
       // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15.
       // Technically this could be removed if the PrepareB did the same reordering internally.
       return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
     }
-
-    const __m256 mult_;
 };
 
-} // namespace
-
-
-struct AVX2_16bit {
+struct Kernels16 {
   typedef int16_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
@@ -68,10 +57,10 @@ struct AVX2_16bit {
   INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
     assert(size % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
-    avx2::QuantizeTile16 q(quant_mult);
+    FRegister q = set1_ps<FRegister>(quant_mult);
     const float *end = input + size;
     for (; input != end; input += 16, output += 16) {
-      *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
+      *reinterpret_cast<__m256i*>(output) = QuantizeTile16::Consecutive(q, input);
     }
   }
 
@@ -83,7 +72,7 @@ struct AVX2_16bit {
     PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols);
   }*/
   INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, CPUType::AVX2, int16_t)
+  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int16_t)
   INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile16, int16_t)
 
   INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
@@ -97,26 +86,21 @@ struct AVX2_16bit {
   static const CPUType kUses = CPUType::AVX2;
 };
 
-namespace avx2 {
 /* Read 8 floats at a time from input0, input1, input2, and input3.  Quantize
  * them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate
  * the result into one register and return it.
  */
 class QuantizeTile8 {
   public:
-    typedef __m256i Register;
-
-    INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
-
-    INTGEMM_AVX2 inline __m256i Consecutive(const float *input) const {
-      return Tile(input, input + 8, input + 16, input + 24);
+    INTGEMM_AVX2 static inline Register Consecutive(FRegister quant_mult, const float *input) {
+      return Tile(quant_mult, input, input + 8, input + 16, input + 24);
     }
 
-    INTGEMM_AVX2 inline __m256i ConsecutiveU(const float *input) const {
-      return TileU(input, input + 8, input + 16, input + 24);
+    INTGEMM_AVX2 static inline Register ConsecutiveU(FRegister quant_mult, const float *input) {
+      return TileU(quant_mult, input, input + 8, input + 16, input + 24);
     }
 
-    INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+    INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
       const float* inputs[4];
       for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
         while (cols_left < sizeof(Register) / sizeof(float)) {
@@ -127,24 +111,24 @@ class QuantizeTile8 {
         input += sizeof(Register) / sizeof(float);
         cols_left -= sizeof(Register) / sizeof(float);
       }
-      return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
+      return Tile(quant_mult, inputs[0], inputs[1], inputs[2], inputs[3]);
     }
 
-    INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) const {
+    INTGEMM_AVX2 static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
       // Put higher rows in the second half of the register.  These will jumble
       // around in the same way then conveniently land in the right place.
-      return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
+      return Tile(quant_mult, input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
     }
 
-    INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const {
+    INTGEMM_AVX2 static inline __m256i Tile(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
       // Looking at the assembly, gcc has pulled this outside the loops calling this.
       const __m256i neg127 = _mm256_set1_epi8(-127);
       const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
       // Grab 4 registers at a time in 32-bit format.
-      __m256i g0 = avx2::QuantizerGrab(input0, mult_);
-      __m256i g1 = avx2::QuantizerGrab(input1, mult_);
-      __m256i g2 = avx2::QuantizerGrab(input2, mult_);
-      __m256i g3 = avx2::QuantizerGrab(input3, mult_);
+      __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
+      __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
+      __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
+      __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
       // Pack 32-bit to 16-bit.
       __m256i packed0 = _mm256_packs_epi32(g0, g1);
       __m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -161,16 +145,16 @@ class QuantizeTile8 {
 
   private:
     //A version that produces uint8_ts
-    INTGEMM_AVX2 inline __m256i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const {
+    INTGEMM_AVX2 static inline Register TileU(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
       // Looking at the assembly, gcc has pulled this outside the loops calling this.
       const __m256i neg127 = _mm256_set1_epi8(-127);
       const __m256i pos127 = _mm256_set1_epi8(127);
       const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
       // Grab 4 registers at a time in 32-bit format.
-      __m256i g0 = avx2::QuantizerGrab(input0, mult_);
-      __m256i g1 = avx2::QuantizerGrab(input1, mult_);
-      __m256i g2 = avx2::QuantizerGrab(input2, mult_);
-      __m256i g3 = avx2::QuantizerGrab(input3, mult_);
+      __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
+      __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
+      __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
+      __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
       // Pack 32-bit to 16-bit.
       __m256i packed0 = _mm256_packs_epi32(g0, g1);
       __m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -185,13 +169,9 @@ class QuantizeTile8 {
       // and the values are only used for GEMM.
       return _mm256_permutevar8x32_epi32(packed, shuffle_param);
     }
-
-    const __m256 mult_;
 };
 
-} // namespace
-
-struct AVX2_8bit {
+struct Kernels8 {
   typedef int8_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
@@ -199,9 +179,9 @@ struct AVX2_8bit {
     Quantize(input, output, quant_mult, rows * cols);
   }
  private:
-  INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2, __m256i, avx2)
+  INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2)
  public:
-  INTGEMM_QUANTIZE(INTGEMM_AVX2, __m256i, avx2)
+  INTGEMM_QUANTIZE(INTGEMM_AVX2)
 
   // Currently A is prepared by quantization but this could theoretically change.
   INTGEMM_AVX2 static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) {
@@ -212,10 +192,10 @@ struct AVX2_8bit {
   INTGEMM_AVX2 static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) {
     assert(size % 32 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
-    avx2::QuantizeTile8 q(quant_mult);
+    FRegister q = set1_ps<FRegister>(quant_mult);
     const float *end = input + size;
     for (; input != end; input += 32, output += 32) {
-      *reinterpret_cast<__m256i*>(output) = q.ConsecutiveU(input);
+      *reinterpret_cast<__m256i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
     }
   }
 
@@ -224,7 +204,7 @@ struct AVX2_8bit {
   static const Index kBTileCol = 8;
 
   INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, CPUType::AVX2, int8_t)
+  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int8_t)
   INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, avx2::QuantizeTile8, int8_t)
 
   INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
@@ -242,4 +222,5 @@ struct AVX2_8bit {
   static const CPUType kUses = CPUType::AVX2;
 };
 
+} // namespace avx2
 } // namespace intgemm
diff --git a/avx512_gemm.h b/intgemm/avx512_gemm.h
index d53e48e..f9fb1eb 100644
--- a/avx512_gemm.h
+++ b/intgemm/avx512_gemm.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
 
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
 
@@ -12,10 +12,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include <cstdlib>
 
 /* AVX512 implementation.
  * This uses INTGEMM_AVX512BW, INTGEMM_AVX512DQ, and might use AVX512VL
@@ -34,8 +31,7 @@ namespace intgemm {
 // So conversion in memory uses these, but I also implement a wider version for
 // rearranging B.
 
-// Convert to 16-bit signed integers.
-namespace avx512f {
+namespace avx512bw {
 
 // Load from memory, multiply, and convert to int32_t.
 /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
@@ -60,7 +56,7 @@ INTGEMM_AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) {
 // Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently.
 /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
 INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
-  __m512 appended = avx512f::Concat(loadu_ps<__m256>(input0), loadu_ps<__m256>(input1));
+  __m512 appended = Concat(loadu_ps<__m256>(input0), loadu_ps<__m256>(input1));
   appended = _mm512_mul_ps(appended, quant_mult_reg);
   return _mm512_cvtps_epi32(appended);
 }
@@ -70,40 +66,27 @@ INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const f
 // being used for the quantizer.
 class QuantizeTile16 {
   public:
-    typedef __m512i Register;
-
-    /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-    INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
-    INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+    INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
       auto input0 = input;
       auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0);
-      auto g0 = QuantizerGrabHalves(input0, input1, mult_reg_);
-      auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, mult_reg_);
+      auto g0 = QuantizerGrabHalves(input0, input1, quant_mult);
+      auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, quant_mult);
       auto packed = packs_epi32(g0, g1);
       return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
     }
 
-    INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const {
-      __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
-      __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
+    INTGEMM_AVX512BW static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
+      __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, quant_mult);
+      __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, quant_mult);
       __m512i packed = packs_epi32(g0, g1);
       // Permute within 256-bit lanes, so same as INTGEMM_AVX2
       return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
     }
-
-  private:
-    const __m512 mult_reg_;
 };
 
 class QuantizeTile8 {
   public:
-    typedef __m512i Register;
-
-    /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-    INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
-    INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+    INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
       static const __m512i neg127 = _mm512_set1_epi8(-127);
       static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
 
@@ -118,10 +101,10 @@ class QuantizeTile8 {
         cols_left -= sizeof(Register) / sizeof(float);
       }
 
-      auto g0 = QuantizerGrab(inputs[0], mult_reg_);
-      auto g1 = QuantizerGrab(inputs[1], mult_reg_);
-      auto g2 = QuantizerGrab(inputs[2], mult_reg_);
-      auto g3 = QuantizerGrab(inputs[3], mult_reg_);
+      auto g0 = QuantizerGrab(inputs[0], quant_mult);
+      auto g1 = QuantizerGrab(inputs[1], quant_mult);
+      auto g2 = QuantizerGrab(inputs[2], quant_mult);
+      auto g3 = QuantizerGrab(inputs[3], quant_mult);
 
       auto packed0 = packs_epi32(g0, g1);
       auto packed1 = packs_epi32(g2, g3);
@@ -130,17 +113,17 @@ class QuantizeTile8 {
       return _mm512_permutexvar_epi32(shuffle_param, packed);
     }
 
-    INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const {
+    INTGEMM_AVX512BW static inline __m512i ForReshape(FRegister quant_mult, const float *input, Index cols) {
       // TODO: try alternative: _mm512_cvtsepi32_epi8 ?
       const __m512i neg127 = _mm512_set1_epi8(-127);
       // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
       const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
 
       // 32-bit format.
-      __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_);
-      __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_);
-      __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_);
-      __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_);
+      __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, quant_mult);
+      __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, quant_mult);
+      __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, quant_mult);
+      __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, quant_mult);
       // Pack 32-bit to 16-bit.
       __m512i packed0 = packs_epi32(g0, g1);
       __m512i packed1 = packs_epi32(g2, g3);
@@ -151,14 +134,9 @@ class QuantizeTile8 {
       // 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63
       return _mm512_permutexvar_epi32(shuffle_param, packed);
     }
-
-  private:
-    const __m512 mult_reg_;
 };
 
-} // namespace
-
-struct AVX512_16bit {
+struct Kernels16 {
   typedef int16_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
@@ -181,7 +159,7 @@ struct AVX512_16bit {
     const float *end = input + size;
     for (; input != end; input += 16, output += 16) {
       // There doesn't seem to be an unmasked version.
-      _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, avx512f::QuantizerGrab(input, quant_mult_reg));
+      _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, QuantizerGrab(input, quant_mult_reg));
     }
   }
 
@@ -189,19 +167,15 @@ struct AVX512_16bit {
   // Tile size for B; B must be a multiple of this block size.
   static const Index kBTileRow = 32;
   static const Index kBTileCol = 8;
-/*
-  INTGEMM_AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
-    PrepareBFor16(input, output, avx512f::QuantizeTile16(quant_mult), rows, cols);
-  }
-*/
+
   /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, avx512f::QuantizeTile16)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, CPUType::AVX512BW, int16_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, avx512f::QuantizeTile16, int16_t)
+  INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, QuantizeTile16)
+  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, int16_t)
+  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, QuantizeTile16, int16_t)
 
   /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
   INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-    avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
+    SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
   }
 
   /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
@@ -212,7 +186,7 @@ struct AVX512_16bit {
   static const CPUType kUses = CPUType::AVX512BW;
 };
 
-struct AVX512_8bit {
+struct Kernels8 {
   typedef int8_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
@@ -237,7 +211,7 @@ struct AVX512_8bit {
     const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
 #pragma omp for
     for (std::size_t i = 0; i < count; i += kBatch) {
-      __m512i asint = avx512f::QuantizerGrab(input + i, quant_mult_reg);
+      __m512i asint = QuantizerGrab(input + i, quant_mult_reg);
       asint = _mm512_max_epi32(asint, neg127);
       // There doesn't seem to be an unmasked version.
       _mm512_mask_cvtsepi32_storeu_epi8(output + i, 0xffff, asint);
@@ -263,7 +237,7 @@ struct AVX512_8bit {
     if (!overhang) return; // We needed a branch anyway for the empty case.
     const __m512i neg127 = _mm512_set1_epi32(-127);
     const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
-    __m512i asint = avx512f::QuantizerGrab(fast_input_end, quant_mult_reg);
+    __m512i asint = QuantizerGrab(fast_input_end, quant_mult_reg);
     asint = _mm512_max_epi32(asint, neg127);
     _mm512_mask_cvtsepi32_storeu_epi8(fast_output_end, (1 << overhang) - 1, asint);
   }
@@ -287,7 +261,7 @@ struct AVX512_8bit {
     const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
     const float *end = input + size;
     for (; input < end; input += 16, output += 16) {
-      __m512i asint = avx512f::QuantizerGrab(input, quant_mult_reg);
+      __m512i asint = QuantizerGrab(input, quant_mult_reg);
       asint = _mm512_min_epi32(asint, pos127);
       asint = _mm512_add_epi32(asint, pos127);
       asint = _mm512_max_epi32(asint, zero);
@@ -298,26 +272,21 @@ struct AVX512_8bit {
   // Tile size for B; B must be a multiple of this block size.
   static const Index kBTileRow = 64;
   static const Index kBTileCol = 8;
-/*
-  INTGEMM_AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
-    PrepareBFor8(input, output, avx512f::QuantizeTile8(quant_mult), rows, cols);
-  }*/
+
   /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, avx512f::QuantizeTile8)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, CPUType::AVX512BW, int8_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, avx512f::QuantizeTile8, int8_t)
+  INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, QuantizeTile8)
+  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, int8_t)
+  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, QuantizeTile8, int8_t)
 
   /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
   INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-    avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
+    SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
   }
 
   // Special AVX512 implementation due to having 32 registers (so I don't have to
   // allocate registers manually) and no sign instruction.
   template <typename Callback>
   INTGEMM_AVX512BW static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    typedef __m512i Register;
-    //typedef __m256 Float; // For quantization we only do 8 at a time.
     // This is copy-paste from Multiply8_SSE2OrAVX2.
     assert(width % sizeof(Register) == 0);
     assert(B_cols % 8 == 0);
@@ -325,7 +294,7 @@ struct AVX512_8bit {
     assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
     // There's 8 results for INTGEMM_AVX2 to handle.
     auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
-    const int simd_width = width / sizeof(Register);
+    const Index simd_width = width / sizeof(Register);
     // Added for AVX512.
     Register zeros = setzero_si<Register>();
     // Go over 8 columns of B at a time.
@@ -436,6 +405,7 @@ struct AVX512_8bit {
   static const CPUType kUses = CPUType::AVX512BW;
 };
 
+} // namespace avx512bw
 } // namespace intgemm
 
 #endif
diff --git a/avx512vnni_gemm.h b/intgemm/avx512vnni_gemm.h
index 22c5c4e..c660168 100644
--- a/avx512vnni_gemm.h
+++ b/intgemm/avx512vnni_gemm.h
@@ -1,12 +1,13 @@
 #pragma once
 
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
 
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
 #include "avx512_gemm.h"
 #include "types.h"
 
 namespace intgemm {
+namespace avx512vnni {
 
 // Workaround extra vmovdqa64 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663
 INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) {
@@ -17,16 +18,15 @@ INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) {
 #endif
 }
 
-struct AVX512VNNI_8bit : public AVX512_8bit {
+struct Kernels8 : public avx512bw::Kernels8 {
   template <typename Callback>
   INTGEMM_AVX512VNNI static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    typedef __m512i Register;
     assert(width % sizeof(Register) == 0);
     assert(B_cols % 8 == 0);
     assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0);
     assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
     auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
-    const int simd_width = width / sizeof(Register);
+    const Index simd_width = width / sizeof(Register);
     Register zeros = setzero_si<Register>();
     // Go over 8 columns of B at a time.
 #pragma omp for
@@ -82,13 +82,12 @@ struct AVX512VNNI_8bit : public AVX512_8bit {
 
   template <typename Callback>
   INTGEMM_AVX512VNNI static void Multiply8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    typedef __m512i Register;
     assert(width % sizeof(Register) == 0);
     assert(B_cols % 8 == 0);
     assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0);
     assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
     auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
-    const int simd_width = width / sizeof(Register);
+    const Index simd_width = width / sizeof(Register);
     Register zeros = setzero_si<Register>();
     // Go over 8 columns of B at a time.
 #pragma omp for
@@ -124,12 +123,11 @@ struct AVX512VNNI_8bit : public AVX512_8bit {
 
   template <typename Callback>
   INTGEMM_AVX512VNNI static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) {
-    typedef __m512i Register;
     assert(width % sizeof(Register) == 0);
     assert(B_cols % 8 == 0);
     assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
     auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
-    const int simd_width = width / sizeof(Register);
+    Index simd_width = width / sizeof(Register);
     Register zeros = setzero_si<Register>();
     const Register a = set1_epi8<Register>(1);
     // Go over 8 columns of B at a time.
@@ -164,6 +162,7 @@ struct AVX512VNNI_8bit : public AVX512_8bit {
   static const CPUType kUses = CPUType::AVX512VNNI;
 };
 
+} // namespace avx512vnni
 } // namespace intgemm
 
 #endif
diff --git a/callbacks.h b/intgemm/callbacks.h
index 24f9009..23d3be1 100644
--- a/callbacks.h
+++ b/intgemm/callbacks.h
@@ -3,7 +3,7 @@
 #include "callbacks/configs.h"
 #include "callbacks/output_buffer_info.h"
 
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
 #include "intrinsics.h"
 #include "kernels.h"
 #include "types.h"
diff --git a/callbacks/configs.h b/intgemm/callbacks/configs.h
index 1222448..1222448 100644
--- a/callbacks/configs.h
+++ b/intgemm/callbacks/configs.h
diff --git a/callbacks/implementations.inl b/intgemm/callbacks/implementations.inl
index d2b7d95..47d2aa4 100644
--- a/callbacks/implementations.inl
+++ b/intgemm/callbacks/implementations.inl
@@ -129,7 +129,14 @@ public:
   }
 
   CPU_ATTR void operator()(vi input, const OutputBufferInfo& info) {
-    auto result = kernels::unquantize(input, unquant_mult);
+    // Workaround gcc 5 internal compiler error that can't read register members in debug.
+    vf mult_reg;
+#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+    asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
+#else
+    mult_reg = unquant_mult;
+#endif
+    auto result = kernels::unquantize(input, mult_reg);
     kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
   }
 
@@ -164,7 +171,14 @@ public:
   }
 
   CPU_ATTR void operator()(vi input, const OutputBufferInfo& info) {
-    auto result = kernels::unquantize(input, unquant_mult);
+    // Workaround gcc 5 internal compiler error that can't read register members in debug.
+    vf mult_reg;
+#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+    asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
+#else
+    mult_reg = unquant_mult;
+#endif
+    auto result = kernels::unquantize(input, mult_reg);
     result = kernels::add_bias(result, config.bias_addr, info.col_idx);
     kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
   }
diff --git a/callbacks/output_buffer_info.h b/intgemm/callbacks/output_buffer_info.h
index 213aef4..213aef4 100644
--- a/callbacks/output_buffer_info.h
+++ b/intgemm/callbacks/output_buffer_info.h
diff --git a/interleave.h b/intgemm/interleave.h
index 231be46..1ec686b 100644
--- a/interleave.h
+++ b/intgemm/interleave.h
@@ -1,12 +1,11 @@
 #pragma once
 
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
 #include "intrinsics.h"
 #include "types.h"
 
 #include <algorithm>
 #include <cassert>
-#include <stdint.h>
 
 namespace intgemm {
 
@@ -180,11 +179,9 @@ template <class Register> static inline void Transpose8InLane(
 // ... ...
 #define INTGEMM_PREPARE_B_8(target, QuantClass) \
 target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \
-  typedef typename QuantClass Quantizer; \
-  typedef typename Quantizer::Register Register; \
-  Quantizer q = Quantizer(quant_mult); \
+  FRegister q = set1_ps<FRegister>(quant_mult); \
   /* Currently all multipliers have a stride of 8 columns.*/ \
-  const int kColStride = 8; \
+  const Index kColStride = 8; \
   assert(cols % kColStride == 0); \
   assert(rows % sizeof(Register) == 0); \
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
@@ -196,14 +193,14 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
          This isn't quite Transpose8InLane because it's half the number of columns, \
          so each register starts with two rows instead of being one row. \
          The quantizers know to skip a row.*/ \
-      output[0] = q.ForReshape(input + cols * (r    ) + c, cols); \
-      output[1] = q.ForReshape(input + cols * (r + 1) + c, cols); \
-      output[2] = q.ForReshape(input + cols * (r + 4) + c, cols); \
-      output[3] = q.ForReshape(input + cols * (r + 5) + c, cols); \
-      output[4] = q.ForReshape(input + cols * (r + 8) + c, cols); \
-      output[5] = q.ForReshape(input + cols * (r + 9) + c, cols); \
-      output[6] = q.ForReshape(input + cols * (r + 12) + c, cols); \
-      output[7] = q.ForReshape(input + cols * (r + 13) + c, cols); \
+      output[0] = QuantClass::ForReshape(q, input + cols * (r    ) + c, cols); \
+      output[1] = QuantClass::ForReshape(q, input + cols * (r + 1) + c, cols); \
+      output[2] = QuantClass::ForReshape(q, input + cols * (r + 4) + c, cols); \
+      output[3] = QuantClass::ForReshape(q, input + cols * (r + 5) + c, cols); \
+      output[4] = QuantClass::ForReshape(q, input + cols * (r + 8) + c, cols); \
+      output[5] = QuantClass::ForReshape(q, input + cols * (r + 9) + c, cols); \
+      output[6] = QuantClass::ForReshape(q, input + cols * (r + 12) + c, cols); \
+      output[7] = QuantClass::ForReshape(q, input + cols * (r + 13) + c, cols); \
       Interleave8(output[0], output[1]); \
       Interleave8(output[2], output[3]); \
       Interleave8(output[4], output[5]); \
@@ -215,9 +212,7 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
 
 #define INTGEMM_PREPARE_B_16(target, QuantClass) \
 target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \
-  typedef typename QuantClass Quantizer; \
-  typedef typename Quantizer::Register Register; \
-  Quantizer q = Quantizer(quant_mult); \
+  FRegister q = set1_ps<FRegister>(quant_mult); \
   assert(cols % 8 == 0); \
   assert(rows % (sizeof(Register) / sizeof(int16_t)) == 0); \
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
@@ -226,8 +221,8 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
   for (Index c = 0; c < cols; c += 8) { \
     for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \
       /* gcc unrolls this loop and uses registers for output[k]*/ \
-      for (int k = 0; k < 8; ++k) { \
-        output[k] = q.ForReshape(input + cols * (r + k) + c, cols); \
+      for (Index k = 0; k < 8; ++k) { \
+        output[k] = QuantClass::ForReshape(q, input + cols * (r + k) + c, cols); \
       } \
       Transpose16InLane(output[0], output[1], output[2], output[3], output[4], output[5], output[6], output[7]); \
     } \
@@ -241,9 +236,8 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
  *
  * cols and rows describe size of transposed B.
  */
-#define INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(target, cpu_type, Integer) \
+#define INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(target, Integer) \
 target static inline void PrepareBQuantizedTransposed(const Integer* input, Integer* output, Index cols, Index rows) { \
-  using Register = vector_t<cpu_type, Integer>; \
   const Index RegisterElems = sizeof(Register) / sizeof(Integer); \
   const Index kColStride = 8; \
   \
@@ -268,7 +262,6 @@ target static inline void PrepareBQuantizedTransposed(const Integer* input, Inte
  */
 #define INTGEMM_PREPARE_B_TRANSPOSED(target, Quantizer, Integer) \
 target static inline void PrepareBTransposed(const float* input, Integer* output, float quant_mult, Index cols, Index rows) { \
-  using Register = typename Quantizer::Register; \
   const Index RegisterElemsInt = sizeof(Register) / sizeof(Integer); \
   const Index kColStride = 8; \
   \
@@ -277,13 +270,13 @@ target static inline void PrepareBTransposed(const float* input, Integer* output
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
   assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
   \
-  Quantizer quantizer(quant_mult); \
+  FRegister q = set1_ps<FRegister>(quant_mult); \
   Register* output_it = reinterpret_cast<Register*>(output); \
   Index r = 0; \
   Index c = 0; \
   while (r < rows) { \
     for (Index ri = 0; ri < 8; ++ri) \
-      *output_it++ = quantizer.ConsecutiveWithWrapping(input + (r + ri) * cols + c, cols - c, cols, 8); \
+      *output_it++ = Quantizer::ConsecutiveWithWrapping(q, input + (r + ri) * cols + c, cols - c, cols, 8); \
     c += RegisterElemsInt; \
     while (c >= cols) { \
       r += kColStride; \
@@ -299,14 +292,14 @@ target static inline void SelectColumnsOfB(const Register *input, Register *outp
   assert(rows_bytes % sizeof(Register) == 0); \
   assert((cols_end - cols_begin) % 8 == 0);  \
   /* Do columns for multiples of 8.*/ \
-  int register_rows = rows_bytes / sizeof(Register); \
+  Index register_rows = rows_bytes / sizeof(Register); \
   const Register *starts[8]; \
   for (; cols_begin != cols_end; cols_begin += 8) { \
-    for (int k = 0; k < 8; ++k) { \
+    for (Index k = 0; k < 8; ++k) { \
       starts[k] = input + (cols_begin[k] & 7) + (cols_begin[k] & ~7) * register_rows; \
     } \
-    for (int r = 0; r < register_rows; ++r) { \
-      for (int k = 0; k < 8; ++k) { \
+    for (Index r = 0; r < register_rows; ++r) { \
+      for (Index k = 0; k < 8; ++k) { \
         *(output++) = *starts[k]; \
         starts[k] += 8; \
       } \
diff --git a/intgemm/intgemm.cc b/intgemm/intgemm.cc
new file mode 100644
index 0000000..f859b9a
--- /dev/null
+++ b/intgemm/intgemm.cc
@@ -0,0 +1,71 @@
+#include "intgemm.h"
+#include "stats.h"
+
+namespace intgemm {
+
+float Unsupported_MaxAbsolute(const float * /*begin*/, const float * /*end*/) {
+  throw UnsupportedCPU();
+}
+
+MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/, bool /*absolute*/) {
+  throw UnsupportedCPU();
+}
+
+void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(avx512bw::Kernels16::Quantize, avx512bw::Kernels16::Quantize, avx2::Kernels16::Quantize, sse2::Kernels16::Quantize, sse2::Kernels16::Quantize, Unsupported_16bit::Quantize);
+
+void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512bw::Kernels16::PrepareB, avx512bw::Kernels16::PrepareB, avx2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, sse2::Kernels16::PrepareB, Unsupported_16bit::PrepareB);
+
+void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBQuantizedTransposed, avx512bw::Kernels16::PrepareBQuantizedTransposed, avx2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, sse2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
+
+void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels16::PrepareBTransposed, avx512bw::Kernels16::PrepareBTransposed, avx2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, sse2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
+
+void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512bw::Kernels16::SelectColumnsB, avx512bw::Kernels16::SelectColumnsB, avx2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, sse2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
+
+const char *const Int16::kName = ChooseCPU(avx512bw::Kernels16::kName, avx512bw::Kernels16::kName, avx2::Kernels16::kName, sse2::Kernels16::kName, sse2::Kernels16::kName, Unsupported_16bit::kName);
+
+void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::Quantize, avx512bw::Kernels8::Quantize, avx2::Kernels8::Quantize, ssse3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
+
+void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
+
+void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(avx512vnni::Kernels8::PrepareB, avx512bw::Kernels8::PrepareB, avx2::Kernels8::PrepareB, ssse3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
+
+void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBQuantizedTransposed, avx512bw::Kernels8::PrepareBQuantizedTransposed, avx2::Kernels8::PrepareBQuantizedTransposed, ssse3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
+
+void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(avx512bw::Kernels8::PrepareBTransposed, avx512bw::Kernels8::PrepareBTransposed, avx2::Kernels8::PrepareBTransposed, ssse3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
+
+void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(avx512vnni::Kernels8::SelectColumnsB, avx512bw::Kernels8::SelectColumnsB, avx2::Kernels8::SelectColumnsB, ssse3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
+
+const char *const Int8::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
+
+void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(avx512vnni::Kernels8::QuantizeU, avx512bw::Kernels8::QuantizeU, avx2::Kernels8::QuantizeU, ssse3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
+
+const char *const Int8Shift::kName = ChooseCPU(avx512vnni::Kernels8::kName, avx512bw::Kernels8::kName, avx2::Kernels8::kName, ssse3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
+
+const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType::AVX2, CPUType::SSSE3, CPUType::SSE2, CPUType::UNSUPPORTED);
+
+#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW)
+namespace avx512bw {
+using avx2::MaxAbsolute;
+using avx2::VectorMeanStd;
+} // namespace avx512bw
+#endif
+
+float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512bw::MaxAbsolute, avx512bw::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute);
+
+MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512bw::VectorMeanStd, avx512bw::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, Unsupported_VectorMeanStd);
+
+constexpr const char *const Unsupported_16bit::kName;
+constexpr const char *const Unsupported_8bit::kName;
+constexpr const char *const sse2::Kernels16::kName;
+constexpr const char *const ssse3::Kernels8::kName;
+constexpr const char *const avx2::Kernels8::kName;
+constexpr const char *const avx2::Kernels16::kName;
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+constexpr const char *const avx512bw::Kernels8::kName;
+constexpr const char *const avx512bw::Kernels16::kName;
+#endif
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
+constexpr const char *const avx512vnni::Kernels8::kName;
+#endif
+
+}
diff --git a/intgemm.h b/intgemm/intgemm.h
index db7d2fe..8e2da02 100644
--- a/intgemm.h
+++ b/intgemm/intgemm.h
@@ -39,11 +39,9 @@
  * passing unquant_mult = \lambda / (A_quant_mult * B_quant_mult).
  */
 
-// Yes, both headers due to the debacle about int32_t
 #include <cstdint>
-#include <stdint.h>
 
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
 #include "types.h"
 #include "sse2_gemm.h"
 #include "ssse3_gemm.h"
@@ -126,7 +124,15 @@ struct Unsupported_8bit {
 
 #ifndef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
 // These won't ever be called in this capacity, but it does let the code below compile.
-typedef Unsupported_8bit AVX512VNNI_8bit;
+namespace avx512vnni {
+typedef Unsupported_8bit Kernels8;
+} // namespace avx512vnni
+#endif
+#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW
+namespace avx512bw {
+typedef Unsupported_8bit Kernels8;
+typedef Unsupported_16bit Kernels16;
+} // namespace avx512bw
 #endif
 
 /* Returns:
@@ -274,7 +280,7 @@ private:
 };
 
 template <typename Callback>
-void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512VNNI_8bit>, OMPParallelWrap<Callback, AVX512_8bit>, OMPParallelWrap<Callback, AVX2_8bit>, OMPParallelWrap<Callback, SSSE3_8bit>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>);
+void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512vnni::Kernels8>, OMPParallelWrap<Callback, avx512bw::Kernels8>, OMPParallelWrap<Callback, avx2::Kernels8>, OMPParallelWrap<Callback, ssse3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>);
 
 /*
  * 8-bit matrix multiplication with shifting A by 127
@@ -338,14 +344,14 @@ private:
 
 template <class Callback>
 void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(
-    OMPParallelWrap8Shift<Callback, AVX512VNNI_8bit>,
-    OMPParallelWrap8Shift<Callback, AVX512_8bit>,
-    OMPParallelWrap8Shift<Callback, AVX2_8bit>,
-    OMPParallelWrap8Shift<Callback, SSSE3_8bit>, 
+    OMPParallelWrap8Shift<Callback, avx512vnni::Kernels8>,
+    OMPParallelWrap8Shift<Callback, avx512bw::Kernels8>,
+    OMPParallelWrap8Shift<Callback, avx2::Kernels8>,
+    OMPParallelWrap8Shift<Callback, ssse3::Kernels8>, 
     Unsupported_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift<Callback>);
 
 template <class Callback>
-void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::PrepareBias<Callback>, AVX512_8bit::PrepareBias<Callback>, AVX2_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
+void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(avx512vnni::Kernels8::PrepareBias<Callback>, avx512bw::Kernels8::PrepareBias<Callback>, avx2::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, ssse3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
 
 /*
  * 16-bit matrix multiplication
@@ -401,7 +407,7 @@ private:
 };
 
 template <typename Callback>
-void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512_16bit> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, AVX512_16bit>, OMPParallelWrap<Callback, AVX2_16bit>, OMPParallelWrap<Callback, SSE2_16bit>, OMPParallelWrap<Callback, SSE2_16bit>, Unsupported_16bit::Multiply<Callback>);
+void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, avx512bw::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, avx512bw::Kernels16>, OMPParallelWrap<Callback, avx2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, OMPParallelWrap<Callback, sse2::Kernels16>, Unsupported_16bit::Multiply<Callback>);
 
 extern const CPUType kCPU;
 
diff --git a/intgemm_config.h.in b/intgemm/intgemm_config.h.in
index 920e9ae..920e9ae 100644
--- a/intgemm_config.h.in
+++ b/intgemm/intgemm_config.h.in
diff --git a/intrinsics.h b/intgemm/intrinsics.h
index bf79e43..480f421 100644
--- a/intrinsics.h
+++ b/intgemm/intrinsics.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
 #include "types.h"
 
 #include <tmmintrin.h>
@@ -9,7 +9,6 @@
 #include <xmmintrin.h>
 
 #include <cstdint>
-#include <stdint.h>
 
 /*
  * NOTE: Please keep intrinsics in alphabetical order.
@@ -162,17 +161,17 @@ template <> INTGEMM_SSE2 inline __m128i setzero_si<__m128i>() {
 INTGEMM_SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) {
   return _mm_sign_epi8(first, second);
 }
-INTGEMM_SSE2 static inline __m128i slli_epi16(__m128i a, int8_t b) {
-  return _mm_slli_epi16(a, b);
+template <int imm8> INTGEMM_SSE2 static inline __m128i slli_epi16(__m128i a) {
+  return _mm_slli_epi16(a, imm8);
 }
-INTGEMM_SSE2 static inline __m128i srai_epi16(__m128i a, int8_t b) {
-  return _mm_srai_epi16(a, b);
+template <int imm8> INTGEMM_SSE2 static inline __m128i srai_epi16(__m128i a) {
+  return _mm_srai_epi16(a, imm8);
 }
-INTGEMM_SSE2 static inline __m128i srai_epi32(__m128i a, int8_t b) {
-  return _mm_srai_epi32(a, b);
+template <int imm8> INTGEMM_SSE2 static inline __m128i srai_epi32(__m128i a) {
+  return _mm_srai_epi32(a, imm8);
 }
-INTGEMM_SSE2 static inline __m128i srli_epi16(__m128i a, int8_t b) {
-  return _mm_srli_epi16(a, b);
+template <int imm8> INTGEMM_SSE2 static inline __m128i srli_epi16(__m128i a) {
+  return _mm_srli_epi16(a, imm8);
 }
 INTGEMM_SSE2 static inline void storeu_ps(float* mem_addr, __m128 a) {
   _mm_storeu_ps(mem_addr, a);
@@ -343,17 +342,17 @@ template <> INTGEMM_AVX2 inline __m256i setzero_si<__m256i>() {
 INTGEMM_AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) {
   return _mm256_sign_epi8(first, second);
 }
-INTGEMM_AVX2 static inline __m256i slli_epi16(__m256i a, int8_t b) {
-  return _mm256_slli_epi16(a, b);
+template <int imm8> INTGEMM_AVX2 static inline __m256i slli_epi16(__m256i a) {
+  return _mm256_slli_epi16(a, imm8);
 }
-INTGEMM_AVX2 static inline __m256i srai_epi16(__m256i a, int8_t b) {
-  return _mm256_srai_epi16(a, b);
+template <int imm8> INTGEMM_AVX2 static inline __m256i srai_epi16(__m256i a) {
+  return _mm256_srai_epi16(a, imm8);
 }
-INTGEMM_AVX2 static inline __m256i srai_epi32(__m256i a, int8_t b) {
-  return _mm256_srai_epi32(a, b);
+template <int imm8> INTGEMM_AVX2 static inline __m256i srai_epi32(__m256i a) {
+  return _mm256_srai_epi32(a, imm8);
 }
-INTGEMM_AVX2 static inline __m256i srli_epi16(__m256i a, int8_t b) {
-  return _mm256_srli_epi16(a, b);
+template <int imm8> INTGEMM_AVX2 static inline __m256i srli_epi16(__m256i a) {
+  return _mm256_srli_epi16(a, imm8);
 }
 INTGEMM_AVX2 static inline void storeu_ps(float* mem_addr, __m256 a) {
   _mm256_storeu_ps(mem_addr, a);
@@ -540,17 +539,17 @@ template <> INTGEMM_AVX512BW inline __m512 load_ps<__m512>(const float* from) {
 /*
  * Missing sign_epi8
  */
-INTGEMM_AVX512BW static inline __m512i slli_epi16(__m512i a, int8_t b) {
-  return _mm512_slli_epi16(a, b);
+template <int imm8> INTGEMM_AVX512BW static inline __m512i slli_epi16(__m512i a) {
+  return _mm512_slli_epi16(a, imm8);
 }
-INTGEMM_AVX512BW static inline __m512i srai_epi16(__m512i a, int8_t b) {
-  return _mm512_srai_epi16(a, b);
+template <int imm8> INTGEMM_AVX512BW static inline __m512i srai_epi16(__m512i a) {
+  return _mm512_srai_epi16(a, imm8);
 }
-INTGEMM_AVX512BW static inline __m512i srai_epi32(__m512i a, int8_t b) {
-  return _mm512_srai_epi32(a, b);
+template <int imm8> INTGEMM_AVX512BW static inline __m512i srai_epi32(__m512i a) {
+  return _mm512_srai_epi32(a, imm8);
 }
-INTGEMM_AVX512BW static inline __m512i srli_epi16(__m512i a, int8_t b) {
-  return _mm512_srli_epi16(a, b);
+template <int imm8> INTGEMM_AVX512BW static inline __m512i srli_epi16(__m512i a) {
+  return _mm512_srli_epi16(a, imm8);
 }
 INTGEMM_AVX512BW static inline void storeu_ps(float* mem_addr, __m512 a) {
   _mm512_storeu_ps(mem_addr, a);
diff --git a/kernels.h b/intgemm/kernels.h
index 84631b5..ee35966 100644
--- a/kernels.h
+++ b/intgemm/kernels.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
 #include "intrinsics.h"
 #include "types.h"
 #include "utils.h"
diff --git a/kernels/implementations.inl b/intgemm/kernels/implementations.inl
index 2ec9f1f..4f1b39f 100644
--- a/kernels/implementations.inl
+++ b/intgemm/kernels/implementations.inl
@@ -145,8 +145,8 @@ CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> multiply(vector_t<CPUTy
 template <>
 CPU_ATTR inline vi multiply<int8_t>(vi a, vi b) {
   auto even = mullo_epi16(a, b);
-  auto odd = mullo_epi16(srli_epi16(a, 8), srli_epi16(b, 8));
-  return or_si(slli_epi16(odd, 8), srli_epi16(slli_epi16(even, 8), 8));
+  auto odd = mullo_epi16(srli_epi16<8>(a), srli_epi16<8>(b));
+  return or_si(slli_epi16<8>(odd), srli_epi16<8>(slli_epi16<8>(even)));
 }
 
 template <>
@@ -235,7 +235,7 @@ CPU_ATTR static inline dvector_t<CPUType::CPU_NAME, int16_t> upcast8to16(vi inpu
   static const auto permutation_indices = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
 
   input = _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(input)));
-  auto negatives = _mm512_cmp_epi8_mask(input, vzero, _MM_CMPINT_LT);
+  auto negatives = _mm512_cmp_epi8_mask(input, vzero, 1 /* _MM_CMPINT_LT */);
   auto higher_byte = _mm512_mask_blend_epi8(negatives, vzero, vmax_negative);
 #endif
 
@@ -258,7 +258,7 @@ CPU_ATTR static inline dvector_t<CPUType::CPU_NAME, int> upcast16to32(vi input)
   static const auto permutation_indices = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
 
   input = _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(input)));
-  auto negatives = _mm512_cmp_epi16_mask(input, vzero, _MM_CMPINT_LT);
+  auto negatives = _mm512_cmp_epi16_mask(input, vzero, 1 /* _MM_CMPINT_LT */);
   auto higher_byte = _mm512_mask_blend_epi16(negatives, vzero, vmax_negative);
 #endif
 
@@ -296,32 +296,6 @@ CPU_ATTR static inline vi bitwise_not(vi v) {
 }
 
 /*
- * Multiply with saturation (elemwise)
- */
-template <typename Type>
-CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> multiply_sat(vector_t<CPUType::CPU_NAME, Type> a, vector_t<CPUType::CPU_NAME, Type> b, uint8_t right_shift);
-
-template <>
-CPU_ATTR inline vi multiply_sat<int8_t>(vi a, vi b, uint8_t right_shift) {
-  auto upcasted_a = upcast8to16(a);
-  auto upcasted_b = upcast8to16(b);
-  auto low = srai_epi16(multiply<int16_t>(upcasted_a.first, upcasted_b.first), right_shift);
-  auto hi = srai_epi16(multiply<int16_t>(upcasted_a.second, upcasted_b.second), right_shift);
-
-  return downcast16to8(low, hi);
-}
-
-template <>
-CPU_ATTR inline vi multiply_sat<int16_t>(vi a, vi b, uint8_t right_shift) {
-  auto upcasted_a = upcast16to32(a);
-  auto upcasted_b = upcast16to32(b);
-  auto low = srai_epi32(multiply<int32_t>(upcasted_a.first, upcasted_b.first), right_shift);
-  auto hi = srai_epi32(multiply<int32_t>(upcasted_a.second, upcasted_b.second), right_shift);
-
-  return downcast32to16(low, hi);
-}
-
-/*
  * Floor
  */
 CPU_ATTR static inline vf floor(vf input) {
diff --git a/multiply.h b/intgemm/multiply.h
index 80940d4..e201e09 100644
--- a/multiply.h
+++ b/intgemm/multiply.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
 #include "interleave.h"
 #include "intrinsics.h"
 #include "vec_traits.h"
@@ -47,16 +47,16 @@ INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i p
 // Quantize function used for SSSE3 and AVX2.
 // Separate function for thread to work around gcc 7 bug that doesn't imbue
 // target attributes across #pragma omp parallel.
-#define INTGEMM_QUANTIZE_THREAD(target, Register, name) \
+#define INTGEMM_QUANTIZE_THREAD(target) \
 target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \
-  name::QuantizeTile8 q(quant_mult); \
+  FRegister q = set1_ps<FRegister>(quant_mult); \
   INTGEMM_OMP_FOR \
   for (std::size_t i = 0; i < count; i += sizeof(Register)) { \
-    *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+    *reinterpret_cast<Register*>(output + i) = QuantizeTile8::Consecutive(q, input + i); \
   } \
 }
 
-#define INTGEMM_QUANTIZE(target, Register, name) \
+#define INTGEMM_QUANTIZE(target) \
 target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
   assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
@@ -68,7 +68,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa
   } \
   std::size_t overhang = size & (kBatch - 1); \
   if (!overhang) return; \
-  name::QuantizeTile8 q(quant_mult); \
+  FRegister q = set1_ps<FRegister>(quant_mult); \
   /* Each does size(Register) / 32 == kBatch / 4 floats at a time.
    * If we're allowed to read one of them, then we can read the whole register.  */ \
   const float *inputs[4]; \
@@ -80,7 +80,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa
   for (; i < 4; ++i) { \
     inputs[i] = &input[fast_end]; \
   } \
-  Register result = q.Tile(inputs[0], inputs[1], inputs[2], inputs[3]); \
+  Register result = QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]); \
   std::memcpy(output + (size & ~(kBatch - 1)), &result, overhang); \
 }
 
@@ -159,7 +159,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
   assert(B_cols % 8 == 0); \
   assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
   assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
-  const int simd_width = width / (sizeof(Register) / sizeof(int16_t)); \
+  const Index simd_width = width / (sizeof(Register) / sizeof(int16_t)); \
   auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
   INTGEMM_OMP_FOR \
   for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
@@ -169,7 +169,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
       const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \
       /* These will be packed 32-bit integers containing sums for each row of B multiplied by the row of A. \
          Iterate over shared (inner) dimension.*/ \
-      int k = 0; \
+      Index k = 0; \
       Register a = *(A_row + k); \
       Register sum0 = madd_epi16(a, *(B0_col + k * 8)); \
       Register sum1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \
@@ -216,7 +216,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
   assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \
   assert(B_cols % 8 == 0); \
   assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
-  const int simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
+  const Index simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
   auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
   const Register a = set1_epi8<Register>(1); \
   INTGEMM_OMP_FOR \
@@ -225,7 +225,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
     /*const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width);*/ \
     /* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \
        Iterate over shared (inner) dimension.*/ \
-    int k = 0; \
+    Index k = 0; \
     Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
     Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
     Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
@@ -291,7 +291,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
   assert(B_cols % 8 == 0); \
   assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
   assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
-  const int simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
+  const Index simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
   auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
   INTGEMM_OMP_FOR \
   for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
@@ -301,7 +301,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
       const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \
       /* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \
          Iterate over shared (inner) dimension.*/ \
-      int k = 0; \
+      Index k = 0; \
       Register a = *(A_row + k); \
       Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
       Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
@@ -538,7 +538,7 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
   assert(B_cols % 8 == 0); \
   assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
   assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
-  const int simd_width = width / sizeof(Register); \
+  const Index simd_width = width / sizeof(Register); \
   auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
   INTGEMM_OMP_FOR \
   for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
diff --git a/sse2_gemm.h b/intgemm/sse2_gemm.h
index 69f6741..cd49efe 100644
--- a/sse2_gemm.h
+++ b/intgemm/sse2_gemm.h
@@ -5,12 +5,10 @@
 #include "types.h"
 
 #include <cstdint>
-#include <stdint.h>
 
 // 8 bit is in ssse3_gemm.h
 
 namespace intgemm {
-
 namespace sse2 {
 
 INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
@@ -21,37 +19,30 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
 
 class QuantizeTile16 {
   public:
-    typedef __m128i Register;
-
-    INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
-    INTGEMM_SSE2 inline __m128i Consecutive(const float *input) const {
-      return Tile(input, input + 4);
+    INTGEMM_SSE2 static inline Register Consecutive(__m128 mult_reg, const float *input) {
+      return Tile(mult_reg, input, input + 4);
     }
 
-    INTGEMM_SSE2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
-      return Tile(
+    INTGEMM_SSE2 static inline Register ConsecutiveWithWrapping(__m128 mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
+      return Tile(mult_reg,
         input,
         input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0));
     }
 
-    INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) const {
-      return Consecutive(input);
+    INTGEMM_SSE2 static inline Register ForReshape(__m128 mult_reg, const float *input, int) {
+      return Consecutive(mult_reg, input);
     }
 
   private:
-    INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) const {
-      __m128i g0 = QuantizerGrab(input0, mult_reg_);
-      __m128i g1 = QuantizerGrab(input1, mult_reg_);
+    INTGEMM_SSE2 static inline Register Tile(__m128 mult_reg, const float *input0, const float *input1) {
+      __m128i g0 = kernels::quantize(loadu_ps<__m128>(input0), mult_reg);
+      __m128i g1 = kernels::quantize(loadu_ps<__m128>(input1), mult_reg);
       return _mm_packs_epi32(g0, g1);
     }
-
-    const __m128 mult_reg_;
 };
 
-} //namespace
-// This should be pure INTGEMM_SSE2 (and below).
-struct SSE2_16bit {
+// This should be pure SSE2 (and below).
+struct Kernels16 {
   typedef int16_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
@@ -63,10 +54,10 @@ struct SSE2_16bit {
     assert(size % 8 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
-    sse2::QuantizeTile16 q(quant_mult);
+    FRegister q = set1_ps<FRegister>(quant_mult);
     const float *end = input + size;
     for (; input != end; input += 8, output += 8) {
-      *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
+      *reinterpret_cast<__m128i*>(output) = QuantizeTile16::Consecutive(q, input);
     }
   }
 
@@ -74,13 +65,13 @@ struct SSE2_16bit {
   static const Index kBTileRow = 8;
   static const Index kBTileCol = 8;
 
-  INTGEMM_PREPARE_B_16(INTGEMM_SSE2, sse2::QuantizeTile16)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSE2, CPUType::SSE2, int16_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSE2, sse2::QuantizeTile16, int16_t)
+  INTGEMM_PREPARE_B_16(INTGEMM_SSE2, QuantizeTile16)
+  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSE2, int16_t)
+  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSE2, QuantizeTile16, int16_t)
 
   INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     //TODO #DEFINE
-    sse2::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
+    SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
   }
   INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, CPUType::SSE2)
 
@@ -89,4 +80,5 @@ struct SSE2_16bit {
   static const CPUType kUses = CPUType::SSE2;
 };
 
+} // namespace sse2
 } // namespace intgemm
diff --git a/ssse3_gemm.h b/intgemm/ssse3_gemm.h
index fd3ab8c..865fe12 100644
--- a/ssse3_gemm.h
+++ b/intgemm/ssse3_gemm.h
@@ -6,13 +6,11 @@
 #include "types.h"
 
 #include <cstdint>
-#include <stdint.h>
 #include <cstring>
 
 // 16-bit is in sse2_gemm.h
 
 namespace intgemm {
-
 namespace ssse3 {
 
 INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
@@ -23,24 +21,20 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i)
 
 class QuantizeTile8 {
   public:
-    typedef __m128i Register;
-
-    INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
-    INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) const {
+    INTGEMM_SSSE3 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
       // Skip a row.
-      return Tile(input, input + 4, input + 2 * cols, input + 2 * cols + 4);
+      return Tile(mult_reg, input, input + 4, input + 2 * cols, input + 2 * cols + 4);
     }
 
-    INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) const {
-      return Tile(input, input + 4, input + 8, input + 12);
+    INTGEMM_SSSE3 static inline Register Consecutive(FRegister mult_reg, const float *input) {
+      return Tile(mult_reg, input, input + 4, input + 8, input + 12);
     }
 
-    INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) const {
-      return TileU(input, input + 4, input + 8, input + 12);
+    INTGEMM_SSSE3 static inline Register ConsecutiveU(FRegister mult_reg, const float *input) {
+      return TileU(mult_reg, input, input + 4, input + 8, input + 12);
     }
 
-    INTGEMM_SSSE3 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+    INTGEMM_SSSE3 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
       const float* inputs[4];
       for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
         while (cols_left < sizeof(Register) / sizeof(float)) {
@@ -51,16 +45,16 @@ class QuantizeTile8 {
         input += sizeof(Register) / sizeof(float);
         cols_left -= sizeof(Register) / sizeof(float);
       }
-      return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
+      return Tile(mult_reg, inputs[0], inputs[1], inputs[2], inputs[3]);
     }
 
     // Quantize 16xfloat into 16xint8_t
-    INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const {
+    INTGEMM_SSSE3 static inline __m128i Tile(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
       const __m128i neg128 = _mm_set1_epi8(-128);
-      __m128i g0 = QuantizerGrab(input0, mult_reg_);
-      __m128i g1 = QuantizerGrab(input1, mult_reg_);
-      __m128i g2 = QuantizerGrab(input2, mult_reg_);
-      __m128i g3 = QuantizerGrab(input3, mult_reg_);
+      __m128i g0 = QuantizerGrab(input0, mult_reg);
+      __m128i g1 = QuantizerGrab(input1, mult_reg);
+      __m128i g2 = QuantizerGrab(input2, mult_reg);
+      __m128i g3 = QuantizerGrab(input3, mult_reg);
       __m128i packed0 = _mm_packs_epi32(g0, g1);
       __m128i packed1 = _mm_packs_epi32(g2, g3);
       __m128i packed = _mm_packs_epi16(packed0, packed1);
@@ -78,13 +72,13 @@ class QuantizeTile8 {
     }
 
   private:
-    INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const {
+    INTGEMM_SSSE3 static inline __m128i TileU(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
       const __m128i neg128 = _mm_set1_epi8(-128);
       const __m128i pos127 = _mm_set1_epi8(127);
-      __m128i g0 = QuantizerGrab(input0, mult_reg_);
-      __m128i g1 = QuantizerGrab(input1, mult_reg_);
-      __m128i g2 = QuantizerGrab(input2, mult_reg_);
-      __m128i g3 = QuantizerGrab(input3, mult_reg_);
+      __m128i g0 = QuantizerGrab(input0, mult_reg);
+      __m128i g1 = QuantizerGrab(input1, mult_reg);
+      __m128i g2 = QuantizerGrab(input2, mult_reg);
+      __m128i g3 = QuantizerGrab(input3, mult_reg);
       __m128i packed0 = _mm_packs_epi32(g0, g1);
       __m128i packed1 = _mm_packs_epi32(g2, g3);
       __m128i packed = _mm_packs_epi16(packed0, packed1);
@@ -100,15 +94,10 @@ class QuantizeTile8 {
       return _mm_add_epi8(_mm_sub_epi8(packed, evils), pos127);
       // No permute needed.  packs is in order for SSE.
     }
-
-  private:
-    const __m128 mult_reg_;
 };
 
-} // namespace
-
-// pmaddubsw (the 8-bit multiply) is INTGEMM_SSSE3, so pedantically that's the version we need.
-struct SSSE3_8bit {
+// pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need.
+struct Kernels8 {
   typedef int8_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
@@ -117,9 +106,9 @@ struct SSSE3_8bit {
   }
 
  private:
-  INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3, __m128i, ssse3)
+  INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3)
  public:
-  INTGEMM_QUANTIZE(INTGEMM_SSSE3, __m128i, ssse3)
+  INTGEMM_QUANTIZE(INTGEMM_SSSE3)
 
   // Version with unsigned int + 127
   // Currently A is prepared by quantization but this could theoretically change.
@@ -131,10 +120,10 @@ struct SSSE3_8bit {
     assert(size % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
-    ssse3::QuantizeTile8 q(quant_mult);
+    FRegister q = set1_ps<FRegister>(quant_mult);
     const float *end = input + size;
     for (; input != end; input += 16, output += 16) {
-      *reinterpret_cast<__m128i*>(output) = q.ConsecutiveU(input);
+      *reinterpret_cast<__m128i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
     }
   }
 
@@ -143,8 +132,8 @@ struct SSSE3_8bit {
   static const Index kBTileCol = 8;
 
   INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, CPUType::SSE2, int8_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, ssse3::QuantizeTile8, int8_t)
+  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t)
+  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t)
 
   INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
@@ -161,4 +150,5 @@ struct SSSE3_8bit {
   static const CPUType kUses = CPUType::SSSE3;
 };
 
+} // namespace ssse3
 } // namespace intgemm
diff --git a/stats.h b/intgemm/stats.h
index 6f9eda2..6f9eda2 100644
--- a/stats.h
+++ b/intgemm/stats.h
diff --git a/stats.inl b/intgemm/stats.inl
index 7fc7afb..d6a850e 100644
--- a/stats.inl
+++ b/intgemm/stats.inl
@@ -54,7 +54,7 @@ INTGEMM_TARGET static inline float MaxAbsolute(const float *begin_float, const f
   }
 #else
   for (const float *i = end_reg; i < end_float; ++i) {
-    ret = std::max(ret, fabsf(*i));
+    ret = std::max(ret, std::fabs(*i));
   }
 #endif
   return ret;
diff --git a/types.h b/intgemm/types.h
index da0429f..da0429f 100644
--- a/types.h
+++ b/intgemm/types.h
diff --git a/utils.h b/intgemm/utils.h
index a520ea0..a520ea0 100644
--- a/utils.h
+++ b/intgemm/utils.h
diff --git a/vec_traits.h b/intgemm/vec_traits.h
index 86265b2..86265b2 100644
--- a/vec_traits.h
+++ b/intgemm/vec_traits.h
diff --git a/test/3rd_party/LICENSE_1_0.txt b/test/3rd_party/LICENSE_1_0.txt
new file mode 100644
index 0000000..7925d62
--- /dev/null
+++ b/test/3rd_party/LICENSE_1_0.txt
@@ -0,0 +1,24 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
diff --git a/3rd_party/catch.hpp b/test/3rd_party/catch.hpp
index 1850fff..1850fff 100644
--- a/3rd_party/catch.hpp
+++ b/test/3rd_party/catch.hpp
diff --git a/test/add127_test.cc b/test/add127_test.cc
index 28c241f..b7ce49b 100644
--- a/test/add127_test.cc
+++ b/test/add127_test.cc
@@ -282,51 +282,51 @@ template <class Routine> void TestMultiplyShiftInt(Index A_rows, Index width, In
 // Bias
 TEST_CASE("PrepareBias SSSE3", "[Add127]") {
 	if (kCPU < CPUType::SSSE3) return;
-	TestPrepareBias<SSSE3_8bit>(256,256);
-	TestPrepareBias<SSSE3_8bit>(2048,256);
-	TestPrepareBias<SSSE3_8bit>(512,512);
+	TestPrepareBias<ssse3::Kernels8>(256,256);
+	TestPrepareBias<ssse3::Kernels8>(2048,256);
+	TestPrepareBias<ssse3::Kernels8>(512,512);
 }
 
 TEST_CASE("PrepareBias AVX2", "[Add127]") {
 	if (kCPU < CPUType::AVX2) return;
-	TestPrepareBias<AVX2_8bit>(256,256);
-	TestPrepareBias<AVX2_8bit>(2048,256);
-	TestPrepareBias<AVX2_8bit>(512,512);
+	TestPrepareBias<avx2::Kernels8>(256,256);
+	TestPrepareBias<avx2::Kernels8>(2048,256);
+	TestPrepareBias<avx2::Kernels8>(512,512);
 }
 
 TEST_CASE("PrepareBias AVX512F", "[Add127]") {
 	if (kCPU < CPUType::AVX512BW) return;
 	#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-	TestPrepareBias<AVX512_8bit>(256,256);
-	TestPrepareBias<AVX512_8bit>(2048,256);
-	TestPrepareBias<AVX512_8bit>(512,512);
+	TestPrepareBias<avx512bw::Kernels8>(256,256);
+	TestPrepareBias<avx512bw::Kernels8>(2048,256);
+	TestPrepareBias<avx512bw::Kernels8>(512,512);
 	#endif
 }
 
 //A
 TEST_CASE("PrepareA SSSE3", "[Add127]") {
 	if (kCPU < CPUType::SSSE3) return;
-	TestPrepareA<SSSE3_8bit>(64,64);
-	TestPrepareA<SSSE3_8bit>(256,256);
-	TestPrepareA<SSSE3_8bit>(512,512);
-  TestPrepareA<SSSE3_8bit>(2048,256);
+	TestPrepareA<ssse3::Kernels8>(64,64);
+	TestPrepareA<ssse3::Kernels8>(256,256);
+	TestPrepareA<ssse3::Kernels8>(512,512);
+  TestPrepareA<ssse3::Kernels8>(2048,256);
 }
 
 TEST_CASE("PrepareA AVX2", "[Add127]") {
 	if (kCPU < CPUType::AVX2) return;
-	TestPrepareA<AVX2_8bit>(64,64);
-	TestPrepareA<AVX2_8bit>(256,256);
-	TestPrepareA<AVX2_8bit>(512,512);
-  TestPrepareA<AVX2_8bit>(2048,256);
+	TestPrepareA<avx2::Kernels8>(64,64);
+	TestPrepareA<avx2::Kernels8>(256,256);
+	TestPrepareA<avx2::Kernels8>(512,512);
+  TestPrepareA<avx2::Kernels8>(2048,256);
 }
 
 TEST_CASE("PrepareA AVX512F", "[Add127]") {
 	if (kCPU < CPUType::AVX512BW) return;
 	#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-	TestPrepareA<AVX512_8bit>(64,64);
-	TestPrepareA<AVX512_8bit>(256,256);
-	TestPrepareA<AVX512_8bit>(512,512);
-  TestPrepareA<AVX512_8bit>(2048,256);
+	TestPrepareA<avx512bw::Kernels8>(64,64);
+	TestPrepareA<avx512bw::Kernels8>(256,256);
+	TestPrepareA<avx512bw::Kernels8>(512,512);
+  TestPrepareA<avx512bw::Kernels8>(2048,256);
 	#endif
 }
 
@@ -334,144 +334,144 @@ TEST_CASE("PrepareA AVX512F", "[Add127]") {
 
 TEST_CASE ("Multiply SSSE3 8bit Shift with bias", "[Add127]") {
   if (kCPU < CPUType::SSSE3) return;
-  TestMultiplyBiasNew<SSSE3_8bit>(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f);
-  TestMultiplyBiasNew<SSSE3_8bit>(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f);
-  TestMultiplyBiasNew<SSSE3_8bit>(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f);
-  TestMultiplyBiasNew<SSSE3_8bit>(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f);
-  TestMultiplyBiasNew<SSSE3_8bit>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
-  TestMultiplyBiasNew<SSSE3_8bit>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
-  TestMultiplyBiasNew<SSSE3_8bit>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
+  TestMultiplyBiasNew<ssse3::Kernels8>(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f);
+  TestMultiplyBiasNew<ssse3::Kernels8>(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f);
+  TestMultiplyBiasNew<ssse3::Kernels8>(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f);
+  TestMultiplyBiasNew<ssse3::Kernels8>(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f);
+  TestMultiplyBiasNew<ssse3::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
+  TestMultiplyBiasNew<ssse3::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
+  TestMultiplyBiasNew<ssse3::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
 }
 
 TEST_CASE ("Multiply AVX2 8bit Shift with bias", "[Add127]") {
   if (kCPU < CPUType::AVX2) return;
-  TestMultiplyBiasNew<AVX2_8bit>(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f);
-  TestMultiplyBiasNew<AVX2_8bit>(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f);
-  TestMultiplyBiasNew<AVX2_8bit>(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f);
-  TestMultiplyBiasNew<AVX2_8bit>(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f);
-  TestMultiplyBiasNew<AVX2_8bit>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
-  TestMultiplyBiasNew<AVX2_8bit>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
-  TestMultiplyBiasNew<AVX2_8bit>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
+  TestMultiplyBiasNew<avx2::Kernels8>(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f);
+  TestMultiplyBiasNew<avx2::Kernels8>(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f);
+  TestMultiplyBiasNew<avx2::Kernels8>(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f);
+  TestMultiplyBiasNew<avx2::Kernels8>(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f);
+  TestMultiplyBiasNew<avx2::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
+  TestMultiplyBiasNew<avx2::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
+  TestMultiplyBiasNew<avx2::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
 }
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
 TEST_CASE ("Multiply AVX512F 8bit Shift with bias", "[Add127]") {
   if (kCPU < CPUType::AVX512BW) return;
-  TestMultiplyBiasNew<AVX512_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
-  TestMultiplyBiasNew<AVX512_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
-  TestMultiplyBiasNew<AVX512_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
-  TestMultiplyBiasNew<AVX512_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-  TestMultiplyBiasNew<AVX512_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
-  TestMultiplyBiasNew<AVX512_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-  TestMultiplyBiasNew<AVX512_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
+  TestMultiplyBiasNew<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
+  TestMultiplyBiasNew<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
+  TestMultiplyBiasNew<avx512bw::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
+  TestMultiplyBiasNew<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+  TestMultiplyBiasNew<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
+  TestMultiplyBiasNew<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+  TestMultiplyBiasNew<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
 }
 #endif
 
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
   TEST_CASE ("Multiply AVX512VNNI 8bit Shift with bias", "[Add127]") {
     if (kCPU < CPUType::AVX512VNNI) return;
-    TestMultiplyBiasNew<AVX512VNNI_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
+    TestMultiplyBiasNew<avx512vnni::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
+    TestMultiplyBiasNew<avx512vnni::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
+    TestMultiplyBiasNew<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
+    TestMultiplyBiasNew<avx512vnni::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+    TestMultiplyBiasNew<avx512vnni::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
+    TestMultiplyBiasNew<avx512vnni::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+    TestMultiplyBiasNew<avx512vnni::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
   }
 #endif
 
 //Multiply old vs new
 TEST_CASE ("Multiply SSSE3 8bit Shift vs nonshift", "[Add127]") {
   if (kCPU < CPUType::SSSE3) return;
-  TestMultiplyShiftNonShift<SSSE3_8bit>(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f);
-  TestMultiplyShiftNonShift<SSSE3_8bit>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
-  TestMultiplyShiftNonShift<SSSE3_8bit>(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad
-  TestMultiplyShiftNonShift<SSSE3_8bit>(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f);
-  TestMultiplyShiftNonShift<SSSE3_8bit>(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f);
-  TestMultiplyShiftNonShift<SSSE3_8bit>(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f);
-  TestMultiplyShiftNonShift<SSSE3_8bit>(200, 256, 256, 1, 0.74f, 0.17f, 0.006f);
+  TestMultiplyShiftNonShift<ssse3::Kernels8>(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f);
+  TestMultiplyShiftNonShift<ssse3::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
+  TestMultiplyShiftNonShift<ssse3::Kernels8>(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad
+  TestMultiplyShiftNonShift<ssse3::Kernels8>(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f);
+  TestMultiplyShiftNonShift<ssse3::Kernels8>(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f);
+  TestMultiplyShiftNonShift<ssse3::Kernels8>(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f);
+  TestMultiplyShiftNonShift<ssse3::Kernels8>(200, 256, 256, 1, 0.74f, 0.17f, 0.006f);
 }
 
 TEST_CASE ("Multiply AVX2 8bit Shift vs nonshift", "[Add127]") {
   if (kCPU < CPUType::AVX2) return;
-  TestMultiplyShiftNonShift<AVX2_8bit>(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f);
-  TestMultiplyShiftNonShift<AVX2_8bit>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
-  TestMultiplyShiftNonShift<AVX2_8bit>(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad
-  TestMultiplyShiftNonShift<AVX2_8bit>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftNonShift<AVX2_8bit>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
-  TestMultiplyShiftNonShift<AVX2_8bit>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftNonShift<AVX2_8bit>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
+  TestMultiplyShiftNonShift<avx2::Kernels8>(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f);
+  TestMultiplyShiftNonShift<avx2::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
+  TestMultiplyShiftNonShift<avx2::Kernels8>(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad
+  TestMultiplyShiftNonShift<avx2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+  TestMultiplyShiftNonShift<avx2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
+  TestMultiplyShiftNonShift<avx2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+  TestMultiplyShiftNonShift<avx2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
 }
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
 TEST_CASE ("Multiply AVX512F 8bit Shift vs nonshift", "[Add127]") {
   if (kCPU < CPUType::AVX512BW) return;
-  TestMultiplyShiftNonShift<AVX512_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
-  TestMultiplyShiftNonShift<AVX512_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
-  TestMultiplyShiftNonShift<AVX512_8bit>(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f);
-  TestMultiplyShiftNonShift<AVX512_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-  TestMultiplyShiftNonShift<AVX512_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
-  TestMultiplyShiftNonShift<AVX512_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-  TestMultiplyShiftNonShift<AVX512_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
+  TestMultiplyShiftNonShift<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
+  TestMultiplyShiftNonShift<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
+  TestMultiplyShiftNonShift<avx512bw::Kernels8>(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f);
+  TestMultiplyShiftNonShift<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+  TestMultiplyShiftNonShift<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
+  TestMultiplyShiftNonShift<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
+  TestMultiplyShiftNonShift<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
 }
 #endif
 
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
   TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs nonshift", "[Add127]") {
     if (kCPU < CPUType::AVX512VNNI) return;
-    TestMultiplyShiftNonShift<AVX512VNNI_8bit>(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f);
-    TestMultiplyShiftNonShift<AVX512VNNI_8bit>(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f);
-    TestMultiplyShiftNonShift<AVX512VNNI_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
-    TestMultiplyShiftNonShift<AVX512VNNI_8bit>(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
-    TestMultiplyShiftNonShift<AVX512VNNI_8bit>(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f);
-    TestMultiplyShiftNonShift<AVX512VNNI_8bit>(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
-    TestMultiplyShiftNonShift<AVX512VNNI_8bit>(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f);
+    TestMultiplyShiftNonShift<avx512vnni::Kernels8>(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f);
+    TestMultiplyShiftNonShift<avx512vnni::Kernels8>(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f);
+    TestMultiplyShiftNonShift<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
+    TestMultiplyShiftNonShift<avx512vnni::Kernels8>(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
+    TestMultiplyShiftNonShift<avx512vnni::Kernels8>(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f);
+    TestMultiplyShiftNonShift<avx512vnni::Kernels8>(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
+    TestMultiplyShiftNonShift<avx512vnni::Kernels8>(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f);
   }
 #endif
 
 //Multiply Shift vs int shift implementation
 TEST_CASE ("Multiply SSSE3 8bit Shift vs Int", "[Add127]") {
   if (kCPU < CPUType::SSSE3) return;
-  TestMultiplyShiftInt<SSSE3_8bit>(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3_8bit>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3_8bit>(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3_8bit>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3_8bit>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3_8bit>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3_8bit>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
+  TestMultiplyShiftInt<ssse3::Kernels8>(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<ssse3::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
+  TestMultiplyShiftInt<ssse3::Kernels8>(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f);
+  TestMultiplyShiftInt<ssse3::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+  TestMultiplyShiftInt<ssse3::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
+  TestMultiplyShiftInt<ssse3::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+  TestMultiplyShiftInt<ssse3::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
 }
 
 TEST_CASE ("Multiply AVX2 8bit Shift vs Int", "[Add127]") {
   if (kCPU < CPUType::AVX2) return;
-  TestMultiplyShiftInt<AVX2_8bit>(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX2_8bit>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<AVX2_8bit>(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f);
-  TestMultiplyShiftInt<AVX2_8bit>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftInt<AVX2_8bit>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<AVX2_8bit>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftInt<AVX2_8bit>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
+  TestMultiplyShiftInt<avx2::Kernels8>(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx2::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
+  TestMultiplyShiftInt<avx2::Kernels8>(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f);
+  TestMultiplyShiftInt<avx2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+  TestMultiplyShiftInt<avx2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
+  TestMultiplyShiftInt<avx2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
+  TestMultiplyShiftInt<avx2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
 }
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
 TEST_CASE ("Multiply AVX512F 8bit Shift vs Int", "[Add127]") {
   if (kCPU < CPUType::AVX512BW) return;
-  TestMultiplyShiftInt<AVX512_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
-  TestMultiplyShiftInt<AVX512_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<AVX512_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx512bw::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
+  TestMultiplyShiftInt<avx512bw::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx512bw::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
+  TestMultiplyShiftInt<avx512bw::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx512bw::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx512bw::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx512bw::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
 }
 #endif
 
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
 TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs Int", "[Add127]") {
   if (kCPU < CPUType::AVX512VNNI) return;
-  TestMultiplyShiftInt<AVX512VNNI_8bit>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI_8bit>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI_8bit>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI_8bit>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI_8bit>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI_8bit>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI_8bit>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx512vnni::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
+  TestMultiplyShiftInt<avx512vnni::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx512vnni::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
+  TestMultiplyShiftInt<avx512vnni::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx512vnni::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx512vnni::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
+  TestMultiplyShiftInt<avx512vnni::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
 }
 #endif
 
diff --git a/test/kernels/add_bias_test.cc b/test/kernels/add_bias_test.cc
index 7c299f0..492c669 100644
--- a/test/kernels/add_bias_test.cc
+++ b/test/kernels/add_bias_test.cc
@@ -1,6 +1,6 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <numeric>
 
diff --git a/test/kernels/bitwise_not_test.cc b/test/kernels/bitwise_not_test.cc
index 1408db3..e908c43 100644
--- a/test/kernels/bitwise_not_test.cc
+++ b/test/kernels/bitwise_not_test.cc
@@ -1,6 +1,6 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <cstdlib>
 #include <numeric>
diff --git a/test/kernels/downcast_test.cc b/test/kernels/downcast_test.cc
index 6c1f3ab..5f9db66 100644
--- a/test/kernels/downcast_test.cc
+++ b/test/kernels/downcast_test.cc
@@ -1,6 +1,6 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <cstddef>
 #include <numeric>
diff --git a/test/kernels/exp_test.cc b/test/kernels/exp_test.cc
index b76a2e1..838e228 100644
--- a/test/kernels/exp_test.cc
+++ b/test/kernels/exp_test.cc
@@ -1,6 +1,6 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <cstddef>
 #include <numeric>
diff --git a/test/kernels/floor_test.cc b/test/kernels/floor_test.cc
index 01b607f..2659c3f 100644
--- a/test/kernels/floor_test.cc
+++ b/test/kernels/floor_test.cc
@@ -1,6 +1,6 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <cstddef>
 #include <numeric>
diff --git a/test/kernels/multiply_sat_test.cc b/test/kernels/multiply_sat_test.cc
deleted file mode 100644
index 444eae4..0000000
--- a/test/kernels/multiply_sat_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
-
-#include <stdint.h>
-#include <cstdint>
-#include <cstddef>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_, typename Type_>
-void kernel_multiply_sat_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vec_t = vector_t<CPUType_, Type_>;
-  constexpr int VECTOR_LENGTH = sizeof(vec_t) / sizeof(Type_);
-
-  AlignedVector<Type_> input1(VECTOR_LENGTH);
-  AlignedVector<Type_> input2(VECTOR_LENGTH);
-  AlignedVector<Type_> output(VECTOR_LENGTH);
-
-  std::iota(input1.begin(), input1.end(), static_cast<Type_>(-VECTOR_LENGTH / 2));
-  std::iota(input2.begin(), input2.end(), static_cast<Type_>(-VECTOR_LENGTH / 3));
-
-  // TODO: try all shifts.  The shift must be an immediate.
-  int8_t shift = 1;
-  *output.template as<vec_t>() = kernels::multiply_sat<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>(), shift);
-  for (std::size_t i = 0; i < output.size(); ++i) {
-    auto ref = (int64_t(input1[i]) * input2[i]) >> shift;
-    auto ref_sat = Type_(std::min<int64_t>(std::numeric_limits<Type_>::max(), std::max<int64_t>(std::numeric_limits<Type_>::min(), ref)));
-    CHECK(output[i] == ref_sat);
-  }
-}
-
-template INTGEMM_SSE2 void kernel_multiply_sat_test<CPUType::SSE2, int8_t>();
-template INTGEMM_SSE2 void kernel_multiply_sat_test<CPUType::SSE2, int16_t>();
-KERNEL_TEST_CASE("multiply_sat/int8 SSE2") { return kernel_multiply_sat_test<CPUType::SSE2, int8_t>(); }
-KERNEL_TEST_CASE("multiply_sat/int16 SSE2") { return kernel_multiply_sat_test<CPUType::SSE2, int16_t>(); }
-
-template INTGEMM_AVX2 void kernel_multiply_sat_test<CPUType::AVX2, int8_t>();
-template INTGEMM_AVX2 void kernel_multiply_sat_test<CPUType::AVX2, int16_t>();
-KERNEL_TEST_CASE("multiply_sat/int8 AVX2") { return kernel_multiply_sat_test<CPUType::AVX2, int8_t>(); }
-KERNEL_TEST_CASE("multiply_sat/int16 AVX2") { return kernel_multiply_sat_test<CPUType::AVX2, int16_t>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_multiply_sat_test<CPUType::AVX512BW, int8_t>();
-template INTGEMM_AVX512BW void kernel_multiply_sat_test<CPUType::AVX512BW, int16_t>();
-KERNEL_TEST_CASE("multiply_sat/int8 AVX512BW") { return kernel_multiply_sat_test<CPUType::AVX512BW, int8_t>(); }
-KERNEL_TEST_CASE("multiply_sat/int16 AVX512BW") { return kernel_multiply_sat_test<CPUType::AVX512BW, int16_t>(); }
-#endif
-
-}
diff --git a/test/kernels/multiply_test.cc b/test/kernels/multiply_test.cc
index ca8c54c..029e3ac 100644
--- a/test/kernels/multiply_test.cc
+++ b/test/kernels/multiply_test.cc
@@ -1,9 +1,8 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <cstdint>
-#include <stdint.h>
 #include <numeric>
 
 namespace intgemm {
diff --git a/test/kernels/quantize_test.cc b/test/kernels/quantize_test.cc
index f163a45..ae3c068 100644
--- a/test/kernels/quantize_test.cc
+++ b/test/kernels/quantize_test.cc
@@ -1,6 +1,6 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <numeric>
 
diff --git a/test/kernels/relu_test.cc b/test/kernels/relu_test.cc
index 3ad6dd8..6fcef98 100644
--- a/test/kernels/relu_test.cc
+++ b/test/kernels/relu_test.cc
@@ -1,8 +1,7 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
-#include <stdint.h>
 #include <cstdint>
 #include <numeric>
 
diff --git a/test/kernels/rescale_test.cc b/test/kernels/rescale_test.cc
index ae13984..280b513 100644
--- a/test/kernels/rescale_test.cc
+++ b/test/kernels/rescale_test.cc
@@ -1,9 +1,8 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <cstdint>
-#include <stdint.h>
 #include <numeric>
 
 namespace intgemm {
diff --git a/test/kernels/sigmoid_test.cc b/test/kernels/sigmoid_test.cc
index 7f7392d..af9dad1 100644
--- a/test/kernels/sigmoid_test.cc
+++ b/test/kernels/sigmoid_test.cc
@@ -1,6 +1,6 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <cstddef>
 #include <numeric>
diff --git a/test/kernels/tanh_test.cc b/test/kernels/tanh_test.cc
index 4ba099c..e2c36f5 100644
--- a/test/kernels/tanh_test.cc
+++ b/test/kernels/tanh_test.cc
@@ -1,6 +1,6 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <cstddef>
 #include <numeric>
diff --git a/test/kernels/unquantize_test.cc b/test/kernels/unquantize_test.cc
index 20c3d6a..ee4bc80 100644
--- a/test/kernels/unquantize_test.cc
+++ b/test/kernels/unquantize_test.cc
@@ -1,6 +1,6 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <numeric>
 
diff --git a/test/kernels/upcast_test.cc b/test/kernels/upcast_test.cc
index cc782b5..92be1bd 100644
--- a/test/kernels/upcast_test.cc
+++ b/test/kernels/upcast_test.cc
@@ -1,9 +1,10 @@
+// This test triggers an internal compiler error in gcc 5.
+#if defined(__OPTIMIZE__) || defined(__clang__) || defined(__INTEL_COMPILER) || !defined(__GNUC__) || (__GNUC__ != 5)
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <cstdint>
-#include <stdint.h>
 #include <numeric>
 
 namespace intgemm {
@@ -72,6 +73,7 @@ template INTGEMM_AVX512BW void kernel_upcast16to32_test<CPUType::AVX512BW>();
 KERNEL_TEST_CASE("upcast16to32 AVX512BW") { return kernel_upcast16to32_test<CPUType::AVX512BW>(); }
 #endif
 
+
 template <CPUType CPUType_>
 void kernel_upcast8to32_test() {
   if (kCPU < CPUType_)
@@ -107,3 +109,4 @@ KERNEL_TEST_CASE("upcast8to32 AVX512BW") { return kernel_upcast8to32_test<CPUTyp
 #endif
 
 }
+#endif
diff --git a/test/kernels/write_test.cc b/test/kernels/write_test.cc
index a0189fe..c263fca 100644
--- a/test/kernels/write_test.cc
+++ b/test/kernels/write_test.cc
@@ -1,6 +1,6 @@
 #include "../test.h"
-#include "../../aligned.h"
-#include "../../kernels.h"
+#include "../../intgemm/aligned.h"
+#include "../../intgemm/kernels.h"
 
 #include <cstddef>
 #include <numeric>
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index a1138a0..6c16edd 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -1,10 +1,10 @@
 #include "test.h"
-#include "../aligned.h"
-#include "../callbacks.h"
-#include "../interleave.h"
-#include "../intgemm.h"
-#include "../multiply.h"
-#include "../stats.h"
+#include "../intgemm/aligned.h"
+#include "../intgemm/callbacks.h"
+#include "../intgemm/interleave.h"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/multiply.h"
+#include "../intgemm/stats.h"
 
 #include <algorithm>
 #include <cassert>
@@ -66,7 +66,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
     it = dist(gen);
   }
 
-  typedef typename Routine::Integer Integer;
+  using Integer = typename Routine::Integer;
   // Call Prepare
   AlignedVector<Integer> test(input.size());
   Routine::PrepareB(input.begin(), test.begin(), 1, rows, cols);
@@ -85,30 +85,30 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
 TEST_CASE("Prepare AVX512", "[prepare]") {
   if (kCPU < CPUType::AVX512BW) return;
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-	TestPrepare<AVX512_8bit>(64, 8);
-	TestPrepare<AVX512_8bit>(256, 32);
-    TestPrepare<AVX512_16bit>(64, 8);
-    TestPrepare<AVX512_16bit>(256, 32);
+	TestPrepare<avx512bw::Kernels8>(64, 8);
+	TestPrepare<avx512bw::Kernels8>(256, 32);
+    TestPrepare<avx512bw::Kernels16>(64, 8);
+    TestPrepare<avx512bw::Kernels16>(256, 32);
 #endif
 }
 
 TEST_CASE("Prepare AVX2", "[prepare]") {
   if (kCPU < CPUType::AVX2) return;
-  TestPrepare<AVX2_8bit>(64, 32);
-  TestPrepare<AVX2_16bit>(64, 32);
+  TestPrepare<avx2::Kernels8>(64, 32);
+  TestPrepare<avx2::Kernels16>(64, 32);
 }
 
 TEST_CASE("Prepare SSSE3", "[prepare]") {
   if (kCPU < CPUType::SSSE3) return;
-  TestPrepare<SSSE3_8bit>(16, 8);
-  TestPrepare<SSSE3_8bit>(32, 16);
-  TestPrepare<SSSE3_8bit>(32, 32);
+  TestPrepare<ssse3::Kernels8>(16, 8);
+  TestPrepare<ssse3::Kernels8>(32, 16);
+  TestPrepare<ssse3::Kernels8>(32, 32);
 }
 
 TEST_CASE("Prepare SSE2", "[prepare]") {
   if (kCPU < CPUType::SSE2) return;
-  TestPrepare<SSE2_16bit>(8, 8);
-  TestPrepare<SSE2_16bit>(32, 32);
+  TestPrepare<sse2::Kernels16>(8, 8);
+  TestPrepare<sse2::Kernels16>(32, 32);
 }
 
 template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 16) {
@@ -119,7 +119,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
   for (auto& it : input) {
     it = dist(gen);
   }
-  typedef typename Routine::Integer Integer;
+  using Integer = typename Routine::Integer;
   AlignedVector<Integer> prepared(input.size());
   Routine::PrepareB(input.begin(), prepared.begin(), 1, rows, cols);
 
@@ -150,27 +150,27 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
 TEST_CASE("SelectColumnsB AVX512", "[select]") {
   if (kCPU < CPUType::AVX512BW) return;
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-    TestSelectColumnsB<AVX512_8bit>();
-    TestSelectColumnsB<AVX512_16bit>(256, 256);
+    TestSelectColumnsB<avx512bw::Kernels8>();
+    TestSelectColumnsB<avx512bw::Kernels16>(256, 256);
 #endif
 }
 
 TEST_CASE("SelectColumnsB AVX2", "[select]") {
   if (kCPU < CPUType::AVX2) return;
-  TestSelectColumnsB<AVX2_8bit>(256, 256);
-  TestSelectColumnsB<AVX2_16bit>(256, 256);
+  TestSelectColumnsB<avx2::Kernels8>(256, 256);
+  TestSelectColumnsB<avx2::Kernels16>(256, 256);
 }
 
 TEST_CASE("SelectColumnsB SSSE3", "[select]") {
   if (kCPU < CPUType::SSSE3) return;
-  TestSelectColumnsB<SSSE3_8bit>();
-  TestSelectColumnsB<SSSE3_8bit>(256, 256);
+  TestSelectColumnsB<ssse3::Kernels8>();
+  TestSelectColumnsB<ssse3::Kernels8>(256, 256);
 }
 
 TEST_CASE("SelectColumnsB SSE2", "[select]") {
   if (kCPU < CPUType::SSE2) return;
-  TestSelectColumnsB<SSE2_16bit>();
-  TestSelectColumnsB<SSE2_16bit>(256, 256);
+  TestSelectColumnsB<sse2::Kernels16>();
+  TestSelectColumnsB<sse2::Kernels16>(256, 256);
 }
 
 template <class Register> void TestMax() {
@@ -187,8 +187,8 @@ TEST_CASE("Max", "[max]") {
 }
 
 void CompareMaxAbs(const float *begin, const float *end, float test, std::size_t offset) {
-  float largest = fabs(*std::max_element(begin, end));
-  float smallest = fabs(*std::min_element(begin, end));
+  float largest = std::fabs(*std::max_element(begin, end));
+  float smallest = std::fabs(*std::min_element(begin, end));
   largest = std::max(largest, smallest);
   CHECK_MESSAGE(largest == test, "Error: " << largest << " versus " << test << " in length " << (end - begin) << " offset " << offset);
 }
@@ -255,7 +255,7 @@ TEST_CASE("MaxAbsolute AVX512BW", "[max]") {
 
 template <class Routine> void TestMultiply(Index A_rows, Index width, Index B_cols,
  float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
-  typedef typename Routine::Integer Integer;
+  using Integer = typename Routine::Integer;
   std::ostringstream info;
   info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
 
@@ -307,7 +307,7 @@ template <class Routine> void TestMultiply(Index A_rows, Index width, Index B_co
 //Require different number of arguments. I don't think the refactoring is worth it.
 template <class Routine> void TestMultiplyBias(Index A_rows, Index width, Index B_cols,
  float int_tolerance = 0.1f, float float_tolerance = 1.0f, float MSE_float_tolerance = 0.0f, float MSE_int_tolerance = 0.0f) {
-  typedef typename Routine::Integer Integer;
+  using Integer = typename Routine::Integer;
   std::ostringstream info;
   info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
 
@@ -358,145 +358,145 @@ template <class Routine> void TestMultiplyBias(Index A_rows, Index width, Index
 
 TEST_CASE ("Multiply SSE2 16bit", "[multiply]") {
   if (kCPU < CPUType::SSE2) return;
-  TestMultiply<SSE2_16bit>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<SSE2_16bit>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiply<SSE2_16bit>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<SSE2_16bit>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<SSE2_16bit>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<SSE2_16bit>(200, 256, 256, .1f, 1, 0.01f);
+  TestMultiply<sse2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+  TestMultiply<sse2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+  TestMultiply<sse2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+  TestMultiply<sse2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+  TestMultiply<sse2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+  TestMultiply<sse2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
 }
 
 TEST_CASE ("Multiply SSE2 16bit with bias", "[biased_multiply]") {
   if (kCPU < CPUType::SSE2) return;
-  TestMultiplyBias<SSE2_16bit>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<SSE2_16bit>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiplyBias<SSE2_16bit>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<SSE2_16bit>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<SSE2_16bit>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<SSE2_16bit>(200, 256, 256, .1f, 1, 0.01f);
+  TestMultiplyBias<sse2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+  TestMultiplyBias<sse2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+  TestMultiplyBias<sse2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+  TestMultiplyBias<sse2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+  TestMultiplyBias<sse2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+  TestMultiplyBias<sse2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
 }
 
 TEST_CASE ("Multiply SSSE3 8bit", "[multiply]") {
   if (kCPU < CPUType::SSSE3) return;
-  TestMultiply<SSSE3_8bit>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
-  TestMultiply<SSSE3_8bit>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
-  TestMultiply<SSSE3_8bit>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
-  TestMultiply<SSSE3_8bit>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
-  TestMultiply<SSSE3_8bit>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
-  TestMultiply<SSSE3_8bit>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
+  TestMultiply<ssse3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
+  TestMultiply<ssse3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
+  TestMultiply<ssse3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
+  TestMultiply<ssse3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
+  TestMultiply<ssse3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
+  TestMultiply<ssse3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
 }
 
 TEST_CASE ("Multiply SSSE3 8bit with bias", "[biased_multiply]") {
   if (kCPU < CPUType::SSSE3) return;
-  TestMultiplyBias<SSSE3_8bit>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
-  TestMultiplyBias<SSSE3_8bit>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
-  TestMultiplyBias<SSSE3_8bit>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
-  TestMultiplyBias<SSSE3_8bit>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
-  TestMultiplyBias<SSSE3_8bit>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
-  TestMultiplyBias<SSSE3_8bit>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
+  TestMultiplyBias<ssse3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
+  TestMultiplyBias<ssse3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
+  TestMultiplyBias<ssse3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
+  TestMultiplyBias<ssse3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
+  TestMultiplyBias<ssse3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
+  TestMultiplyBias<ssse3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
 }
 
 TEST_CASE ("Multiply AVX2 8bit", "[multiply]") {
   if (kCPU < CPUType::AVX2) return;
-  TestMultiply<AVX2_8bit>(8, 256, 256, .1f, 1, 0.1f);
-  TestMultiply<AVX2_8bit>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
-  TestMultiply<AVX2_8bit>(320, 256, 256, .1f, 1, 0.1f);
-  TestMultiply<AVX2_8bit>(472, 256, 256, .1f, 1, 0.1f);
-  TestMultiply<AVX2_8bit>(248, 256, 256, .1f, 1, 0.1f);
-  TestMultiply<AVX2_8bit>(200, 256, 256, .1f, 1, 0.1f);
+  TestMultiply<avx2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
+  TestMultiply<avx2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
+  TestMultiply<avx2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
+  TestMultiply<avx2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
+  TestMultiply<avx2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
+  TestMultiply<avx2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
 }
 
 TEST_CASE ("Multiply AVX2 8bit with bias", "[biased_multiply]") {
   if (kCPU < CPUType::AVX2) return;
-  TestMultiplyBias<AVX2_8bit>(8, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBias<AVX2_8bit>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
-  TestMultiplyBias<AVX2_8bit>(320, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBias<AVX2_8bit>(472, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBias<AVX2_8bit>(248, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBias<AVX2_8bit>(200, 256, 256, .1f, 1, 0.1f);
+  TestMultiplyBias<avx2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
+  TestMultiplyBias<avx2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
+  TestMultiplyBias<avx2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
+  TestMultiplyBias<avx2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
+  TestMultiplyBias<avx2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
+  TestMultiplyBias<avx2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
 }
 
 TEST_CASE ("Multiply AVX2 16bit", "[multiply]") {
   if (kCPU < CPUType::AVX2) return;
-  TestMultiply<AVX2_16bit>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<AVX2_16bit>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiply<AVX2_16bit>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<AVX2_16bit>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<AVX2_16bit>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<AVX2_16bit>(200, 256, 256, .1f, 1, 0.01f);
+  TestMultiply<avx2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+  TestMultiply<avx2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+  TestMultiply<avx2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+  TestMultiply<avx2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+  TestMultiply<avx2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+  TestMultiply<avx2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
 }
 
 TEST_CASE ("Multiply AVX2 16bit with bias", "[biased_multiply]") {
   if (kCPU < CPUType::AVX2) return;
-  TestMultiplyBias<AVX2_16bit>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<AVX2_16bit>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiplyBias<AVX2_16bit>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<AVX2_16bit>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<AVX2_16bit>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<AVX2_16bit>(200, 256, 256, .1f, 1, 0.01f);
+  TestMultiplyBias<avx2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+  TestMultiplyBias<avx2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
+  TestMultiplyBias<avx2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+  TestMultiplyBias<avx2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+  TestMultiplyBias<avx2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+  TestMultiplyBias<avx2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
 }
 
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
   TEST_CASE ("Multiply AVX512 8bit", "[multiply]") {
     if (kCPU < CPUType::AVX512BW) return;
-    TestMultiply<AVX512_8bit>(8, 256, 256, 0, 0.25f, 0.062f);
-    TestMultiply<AVX512_8bit>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
-    TestMultiply<AVX512_8bit>(320, 256, 256, 0, 0.26f, 0.059f);
-    TestMultiply<AVX512_8bit>(472, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiply<AVX512_8bit>(248, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiply<AVX512_8bit>(200, 256, 256, 0, 0.28f, 0.06f);
+    TestMultiply<avx512bw::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+    TestMultiply<avx512bw::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
+    TestMultiply<avx512bw::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+    TestMultiply<avx512bw::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+    TestMultiply<avx512bw::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+    TestMultiply<avx512bw::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
   }
 
   TEST_CASE ("Multiply AVX512 8bit with bias", "[biased_multiply]") {
     if (kCPU < CPUType::AVX512BW) return;
-    TestMultiplyBias<AVX512_8bit>(8, 256, 256, 0, 0.25f, 0.062f);
-    TestMultiplyBias<AVX512_8bit>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
-    TestMultiplyBias<AVX512_8bit>(320, 256, 256, 0, 0.26f, 0.059f);
-    TestMultiplyBias<AVX512_8bit>(472, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiplyBias<AVX512_8bit>(248, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiplyBias<AVX512_8bit>(200, 256, 256, 0, 0.28f, 0.06f);
+    TestMultiplyBias<avx512bw::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+    TestMultiplyBias<avx512bw::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
+    TestMultiplyBias<avx512bw::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+    TestMultiplyBias<avx512bw::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+    TestMultiplyBias<avx512bw::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+    TestMultiplyBias<avx512bw::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
   }
 
   #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
     TEST_CASE ("Multiply AVX512VNNI 8bit", "[multiply]") {
       if (kCPU < CPUType::AVX512VNNI) return;
-      TestMultiply<AVX512VNNI_8bit>(8, 256, 256, 0, 0.25f, 0.062f);
-      TestMultiply<AVX512VNNI_8bit>(8, 2048, 256, 0, 0.55f, 0.25f);
-      TestMultiply<AVX512VNNI_8bit>(320, 256, 256, 0, 0.26f, 0.059f);
-      TestMultiply<AVX512VNNI_8bit>(472, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiply<AVX512VNNI_8bit>(248, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiply<AVX512VNNI_8bit>(200, 256, 256, 0, 0.28f, 0.06f);
+      TestMultiply<avx512vnni::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+      TestMultiply<avx512vnni::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
+      TestMultiply<avx512vnni::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+      TestMultiply<avx512vnni::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+      TestMultiply<avx512vnni::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+      TestMultiply<avx512vnni::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
     }
 
     TEST_CASE ("Multiply AVX512VNNI 8bit with bias", "[biased_multiply]") {
       if (kCPU < CPUType::AVX512VNNI) return;
-      TestMultiplyBias<AVX512VNNI_8bit>(8, 256, 256, 0, 0.25f, 0.062f);
-      TestMultiplyBias<AVX512VNNI_8bit>(8, 2048, 256, 0, 0.55f, 0.25f);
-      TestMultiplyBias<AVX512VNNI_8bit>(320, 256, 256, 0, 0.26f, 0.059f);
-      TestMultiplyBias<AVX512VNNI_8bit>(472, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiplyBias<AVX512VNNI_8bit>(248, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiplyBias<AVX512VNNI_8bit>(200, 256, 256, 0, 0.28f, 0.06f);
+      TestMultiplyBias<avx512vnni::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
+      TestMultiplyBias<avx512vnni::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
+      TestMultiplyBias<avx512vnni::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
+      TestMultiplyBias<avx512vnni::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
+      TestMultiplyBias<avx512vnni::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
+      TestMultiplyBias<avx512vnni::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
     }
   #endif
 
   TEST_CASE ("Multiply AVX512 16bit", "[multiply]") {
     if (kCPU < CPUType::AVX512BW) return;
-    TestMultiply<AVX512_16bit>(8, 256, 256, .1f, 1, 0.01f);
-    TestMultiply<AVX512_16bit>(8, 2048, 256, .1f, 1, 0.011f);
-    TestMultiply<AVX512_16bit>(320, 256, 256, .1f, 1, 0.01f);
-    TestMultiply<AVX512_16bit>(472, 256, 256, .1f, 1, 0.01f);
-    TestMultiply<AVX512_16bit>(248, 256, 256, .1f, 1, 0.01f);
-    TestMultiply<AVX512_16bit>(200, 256, 256, .1f, 1, 0.01f);
+    TestMultiply<avx512bw::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+    TestMultiply<avx512bw::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
+    TestMultiply<avx512bw::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+    TestMultiply<avx512bw::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+    TestMultiply<avx512bw::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+    TestMultiply<avx512bw::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
   }
 
   TEST_CASE ("Multiply AVX512 16bit with bias", "[biased_multiply]") {
     if (kCPU < CPUType::AVX512BW) return;
-    TestMultiplyBias<AVX512_16bit>(8, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBias<AVX512_16bit>(8, 2048, 256, .1f, 1, 0.011f);
-    TestMultiplyBias<AVX512_16bit>(320, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBias<AVX512_16bit>(472, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBias<AVX512_16bit>(248, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBias<AVX512_16bit>(200, 256, 256, .1f, 1, 0.01f);
+    TestMultiplyBias<avx512bw::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
+    TestMultiplyBias<avx512bw::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
+    TestMultiplyBias<avx512bw::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
+    TestMultiplyBias<avx512bw::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
+    TestMultiplyBias<avx512bw::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
+    TestMultiplyBias<avx512bw::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
   }
 #endif
 
diff --git a/test/prepare_b_quantized_transposed.cc b/test/prepare_b_quantized_transposed.cc
index 938cc28..1437e0a 100644
--- a/test/prepare_b_quantized_transposed.cc
+++ b/test/prepare_b_quantized_transposed.cc
@@ -1,13 +1,13 @@
 #include "test.h"
-#include "../aligned.h"
-#include "../avx2_gemm.h"
-#include "../avx512_gemm.h"
-#include "../sse2_gemm.h"
-#include "../ssse3_gemm.h"
+#include "../intgemm/aligned.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/avx512_gemm.h"
+#include "../intgemm/sse2_gemm.h"
+#include "../intgemm/ssse3_gemm.h"
 
+#include <cmath>
 #include <cstring>
 #include <iostream>
-#include <math.h>
 
 namespace intgemm {
 namespace {
@@ -62,22 +62,22 @@ TEST_CASE("PrepareBQuantizedTransposed SSE2", "") {
   if (kCPU < CPUType::SSE2)
     return;
 
-  CHECK(TestMany<SSE2_16bit>(32, 128));
+  CHECK(TestMany<sse2::Kernels16>(32, 128));
 }
 
 TEST_CASE("PrepareBQuantizedTransposed SSSE3", "") {
   if (kCPU < CPUType::SSSE3)
     return;
 
-  CHECK(TestMany<SSSE3_8bit>(32, 128));
+  CHECK(TestMany<ssse3::Kernels8>(32, 128));
 }
 
 TEST_CASE("PrepareBQuantizedTransposed AVX2", "") {
   if (kCPU < CPUType::AVX2)
     return;
 
-  CHECK(TestMany<AVX2_8bit>(32, 128));
-  CHECK(TestMany<AVX2_16bit>(32, 128));
+  CHECK(TestMany<avx2::Kernels8>(32, 128));
+  CHECK(TestMany<avx2::Kernels16>(32, 128));
 }
 
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
@@ -85,8 +85,8 @@ TEST_CASE("PrepareBQuantizedTransposed AVX2", "") {
     if (kCPU < CPUType::AVX512BW)
       return;
 
-    CHECK(TestMany<AVX512_8bit>(64, 128));
-    CHECK(TestMany<AVX512_16bit>(64, 128));
+    CHECK(TestMany<avx512bw::Kernels8>(64, 128));
+    CHECK(TestMany<avx512bw::Kernels16>(64, 128));
   }
 #endif
 
diff --git a/test/prepare_b_transposed.cc b/test/prepare_b_transposed.cc
index 5969724..bc35138 100644
--- a/test/prepare_b_transposed.cc
+++ b/test/prepare_b_transposed.cc
@@ -1,13 +1,13 @@
 #include "test.h"
-#include "../aligned.h"
-#include "../avx2_gemm.h"
-#include "../avx512_gemm.h"
-#include "../sse2_gemm.h"
-#include "../ssse3_gemm.h"
+#include "../intgemm/aligned.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/avx512_gemm.h"
+#include "../intgemm/sse2_gemm.h"
+#include "../intgemm/ssse3_gemm.h"
 
+#include <cmath>
 #include <cstring>
 #include <iostream>
-#include <math.h>
 
 namespace intgemm {
 namespace {
@@ -63,22 +63,22 @@ TEST_CASE("PrepareBTransposed SSE2", "") {
   if (kCPU < CPUType::SSE2)
     return;
 
-  CHECK(TestMany<SSE2_16bit>(4, 128, 2.0f));
+  CHECK(TestMany<sse2::Kernels16>(4, 128, 2.0f));
 }
 
 TEST_CASE("PrepareBTransposed SSSE3", "") {
   if (kCPU < CPUType::SSSE3)
     return;
 
-  CHECK(TestMany<SSSE3_8bit>(4, 128, 2.0f));
+  CHECK(TestMany<ssse3::Kernels8>(4, 128, 2.0f));
 }
 
 TEST_CASE("PrepareBTransposed AVX2", "") {
   if (kCPU < CPUType::AVX2)
     return;
 
-  CHECK(TestMany<AVX2_8bit>(8, 128, 2.0f));
-  CHECK(TestMany<AVX2_16bit>(8, 128, 2.0f));
+  CHECK(TestMany<avx2::Kernels8>(8, 128, 2.0f));
+  CHECK(TestMany<avx2::Kernels16>(8, 128, 2.0f));
 }
 
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
@@ -86,8 +86,8 @@ TEST_CASE("PrepareBTransposed AVX2", "") {
     if (kCPU < CPUType::AVX512BW)
       return;
 
-    CHECK(TestMany<AVX512_8bit>(16, 128, 2.0f));
-    CHECK(TestMany<AVX512_16bit>(16, 128, 2.0f));
+    CHECK(TestMany<avx512bw::Kernels8>(16, 128, 2.0f));
+    CHECK(TestMany<avx512bw::Kernels16>(16, 128, 2.0f));
   }
 #endif
 
diff --git a/test/quantize_test.cc b/test/quantize_test.cc
index d2f6304..550ec66 100644
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -1,14 +1,14 @@
 #include "test.h"
-#include "../aligned.h"
-#include "../avx2_gemm.h"
-#include "../avx512_gemm.h"
-#include "../sse2_gemm.h"
-#include "../ssse3_gemm.h"
-#include "../stats.h"
-
+#include "../intgemm/aligned.h"
+#include "../intgemm/avx2_gemm.h"
+#include "../intgemm/avx512_gemm.h"
+#include "../intgemm/sse2_gemm.h"
+#include "../intgemm/ssse3_gemm.h"
+#include "../intgemm/stats.h"
+
+#include <cmath>
 #include <cstring>
 #include <iostream>
-#include <math.h>
 
 namespace intgemm {
 namespace {
@@ -61,8 +61,8 @@ void testVectorMeanStd(int num_items, bool absolute=false) {
   MeanStd reference = VectorMeanStd(inputVec, num_items, absolute);
   MeanStd fast = Backend(inputVec.begin(), inputVec.end(), absolute);
 
-  float meanDifference = fabsf(reference.mean - fast.mean);
-  float stdDifference = fabsf(reference.stddev - fast.stddev);
+  float meanDifference = std::fabs(reference.mean - fast.mean);
+  float stdDifference = std::fabs(reference.stddev - fast.stddev);
   float eps = 0.00002f; //Accumulating horizontal sums can lead to errors.
 
   CHECK_MESSAGE(meanDifference <= eps, "Items: " << num_items << " Absolute: " << absolute << " Reference mean: " << reference.mean << " actual: " << fast.mean);
@@ -73,15 +73,15 @@ void testVectorMeanStd(int num_items, bool absolute=false) {
 template <class I> bool IsOff(float from, I ref, I test) {
   if (ref == test) return false;
   if (ref - test > 1 && test - ref > 1) return true;
-  float off_test = fabs((float)test - from);
-  float off_ref = fabs((float)ref - from);
+  float off_test = std::fabs(static_cast<float>(test) - from);
+  float off_ref = std::fabs(static_cast<float>(ref) - from);
   // Allow 0.5 to round either way.
   if (off_test > 0.49 && off_test < 0.51 && off_ref > 0.49 && off_ref < 0.51) return false;
   return true;
 }
 
 template <class Backend> bool Test(const float *input_unaligned, float quant_mult, std::size_t size) {
-  typedef typename Backend::Integer Integer;
+  using Integer = typename Backend::Integer;
   bool success = true;
   AlignedVector<float> input(size);
   std::memcpy(input.begin(), input_unaligned, sizeof(float) * size);
@@ -120,24 +120,24 @@ template <class Backend> void TestMany(std::size_t grow) {
 
 TEST_CASE ("Quantize SSE2", "[quantize]") {
   if (kCPU < CPUType::SSE2) return;
-  TestMany<SSE2_16bit>(8);
+  TestMany<sse2::Kernels16>(8);
 }
 
 TEST_CASE ("Quantize SSSE3", "[quantize]") {
   if (kCPU < CPUType::SSSE3) return;
-  TestMany<SSSE3_8bit>(1);
+  TestMany<ssse3::Kernels8>(1);
 }
 
 TEST_CASE ("Quantize AVX2", "[quantize]") {
   if (kCPU < CPUType::AVX2) return;
-  TestMany<AVX2_8bit>(1);
-  TestMany<AVX2_16bit>(16);
+  TestMany<avx2::Kernels8>(1);
+  TestMany<avx2::Kernels16>(16);
 }
 #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
   TEST_CASE ("Quantize AVX512", "[quantize]") {
     if (kCPU < CPUType::AVX512BW) return;
-    TestMany<AVX512_8bit>(1);
-    TestMany<AVX512_16bit>(16);
+    TestMany<avx512bw::Kernels8>(1);
+    TestMany<avx512bw::Kernels16>(16);
   }
 #endif
 
diff --git a/test/test.cc b/test/test.cc
index 3559738..45c27ad 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -1,6 +1,8 @@
 #define CATCH_CONFIG_RUNNER
 #include "test.h"
 
+#include <cmath>
+
 int main(int argc, char ** argv) {
   return Catch::Session().run(argc, argv);
 }
@@ -13,13 +15,13 @@ void CompareMSE(const float *float_ref, const float *int_ref, const float *int_t
   for (std::size_t i = 0; i < size; ++i) {
     float int_diff = int_ref[i] - int_test[i];
     float float_diff = float_ref[i] - int_test[i];
-    CHECK_MESSAGE(fabs(int_diff) <= int_tolerance, test_info << "Inaccurate compared to int reference at " << i << ' ' << int_ref[i] << ' ' << int_test[i]);
-    CHECK_MESSAGE(fabs(float_diff) <= float_tolerance, test_info << "Inaccurate compared to float reference at " << i << ' ' << float_ref[i] << ' ' << int_test[i]);
+    CHECK_MESSAGE(std::fabs(int_diff) <= int_tolerance, test_info << "Inaccurate compared to int reference at " << i << ' ' << int_ref[i] << ' ' << int_test[i]);
+    CHECK_MESSAGE(std::fabs(float_diff) <= float_tolerance, test_info << "Inaccurate compared to float reference at " << i << ' ' << float_ref[i] << ' ' << int_test[i]);
     int_sum += int_diff * int_diff;
     float_sum += float_diff * float_diff;
   }
-  CHECK_MESSAGE(fabs(sqrt(float_sum / size)) <= MSE_float_tolerance, test_info << "Float MSE = " << sqrt(float_sum / size));
-  CHECK_MESSAGE(fabs(sqrt(int_sum / size)) <= MSE_int_tolerance, test_info << "Int MSE = " << sqrt(int_sum / size));
+  CHECK_MESSAGE(std::fabs(sqrt(float_sum / size)) <= MSE_float_tolerance, test_info << "Float MSE = " << sqrt(float_sum / size));
+  CHECK_MESSAGE(std::fabs(sqrt(int_sum / size)) <= MSE_int_tolerance, test_info << "Int MSE = " << sqrt(int_sum / size));
 }
 
 } // namespace intgemm
diff --git a/test/test.h b/test/test.h
index f145681..1f884c5 100644
--- a/test/test.h
+++ b/test/test.h
@@ -1,12 +1,12 @@
 #pragma once
 
-#include "intgemm_config.h"
+#include "intgemm/intgemm_config.h"
 
-#include "../3rd_party/catch.hpp"
-#include "../intgemm.h"
-#include "../aligned.h"
+#include "3rd_party/catch.hpp"
+#include "../intgemm/intgemm.h"
+#include "../intgemm/aligned.h"
 
-#include <math.h>
+#include <cmath>
 #include <sstream>
 #include <iostream>
 #include <iomanip>
@@ -18,7 +18,7 @@
 
 #define CHECK_EPS(actual, expected, epsilon) \
   do { \
-    if (fabs((actual) - (expected)) < epsilon) { SUCCEED(); } \
+    if (std::fabs((actual) - (expected)) < epsilon) { SUCCEED(); } \
     else { CHECK((actual) == (expected)); } \
   } while(0)
 
@@ -39,8 +39,8 @@ void CompareEps(const Type* reference, const Type* actual, Index size, Type epsi
   for (Index i = 0; i < size; ++i) {
     INFO("Inaccurate at " << i << ' ' << reference[i] << ' ' << actual[i]);
     // Ratio to maximum value.
-    float threshold = epsilon * std::max<float>(0.01f, fabsf(reference[i]));
-    CHECK(fabsf(reference[i] - actual[i]) < threshold);
+    float threshold = epsilon * std::max<float>(0.01f, std::fabs(reference[i]));
+    CHECK(std::fabs(reference[i] - actual[i]) < threshold);
   }
 }
 
diff --git a/test/utils_test.cc b/test/utils_test.cc
index 00b5277..e7d07e8 100644
--- a/test/utils_test.cc
+++ b/test/utils_test.cc
@@ -1,5 +1,5 @@
 #include "test.h"
-#include "../utils.h"
+#include "../intgemm/utils.h"
 
 namespace intgemm {
 namespace {
diff --git a/test_mull.cpp b/test_mull.cpp
deleted file mode 100644
index 42924a6..0000000
--- a/test_mull.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-#include "intgemm.cc"
-#include "aligned.h"
-#include <iostream>
-#include <random>
-#include <string>
-#include <algorithm>
-#include <fstream>
-#include <sstream>
-
-
-/*Adapted from https://www.bfilipek.com/2018/07/string-view-perf-followup.html . We should probably go string_view way
-inline void tokenizeLine(std::string& str, std::vector<std::string>& output,
- std::string delimeter = " ") {
-    auto first = std::begin(str);
-
-    while (first != str.end()) {
-        const auto second = std::find_first_of(first, std::end(str), std::begin(delimeter), std::end(delimeter));
-
-        if (first != second) {
-            output.emplace_back(str.substr(std::distance(std::begin(str), first), std::distance(first, second)));
-        }
-
-        if (second == str.end())
-            break;
-
-        first = std::next(second);
-    }
-}
-
-//This is a different parsing method, without stringStream
-template<class StringType>
-void ReadInFile(StringType infile) {
-	std::ifstream in(infile);
-	std::string line;
-
-	//First line, Info about the matrix
-	std::getline(in, line);
-    std::istringstream iss(line);
-    std::string temp1, temp2, temp3, temp4;
-    int RowsA, ColsA, RowsB, ColsB;
-    if (!(iss >> temp1 >> RowsA >> temp2 >> ColsA >> temp3 >> RowsB >> temp4 >> ColsB)) {
-    	std::cerr << "Error parsing line 1 " << std::endl;
-    	exit(1);
-    }
-
-    //Second line, get QuantMult
-    std::getline(in, line);
-    std::istringstream iss2(line);
-    float quantMultA, quantMultB;
-    if (!(iss2 >> temp1 >> quantMultA >> temp2 >> quantMultA)) { 
-    	std::cerr << "Error parsing line 2 " << std::endl;
-    	exit(1);
-    }
-    std::getline(in, line); //Just some text
-    //Fourth line, AQuant
-    std::vector<int> AQuant;
-    std::getline(in, line);
-    std::vector<std::string> tmp_container;
-    tokenizeLine(line, tmp_container);
-    if (tmp_container.size() != RowsA*ColsA) {
-    	std::cerr << "Error parsing matrix A. Size mismatch. Expected " <<  RowsA*ColsA << " got " << tmp_container.size() << std::endl;
-    }
-    for (auto&& num : tmp_container) {
-    	AQuant.push_back(std::stoi(num));
-    }
-    tmp_container.resize(0);
-
-    std::getline(in, line); //Just some text
-    //Sixth line, B_raw
-    std::vector<float> B_raw;
-    std::getline(in, line);
-    tokenizeLine(line, tmp_container);
-    if (tmp_container.size() != RowsB*ColsB) {
-    	std::cerr << "Error parsing matrix B. Size mismatch. Expected " <<  RowsB*ColsB << " got " << tmp_container.size() << std::endl;
-    }
-    for (auto&& num : tmp_container) {
-    	B_raw.push_back(std::stof(num));
-    }
-    tmp_container.resize(0);
-
-    std::getline(in, line); //Just some text
-    //Eight line, Bias
-    std::vector<float> Bias;
-    std::getline(in, line);
-    tokenizeLine(line, tmp_container);
-    if (tmp_container.size() != ColsB) {
-    	std::cerr << "Error parsing bias. Size mismatch. Expected " <<  ColsB << " got " << tmp_container.size() << std::endl;
-    }
-    for (auto&& num : tmp_container) {
-    	Bias.push_back(std::stof(num));
-    }
-    tmp_container.resize(0);
-
-}
-
-*/
-template<class StringType>
-void ReadInFile(StringType infile) {
-  std::ifstream in(infile);
-  std::string line;
-
-  //First line, Info about the matrix
-  std::getline(in, line);
-  std::istringstream iss(line);
-  std::string temp1, temp2, temp3, temp4;
-  int RowsA, ColsA, RowsB, ColsB;
-  if (!(iss >> temp1 >> RowsA >> temp2 >> ColsA >> temp3 >> RowsB >> temp4 >> ColsB)) {
-    std::cerr << "Error parsing line 1 " << std::endl;
-    exit(1);
-  }
-
-  //Second line, get QuantMult
-  std::getline(in, line);
-  std::istringstream iss2(line);
-  float quantMultA, quantMultB;
-  if (!(iss2 >> temp1 >> quantMultA >> temp2 >> quantMultA)) { 
-    std::cerr << "Error parsing line 2 " << std::endl;
-    exit(1);
-  }
-  std::getline(in, line); //Just some text for human readability
-
-  //4th line, AQuant
-  std::vector<int> AQuant;
-  std::getline(in, line);
-  std::istringstream iss3(line);
-  for (int i = 0; i < RowsA*ColsA; i++) {
-    int num;
-    if (!(iss3 >> num)) {
-      std::cerr << "Error parsing matrix A at " << i << std::endl;;
-    }
-    AQuant.push_back(num);
-  }
-
-  std::getline(in, line); //Just some text for human readability
-  //6th line, B_raw
-  std::vector<float> B_raw;
-  std::getline(in, line);
-  std::istringstream iss4(line);
-  for (int i = 0; i < RowsB*ColsB; i++) {
-    float num;
-    if (!(iss4 >> num)) {
-      std::cerr << "Error parsing matrix B " << std::endl;
-    }
-    B_raw.push_back(num);
-  }
-
-  std::getline(in, line); //Just some text for human readability
-  //8th line, Bias
-  std::vector<float> Bias;
-  std::getline(in, line);
-  std::istringstream iss5(line);
-  for (int i = 0; i < ColsB; i++) {
-    float num;
-    if (!(iss5 >> num)) {
-      std::cerr << "Error parsing matrix bias " << std::endl;
-    }
-    Bias.push_back(num);
-  }
-}
-
-using namespace intgemm;
-template<class T>
-void printMatrix(T* data, Index rows, Index cols) {
-  std::cout << "[";
-  for (int i = 0; i<rows; i++) {
-    std::cout << "[";
-    for (int j =0; j<cols; j++) {
-      std::cout << (float)data[i*cols + j];
-      if (j != cols - 1) {
-        std::cout << ", ";
-      }
-    }
-    std::cout << "]";
-    if (i != rows -1) {
-      std::cout << ',' << std::endl;
-    }
-  }
-  std::cout << "]" << std::endl;
-}
-
-void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index width, Index B_cols, const float *bias) {
-  for (Index r = 0; r < A_rows; ++r) {
-    for (Index c = 0; c < B_cols; ++c) {
-      float sum = 0.0f;
-      for (Index k = 0; k < width; ++k) {
-        sum += A[r * width + k] * B[k * B_cols + c];
-      }
-      if (bias) {
-        C[r * B_cols + c] = sum + bias[c];
-      } else {
-        C[r * B_cols + c] = sum;
-      }
-    }
-  }
-}
-
-// Compute A*B slowly from integers.
-template <class Integer> 
-void SlowRefInt(const Integer *A, const Integer *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias) {
-  for (Index r = 0; r < A_rows; ++r) {
-    for (Index c = 0; c < B_cols; ++c) {
-      int32_t sum = 0;
-      for (Index k = 0; k < width; ++k) {
-        sum += static_cast<int16_t>(A[r * width + k]) * static_cast<int16_t>(B[k * B_cols + c]);
-      }
-      if (bias) {
-        C[r * B_cols + c] = sum * unquant_mult + bias[c];
-      } else {
-        C[r * B_cols + c] = sum * unquant_mult;
-      }
-    }
-  }
-}
-
-int main() {
-
-  const Index A_rows = 1;
-  const Index width = 2048;
-  const Index B_cols = 8;
-
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  AlignedVector<float> bias(B_cols);
-
-  float alpha = 2.0f;
-  float quant_mult = 127/alpha;
-  float unquant_mult = 1.0 / (quant_mult * quant_mult);
-
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-2.0f, 2.0f);
-
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-  for (auto& it : bias) {
-    it = dist(gen);
-  }
-
-  AlignedVector<float> bias_orig(B_cols);
-  for (int i = 0; i < bias.size(); i++) {
-    bias_orig[i] = bias[i];
-  }
-
-  AlignedVector<int8_t> A_prep(A.size());
-  AlignedVector<int8_t> B_prep(B.size());
-
-  AVX2_8bit::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  AVX2_8bit::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-  /*
-     std::cout << "A:" << std::endl;
-     printMatrix(A.begin(), A_rows, width);
-     std::cout << "B:" << std::endl;
-     printMatrix(B.begin(), width, B_cols);
-     std::cout << "bias:" << std::endl;
-     printMatrix(bias.begin(), 1, B_cols);*/
-
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-
-  AVX2_8bit::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
-  //AVX2_8bit::Multiply(A_prep.begin(), B_prep.begin(), JustUnquantizeC(test_C.begin(), unquant_mult), A_rows, width, B_cols);
-  std::cout << "Old multiply:" << std::endl;
-  printMatrix(test_C.begin(), A_rows, B_cols);
-
-  //NEEEXT
-  AlignedVector<uint8_t> A_prep2(A.size());
-  AVX2_8bit::PrepareA(A.begin(), A_prep2.begin(), quant_mult, A_rows, width);
-
-  AVX2_8bit::PrepareBiasFor8(B.begin(), bias.begin(), alpha, width, B_cols);
-
-  //printMatrix(bias.begin(), 1, B_cols); //Print bias
-
-  AVX2_8bit::Multiply8Shift(reinterpret_cast<uint8_t*>(A_prep2.begin()), B_prep.begin(), A_rows, width, B_cols, UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
-  //AVX2_8bit::Multiply8Shift(reinterpret_cast<uint8_t*>(A_prep.begin()), B_prep.begin(), JustUnquantizeC(test_C.begin(), unquant_mult), A_rows, width, B_cols);
-
-  AlignedVector<int16_t> A_prep3(A.size());
-  AlignedVector<int16_t> B_prep3(B.size());
-  std::cout << "New multiply:" << std::endl;
-  printMatrix(test_C.begin(), A_rows, B_cols);
-  for (int i = 0; i < A_prep2.size(); i++) {
-    A_prep3[i] = A_prep2[i];
-  }
-  AVX2_16bit::PrepareB(B.begin(), B_prep3.begin(), quant_mult, width, B_cols);
-  AVX2_16bit::Multiply(A_prep3.begin(), B_prep3.begin(), A_rows, width, B_cols, UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
-
-  std::cout << "New multiply, 16 bit:" << std::endl;
-  printMatrix(test_C.begin(), A_rows, B_cols);
-
-  //FULL INTS
-  AlignedVector<float> C_slowint(A_rows * B_cols);
-  AlignedVector<int8_t> B_quant(width * B_cols);
-  AVX2_8bit::Quantize(B.begin(), B_quant.begin(), quant_mult, B.size());
-
-  SlowRefInt(A_prep.begin(), B_quant.begin(), C_slowint.begin(),
-      unquant_mult, A_rows, width, B_cols, bias_orig.begin());
-
-
-  std::cout << "Reference int8:" << std::endl;
-  printMatrix(C_slowint.begin(), A_rows, B_cols);
-
-  //FULL INT16
-  AlignedVector<int16_t> A_prep4(A.size());
-  for (int i = 0; i < A_prep2.size(); i++) {
-    A_prep4[i] = A_prep[i];
-  }
-
-  AlignedVector<float> C_slowint2(A_rows * B_cols);
-  AlignedVector<int16_t> B_quant2(width * B_cols);
-  AVX2_16bit::Quantize(B.begin(), B_quant2.begin(), quant_mult, B.size());
-
-  SlowRefInt(A_prep4.begin(), B_quant2.begin(), C_slowint2.begin(),
-      unquant_mult, A_rows, width, B_cols, bias_orig.begin());
-
-
-  std::cout << "Reference int16:" << std::endl;
-  printMatrix(C_slowint2.begin(), A_rows, B_cols);
-
-  //FLOATS
-  AlignedVector<float> C(A_rows * B_cols);
-
-  SlowRefFloat(A.begin(), B.begin(), C.begin(), A_rows, width, B_cols, bias_orig.begin());
-  std::cout << "Reference float:" << std::endl;
-  printMatrix(C.begin(), A_rows, B_cols);
-
-}
author	Nikolay Bogoychev <nheart@gmail.com>	2020-11-15 19:50:02 +0300
committer	GitHub <noreply@github.com>	2020-11-15 19:50:02 +0300
commit	8abde25b13c3ab210c0dec8e23f4944e3953812d (patch)
tree	90b591ee994252ddd44d593276b4ef895bbcb5aa
parent	874ceebbf53a85691b326495100b6361a2166cec (diff)
parent	8f28282c3bd854922da638024d2659be52e892e9 (diff)