16 files changed, 667 insertions, 803 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d9aa674..05e395a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,15 +29,15 @@ else()
   set_source_files_properties(intgemm.cc test/quantize_test.cc test/multiply_test.cc benchmark.cc PROPERTIES COMPILE_DEFINITIONS "INTGEMM_NO_AVX512")
 endif()
 
-add_library(intgemm STATIC ${GEMMS} intgemm.cc)
-foreach(exe example benchmark)
-  add_executable(${exe} ${exe}.cc)
-  target_link_libraries(${exe} intgemm)
-endforeach()
+#add_library(intgemm STATIC ${GEMMS} intgemm.cc)
+#foreach(exe example benchmark)
+#  add_executable(${exe} ${exe}.cc)
+#  target_link_libraries(${exe} intgemm)
+#endforeach()
 
 include_directories(.)
-add_executable(tests test/multiply_test.cc test/quantize_test.cc)
-target_link_libraries(tests intgemm)
+add_executable(tests test/multiply_test.cc test/quantize_test.cc avx2_gemm.cc) # avx512_gemm.cc)
+#target_link_libraries(tests intgemm)
 
 #CTest integration with Catch2
 include(CMake/Catch.cmake)
diff --git a/avx2_gemm.cc b/avx2_gemm.cc
index 955f64c..541b5c2 100644
--- a/avx2_gemm.cc
+++ b/avx2_gemm.cc
@@ -1,155 +1 @@
 #include "avx2_gemm.h"
-#include "cops.h"
-#include "interleave.h"
-#include "multiply.h"
-
-#include <cassert>
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
-#include <stdint.h>
-
-namespace intgemm {
-
-// PREPARE A: just quantization in the same memory order.
-
-namespace {
-// Read a vector of floats, multiply them, and cast to 32-bit integer.
-inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
-  return _mm256_cvtps_epi32(_mm256_mul_ps(*reinterpret_cast<const __m256*>(input), quant_mult_reg));
-}
-
-class QuantizeTile16 {
-  public:
-    typedef __m256i Integer;
-
-    explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
-
-    Integer Consecutive(const float *input) {
-      return Tile(input, input + 8);
-    }
-
-    Integer ForReshape(const float *input, Index cols) {
-      // 8 rows in the first 128-bit register, 8 in the second register.
-      return Tile(input, input + 8 * cols);
-    }
-
-  private:
-    __m256i Tile(const float *input0, const float *input1) {
-      __m256i g0 = QuantizerGrab(input0, mult_);
-      __m256i g1 = QuantizerGrab(input1, mult_);
-      __m256i packed = _mm256_packs_epi32(g0, g1);
-      // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15.
-      // Technically this could be removed if the PrepareB did the same reordering internally.
-      return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
-    }
-
-    const __m256 mult_;
-};
-
-} // namespace
-
-// Just quantize everything in order.
-void AVX2_16bit::Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
-  assert(size % 16 == 0);
-  assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
-  QuantizeTile16 q(quant_mult);
-  const float *end = input + size;
-  for (; input != end; input += 16, output += 16) {
-    *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
-  }
-}
-
-namespace {
-/* Read 8 floats at a time from input0, input1, input2, and input3.  Quantize
- * them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate
- * the result into one register and return it.
- */
-class QuantizeTile8 {
-  public:
-    typedef __m256i Integer;
-
-    explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
-
-    inline __m256i Consecutive(const float *input) {
-      return Tile(input, input + 8, input + 16, input + 24);
-    }
-
-    inline __m256i ForReshape(const float *input, Index cols) {
-      // Put higher rows in the second half of the register.  These will jumble
-      // around in the same way then conveniently land in the right place.
-      return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
-    }
-
-  private:
-    inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
-      // Looking at the assembly, gcc has pulled this outside the loops calling this.
-      const __m256i neg127 = _mm256_set1_epi8(-127);
-      const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-      // Grab 4 registers at a time in 32-bit format.
-      __m256i g0 = QuantizerGrab(input0, mult_);
-      __m256i g1 = QuantizerGrab(input1, mult_);
-      __m256i g2 = QuantizerGrab(input2, mult_);
-      __m256i g3 = QuantizerGrab(input3, mult_);
-      // Pack 32-bit to 16-bit.
-      __m256i packed0 = _mm256_packs_epi32(g0, g1);
-      __m256i packed1 = _mm256_packs_epi32(g2, g3);
-      // Pack 16-bit to 8-bit.
-      __m256i packed = _mm256_packs_epi16(packed0, packed1);
-      // Ban -128.
-      packed = _mm256_max_epi8(packed, neg127);
-      // Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
-      // Or as 32-bit integers 0 2 4 6 1 3 5 7
-      // Technically this could be removed so long as the rows are bigger than 16
-      // and the values are only used for GEMM.
-      return _mm256_permutevar8x32_epi32(packed, shuffle_param);
-    }
-    
-    const __m256 mult_;
-};
-} // namespace
-
-// Just quantize everything in order.
-void AVX2_8bit::Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
-  assert(size % 32 == 0);
-  assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
-  QuantizeTile8 q(quant_mult);
-  const float *end = input + size;
-  for (; input != end; input += 32, output += 32) {
-    *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
-  }
-}
-
-void AVX2_16bit::PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
-  PrepareBFor16(input, output, QuantizeTile16(quant_mult), rows, cols);
-}
-
-void AVX2_16bit::SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-  SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
-}
-
-void AVX2_8bit::PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
-  PrepareBFor8(input, output, QuantizeTile8(quant_mult), rows, cols);
-}
-
-void AVX2_8bit::SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-  SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
-}
-
-void AVX2_16bit::Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
-  Multiply16<__m256i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
-}
-
-void AVX2_8bit::Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
-  Multiply8_SSE2OrAVX2<Multiply8_AVXAVX2, __m256i, __m256>(A, B, C, unquant_mult, A_rows, width, B_cols);
-}
-
-const char *const AVX2_16bit::kName = "16-bit AVX2";
-const char *const AVX2_8bit::kName = "8-bit AVX2";
-
-float AVX2_MaxAbsolute(const float *begin, const float *end) {
-  return MaxAbsoluteBackend<__m256>(begin, end);
-}
-
-} // namespace intgemm
diff --git a/avx2_gemm.h b/avx2_gemm.h
index 4b0b001..32b1a5e 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -3,59 +3,184 @@
 #include <cstdint>
 #include <stdint.h>
 
+#include "cops.h"
+#include "interleave.h"
+#include "multiply.h"
+
 namespace intgemm {
 
+// PREPARE A: just quantization in the same memory order.
+
+namespace avx2 {
+// Read a vector of floats, multiply them, and cast to 32-bit integer.
+// EVIL EVIL CODE DUPLICATION, FIX
+AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
+  return _mm256_cvtps_epi32(_mm256_mul_ps(*reinterpret_cast<const __m256*>(input), quant_mult_reg));
+}
+
+class QuantizeTile16 {
+  public:
+    typedef __m256i Integer;
+
+    explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
+
+    AVX2 Integer Consecutive(const float *input) {
+      return Tile(input, input + 8);
+    }
+
+    AVX2 Integer ForReshape(const float *input, Index cols) {
+      // 8 rows in the first 128-bit register, 8 in the second register.
+      return Tile(input, input + 8 * cols);
+    }
+
+  private:
+    AVX2 __m256i Tile(const float *input0, const float *input1) {
+      __m256i g0 = QuantizerGrab(input0, mult_);
+      __m256i g1 = QuantizerGrab(input1, mult_);
+      __m256i packed = _mm256_packs_epi32(g0, g1);
+      // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15.
+      // Technically this could be removed if the PrepareB did the same reordering internally.
+      return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
+    }
+
+    const __m256 mult_;
+};
+
+} // namespace
+
+
 struct AVX2_16bit {
   typedef int16_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
-  static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+  AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
-  static void Quantize(const float *input, int16_t *output, float quant_mult, Index size);
+  // Just quantize everything in order.
+  AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+    assert(size % 16 == 0);
+    assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
+    avx2::QuantizeTile16 q(quant_mult);
+    const float *end = input + size;
+    for (; input != end; input += 16, output += 16) {
+      *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
+    }
+  }
 
   // Tile size for B; B must be a multiple of this block size.
   static const Index kBTileRow = 16;
   static const Index kBTileCol = 8;
 
-  static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
+  AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+    PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols);
+  }
 
-  static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+  AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
+  }
 
-  static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+  AVX2 static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+    Multiply16<__m256i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
+  }
 
-  static const char *const kName;
+  constexpr static const char *const kName = "16-bit AVX2";
 
   static const CPUType kUses = CPU_AVX2;
 };
 
+namespace avx2 {
+/* Read 8 floats at a time from input0, input1, input2, and input3.  Quantize
+ * them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate
+ * the result into one register and return it.
+ */
+class QuantizeTile8 {
+  public:
+    typedef __m256i Integer;
+
+    explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
+
+    AVX2 inline __m256i Consecutive(const float *input) {
+      return Tile(input, input + 8, input + 16, input + 24);
+    }
+
+    AVX2 inline __m256i ForReshape(const float *input, Index cols) {
+      // Put higher rows in the second half of the register.  These will jumble
+      // around in the same way then conveniently land in the right place.
+      return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
+    }
+
+  private:
+    AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
+      // Looking at the assembly, gcc has pulled this outside the loops calling this.
+      const __m256i neg127 = _mm256_set1_epi8(-127);
+      const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+      // Grab 4 registers at a time in 32-bit format.
+      __m256i g0 = avx2::QuantizerGrab(input0, mult_);
+      __m256i g1 = avx2::QuantizerGrab(input1, mult_);
+      __m256i g2 = avx2::QuantizerGrab(input2, mult_);
+      __m256i g3 = avx2::QuantizerGrab(input3, mult_);
+      // Pack 32-bit to 16-bit.
+      __m256i packed0 = _mm256_packs_epi32(g0, g1);
+      __m256i packed1 = _mm256_packs_epi32(g2, g3);
+      // Pack 16-bit to 8-bit.
+      __m256i packed = _mm256_packs_epi16(packed0, packed1);
+      // Ban -128.
+      packed = _mm256_max_epi8(packed, neg127);
+      // Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
+      // Or as 32-bit integers 0 2 4 6 1 3 5 7
+      // Technically this could be removed so long as the rows are bigger than 16
+      // and the values are only used for GEMM.
+      return _mm256_permutevar8x32_epi32(packed, shuffle_param);
+    }
+    
+    const __m256 mult_;
+};
+} // namespace
+
 struct AVX2_8bit {
   typedef int8_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
-  static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+  AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
-  static void Quantize(const float *input, int8_t *output, float quant_mult, Index size);
+  // Just quantize everything in order.
+  AVX2 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+    assert(size % 32 == 0);
+    assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
+    avx2::QuantizeTile8 q(quant_mult);
+    const float *end = input + size;
+    for (; input != end; input += 32, output += 32) {
+      *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
+    }
+  }
 
   // Tile size for B; B must be a multiple of this block size.
   static const Index kBTileRow = 32;
   static const Index kBTileCol = 8;
 
-  static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols);
+  AVX2 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+    PrepareBFor8(input, output, avx2::QuantizeTile8(quant_mult), rows, cols);
+  }
 
-  static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+  AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
+  }
 
-  static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+  AVX2 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+    Multiply8_SSE2OrAVX2<Multiply8_AVXAVX2, __m256i, __m256>(A, B, C, unquant_mult, A_rows, width, B_cols);
+  }
   
-  static const char *const kName;
+  constexpr static const char *const kName = "8-bit AVX2";
 
   static const CPUType kUses = CPU_AVX2;
 };
 
 // Technically only requires AVX
-float AVX2_MaxAbsolute(const float *begin, const float *end);
+AVX2 float AVX2_MaxAbsolute(const float *begin, const float *end) {
+  return MaxAbsoluteBackend<__m256>(begin, end);
+}
 
 } // namespace intgemm
diff --git a/avx512_gemm.cc b/avx512_gemm.cc
index ff200bf..59da3e4 100644
--- a/avx512_gemm.cc
+++ b/avx512_gemm.cc
@@ -1,282 +1 @@
 #include "avx512_gemm.h"
-#include "interleave.h"
-#include "multiply.h"
-#include "cops.h"
-
-#include <cassert>
-#include <cstddef>
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
-
-namespace intgemm {
-
-namespace {
-
-// Load from memory, multiply, and convert to int32_t.
-inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
-  // Multiply each by the quantization factor.
-  __m512 val = _mm512_mul_ps(*reinterpret_cast<const __m512*>(input), quant_mult_reg);
-  // Cast to 32-bit int
-  return _mm512_cvtps_epi32(val);
-}
-
-} // namespace
-
-
-// AVX512 has combined collapse and store instructions:
-// _mm512_mask_cvtsepi32_storeu_epi16
-// _mm512_mask_cvtsepi32_storeu_epi8
-// So conversion in memory uses these, but I also implement a wider version for
-// rearranging B.
-// 
-// Convert to 16-bit signed integers.
-void AVX512_16bit::Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
-    assert(size % 16 == 0);
-    assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
-    // Fill with the quantization multiplier.
-    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
-    const float *end = input + size;
-    for (; input != end; input += 16, output += 16) {
-      // There doesn't seem to be an unmasked version.
-      _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, QuantizerGrab(input, quant_mult_reg));
-    }
-}
-
-// Convert to 8-bit signed integers.
-void AVX512_8bit::Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
-  assert(size % 16 == 0);
-  assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
-  const __m512i neg127 = _mm512_set1_epi32(-127);
-  const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
-  const float *end = input + size;
-  for (; input < end; input += 16, output += 16) {
-    __m512i asint = QuantizerGrab(input, quant_mult_reg);
-    asint = _mm512_max_epi32(asint, neg127);
-    // There doesn't seem to be an unmasked version.
-    _mm512_mask_cvtsepi32_storeu_epi8(output, 0xffff, asint);
-  }
-}
-
-namespace {
-
-// For PrepareB we want to read 8 columns at a time.  When converting 32-bit
-// floats to 8-bit values, that's 32 bytes of floats.  But AVX512 is 64 bytes
-// wide so it reads off the edge of the tile.  We could expand the tile size
-// but then the memory written to won't be contiguous anyway so we'd be doing a
-// scatter anyway.  Easier to just read the 8 columns we wanted as 256 bits
-// concatenate.
-inline __m512 Concat(const __m256 first, const __m256 second) {
-  // AVX512DQ but that goes with AVX512BW anyway.
-  return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
-}
-
-// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently.
-inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
-  __m512 appended = Concat(*reinterpret_cast<const __m256*>(input0), *reinterpret_cast<const __m256*>(input1));
-  appended = _mm512_mul_ps(appended, quant_mult_reg);
-  return _mm512_cvtps_epi32(appended);
-}
-
-// These are only used for reshaping due to the AVX512 instructions
-// _mm512_mask_cvtsepi32_storeu_epi16 and _mm512_mask_cvtsepi32_storeu_epi8
-// being used for the quantizer.
-class QuantizeTile16 {
-  public:
-    typedef __m512i Integer;
-
-    explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
-    inline __m512i ForReshape(const float *input, Index cols) {
-      __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
-      __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
-      __m512i packed = _mm512_packs_epi32(g0, g1);
-      // Permute within 256-bit lanes, so same as AVX2
-      return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
-    }
-
-  private:
-    const __m512 mult_reg_;
-};
-
-class QuantizeTile8 {
-  public:
-    typedef __m512i Integer;
-
-    explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
-    inline __m512i ForReshape(const float *input, Index cols) {
-      // TODO: try alternative: _mm512_cvtsepi32_epi8 ?
-			const __m512i neg127 = _mm512_set1_epi8(-127);
-			// In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
-			const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
-
-			// 32-bit format.
-			__m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_);
-			__m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_);
-			__m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_);
-			__m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_);
-			// Pack 32-bit to 16-bit.
-			__m512i packed0 = _mm512_packs_epi32(g0, g1);
-			__m512i packed1 = _mm512_packs_epi32(g2, g3);
-			// Pack 16-bit to 8-bit.
-			__m512i packed = _mm512_packs_epi16(packed0, packed1);
-			// Ban -128.
-			packed = _mm512_max_epi8(packed, neg127);
-			// 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63
-			return _mm512_permutexvar_epi32(shuffle_param, packed);
-		}
-
-  private:
-    const __m512 mult_reg_;
-};
-
-} // namespace
-
-void AVX512_16bit::PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
-  PrepareBFor16(input, output, QuantizeTile16(quant_mult), rows, cols);
-}
-
-void AVX512_16bit::SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-  SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
-}
-
-void AVX512_8bit::PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
-  PrepareBFor8(input, output, QuantizeTile8(quant_mult), rows, cols);
-}
-
-void AVX512_8bit::SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-  SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
-}
-
-void AVX512_16bit::Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
-  // The unquantization is only 256-bit wide because there are 8 results.
-  Multiply16<__m512i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
-}
-
-// Special AVX512 implementation due to having 32 registers (so I don't have to
-// allocate registers manually) and no sign instruction.
-void AVX512_8bit::Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
-  typedef __m512i Integer;
-  typedef __m256 Float; // For quantization we only do 8 at a time.
-  // This is copy-paste from Multiply8_SSE2OrAVX2.
-  assert(width % sizeof(Integer) == 0);
-  assert(B_cols % 8 == 0);
-  assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0);
-  assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0);
-  assert(reinterpret_cast<uintptr_t>(C) % sizeof(Integer) == 0);
-  Float unquant_reg = set1_ps<Float>(unquant_mult);
-  const int simd_width = width / sizeof(Integer);
-  const Integer *B0_col = reinterpret_cast<const Integer*>(B);
-  // Added for AVX512.
-  Integer zeros = setzero_si<Integer>();
-  // Go over 8 columns of B at a time.
-  for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) {
-    // Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.
-    for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
-      // Iterate over shared (inner) dimension.
-      const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width);
-      const Integer *A_end = A_live + simd_width;
-      const Integer *B_live = B0_col;
-
-      // Do the first iteration to initialize the sums.
-      __m512i a = *A_live;
-      __mmask64 neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
-      __m512i a_positive = _mm512_abs_epi8(a);
-      // These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A.
-      Integer sum0 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[0], neg_mask, zeros, B_live[0]));
-      Integer sum1 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[1], neg_mask, zeros, B_live[1]));
-      Integer sum2 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[2], neg_mask, zeros, B_live[2]));
-      Integer sum3 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[3], neg_mask, zeros, B_live[3]));
-      Integer sum4 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[4], neg_mask, zeros, B_live[4]));
-      Integer sum5 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[5], neg_mask, zeros, B_live[5]));
-      Integer sum6 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[6], neg_mask, zeros, B_live[6]));
-      Integer sum7 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[7], neg_mask, zeros, B_live[7]));
-
-      ++A_live;
-      B_live += 8;
-
-      // Use A as the loop variable so the add can be done where gcc likes it
-      // for branch prediction.
-      for (; A_live != A_end; ++A_live, B_live += 8) {
-        // Unique code here: can we do an inline function?
-        // Retrieve a.  We will use this as the unsigned part.
-        a = *A_live;
-        // Retrieve the conveniently consecutive values of B.
-        __m512i b0 = *B_live;
-        __m512i b1 = *(B_live + 1);
-        __m512i b2 = *(B_live + 2);
-        __m512i b3 = *(B_live + 3);
-        __m512i b4 = *(B_live + 4);
-        __m512i b5 = *(B_live + 5);
-        __m512i b6 = *(B_live + 6);
-        __m512i b7 = *(B_live + 7);
-
-        // Get a mask where a is negative.
-        // Didn't seem to make a difference definining sign bits here vs at top
-        neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
-        a_positive = _mm512_abs_epi8(a);
-
-        // Negate by subtracting from zero with a mask.
-        b0 = _mm512_mask_sub_epi8(b0, neg_mask, zeros, b0);
-        b1 = _mm512_mask_sub_epi8(b1, neg_mask, zeros, b1);
-        b2 = _mm512_mask_sub_epi8(b2, neg_mask, zeros, b2);
-        b3 = _mm512_mask_sub_epi8(b3, neg_mask, zeros, b3);
-        b4 = _mm512_mask_sub_epi8(b4, neg_mask, zeros, b4);
-        b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5);
-        b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6);
-        b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7);
-        // The magic 8-bit multiply then horizontal sum into 16-bit.
-        b0 = _mm512_maddubs_epi16(a_positive, b0);
-        b1 = _mm512_maddubs_epi16(a_positive, b1);
-        b2 = _mm512_maddubs_epi16(a_positive, b2);
-        b3 = _mm512_maddubs_epi16(a_positive, b3);
-        b4 = _mm512_maddubs_epi16(a_positive, b4);
-        b5 = _mm512_maddubs_epi16(a_positive, b5);
-        b6 = _mm512_maddubs_epi16(a_positive, b6);
-        b7 = _mm512_maddubs_epi16(a_positive, b7);
-        // Now we have 16-bit results that are the sum of two multiplies.
-        // Choosing to approximate and do adds.
-        // Perhaps every so often we could accumulate by upcasting.
-        sum0 = _mm512_adds_epi16(sum0, b0);
-        sum1 = _mm512_adds_epi16(sum1, b1);
-        sum2 = _mm512_adds_epi16(sum2, b2);
-        sum3 = _mm512_adds_epi16(sum3, b3);
-        sum4 = _mm512_adds_epi16(sum4, b4);
-        sum5 = _mm512_adds_epi16(sum5, b5);
-        sum6 = _mm512_adds_epi16(sum6, b6);
-        sum7 = _mm512_adds_epi16(sum7, b7);
-        // Unique code ends: can we do an inline function?
-      }
-      // Upcast to 32-bit and horizontally add.
-      Integer ones = set1_epi16<Integer>(1);
-      sum0 = madd_epi16(sum0, ones);
-      sum1 = madd_epi16(sum1, ones);
-      sum2 = madd_epi16(sum2, ones);
-      sum3 = madd_epi16(sum3, ones);
-      sum4 = madd_epi16(sum4, ones);
-      sum5 = madd_epi16(sum5, ones);
-      sum6 = madd_epi16(sum6, ones);
-      sum7 = madd_epi16(sum7, ones);
-      Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3);
-      Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7);
-
-      auto total = PermuteSummer(pack0123, pack4567);
-      WriteC(C + A_rowidx * B_cols + B0_colidx, total, unquant_reg);
-    }
-  }
-}
-
-const char *const AVX512_16bit::kName = "16-bit AVX512";
-const char *const AVX512_8bit::kName = "8-bit AVX512";
-
-float AVX512_MaxAbsolute(const float *begin, const float *end) {
-  return MaxAbsoluteBackend<__m512>(begin, end);
-}
-
-} // namespace intgemm
diff --git a/avx512_gemm.h b/avx512_gemm.h
index f9b0f81..e226686 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -1,6 +1,15 @@
 #pragma once
 #include <stdint.h>
 #include <cstdint>
+#include <cassert>
+#include <cstddef>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "interleave.h"
+#include "multiply.h"
+#include "cops.h"
 
 #include "types.h"
 
@@ -15,31 +24,140 @@
 
 namespace intgemm {
 
+// AVX512 has combined collapse and store instructions:
+// _mm512_mask_cvtsepi32_storeu_epi16
+// _mm512_mask_cvtsepi32_storeu_epi8
+// So conversion in memory uses these, but I also implement a wider version for
+// rearranging B.
+
+// Convert to 16-bit signed integers.
+namespace avx512f {
+
+// Load from memory, multiply, and convert to int32_t.
+AVX512F inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
+  // Multiply each by the quantization factor.
+  __m512 val = _mm512_mul_ps(*reinterpret_cast<const __m512*>(input), quant_mult_reg);
+  // Cast to 32-bit int
+  return _mm512_cvtps_epi32(val);
+}
+
+// For PrepareB we want to read 8 columns at a time.  When converting 32-bit
+// floats to 8-bit values, that's 32 bytes of floats.  But AVX512 is 64 bytes
+// wide so it reads off the edge of the tile.  We could expand the tile size
+// but then the memory written to won't be contiguous anyway so we'd be doing a
+// scatter anyway.  Easier to just read the 8 columns we wanted as 256 bits
+// concatenate.
+AVX512F inline __m512 Concat(const __m256 first, const __m256 second) {
+  // AVX512DQ but that goes with AVX512BW anyway.
+  return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
+}
+
+// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently.
+AVX512F inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
+  __m512 appended = avx512f::Concat(*reinterpret_cast<const __m256*>(input0), *reinterpret_cast<const __m256*>(input1));
+  appended = _mm512_mul_ps(appended, quant_mult_reg);
+  return _mm512_cvtps_epi32(appended);
+}
+
+// These are only used for reshaping due to the AVX512 instructions
+// _mm512_mask_cvtsepi32_storeu_epi16 and _mm512_mask_cvtsepi32_storeu_epi8
+// being used for the quantizer.
+class QuantizeTile16 {
+  public:
+    typedef __m512i Integer;
+
+    explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+
+    AVX512F inline __m512i ForReshape(const float *input, Index cols) {
+      __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
+      __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
+      __m512i packed = _mm512_packs_epi32(g0, g1);
+      // Permute within 256-bit lanes, so same as AVX2
+      return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
+    }
+
+  private:
+    const __m512 mult_reg_;
+};
+
+class QuantizeTile8 {
+  public:
+    typedef __m512i Integer;
+
+    explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+
+    AVX512F inline __m512i ForReshape(const float *input, Index cols) {
+      // TODO: try alternative: _mm512_cvtsepi32_epi8 ?
+      const __m512i neg127 = _mm512_set1_epi8(-127);
+      // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
+      const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+
+      // 32-bit format.
+      __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_);
+      __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_);
+      __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_);
+      __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_);
+      // Pack 32-bit to 16-bit.
+      __m512i packed0 = _mm512_packs_epi32(g0, g1);
+      __m512i packed1 = _mm512_packs_epi32(g2, g3);
+      // Pack 16-bit to 8-bit.
+      __m512i packed = _mm512_packs_epi16(packed0, packed1);
+      // Ban -128.
+      packed = _mm512_max_epi8(packed, neg127);
+      // 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63
+      return _mm512_permutexvar_epi32(shuffle_param, packed);
+    }
+
+  private:
+    const __m512 mult_reg_;
+};
+
+} // namespace
+
 struct AVX512_16bit {
   typedef int16_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
   // rows * cols must be a multiple of 16.
-  static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+  AVX512F static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
   // Technically output can be unaligned in Quantize.
   // But then it will need to be aligned for Multiply.
   // size must be a multiple of 16.
-  static void Quantize(const float *input, int16_t *output, float quant_mult, Index size);
+  // Convert to 16-bit signed integers.
+  AVX512F static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+    assert(size % 16 == 0);
+    assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
+    // Fill with the quantization multiplier.
+    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
+    const float *end = input + size;
+    for (; input != end; input += 16, output += 16) {
+      // There doesn't seem to be an unmasked version.
+      _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, avx512f::QuantizerGrab(input, quant_mult_reg));
+    }
+  }
+  
 
   // Tile size for B; B must be a multiple of this block size.
   static const Index kBTileRow = 32;
   static const Index kBTileCol = 8;
 
-  static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
+  AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+    PrepareBFor16(input, output, avx512f::QuantizeTile16(quant_mult), rows, cols);
+  }
 
-  static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+  AVX512F static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
+  }
 
-  static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+  AVX512F static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+    // The unquantization is only 256-bit wide because there are 8 results.
+    Multiply16<__m512i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
+  }
 
-  static const char *const kName;
+  constexpr static const char *const kName = "16-bit AVX512";
 
   static const CPUType kUses = CPU_AVX512BW;
 };
@@ -48,29 +166,159 @@ struct AVX512_8bit {
   typedef int8_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
-  static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+  AVX512F static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
   // Technically output can be unaligned in Quantize.
   // But then it will need to be aligned for Multiply.
-  static void Quantize(const float *input, int8_t *output, float quant_mult, Index size);
+  // Convert to 8-bit signed integers.
+  AVX512F static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+    assert(size % 16 == 0);
+    assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
+    const __m512i neg127 = _mm512_set1_epi32(-127);
+    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
+    const float *end = input + size;
+    for (; input < end; input += 16, output += 16) {
+      __m512i asint = avx512f::QuantizerGrab(input, quant_mult_reg);
+      asint = _mm512_max_epi32(asint, neg127);
+      // There doesn't seem to be an unmasked version.
+      _mm512_mask_cvtsepi32_storeu_epi8(output, 0xffff, asint);
+    }
+  }
 
   // Tile size for B; B must be a multiple of this block size.
   static const Index kBTileRow = 64;
   static const Index kBTileCol = 8;
 
-  static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols);
+  AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+    PrepareBFor8(input, output, avx512f::QuantizeTile8(quant_mult), rows, cols);
+  }
+
+  AVX512F static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
+  }
+
+  // Special AVX512 implementation due to having 32 registers (so I don't have to
+  // allocate registers manually) and no sign instruction.
+  AVX512F static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+  typedef __m512i Integer;
+  typedef __m256 Float; // For quantization we only do 8 at a time.
+  // This is copy-paste from Multiply8_SSE2OrAVX2.
+  assert(width % sizeof(Integer) == 0);
+  assert(B_cols % 8 == 0);
+  assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0);
+  assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0);
+  assert(reinterpret_cast<uintptr_t>(C) % sizeof(Integer) == 0);
+  Float unquant_reg = set1_ps<Float>(unquant_mult);
+  const int simd_width = width / sizeof(Integer);
+  const Integer *B0_col = reinterpret_cast<const Integer*>(B);
+  // Added for AVX512.
+  Integer zeros = setzero_si<Integer>();
+  // Go over 8 columns of B at a time.
+  for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) {
+    // Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.
+    for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
+      // Iterate over shared (inner) dimension.
+      const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width);
+      const Integer *A_end = A_live + simd_width;
+      const Integer *B_live = B0_col;
+
+      // Do the first iteration to initialize the sums.
+      __m512i a = *A_live;
+      __mmask64 neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
+      __m512i a_positive = _mm512_abs_epi8(a);
+      // These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A.
+      Integer sum0 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[0], neg_mask, zeros, B_live[0]));
+      Integer sum1 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[1], neg_mask, zeros, B_live[1]));
+      Integer sum2 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[2], neg_mask, zeros, B_live[2]));
+      Integer sum3 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[3], neg_mask, zeros, B_live[3]));
+      Integer sum4 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[4], neg_mask, zeros, B_live[4]));
+      Integer sum5 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[5], neg_mask, zeros, B_live[5]));
+      Integer sum6 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[6], neg_mask, zeros, B_live[6]));
+      Integer sum7 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[7], neg_mask, zeros, B_live[7]));
 
-  static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+      ++A_live;
+      B_live += 8;
 
-  static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+      // Use A as the loop variable so the add can be done where gcc likes it
+      // for branch prediction.
+      for (; A_live != A_end; ++A_live, B_live += 8) {
+        // Unique code here: can we do an inline function?
+        // Retrieve a.  We will use this as the unsigned part.
+        a = *A_live;
+        // Retrieve the conveniently consecutive values of B.
+        __m512i b0 = *B_live;
+        __m512i b1 = *(B_live + 1);
+        __m512i b2 = *(B_live + 2);
+        __m512i b3 = *(B_live + 3);
+        __m512i b4 = *(B_live + 4);
+        __m512i b5 = *(B_live + 5);
+        __m512i b6 = *(B_live + 6);
+        __m512i b7 = *(B_live + 7);
+
+        // Get a mask where a is negative.
+        // Didn't seem to make a difference definining sign bits here vs at top
+        neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
+        a_positive = _mm512_abs_epi8(a);
+
+        // Negate by subtracting from zero with a mask.
+        b0 = _mm512_mask_sub_epi8(b0, neg_mask, zeros, b0);
+        b1 = _mm512_mask_sub_epi8(b1, neg_mask, zeros, b1);
+        b2 = _mm512_mask_sub_epi8(b2, neg_mask, zeros, b2);
+        b3 = _mm512_mask_sub_epi8(b3, neg_mask, zeros, b3);
+        b4 = _mm512_mask_sub_epi8(b4, neg_mask, zeros, b4);
+        b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5);
+        b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6);
+        b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7);
+        // The magic 8-bit multiply then horizontal sum into 16-bit.
+        b0 = _mm512_maddubs_epi16(a_positive, b0);
+        b1 = _mm512_maddubs_epi16(a_positive, b1);
+        b2 = _mm512_maddubs_epi16(a_positive, b2);
+        b3 = _mm512_maddubs_epi16(a_positive, b3);
+        b4 = _mm512_maddubs_epi16(a_positive, b4);
+        b5 = _mm512_maddubs_epi16(a_positive, b5);
+        b6 = _mm512_maddubs_epi16(a_positive, b6);
+        b7 = _mm512_maddubs_epi16(a_positive, b7);
+        // Now we have 16-bit results that are the sum of two multiplies.
+        // Choosing to approximate and do adds.
+        // Perhaps every so often we could accumulate by upcasting.
+        sum0 = _mm512_adds_epi16(sum0, b0);
+        sum1 = _mm512_adds_epi16(sum1, b1);
+        sum2 = _mm512_adds_epi16(sum2, b2);
+        sum3 = _mm512_adds_epi16(sum3, b3);
+        sum4 = _mm512_adds_epi16(sum4, b4);
+        sum5 = _mm512_adds_epi16(sum5, b5);
+        sum6 = _mm512_adds_epi16(sum6, b6);
+        sum7 = _mm512_adds_epi16(sum7, b7);
+        // Unique code ends: can we do an inline function?
+      }
+      // Upcast to 32-bit and horizontally add.
+      Integer ones = set1_epi16<Integer>(1);
+      sum0 = madd_epi16(sum0, ones);
+      sum1 = madd_epi16(sum1, ones);
+      sum2 = madd_epi16(sum2, ones);
+      sum3 = madd_epi16(sum3, ones);
+      sum4 = madd_epi16(sum4, ones);
+      sum5 = madd_epi16(sum5, ones);
+      sum6 = madd_epi16(sum6, ones);
+      sum7 = madd_epi16(sum7, ones);
+      Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3);
+      Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7);
+
+      auto total = PermuteSummer(pack0123, pack4567);
+      WriteC(C + A_rowidx * B_cols + B0_colidx, total, unquant_reg);
+    }
+  }
+}
 
-  static const char *const kName;
+  constexpr static const char *const kName = "8-bit AVX512";
 
   static const CPUType kUses = CPU_AVX512BW;
 };
 
-float AVX512_MaxAbsolute(const float *begin_float, const float *end_float);
+AVX512F float AVX512_MaxAbsolute(const float *begin, const float *end) {
+  return MaxAbsoluteBackend<__m512>(begin, end);
+}
 
 } // namespace intgemm
diff --git a/cops.h b/cops.h
index 5fb67bd..b93f4e9 100644
--- a/cops.h
+++ b/cops.h
@@ -1,27 +1,10 @@
+#pragma once
 #include "intrinsics.h"
 
 #include <exception>
 
 namespace intgemm {
 
-// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3).
-class UnsupportedCPU : public std::exception {
-  public:
-    UnsupportedCPU();
-
-    ~UnsupportedCPU() throw();
-
-    const char *what() const throw() override;
-};
-
-UnsupportedCPU::UnsupportedCPU() {}
-
-UnsupportedCPU::~UnsupportedCPU() throw() {}
-
-const char *UnsupportedCPU::what() const throw() {
-  return "Integer matrix multiplication has not been efficiently implemented for your CPU.";
-}
-
 class JustUnquantizeC {
 public:
  JustUnquantizeC(float *C, float unquant_mult);
diff --git a/intgemm.cc b/intgemm.cc
index d286c1a..f50a957 100644
--- a/intgemm.cc
+++ b/intgemm.cc
@@ -1,118 +1 @@
 #include "intgemm.h"
-
-#include "types.h"
-#include "sse2_gemm.h"
-#include "ssse3_gemm.h"
-#include "avx2_gemm.h"
-#ifndef INTGEMM_NO_AVX512
-#include "avx512_gemm.h"
-#endif
-
-namespace intgemm {
-
-UnsupportedCPU::UnsupportedCPU() {}
-
-UnsupportedCPU::~UnsupportedCPU() throw() {}
-
-const char *UnsupportedCPU::what() const throw() {
-  return "Integer matrix multiplication has not been efficiently implemented for your CPU.";
-}
-
-namespace {
-
-struct Unsupported_16bit {
-  static void Quantize(const float *, int16_t *, float, Index) {
-    throw UnsupportedCPU();
-  }
-  static void PrepareB(const float *, int16_t *, float, Index, Index) {
-    throw UnsupportedCPU();
-  }
-  static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) {
-    throw UnsupportedCPU();
-  }
-  static void Multiply(const int16_t *, const int16_t *, float *, float, Index, Index, Index) {
-    throw UnsupportedCPU();
-  }
-  static const char *const kName;
-};
-const char *const Unsupported_16bit::kName = "16-bit Unsupported";
-
-struct Unsupported_8bit {
-  static void Quantize(const float *, int8_t *, float, Index) {
-    throw UnsupportedCPU();
-  }
-  static void PrepareB(const float *, int8_t *, float, Index, Index) {
-    throw UnsupportedCPU();
-  }
-  static void SelectColumnsB(const int8_t *, int8_t *, Index, const Index *, const Index *) {
-    throw UnsupportedCPU();
-  }
-  static void Multiply(const int8_t *, const int8_t *, float *, float, Index, Index, Index) {
-    throw UnsupportedCPU();
-  }
-  static const char *const kName;
-};
-const char *const Unsupported_8bit::kName = "8-bit Unsupported";
-
-float Unsupported_MaxAbsolute(const float *begin, const float *end) {
-  throw UnsupportedCPU();
-}
-
-/* Returns:
- * avx512 if the CPU supports AVX512F (though really it should be AVX512BW, but
- * cloud providers lie).  TODO: don't catch Knights processors with this.
- *
- * avx2 if the CPU supports AVX2
- * 
- * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit)
- * 
- * sse2 if the CPU supports SSE2
- *
- * unsupported otherwise
- */
-template <class T> T ChooseCPU(T avx512, T avx2, T ssse3, T sse2, T unsupported) {
-  // TODO: don't catch Knights processors here!
-#ifndef INTGEMM_NO_AVX512
-  if (__builtin_cpu_supports("avx512f")) {
-    return avx512;
-  }
-#endif
-  if (__builtin_cpu_supports("avx2")) {
-    return avx2;
-  } else if (__builtin_cpu_supports("ssse3")) {
-    return ssse3;
-  } else if (__builtin_cpu_supports("sse2")) {
-    return sse2;
-  } else {
-    return unsupported;
-  }
-}
-
-#ifdef INTGEMM_NO_AVX512
-// These won't ever be called in this capacity, but it does let the code below compile.
-typedef Unsupported_16bit AVX512_16bit;
-typedef Unsupported_8bit AVX512_8bit;
-float AVX512_MaxAbsolute(const float *begin, const float *end) {
-  throw UnsupportedCPU();
-}
-#endif
-
-} // namespace
-
-void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_16bit::Quantize, AVX2_16bit::Quantize, SSE2_16bit::Quantize, SSE2_16bit::Quantize, Unsupported_16bit::Quantize);
-void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_16bit::PrepareB, AVX2_16bit::PrepareB, SSE2_16bit::PrepareB, SSE2_16bit::PrepareB, Unsupported_16bit::PrepareB);
-void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
-void (*Int16::Multiply)(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_16bit::Multiply, AVX2_16bit::Multiply, SSE2_16bit::Multiply, SSE2_16bit::Multiply, Unsupported_16bit::Multiply);
-const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName);
-
-void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_8bit::Quantize, AVX2_8bit::Quantize, SSSE3_8bit::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
-void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_8bit::PrepareB, AVX2_8bit::PrepareB, SSSE3_8bit::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
-void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
-void (*Int8::Multiply)(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_8bit::Multiply, AVX2_8bit::Multiply, SSSE3_8bit::Multiply, Unsupported_8bit::Multiply, Unsupported_8bit::Multiply);
-const char *const Int8::kName = ChooseCPU(AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
-
-const CPUType kCPU = ChooseCPU(CPU_AVX512BW, CPU_AVX2, CPU_SSSE3, CPU_SSE2, CPU_UNSUPPORTED);
-
-float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512_MaxAbsolute, AVX2_MaxAbsolute, SSE2_MaxAbsolute, SSE2_MaxAbsolute, Unsupported_MaxAbsolute);
-
-} // namespace intgemm
diff --git a/intgemm.h b/intgemm.h
index 58fa8cc..5818ce8 100644
--- a/intgemm.h
+++ b/intgemm.h
@@ -43,24 +43,94 @@
 // Yes, both headers due to the debacle about int32_t
 #include <cstdint>
 #include <stdint.h>
-#include <exception>
 
 #include "types.h"
+#include "sse2_gemm.h"
+#include "ssse3_gemm.h"
+#include "avx2_gemm.h"
+#ifndef INTGEMM_NO_AVX512
+#include "avx512_gemm.h"
+#endif
 
 /* Dispatch to functions based on runtime CPUID.  This adds one call-by-variable to each call. */
 
 namespace intgemm {
 
-// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3).
-class UnsupportedCPU : public std::exception {
-  public:
-    UnsupportedCPU();
-
-    ~UnsupportedCPU() throw();
+struct Unsupported_16bit {
+  static void Quantize(const float *, int16_t *, float, Index) {
+    throw UnsupportedCPU();
+  }
+  static void PrepareB(const float *, int16_t *, float, Index, Index) {
+    throw UnsupportedCPU();
+  }
+  static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) {
+    throw UnsupportedCPU();
+  }
+  static void Multiply(const int16_t *, const int16_t *, float *, float, Index, Index, Index) {
+    throw UnsupportedCPU();
+  }
+  constexpr static const char *const kName = "16-bit Unsupported";
+};
 
-    const char *what() const throw() override;
+struct Unsupported_8bit {
+  static void Quantize(const float *, int8_t *, float, Index) {
+    throw UnsupportedCPU();
+  }
+  static void PrepareB(const float *, int8_t *, float, Index, Index) {
+    throw UnsupportedCPU();
+  }
+  static void SelectColumnsB(const int8_t *, int8_t *, Index, const Index *, const Index *) {
+    throw UnsupportedCPU();
+  }
+  static void Multiply(const int8_t *, const int8_t *, float *, float, Index, Index, Index) {
+    throw UnsupportedCPU();
+  }
+  constexpr static const char *const kName = "8-bit Unsupported";
 };
 
+float Unsupported_MaxAbsolute(const float *begin, const float *end) {
+  throw UnsupportedCPU();
+}
+
+#ifdef INTGEMM_NO_AVX512
+// These won't ever be called in this capacity, but it does let the code below compile.
+typedef Unsupported_16bit AVX512_16bit;
+typedef Unsupported_8bit AVX512_8bit;
+float AVX512_MaxAbsolute(const float *begin, const float *end) {
+  throw UnsupportedCPU();
+}
+#endif
+
+/* Returns:
+ * avx512 if the CPU supports AVX512F (though really it should be AVX512BW, but
+ * cloud providers lie).  TODO: don't catch Knights processors with this.
+ *
+ * avx2 if the CPU supports AVX2
+ * 
+ * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit)
+ * 
+ * sse2 if the CPU supports SSE2
+ *
+ * unsupported otherwise
+ */
+template <class T> T ChooseCPU(T avx512, T avx2, T ssse3, T sse2, T unsupported) {
+  // TODO: don't catch Knights processors here!
+#ifndef INTGEMM_NO_AVX512
+  if (__builtin_cpu_supports("avx512f")) {
+    return avx512;
+  }
+#endif
+  if (__builtin_cpu_supports("avx2")) {
+    return avx2;
+  } else if (__builtin_cpu_supports("ssse3")) {
+    return ssse3;
+  } else if (__builtin_cpu_supports("sse2")) {
+    return sse2;
+  } else {
+    return unsupported;
+  }
+}
+
 /* 16-bit matrix multiplication. */
 struct Int16 {
   typedef int16_t Integer;
@@ -96,6 +166,13 @@ struct Int16 {
   static const char *const kName;
 };
 
+void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_16bit::Quantize, AVX2_16bit::Quantize, SSE2_16bit::Quantize, SSE2_16bit::Quantize, Unsupported_16bit::Quantize);
+void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_16bit::PrepareB, AVX2_16bit::PrepareB, SSE2_16bit::PrepareB, SSE2_16bit::PrepareB, Unsupported_16bit::PrepareB);
+void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_16bit::SelectColumnsB, AVX2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, SSE2_16bit::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
+void (*Int16::Multiply)(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_16bit::Multiply, AVX2_16bit::Multiply, SSE2_16bit::Multiply, SSE2_16bit::Multiply, Unsupported_16bit::Multiply);
+const char *const Int16::kName = ChooseCPU(AVX512_16bit::kName, AVX2_16bit::kName, SSE2_16bit::kName, SSE2_16bit::kName, Unsupported_16bit::kName);
+
+
 /* 8-bit matrix multiplication */
 struct Int8 {
   typedef int8_t Integer;
@@ -130,7 +207,17 @@ struct Int8 {
   static const char *const kName;
 };
 
+void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512_8bit::Quantize, AVX2_8bit::Quantize, SSSE3_8bit::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
+void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512_8bit::PrepareB, AVX2_8bit::PrepareB, SSSE3_8bit::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
+void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512_8bit::SelectColumnsB, AVX2_8bit::SelectColumnsB, SSSE3_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
+void (*Int8::Multiply)(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) = ChooseCPU(AVX512_8bit::Multiply, AVX2_8bit::Multiply, SSSE3_8bit::Multiply, Unsupported_8bit::Multiply, Unsupported_8bit::Multiply);
+const char *const Int8::kName = ChooseCPU(AVX512_8bit::kName, AVX2_8bit::kName, SSSE3_8bit::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
+
+const CPUType kCPU = ChooseCPU(CPU_AVX512BW, CPU_AVX2, CPU_SSSE3, CPU_SSE2, CPU_UNSUPPORTED);
+
 // Get the maximum absolute value of an array of floats.  The number of floats must be a multiple of 16 and 64-byte aligned.
 extern float (*MaxAbsolute)(const float *begin, const float *end);
+float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512_MaxAbsolute, AVX2_MaxAbsolute, SSE2_MaxAbsolute, SSE2_MaxAbsolute, Unsupported_MaxAbsolute);
+
 
 } // namespace intgemm
diff --git a/intrinsics.h b/intrinsics.h
index 7c2cf57..fcc4752 100644
--- a/intrinsics.h
+++ b/intrinsics.h
@@ -33,13 +33,14 @@ template <> SSE2 inline __m128 set1_ps<__m128>(float to) {
 SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) {
   return _mm_madd_epi16(first, second);
 }
-SSE2 static inline __m128i maddubs_epi16(__m128i first, __m128i second) {
+SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) {
   return _mm_maddubs_epi16(first, second);
 }
-SSE2 static inline __m128i sign_epi8(__m128i first, __m128i second) {
+SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) {
   return _mm_sign_epi8(first, second);
 }
-SSE2 static inline __m128i abs_epi8(__m128i arg) {
+
+SSSE3 static inline __m128i abs_epi8(__m128i arg) {
   return _mm_abs_epi8(arg);
 }
 SSE2 static inline __m128 max_ps(__m128 first, __m128 second) {
diff --git a/multiply.h b/multiply.h
index 3e19bc8..9b7bf61 100644
--- a/multiply.h
+++ b/multiply.h
@@ -17,7 +17,7 @@ static inline float MaxFloat32(__m128 a) {
   return *reinterpret_cast<float*>(&a);
 }
 
-static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) {
+SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) {
   // No op for 128 bits: already reduced fully.
   MultiplyResult128 ret;
   ret.pack0123 = pack0123;
@@ -38,7 +38,7 @@ static inline float MaxFloat32(__m256 a) {
   return MaxFloat32(max_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)));
 }
 
-static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
+AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
   // This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f
   __m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21);
   // This instruction generates 1f 2f 3f 4f 5s 6s 7s 8s
@@ -54,7 +54,7 @@ static inline void WriteC(float *to, __m256i total, __m256 unquant_reg) {
 #endif
 #ifdef __AVX512BW__
 
-static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) {
+AVX512F static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) {
   // Form [0th 128-bit register of pack0123, 0st 128-bit register of pack4567, 2nd 128-bit register of pack0123, 2nd 128-bit register of pack4567]
   __m512i mix0 = _mm512_mask_permutex_epi64(pack0123, 0xcc, pack4567, (0 << 4) | (1 << 6));
   // Form [1st 128-bit register of pack0123, 1st 128-bit register of pack4567, 3rd 128-bit register of pack0123, 3rd 128-bit register of pack4567]
@@ -123,8 +123,8 @@ template <class Register> inline Register Pack0123(Register sum0, Register sum1,
 // A_rows can be anything non-negative.
 // width must be a multiple of the register size.
 // B_cols must be a multiple of 8.
-//#define Multiply16(Integer, Annotate) \ 
-  template <class WriteC> Annotate inline void Multiply16(const int16_t *A, const int16_t *B, WriteC functor, Index A_rows, Index width, Index B_cols) {
+//#define Multiply16(Integer, Annotate) \ //fd
+//  template <class WriteC> Annotate inline void Multiply16(const int16_t *A, const int16_t *B, WriteC functor, Index A_rows, Index width, Index B_cols) {
 //
 template <class Integer, class WriteC> inline void Multiply16(const int16_t *A, const int16_t *B, WriteC functor, Index A_rows, Index width, Index B_cols) {
   assert(width % (sizeof(Integer) / sizeof(int16_t)) == 0);
diff --git a/sse2_gemm.cc b/sse2_gemm.cc
index 0ef353a..2fa5596 100644
--- a/sse2_gemm.cc
+++ b/sse2_gemm.cc
@@ -1,77 +1,2 @@
 // This is only 16-bit.  8-bit is in ssse3_gemm.cc since it requires that.
 #include "sse2_gemm.h"
-#include "cops.h"
-
-#include "interleave.h"
-#include "multiply.h"
-
-#include <stdint.h>
-#include <cassert>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-namespace intgemm {
-
-namespace {
-// Same implementation as AVX512, just width.  Grabs 4 32-bit values.
-inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
-  return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
-}
-
-class QuantizeTile16 {
-  public:
-    typedef __m128i Integer;
-
-    explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
-    // Quantize 8xfloat into 8xint16_t
-    inline __m128i Consecutive(const float *input) {
-      __m128i g0 = QuantizerGrab(input, mult_reg_);
-      __m128i g1 = QuantizerGrab(input + 4, mult_reg_);
-      return _mm_packs_epi32(g0, g1);
-    }
-
-    inline __m128i ForReshape(const float *input, int) {
-      return Consecutive(input);
-    }
-
-  private:
-    const __m128 mult_reg_;
-};
-} // namespace
-
-/* I also tried an implementation based on _mm_cvtps_pi16 but it was slower:
- * For size 1048576, run 10x in seconds on i7-6700:
- * This code: 0.00228409, 0.00204906
- * With _mm_cvtps_pi16 basis: 0.00391884, 0.00390869
- */
-void SSE2_16bit::Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
-  assert(size % 8 == 0);
-  assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
-  assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
-  QuantizeTile16 q(quant_mult);
-  const float *end = input + size;
-  for (; input != end; input += 8, output += 8) {
-    *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
-  }
-}
-
-void SSE2_16bit::PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
-  PrepareBFor16(input, output, QuantizeTile16(quant_mult), rows, cols);
-}
-
-void SSE2_16bit::SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-  SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
-}
-
-void SSE2_16bit::Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
-  Multiply16<__m128i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
-}
-
-const char *const SSE2_16bit::kName = "16-bit SSE2";
-
-float SSE2_MaxAbsolute(const float *begin, const float *end) {
-  return MaxAbsoluteBackend<__m128>(begin, end);
-}
-
-} // namespace intgemm
diff --git a/sse2_gemm.h b/sse2_gemm.h
index 0f26362..7a5e5ce 100644
--- a/sse2_gemm.h
+++ b/sse2_gemm.h
@@ -2,37 +2,87 @@
 #include "types.h"
 #include <cstdint>
 #include <stdint.h>
+#include "cops.h"
+#include "multiply.h"
 // 8 bit is in ssse3_gemm.h
 
 namespace intgemm {
 
+namespace sse2 {
+// Same implementation as AVX512, just width.  Grabs 4 32-bit values.
+// TODO duplicated function requires the removal of the annonymous namespace
+SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
+  return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
+}
+
+class QuantizeTile16 {
+  public:
+    typedef __m128i Integer;
+
+    explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
+
+    // Quantize 8xfloat into 8xint16_t
+    SSE2 inline __m128i Consecutive(const float *input) {
+      __m128i g0 = QuantizerGrab(input, mult_reg_);
+      __m128i g1 = QuantizerGrab(input + 4, mult_reg_);
+      return _mm_packs_epi32(g0, g1);
+    }
+
+    SSE2 inline __m128i ForReshape(const float *input, int) {
+      return Consecutive(input);
+    }
+
+  private:
+    const __m128 mult_reg_;
+};
+} //namespace
 // This should be pure SSE2 (and below).
 struct SSE2_16bit {
   typedef int16_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
-  static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+  SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
-  static void Quantize(const float *input, int16_t *output, float quant_mult, Index size);
+  SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+    assert(size % 8 == 0);
+    assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
+    assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
+    sse2::QuantizeTile16 q(quant_mult);
+    const float *end = input + size;
+    for (; input != end; input += 8, output += 8) {
+      *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
+    }
+  }
 
   // Tile size for B; B must be a multiple of this block size.
   static const Index kBTileRow = 8;
   static const Index kBTileCol = 8;
 
-  static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
+  SSE2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+    //TODO #DEFINE
+    PrepareBFor16(input, output, sse2::QuantizeTile16(quant_mult), rows, cols);
+  }
 
-  static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+  SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    //TODO #DEFINE
+    SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
+  }
 
-  static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+  SSE2 static void Multiply(const int16_t *A, const int16_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+    //TODO #DEFINE
+    Multiply16<__m128i, JustUnquantizeC> (A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
+  }
 
-  static const char *const kName;
+  constexpr static const char *const kName = "16-bit SSE2";
 
   static const CPUType kUses = CPU_SSE2;
 };
 
 // Technically only requires SSE
-float SSE2_MaxAbsolute(const float *begin, const float *end);
+SSE2 float SSE2_MaxAbsolute(const float *begin, const float *end) {
+  return MaxAbsoluteBackend<__m128>(begin, end);
+}
 
 } // namespace intgemm
diff --git a/ssse3_gemm.cc b/ssse3_gemm.cc
index d5de13d..32e5bd2 100644
--- a/ssse3_gemm.cc
+++ b/ssse3_gemm.cc
@@ -1,89 +1 @@
 #include "ssse3_gemm.h"
-
-#include "interleave.h"
-#include "multiply.h"
-
-#include <stdint.h>
-#include <cassert>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-namespace intgemm {
-
-namespace {
-// Same implementation as AVX512, just width.  Grabs 4 32-bit values.
-inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
-  return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
-}
-
-class QuantizeTile8 {
-  public:
-    typedef __m128i Integer;
-
-    explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
-    inline __m128i ForReshape(const float *input, Index cols) {
-      // Skip a row.
-      return Tile(input, input + 2 * cols);
-    }
-
-    inline __m128i Consecutive(const float *input) {
-      return Tile(input, input + 8);
-    }
-
-  private:
-    // Quantize 16xfloat into 16xint8_t
-    inline __m128i Tile(const float *input0, const float *input1) {
-      const __m128i neg128 = _mm_set1_epi8(-128);
-      __m128i g0 = QuantizerGrab(input0, mult_reg_);
-      __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_);
-      __m128i g2 = QuantizerGrab(input1, mult_reg_);
-      __m128i g3 = QuantizerGrab(input1 + 4, mult_reg_);
-      __m128i packed0 = _mm_packs_epi32(g0, g1);
-      __m128i packed1 = _mm_packs_epi32(g2, g3);
-      __m128i packed = _mm_packs_epi16(packed0, packed1);
-      /* Ban -128.
-       * Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127).  Instead,
-       * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
-       * The first generates 0xff for fields -128.
-       * The second subtracts 0xff from -128 which has the effect of converting
-       * to -127.
-       */
-      // packed = _mm_max_epi8(packed, neg127);
-      __m128i evils = _mm_cmpeq_epi8(packed, neg128);
-      return _mm_sub_epi8(packed, evils);
-      // No permute needed.  packs is in order for SSE.
-    }
-
-  private:
-    const __m128 mult_reg_;
-};
-
-} // namespace
-
-void SSSE3_8bit::Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
-  assert(size % 16 == 0);
-  assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
-  assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
-  QuantizeTile8 q(quant_mult);
-  const float *end = input + size;
-  for (; input != end; input += 16, output += 16) {
-    *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
-  }
-}
-
-void SSSE3_8bit::PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
-  PrepareBFor8(input, output, QuantizeTile8(quant_mult), rows, cols);
-}
-
-void SSSE3_8bit::SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-  SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
-}
-
-void SSSE3_8bit::Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
-  Multiply8_SSE2OrAVX2<Multiply8_C, __m128i, __m128>(A, B, C, unquant_mult, A_rows, width, B_cols);
-}
-
-const char *const SSSE3_8bit::kName = "8-bit SSSE3";
-
-} // namespace intgemm
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index 4993ef6..69ac298 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -3,32 +3,103 @@
 #include <cstdint>
 #include <stdint.h>
 
+#include "interleave.h"
+#include "multiply.h"
+
 // 16-bit is in sse2_gemm.h
 
 namespace intgemm {
 
+namespace ssse3 {
+// Same implementation as AVX512, just width.  Grabs 4 32-bit values.
+//TODO duplicated function requires the removal of the annonymous namespace
+SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
+  return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
+}
+
+class QuantizeTile8 {
+  public:
+    typedef __m128i Integer;
+
+    SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
+
+    SSSE3 inline __m128i ForReshape(const float *input, Index cols) {
+      // Skip a row.
+      return Tile(input, input + 2 * cols);
+    }
+
+    SSSE3 inline __m128i Consecutive(const float *input) {
+      return Tile(input, input + 8);
+    }
+
+  private:
+    // Quantize 16xfloat into 16xint8_t
+    SSSE3 inline __m128i Tile(const float *input0, const float *input1) {
+      const __m128i neg128 = _mm_set1_epi8(-128);
+      __m128i g0 = QuantizerGrab(input0, mult_reg_);
+      __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_);
+      __m128i g2 = QuantizerGrab(input1, mult_reg_);
+      __m128i g3 = QuantizerGrab(input1 + 4, mult_reg_);
+      __m128i packed0 = _mm_packs_epi32(g0, g1);
+      __m128i packed1 = _mm_packs_epi32(g2, g3);
+      __m128i packed = _mm_packs_epi16(packed0, packed1);
+      /* Ban -128.
+       * Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127).  Instead,
+       * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
+       * The first generates 0xff for fields -128.
+       * The second subtracts 0xff from -128 which has the effect of converting
+       * to -127.
+       */
+      // packed = _mm_max_epi8(packed, neg127);
+      __m128i evils = _mm_cmpeq_epi8(packed, neg128);
+      return _mm_sub_epi8(packed, evils);
+      // No permute needed.  packs is in order for SSE.
+    }
+
+  private:
+    const __m128 mult_reg_;
+};
+
+} // namespace
+
+
 // pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need.
 struct SSSE3_8bit {
   typedef int8_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
-  static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+  SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
-  static void Quantize(const float *input, int8_t *output, float quant_mult, Index size);
+  SSSE3 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+    assert(size % 16 == 0);
+    assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
+    assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
+    ssse3::QuantizeTile8 q(quant_mult);
+    const float *end = input + size;
+    for (; input != end; input += 16, output += 16) {
+      *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
+    }
+  }
 
   // Tile size for B; B must be a multiple of this block size.
   static const Index kBTileRow = 16;
   static const Index kBTileCol = 8;
 
-  static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols);
+  SSSE3 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+    PrepareBFor8(input, output, ssse3::QuantizeTile8(quant_mult), rows, cols);
+  }
 
-  static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+  SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+    SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
+  }
 
-  static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols);
+  SSSE3 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+    Multiply8_SSE2OrAVX2<Multiply8_C, __m128i, __m128>(A, B, C, unquant_mult, A_rows, width, B_cols);
+  }
   
-  static const char *const kName;
+  constexpr static const char *const kName = "8-bit SSSE3";
 
   static const CPUType kUses = CPU_SSSE3;
 };
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index 301254a..8d2f50d 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -1,4 +1,4 @@
-#include "avx512_gemm.h"
+//#include "avx512_gemm.h"
 #include "avx2_gemm.h"
 #include "ssse3_gemm.h"
 #include "sse2_gemm.h"
@@ -73,7 +73,7 @@ TEST_CASE("Transpose 16", "[transpose]") {
   }
 }
 
-TEST_CASE("Transpose 8", "[transpose]") {
+SSSE3 TEST_CASE("Transpose 8", "[transpose]") {
   if (kCPU < CPU_SSSE3) return;
   AlignedVector<int8_t> input(16 * 16);
   for (int i = 0; i < 16 * 16; ++i) {
@@ -127,7 +127,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
   	"Quantized Input" << '\n' << PrintMatrix(quantized.get(), rows, cols) << "Reference" << '\n' <<
   	 PrintMatrix(reference.get(), rows, cols) << "Routine" << '\n' << PrintMatrix(test.get(), rows, cols));
 }
-
+/*
 TEST_CASE("Prepare AVX512", "[prepare]") {
   if (kCPU < CPU_AVX512BW) return;
 #ifndef INTGEMM_NO_AVX512
@@ -137,7 +137,7 @@ TEST_CASE("Prepare AVX512", "[prepare]") {
     TestPrepare<AVX512_16bit>(256, 32);
 #endif
 }
-
+*/
 TEST_CASE("Prepare AVX2", "[prepare]") {
   if (kCPU < CPU_AVX2) return;
   TestPrepare<AVX2_8bit>(64, 32);
@@ -192,7 +192,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
   CHECK_MESSAGE(memcmp(ref.get(), test.get(), sizeof(Integer) * rows * kSelectCols) == 0, "Reference:\n" <<
   	PrintMatrix(ref.get(), rows, kSelectCols) << PrintMatrix(test.get(), rows, kSelectCols));
 }
-
+/*
 TEST_CASE("SelectColumnsB AVX512", "[select]") {
   if (kCPU < CPU_AVX512BW) return;
 #ifndef INTGEMM_NO_AVX512
@@ -200,7 +200,7 @@ TEST_CASE("SelectColumnsB AVX512", "[select]") {
     TestSelectColumnsB<AVX512_16bit>(256, 256);
 #endif
 }
-
+*/
 TEST_CASE("SelectColumnsB AVX2", "[select]") {
   if (kCPU < CPU_AVX2) return;
   TestSelectColumnsB<AVX2_8bit>(256, 256);
@@ -411,7 +411,7 @@ TEST_CASE ("Multiply AVX2 16bit", "[multiply]") {
   TestMultiply<AVX2_16bit>(248, 256, 256, .1, 1, 0.01);
   TestMultiply<AVX2_16bit>(200, 256, 256, .1, 1, 0.01);
 }
-
+/*
 #ifndef INTGEMM_NO_AVX512
   TEST_CASE ("Multiply AVX512 8bit", "[multiply]") {
     if (kCPU < CPU_AVX512BW) return;
@@ -433,6 +433,7 @@ TEST_CASE ("Multiply AVX2 16bit", "[multiply]") {
     TestMultiply<AVX512_16bit>(200, 256, 256, .1, 1, 0.01);
   }
 #endif
+ */
 } // namespace intgemm
 
 int main(int argc, char ** argv) {
diff --git a/types.h b/types.h
index a38d3c3..6ec5085 100644
--- a/types.h
+++ b/types.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <exception>
 
 #define DEFAULT __attribute__ ((target ("default")))
 #define SSE2 __attribute__ ((target ("sse2")))
@@ -6,10 +7,22 @@
 #define SSSE3 __attribute__ ((target ("ssse3")))
 #define AVX2 __attribute__ ((target ("avx2")))
 //#define AVX2_512F __attribute__ ((target ("avx2"), target("avx512f"))) //Not supported by clang
-#define AVX512F __attribute__ ((target ("avx512f")))
+#define AVX512F __attribute__ ((target ("avx512bw")))
 
 namespace intgemm {
 
+// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3).
+class UnsupportedCPU : public std::exception {
+  public:
+    UnsupportedCPU() {}
+
+    ~UnsupportedCPU() throw() {}
+
+    const char *what() const throw() override {
+      return "Integer matrix multiplication has not been efficiently implemented for your CPU.";
+    }
+};
+
 typedef unsigned int Index;
 
 // If you want to detect the CPU and dispatch yourself, here's what to use: