11 files changed, 197 insertions, 197 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h
index bdc60e4..8958920 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -11,29 +11,29 @@ namespace intgemm {
 namespace avx2 {
 // Read a vector of floats, multiply them, and cast to 32-bit integer.
 // EVIL EVIL CODE DUPLICATION, FIX
-AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
+INTGEMM_AVX2 inline __m256i QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
   return _mm256_cvtps_epi32(_mm256_mul_ps(*reinterpret_cast<const __m256*>(input), quant_mult_reg));
 }
 
-SELECT_COL_B_DEFINE(AVX2, __m256i)
+INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i)
 
 class QuantizeTile16 {
   public:
     typedef __m256i Integer;
 
-    AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
+    INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
 
-    AVX2 Integer Consecutive(const float *input) {
+    INTGEMM_AVX2 Integer Consecutive(const float *input) {
       return Tile(input, input + 8);
     }
 
-    AVX2 Integer ForReshape(const float *input, Index cols) {
+    INTGEMM_AVX2 Integer ForReshape(const float *input, Index cols) {
       // 8 rows in the first 128-bit register, 8 in the second register.
       return Tile(input, input + 8 * cols);
     }
 
   private:
-    AVX2 __m256i Tile(const float *input0, const float *input1) {
+    INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) {
       __m256i g0 = QuantizerGrab(input0, mult_);
       __m256i g1 = QuantizerGrab(input1, mult_);
       __m256i packed = _mm256_packs_epi32(g0, g1);
@@ -52,12 +52,12 @@ struct AVX2_16bit {
   typedef int16_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
-  AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+  INTGEMM_AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
   // Just quantize everything in order.
-  AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+  INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
     assert(size % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
     avx2::QuantizeTile16 q(quant_mult);
@@ -71,18 +71,18 @@ struct AVX2_16bit {
   static const Index kBTileRow = 16;
   static const Index kBTileCol = 8;
 /*
-  AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+  INTGEMM_AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
     PrepareBFor16(input, output, avx2::QuantizeTile16(quant_mult), rows, cols);
   }*/
-  PREPARE_B_16_DEFINE(AVX2, avx2::QuantizeTile16)
+  INTGEMM_PREPARE_B_16(INTGEMM_AVX2, avx2::QuantizeTile16)
 
-  AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+  INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
   }
   
-  MULTIPLY16_DEFINE(__m256i, AVX2, OnAVX2)
+  INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, OnAVX2)
 
-  constexpr static const char *const kName = "16-bit AVX2";
+  constexpr static const char *const kName = "16-bit INTGEMM_AVX2";
 
   static const CPUType kUses = CPU_AVX2;
 };
@@ -96,20 +96,20 @@ class QuantizeTile8 {
   public:
     typedef __m256i Integer;
 
-    AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
+    INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
 
-    AVX2 inline __m256i Consecutive(const float *input) {
+    INTGEMM_AVX2 inline __m256i Consecutive(const float *input) {
       return Tile(input, input + 8, input + 16, input + 24);
     }
 
-    AVX2 inline __m256i ForReshape(const float *input, Index cols) {
+    INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) {
       // Put higher rows in the second half of the register.  These will jumble
       // around in the same way then conveniently land in the right place.
       return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
     }
 
   private:
-    AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
+    INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
       // Looking at the assembly, gcc has pulled this outside the loops calling this.
       const __m256i neg127 = _mm256_set1_epi8(-127);
       const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
@@ -136,7 +136,7 @@ class QuantizeTile8 {
 };
 
 // Technically only requires AVX
-MAXABSOLUTE_DEFINE(__m256, AVX2)
+INTGEMM_MAXABSOLUTE(__m256, INTGEMM_AVX2)
 
 } // namespace
 
@@ -144,12 +144,12 @@ struct AVX2_8bit {
   typedef int8_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
-  AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+  INTGEMM_AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
   // Just quantize everything in order.
-  AVX2 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+  INTGEMM_AVX2 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
     assert(size % 32 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
     avx2::QuantizeTile8 q(quant_mult);
@@ -164,23 +164,23 @@ struct AVX2_8bit {
   static const Index kBTileCol = 8;
 
 /*
-  AVX2 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+  INTGEMM_AVX2 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     PrepareBFor8(input, output, avx2::QuantizeTile8(quant_mult), rows, cols);
   }*/
 
-  PREPARE_B_8_DEFINE(AVX2, avx2::QuantizeTile8)
+  INTGEMM_PREPARE_B_8(INTGEMM_AVX2, avx2::QuantizeTile8)
 
-  AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+  INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     avx2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
   }
 /*
-  AVX2 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+  INTGEMM_AVX2 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
     //Multiply8_SSE2OrAVX2<Multiply8_AVXAVX2, __m256i, __m256>(A, B, C, unquant_mult, A_rows, width, B_cols);
     Multiply8_SSE2OrAVX2__m256i<JustUnquantizeC>(A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
   }*/
-  MULTIPLY8_DEFINE(__m256i, AVX2, OnAVX2)
+  INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, OnAVX2)
   
-  constexpr static const char *const kName = "8-bit AVX2";
+  constexpr static const char *const kName = "8-bit INTGEMM_AVX2";
 
   static const CPUType kUses = CPU_AVX2;
 };
diff --git a/avx512_gemm.h b/avx512_gemm.h
index b92ba0d..e20eb24 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -13,7 +13,7 @@
 #include "types.h"
 
 /* AVX512 implementation.
- * This uses AVX512BW, AVX512DQ, and might use AVX512VL
+ * This uses INTGEMM_AVX512BW, INTGEMM_AVX512DQ, and might use AVX512VL
  * That means it supports mainstream CPUs with AVX512, starting with Skylake
  * Xeons.
  * It does not support any Knights / Xeon Phi processors.
@@ -33,16 +33,16 @@ namespace intgemm {
 namespace avx512f {
 
 // Load from memory, multiply, and convert to int32_t.
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
   // Multiply each by the quantization factor.
   __m512 val = _mm512_mul_ps(*reinterpret_cast<const __m512*>(input), quant_mult_reg);
   // Cast to 32-bit int
   return _mm512_cvtps_epi32(val);
 }
 
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-SELECT_COL_B_DEFINE(AVX512BW, __m512i)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_SELECT_COL_B(INTGEMM_AVX512BW, __m512i)
 
 // For PrepareB we want to read 8 columns at a time.  When converting 32-bit
 // floats to 8-bit values, that's 32 bytes of floats.  But AVX512 is 64 bytes
@@ -50,14 +50,14 @@ SELECT_COL_B_DEFINE(AVX512BW, __m512i)
 // but then the memory written to won't be contiguous anyway so we'd be doing a
 // scatter anyway.  Easier to just read the 8 columns we wanted as 256 bits
 // concatenate.
-AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) {
-  // AVX512DQ but that goes with AVX512BW anyway.
+INTGEMM_AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) {
+  // INTGEMM_AVX512DQ but that goes with INTGEMM_AVX512BW anyway.
   return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
 }
 
 // Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently.
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
   __m512 appended = avx512f::Concat(*reinterpret_cast<const __m256*>(input0), *reinterpret_cast<const __m256*>(input1));
   appended = _mm512_mul_ps(appended, quant_mult_reg);
   return _mm512_cvtps_epi32(appended);
@@ -70,14 +70,14 @@ class QuantizeTile16 {
   public:
     typedef __m512i Integer;
 
-    /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-    AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+    /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+    INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
 
-    AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
+    INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
       __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
       __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
       __m512i packed = _mm512_packs_epi32(g0, g1);
-      // Permute within 256-bit lanes, so same as AVX2
+      // Permute within 256-bit lanes, so same as INTGEMM_AVX2
       return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
     }
 
@@ -89,10 +89,10 @@ class QuantizeTile8 {
   public:
     typedef __m512i Integer;
 
-    /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-    AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
+    /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+    INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
 
-    AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
+    INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
       // TODO: try alternative: _mm512_cvtsepi32_epi8 ?
       const __m512i neg127 = _mm512_set1_epi8(-127);
       // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
@@ -118,8 +118,8 @@ class QuantizeTile8 {
     const __m512 mult_reg_;
 };
 
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-MAXABSOLUTE_DEFINE(__m512, AVX512BW)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_MAXABSOLUTE(__m512, INTGEMM_AVX512BW)
 
 } // namespace
 
@@ -128,8 +128,8 @@ struct AVX512_16bit {
 
   // Currently A is prepared by quantization but this could theoretically change.
   // rows * cols must be a multiple of 16.
-  /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-  AVX512BW static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+  INTGEMM_AVX512BW static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
@@ -137,8 +137,8 @@ struct AVX512_16bit {
   // But then it will need to be aligned for Multiply.
   // size must be a multiple of 16.
   // Convert to 16-bit signed integers.
-  /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-  AVX512BW static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+  INTGEMM_AVX512BW static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
     assert(size % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
     // Fill with the quantization multiplier.
@@ -155,20 +155,20 @@ struct AVX512_16bit {
   static const Index kBTileRow = 32;
   static const Index kBTileCol = 8;
 /*
-  AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+  INTGEMM_AVX512F static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
     PrepareBFor16(input, output, avx512f::QuantizeTile16(quant_mult), rows, cols);
   }
 */
-  /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-  PREPARE_B_16_DEFINE(AVX512BW, avx512f::QuantizeTile16)
+  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+  INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, avx512f::QuantizeTile16)
 
-  /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-  AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+  INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
   }
   
-  /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-  MULTIPLY16_DEFINE(__m512i, AVX512BW, OnAVX2)
+  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+  INTGEMM_MULTIPLY16(__m512i, INTGEMM_AVX512BW, OnAVX2)
 
   constexpr static const char *const kName = "16-bit AVX512";
 
@@ -179,16 +179,16 @@ struct AVX512_8bit {
   typedef int8_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
-  /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-  AVX512BW static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+  INTGEMM_AVX512BW static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
   // Technically output can be unaligned in Quantize.
   // But then it will need to be aligned for Multiply.
   // Convert to 8-bit signed integers.
-  /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-  AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+  INTGEMM_AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
     assert(size % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
     const __m512i neg127 = _mm512_set1_epi32(-127);
@@ -206,21 +206,21 @@ struct AVX512_8bit {
   static const Index kBTileRow = 64;
   static const Index kBTileCol = 8;
 /*
-  AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+  INTGEMM_AVX512F static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     PrepareBFor8(input, output, avx512f::QuantizeTile8(quant_mult), rows, cols);
   }*/
-  /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-  PREPARE_B_8_DEFINE(AVX512BW, avx512f::QuantizeTile8)
+  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+  INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, avx512f::QuantizeTile8)
 
-  /* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-  AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+  INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     avx512f::SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
   }
 
   // Special AVX512 implementation due to having 32 registers (so I don't have to
   // allocate registers manually) and no sign instruction.
   template <class WriteC>
-  AVX512BW static void Multiply(const int8_t *A, const int8_t *B, WriteC C, Index A_rows, Index width, Index B_cols) {
+  INTGEMM_AVX512BW static void Multiply(const int8_t *A, const int8_t *B, WriteC C, Index A_rows, Index width, Index B_cols) {
   typedef __m512i Integer;
   //typedef __m256 Float; // For quantization we only do 8 at a time.
   // This is copy-paste from Multiply8_SSE2OrAVX2.
@@ -228,7 +228,7 @@ struct AVX512_8bit {
   assert(B_cols % 8 == 0);
   assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0);
   assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0);
-  // There's 8 results for AVX2 to handle.
+  // There's 8 results for INTGEMM_AVX2 to handle.
   typename WriteC::OnAVX2 write_C(C);
   const int simd_width = width / sizeof(Integer);
   const Integer *B0_col = reinterpret_cast<const Integer*>(B);
diff --git a/cops.h b/cops.h
index e685a3c..2771aeb 100644
--- a/cops.h
+++ b/cops.h
@@ -11,12 +11,12 @@ class JustUnquantizeC {
 
     class OnSSE2 {
       public:
-        SSE2 explicit OnSSE2(const JustUnquantizeC &from)
+        INTGEMM_SSE2 explicit OnSSE2(const JustUnquantizeC &from)
           : C_(from.C_), unquant_mult_(_mm_set1_ps(from.unquant_mult_)) {
           assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m128i) == 0);
          }
 
-        SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) {
+        INTGEMM_SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) {
           *reinterpret_cast<__m128*>(C_ + rowIDX*cols + colIDX) = mul_ps(cvtepi32_ps(result.pack0123), unquant_mult_);
           *reinterpret_cast<__m128*>(C_ + rowIDX*cols + colIDX + 4) = mul_ps(cvtepi32_ps(result.pack4567), unquant_mult_);
         }
@@ -27,12 +27,12 @@ class JustUnquantizeC {
 
     class OnAVX2 {
       public:
-        AVX2 explicit OnAVX2(const JustUnquantizeC &from)
+        INTGEMM_AVX2 explicit OnAVX2(const JustUnquantizeC &from)
           : C_(from.C_), unquant_mult_(_mm256_set1_ps(from.unquant_mult_)) {
           assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m256i) == 0);
         }
 
-        AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) {
+        INTGEMM_AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) {
           *reinterpret_cast<__m256*>(C_ + rowIDX*cols + colIDX) = mul_ps(cvtepi32_ps(result), unquant_mult_);
         }
 
@@ -52,12 +52,12 @@ class Identity {
 
     class OnSSE2 {
       public:
-        SSE2 explicit OnSSE2(const Identity &from)
+        INTGEMM_SSE2 explicit OnSSE2(const Identity &from)
           : C_(from.C_) {
           assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m128i) == 0);
          }
 
-        SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) {
+        INTGEMM_SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) {
           _mm_storeu_si128(reinterpret_cast<__m128i*>(C_ + rowIDX*cols + colIDX), result.pack0123);
           _mm_storeu_si128(reinterpret_cast<__m128i*>(C_ + rowIDX*cols + colIDX + 4), result.pack4567);
         }
@@ -67,12 +67,12 @@ class Identity {
 
     class OnAVX2 {
       public:
-        AVX2 explicit OnAVX2(const Identity &from)
+        INTGEMM_AVX2 explicit OnAVX2(const Identity &from)
           : C_(from.C_) {
           assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m256i) == 0);
         }
 
-        AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) {
+        INTGEMM_AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) {
           _mm256_storeu_si256(reinterpret_cast<__m256i*>(C_ + rowIDX*cols + colIDX), result);
         }
 
diff --git a/interleave.h b/interleave.h
index e7a62f5..8b7484e 100644
--- a/interleave.h
+++ b/interleave.h
@@ -49,41 +49,41 @@ target static inline void Interleave64(type &first, type &second) { \
 
 template <class Register> static inline Register setzero_si() __attribute__((always_inline));;
 
-INTGEMM_INTERLEAVE(SSE2, __m128i, )
-template <> SSE2 inline __m128i setzero_si<__m128i>() {
+INTGEMM_INTERLEAVE(INTGEMM_SSE2, __m128i, )
+template <> INTGEMM_SSE2 inline __m128i setzero_si<__m128i>() {
   return _mm_setzero_si128();
 }
 
-INTGEMM_INTERLEAVE(AVX2, __m256i, 256)
-template <> AVX2 inline __m256i setzero_si<__m256i>() {
+INTGEMM_INTERLEAVE(INTGEMM_AVX2, __m256i, 256)
+template <> INTGEMM_AVX2 inline __m256i setzero_si<__m256i>() {
   return _mm256_setzero_si256();
 }
 #ifndef INTGEMM_NO_AVX512
-INTGEMM_INTERLEAVE(AVX512BW, __m512i, 512)
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-template <> AVX512BW inline __m512i setzero_si<__m512i>() {
+INTGEMM_INTERLEAVE(INTGEMM_AVX512BW, __m512i, 512)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+template <> INTGEMM_AVX512BW inline __m512i setzero_si<__m512i>() {
   return _mm512_setzero_si512();
 }
 #endif
 
-#define SWAP_DEFINE(target, Register) \
+#define INTGEMM_SWAP(target, Register) \
 target static inline void Swap(Register &a, Register &b) { \
   Register tmp = a; \
   a = b; \
   b = tmp; \
 } \
 
-SWAP_DEFINE(SSE2, __m128i)
-SWAP_DEFINE(AVX2, __m256i)
+INTGEMM_SWAP(INTGEMM_SSE2, __m128i)
+INTGEMM_SWAP(INTGEMM_AVX2, __m256i)
 #ifndef INTGEMM_NO_AVX512
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-SWAP_DEFINE(AVX512BW, __m512i)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_SWAP(INTGEMM_AVX512BW, __m512i)
 #endif
 
 /* Transpose registers containing 8 packed 16-bit integers.
  * Each 128-bit lane is handled independently.
  */
-#define TRANSPOSE16_DEFINE(target, Register) \
+#define INTGEMM_TRANSPOSE16(target, Register) \
 target static inline void Transpose16InLane(Register &r0, Register &r1, Register &r2, Register &r3, Register &r4, Register &r5, Register &r6, Register &r7) { \
   /* r0: columns 0 1 2 3 4 5 6 7 from row 0
      r1: columns 0 1 2 3 4 5 6 7 from row 1*/ \
@@ -126,11 +126,11 @@ target static inline void Transpose16InLane(Register &r0, Register &r1, Register
   Swap(r3, r6); \
 } \
 
-TRANSPOSE16_DEFINE(SSE2, __m128i)
-TRANSPOSE16_DEFINE(AVX2, __m256i)
+INTGEMM_TRANSPOSE16(INTGEMM_SSE2, __m128i)
+INTGEMM_TRANSPOSE16(INTGEMM_AVX2, __m256i)
 #ifndef INTGEMM_NO_AVX512
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-TRANSPOSE16_DEFINE(AVX512BW, __m512i)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_TRANSPOSE16(INTGEMM_AVX512BW, __m512i)
 #endif
 
 /* Tranpose registers containing 16 packed 8-bit integers.
@@ -180,7 +180,7 @@ template <class Register> static inline void Transpose8InLane(
 //
 // We presume B starts in row-major order.
 //
-// In AVX2, a register holds 32 8-bit values or 16 16-bit values and we want
+// In INTGEMM_AVX2, a register holds 32 8-bit values or 16 16-bit values and we want
 // that many values from the same column in the register.
 //
 // The multiplier reads 8 rows at a time and we want these reads to be
@@ -189,7 +189,7 @@ template <class Register> static inline void Transpose8InLane(
 // Each 8x32 (for 8-bit) or 8x16 (for 16-bit) tile of B is transposed.
 // The tiles are stored in column major order.
 //
-// For AVX2, this matrix shows what index each value of B will be stored at:
+// For INTGEMM_AVX2, this matrix shows what index each value of B will be stored at:
 //   0  16 ... 240
 //   1  17 ... 241
 //   2  18 ... 242
@@ -209,7 +209,7 @@ template <class Register> static inline void Transpose8InLane(
 // 256 272
 // 257 273
 // ... ...
-#define PREPARE_B_8_DEFINE(target, QuantClass) \
+#define INTGEMM_PREPARE_B_8(target, QuantClass) \
 target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \
   typedef typename QuantClass Quantizer; \
   typedef typename Quantizer::Integer Register; \
@@ -244,7 +244,7 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
   } \
 } \
 
-#define PREPARE_B_16_DEFINE(target, QuantClass) \
+#define INTGEMM_PREPARE_B_16(target, QuantClass) \
 target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \
   typedef typename QuantClass Quantizer; \
   typedef typename Quantizer::Integer Register; \
@@ -267,7 +267,7 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
 
 /* Select columns of B from PrepareB format to PrepareB format.
  */
-#define SELECT_COL_B_DEFINE(target, Register) \
+#define INTGEMM_SELECT_COL_B(target, Register) \
 target static inline void SelectColumnsOfB(const Register *input, Register *output, Index rows_bytes /* number of bytes in a row */, const Index *cols_begin, const Index *cols_end) { \
   assert(rows_bytes % sizeof(Register) == 0); \
   assert((cols_end - cols_begin) % 8 == 0);  \
diff --git a/intgemm.h b/intgemm.h
index b739e6e..76ef5fb 100644
--- a/intgemm.h
+++ b/intgemm.h
@@ -107,14 +107,14 @@ float MaxAbsolute(const float *begin, const float *end) {
 #endif
 
 /* Returns:
- * avx512 if the CPU supports AVX512F (though really it should be AVX512BW, but
+ * avx512 if the CPU supports INTGEMM_AVX512F (though really it should be INTGEMM_AVX512BW, but
  * cloud providers lie).  TODO: don't catch Knights processors with this.
  *
- * avx2 if the CPU supports AVX2
+ * avx2 if the CPU supports INTGEMM_AVX2
  * 
- * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit)
+ * ssse3 if the CPU supports INTGEMM_SSSE3 (this distinction from INTGEMM_SSE2 matters for 8-bit)
  * 
- * sse2 if the CPU supports SSE2
+ * sse2 if the CPU supports INTGEMM_SSE2
  *
  * unsupported otherwise
  */
diff --git a/intrinsics.h b/intrinsics.h
index fa3a3c2..9548a81 100644
--- a/intrinsics.h
+++ b/intrinsics.h
@@ -18,105 +18,105 @@ template <class Register> static inline Register set1_ps(float to);
 struct MultiplyResult128 {
   __m128i pack0123, pack4567;
 };
-SSE2 static inline __m128i add_epi32(__m128i first, __m128i second) {
+INTGEMM_SSE2 static inline __m128i add_epi32(__m128i first, __m128i second) {
   return _mm_add_epi32(first, second);
 }
-SSE2 static inline __m128i adds_epi16(__m128i first, __m128i second) {
+INTGEMM_SSE2 static inline __m128i adds_epi16(__m128i first, __m128i second) {
   return _mm_adds_epi16(first, second);
 }
-template <> SSE2 inline __m128i set1_epi16<__m128i>(int16_t to) {
+template <> INTGEMM_SSE2 inline __m128i set1_epi16<__m128i>(int16_t to) {
   return _mm_set1_epi16(to);
 }
-template <> SSE2 inline __m128 set1_ps<__m128>(float to) {
+template <> INTGEMM_SSE2 inline __m128 set1_ps<__m128>(float to) {
   return _mm_set1_ps(to);
 }
-SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) {
+INTGEMM_SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) {
   return _mm_madd_epi16(first, second);
 }
-SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) {
+INTGEMM_SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) {
   return _mm_maddubs_epi16(first, second);
 }
-SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) {
+INTGEMM_SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) {
   return _mm_sign_epi8(first, second);
 }
 
-SSSE3 static inline __m128i abs_epi8(__m128i arg) {
+INTGEMM_SSSE3 static inline __m128i abs_epi8(__m128i arg) {
   return _mm_abs_epi8(arg);
 }
-SSE2 static inline __m128 max_ps(__m128 first, __m128 second) {
+INTGEMM_SSE2 static inline __m128 max_ps(__m128 first, __m128 second) {
   return _mm_max_ps(first, second);
 }
-SSE2 static inline __m128 and_ps(__m128 first, __m128 second) {
+INTGEMM_SSE2 static inline __m128 and_ps(__m128 first, __m128 second) {
   return _mm_and_ps(first, second);
 }
-SSE2 static inline __m128 cvtepi32_ps(__m128i arg) {
+INTGEMM_SSE2 static inline __m128 cvtepi32_ps(__m128i arg) {
   return _mm_cvtepi32_ps(arg);
 }
-SSE2 static inline __m128 mul_ps (__m128 a, __m128 b) {
+INTGEMM_SSE2 static inline __m128 mul_ps (__m128 a, __m128 b) {
   return _mm_mul_ps(a, b);
 }
 
-AVX2 static inline __m256i add_epi32(__m256i first, __m256i second) {
+INTGEMM_AVX2 static inline __m256i add_epi32(__m256i first, __m256i second) {
   return _mm256_add_epi32(first, second);
 }
-AVX2 static inline __m256i adds_epi16(__m256i first, __m256i second) {
+INTGEMM_AVX2 static inline __m256i adds_epi16(__m256i first, __m256i second) {
   return _mm256_adds_epi16(first, second);
 }
-template <> AVX2 inline __m256i set1_epi16<__m256i>(int16_t to) {
+template <> INTGEMM_AVX2 inline __m256i set1_epi16<__m256i>(int16_t to) {
   return _mm256_set1_epi16(to);
 }
-template <> AVX2 inline __m256 set1_ps<__m256>(float to) {
+template <> INTGEMM_AVX2 inline __m256 set1_ps<__m256>(float to) {
   return _mm256_set1_ps(to);
 }
-AVX2 static inline __m256i madd_epi16(__m256i first, __m256i second) {
+INTGEMM_AVX2 static inline __m256i madd_epi16(__m256i first, __m256i second) {
   return _mm256_madd_epi16(first, second);
 }
-AVX2 static inline __m256i maddubs_epi16(__m256i first, __m256i second) {
+INTGEMM_AVX2 static inline __m256i maddubs_epi16(__m256i first, __m256i second) {
   return _mm256_maddubs_epi16(first, second);
 }
-AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) {
+INTGEMM_AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) {
   return _mm256_sign_epi8(first, second);
 }
-AVX2 static inline __m256i abs_epi8(__m256i arg) {
+INTGEMM_AVX2 static inline __m256i abs_epi8(__m256i arg) {
   return _mm256_abs_epi8(arg);
 }
-AVX2 static inline __m256 max_ps(__m256 first, __m256 second) {
+INTGEMM_AVX2 static inline __m256 max_ps(__m256 first, __m256 second) {
   return _mm256_max_ps(first, second);
 }
-AVX2 static inline __m256 and_ps(__m256 first, __m256 second) {
+INTGEMM_AVX2 static inline __m256 and_ps(__m256 first, __m256 second) {
   return _mm256_and_ps(first, second);
 }
-AVX2 static inline __m256 cvtepi32_ps(__m256i arg) {
+INTGEMM_AVX2 static inline __m256 cvtepi32_ps(__m256i arg) {
   return _mm256_cvtepi32_ps(arg);
 }
-AVX2 static inline __m256 mul_ps (__m256 a, __m256 b) {
+INTGEMM_AVX2 static inline __m256 mul_ps (__m256 a, __m256 b) {
   return _mm256_mul_ps(a, b);
 }
 
 #ifndef INTGEMM_NO_AVX512
-AVX512BW static inline __m512i add_epi32(__m512i first, __m512i second) {
+INTGEMM_AVX512BW static inline __m512i add_epi32(__m512i first, __m512i second) {
   return _mm512_add_epi32(first, second);
 }
-template <> inline AVX512BW __m512i set1_epi16<__m512i>(int16_t to) {
+template <> inline INTGEMM_AVX512BW __m512i set1_epi16<__m512i>(int16_t to) {
   return _mm512_set1_epi16(to);
 }
-template <> inline AVX512BW __m512 set1_ps<__m512>(float to) {
+template <> inline INTGEMM_AVX512BW __m512 set1_ps<__m512>(float to) {
   return _mm512_set1_ps(to);
 }
-AVX512BW static inline __m512i madd_epi16(__m512i first, __m512i second) {
+INTGEMM_AVX512BW static inline __m512i madd_epi16(__m512i first, __m512i second) {
   return _mm512_madd_epi16(first, second);
 }
-AVX512BW static inline __m512i maddubs_epi16(__m512i first, __m512i second) {
+INTGEMM_AVX512BW static inline __m512i maddubs_epi16(__m512i first, __m512i second) {
   return _mm512_maddubs_epi16(first, second);
 }
-AVX512BW static inline __m512i abs_epi8(__m512i arg) {
+INTGEMM_AVX512BW static inline __m512i abs_epi8(__m512i arg) {
   return _mm512_abs_epi8(arg);
 }
-AVX512BW static inline __m512 max_ps(__m512 first, __m512 second) {
+INTGEMM_AVX512BW static inline __m512 max_ps(__m512 first, __m512 second) {
   return _mm512_max_ps(first, second);
 }
 // Technically __AVX512DQ__
-AVX512DQ static inline __m512 and_ps(__m512 first, __m512 second) {
+INTGEMM_AVX512DQ static inline __m512 and_ps(__m512 first, __m512 second) {
   return _mm512_and_ps(first, second);
 }
 #endif
diff --git a/multiply.h b/multiply.h
index 6c81468..36b2da8 100644
--- a/multiply.h
+++ b/multiply.h
@@ -4,7 +4,7 @@
 
 namespace intgemm {
 
-SSE2 static inline float MaxFloat32(__m128 a) {
+INTGEMM_SSE2 static inline float MaxFloat32(__m128 a) {
   // Fold to just using the first 64 bits.
   __m128 second_half = _mm_shuffle_ps(a, a, 3 * 4 + 2);
   a = _mm_max_ps(a, second_half);
@@ -15,7 +15,7 @@ SSE2 static inline float MaxFloat32(__m128 a) {
   return *reinterpret_cast<float*>(&a);
 }
 
-SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) {
+INTGEMM_SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pack4567) {
   // No op for 128 bits: already reduced fully.
   MultiplyResult128 ret;
   ret.pack0123 = pack0123;
@@ -23,11 +23,11 @@ SSE2 static inline MultiplyResult128 PermuteSummer(__m128i pack0123, __m128i pac
   return ret;
 }
 
-AVX2 static inline float MaxFloat32(__m256 a) {
+INTGEMM_AVX2 static inline float MaxFloat32(__m256 a) {
   return MaxFloat32(max_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)));
 }
 
-AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
+INTGEMM_AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
   // This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f
   __m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21);
   // This instruction generates 1f 2f 3f 4f 5s 6s 7s 8s
@@ -36,8 +36,8 @@ AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
 }
 
 #ifndef INTGEMM_NO_AVX512
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) {
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) {
   // Form [0th 128-bit register of pack0123, 0st 128-bit register of pack4567, 2nd 128-bit register of pack0123, 2nd 128-bit register of pack4567]
   __m512i mix0 = _mm512_mask_permutex_epi64(pack0123, 0xcc, pack4567, (0 << 4) | (1 << 6));
   // Form [1st 128-bit register of pack0123, 1st 128-bit register of pack4567, 3rd 128-bit register of pack0123, 3rd 128-bit register of pack4567]
@@ -49,7 +49,7 @@ AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567)
 }
 
 // Find the maximum float.
-static inline AVX512DQ float MaxFloat32(__m512 a) {
+static inline INTGEMM_AVX512DQ float MaxFloat32(__m512 a) {
   return MaxFloat32(max_ps(_mm512_castps512_ps256(a), _mm512_extractf32x8_ps(a, 1)));
 }
 
@@ -70,7 +70,7 @@ template <class Register> inline Register Pack0123(Register sum0, Register sum1,
   return add_epi32(pack01, pack23);
 }
  */
-#define PACK0123_DEFINE(target, Register) \
+#define INTGEMM_PACK0123(target, Register) \
 target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Register sum3) { \
   Interleave32(sum0, sum1); \
   Register pack01 = add_epi32(sum0, sum1); \
@@ -80,14 +80,14 @@ target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Reg
   return add_epi32(pack01, pack23); \
 } \
 
-PACK0123_DEFINE(SSE2, __m128i)
-PACK0123_DEFINE(AVX2, __m256i)
+INTGEMM_PACK0123(INTGEMM_SSE2, __m128i)
+INTGEMM_PACK0123(INTGEMM_AVX2, __m256i)
 #ifndef INTGEMM_NO_AVX512
-/* Only AVX512F is necessary but due to GCC 5.4 bug we have to set AVX512BW */
-PACK0123_DEFINE(AVX512BW, __m512i)
+/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
+INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i)
 #endif
 
-// 16-bit multiplier for SSE2, AVX2, and AVX512.
+// 16-bit multiplier for INTGEMM_SSE2, INTGEMM_AVX2, and AVX512.
 // C = A * B * unquant_mult
 //
 // This has been substantially revised from Jacob Devlin's SSE code which is:
@@ -116,15 +116,15 @@ PACK0123_DEFINE(AVX512BW, __m512i)
 // C is output in row-major form.
 //
 // All of A, B, and C must be in aligned to a multiple of the register size:
-// SSE2: 16 bytes
-// AVX2: 32 bytes
+// INTGEMM_SSE2: 16 bytes
+// INTGEMM_AVX2: 32 bytes
 // AVX512: 64 bytes.
 //
 // A_rows can be anything non-negative.
 // width must be a multiple of the register size.
 // B_cols must be a multiple of 8.
 // Multiply16
-#define MULTIPLY16_DEFINE(Integer, target, WriteCSubType) \
+#define INTGEMM_MULTIPLY16(Integer, target, WriteCSubType) \
   template <class WriteC> target static void Multiply(const int16_t *A, const int16_t *B, WriteC C, Index A_rows, Index width, Index B_cols) { \
   assert(width % (sizeof(Integer) / sizeof(int16_t)) == 0); \
   assert(B_cols % 8 == 0); \
@@ -180,7 +180,7 @@ PACK0123_DEFINE(AVX512BW, __m512i)
   } \
 } \
 
-/* 8-bit matrix multiply used by AVX and AVX2.
+/* 8-bit matrix multiply used by AVX and INTGEMM_AVX2.
  * These have two peculiar properties:
  * 1. The sign instructions don't exist in AVX512.
  * 2. 16 registers means gcc's register allocation failed so I wrote it in my
@@ -189,11 +189,11 @@ PACK0123_DEFINE(AVX512BW, __m512i)
  *
  * Fun fact: AVX introduced the three-argument vpsignb and vpmaddubsw but only
  * for 128-bit, despite the primary change in AVX being the addition of
- * 256-bit.  We had to wait for AVX2 to get 256-bit versions of vpsignb and
+ * 256-bit.  We had to wait for INTGEMM_AVX2 to get 256-bit versions of vpsignb and
  * vpmaddubsw.  That's why this code is generic over 128-bit or 256-bit.
  */
 
-AVX2 inline static void InnerAVX2(
+INTGEMM_AVX2 inline static void InnerINTGEMM_AVX2(
     __m256i a, const __m256i *b,
     __m256i &sum0, __m256i &sum1, __m256i &sum2, __m256i &sum3,
     __m256i &sum4, __m256i &sum5, __m256i &sum6, __m256i &sum7) {
@@ -312,8 +312,8 @@ AVX2 inline static void InnerAVX2(
 }
 
 
-// For SSSE3 without AVX
-SSSE3 inline static void InnerSSSE3(
+// For INTGEMM_SSSE3 without AVX
+INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
     __m128i a, const __m128i *b,
     __m128i &sum0, __m128i &sum1, __m128i &sum2, __m128i &sum3,
     __m128i &sum4, __m128i &sum5, __m128i &sum6, __m128i &sum7) {
@@ -327,8 +327,8 @@ SSSE3 inline static void InnerSSSE3(
   sum6 = adds_epi16(sum6, maddubs_epi16(a_positive, sign_epi8(b[6], a)));
   sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a)));
 }
-//AVX2 or SSSE3 multiply
-#define MULTIPLY8_DEFINE(Integer, target, WriteCSubType) \
+//INTGEMM_AVX2 or INTGEMM_SSSE3 multiply
+#define INTGEMM_MULTIPLY8(Integer, target, WriteCSubType) \
 template <class WriteC> target static void Multiply(const int8_t *A, const int8_t *B, WriteC C, Index A_rows, Index width, Index B_cols) { \
   assert(width % sizeof(Integer) == 0); \
   assert(B_cols % 8 == 0); \
@@ -417,7 +417,7 @@ template <class Register> inline static float MaxAbsoluteBackend(const float *be
 
   return MaxFloat32(highest);
 }*/
-#define MAXABSOLUTE_DEFINE(Register, target) \
+#define INTGEMM_MAXABSOLUTE(Register, target) \
 target static float MaxAbsolute(const float *begin_float, const float *end_float) { \
   assert(end_float > begin_float); \
   assert((end_float - begin_float) % (sizeof(Register) / sizeof(float)) == 0); \
diff --git a/sse2_gemm.h b/sse2_gemm.h
index 882e006..9605a68 100644
--- a/sse2_gemm.h
+++ b/sse2_gemm.h
@@ -10,26 +10,26 @@ namespace intgemm {
 namespace sse2 {
 // Same implementation as AVX512, just width.  Grabs 4 32-bit values.
 // TODO duplicated function requires the removal of the annonymous namespace
-SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
+INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
   return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
 }
 
-SELECT_COL_B_DEFINE(SSE2, __m128i)
+INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
 
 class QuantizeTile16 {
   public:
     typedef __m128i Integer;
 
-    SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
+    INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
 
     // Quantize 8xfloat into 8xint16_t
-    SSE2 inline __m128i Consecutive(const float *input) {
+    INTGEMM_SSE2 inline __m128i Consecutive(const float *input) {
       __m128i g0 = QuantizerGrab(input, mult_reg_);
       __m128i g1 = QuantizerGrab(input + 4, mult_reg_);
       return _mm_packs_epi32(g0, g1);
     }
 
-    SSE2 inline __m128i ForReshape(const float *input, int) {
+    INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) {
       return Consecutive(input);
     }
 
@@ -39,19 +39,19 @@ class QuantizeTile16 {
 
 // Technically only requires SSE
 
-MAXABSOLUTE_DEFINE(__m128, SSE2)
+INTGEMM_MAXABSOLUTE(__m128, INTGEMM_SSE2)
 
 } //namespace
-// This should be pure SSE2 (and below).
+// This should be pure INTGEMM_SSE2 (and below).
 struct SSE2_16bit {
   typedef int16_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
-  SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+  INTGEMM_SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
-  SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
+  INTGEMM_SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
     assert(size % 8 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
@@ -66,15 +66,15 @@ struct SSE2_16bit {
   static const Index kBTileRow = 8;
   static const Index kBTileCol = 8;
 
-  PREPARE_B_16_DEFINE(SSE2, sse2::QuantizeTile16)
+  INTGEMM_PREPARE_B_16(INTGEMM_SSE2, sse2::QuantizeTile16)
 
-  SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+  INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     //TODO #DEFINE
     sse2::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
   }
-  MULTIPLY16_DEFINE(__m128i, SSE2, OnSSE2)
+  INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, OnSSE2)
 
-  constexpr static const char *const kName = "16-bit SSE2";
+  constexpr static const char *const kName = "16-bit INTGEMM_SSE2";
 
   static const CPUType kUses = CPU_SSE2;
 };
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index dccc730..cf5d2c1 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -13,30 +13,30 @@ namespace intgemm {
 namespace ssse3 {
 // Same implementation as AVX512, just width.  Grabs 4 32-bit values.
 //TODO duplicated function requires the removal of the annonymous namespace
-SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
+INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
   return _mm_cvtps_epi32(_mm_mul_ps(*reinterpret_cast<const __m128*>(input), quant_mult_reg));
 }
 
-SELECT_COL_B_DEFINE(SSSE3, __m128i)
+INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i)
 
 class QuantizeTile8 {
   public:
     typedef __m128i Integer;
 
-    SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
+    INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
 
-    SSSE3 inline __m128i ForReshape(const float *input, Index cols) {
+    INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) {
       // Skip a row.
       return Tile(input, input + 2 * cols);
     }
 
-    SSSE3 inline __m128i Consecutive(const float *input) {
+    INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) {
       return Tile(input, input + 8);
     }
 
   private:
     // Quantize 16xfloat into 16xint8_t
-    SSSE3 inline __m128i Tile(const float *input0, const float *input1) {
+    INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1) {
       const __m128i neg128 = _mm_set1_epi8(-128);
       __m128i g0 = QuantizerGrab(input0, mult_reg_);
       __m128i g1 = QuantizerGrab(input0 + 4, mult_reg_);
@@ -47,7 +47,7 @@ class QuantizeTile8 {
       __m128i packed = _mm_packs_epi16(packed0, packed1);
       /* Ban -128.
        * Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127).  Instead,
-       * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
+       * use INTGEMM_SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
        * The first generates 0xff for fields -128.
        * The second subtracts 0xff from -128 which has the effect of converting
        * to -127.
@@ -65,16 +65,16 @@ class QuantizeTile8 {
 } // namespace
 
 
-// pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need.
+// pmaddubsw (the 8-bit multiply) is INTGEMM_SSSE3, so pedantically that's the version we need.
 struct SSSE3_8bit {
   typedef int8_t Integer;
 
   // Currently A is prepared by quantization but this could theoretically change.
-  SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+  INTGEMM_SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     Quantize(input, output, quant_mult, rows * cols);
   }
 
-  SSSE3 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
+  INTGEMM_SSSE3 static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
     assert(size % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
@@ -89,22 +89,22 @@ struct SSSE3_8bit {
   static const Index kBTileRow = 16;
   static const Index kBTileCol = 8;
 /*
-  SSSE3 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
+  INTGEMM_SSSE3 static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
     PrepareBFor8(input, output, ssse3::QuantizeTile8(quant_mult), rows, cols);
   }*/
-  PREPARE_B_8_DEFINE(SSSE3, ssse3::QuantizeTile8)
+  INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8)
 
-  SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
+  INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
   }
 /*
-  SSSE3 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
+  INTGEMM_SSSE3 static void Multiply(const int8_t *A, const int8_t *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols) {
     //Multiply8_SSE2OrAVX2<Multiply8_C, __m128i, __m128>(A, B, C, unquant_mult, A_rows, width, B_cols);
     Multiply8_SSE2OrAVX2__m128i<JustUnquantizeC>(A, B, JustUnquantizeC(C, unquant_mult), A_rows, width, B_cols);
   }*/
-  MULTIPLY8_DEFINE(__m128i, SSSE3, OnSSE2)
+  INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, OnSSE2)
   
-  constexpr static const char *const kName = "8-bit SSSE3";
+  constexpr static const char *const kName = "8-bit INTGEMM_SSSE3";
 
   static const CPUType kUses = CPU_SSSE3;
 };
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index 6c16dec..6deb259 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -51,7 +51,7 @@ template <class V> void SlowTranspose(const V *from, V *to, Index rows, Index co
 }
 
 
-TEST_CASE("Transpose 16", "[transpose]") {
+INTGEMM_SSE2 TEST_CASE("Transpose 16", "[transpose]") {
   if (kCPU < CPU_SSE2) return;
   AlignedVector<int16_t> input(8 * 8);
   for (int16_t i = 0; i < 64; ++i) {
@@ -69,7 +69,7 @@ TEST_CASE("Transpose 16", "[transpose]") {
   }
 }
 
-SSSE3 TEST_CASE("Transpose 8", "[transpose]") {
+INTGEMM_SSSE3 TEST_CASE("Transpose 8", "[transpose]") {
   if (kCPU < CPU_SSSE3) return;
   AlignedVector<int8_t> input(16 * 16);
   for (int i = 0; i < 16 * 16; ++i) {
diff --git a/types.h b/types.h
index 9bd6f03..568c444 100644
--- a/types.h
+++ b/types.h
@@ -2,23 +2,23 @@
 #include <exception>
 
 #define DEFAULT __attribute__ ((target ("default")))
-#define SSE2 __attribute__ ((target ("sse2")))
+#define INTGEMM_SSE2 __attribute__ ((target ("sse2")))
 //#define SSE2_3 __attribute__ ((target ("ssse3"), target("sse2"))) //Not supported by clang
-#define SSSE3 __attribute__ ((target ("ssse3")))
-#define AVX2 __attribute__ ((target ("avx2")))
+#define INTGEMM_SSSE3 __attribute__ ((target ("ssse3")))
+#define INTGEMM_AVX2 __attribute__ ((target ("avx2")))
 //#define AVX2_512F __attribute__ ((target ("avx2"), target("avx512f"))) //Not supported by clang
 #if defined __INTEL_COMPILER
-#define AVX512F __attribute__ ((target ("avx512f")))
-#define AVX512BW __attribute__ ((target ("avx512f")))
-#define AVX512DQ __attribute__ ((target ("avx512f")))
+#define INTGEMM_AVX512F __attribute__ ((target ("avx512f")))
+#define INTGEMM_AVX512BW __attribute__ ((target ("avx512f")))
+#define INTGEMM_AVX512DQ __attribute__ ((target ("avx512f")))
 #else
-#define AVX512F __attribute__ ((target ("avx512f")))
-#define AVX512BW __attribute__ ((target ("avx512bw")))
-#define AVX512DQ __attribute__ ((target ("avx512dq")))
+#define INTGEMM_AVX512F __attribute__ ((target ("avx512f")))
+#define INTGEMM_AVX512BW __attribute__ ((target ("avx512bw")))
+#define INTGEMM_AVX512DQ __attribute__ ((target ("avx512dq")))
 #endif
 namespace intgemm {
 
-// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3).
+// This will be thrown if a CPU isn't supported by the routines (16-bit without INTGEMM_SSE2 or 8-bit without INTGEMM_SSSE3).
 class UnsupportedCPU : public std::exception {
   public:
     UnsupportedCPU() {}