Change quant_mult from member variable to static parameter

Might fix gcc 5.4
author: Kenneth Heafield <github@kheafield.com> 2020-09-15 23:53:06 +0300
committer: Kenneth Heafield <github@kheafield.com> 2020-09-15 23:53:06 +0300
commit: ada32a77770a98d5983647a8c02b5b65e18bb754 (patch)
tree: c439d2c7ac7765b17d3d0a8b17e4bb70c3c354c9
parent: fdbd834e92da7d32c0363e8eb5200ec301d0f146 (diff)
6 files changed, 102 insertions, 131 deletions
diff --git a/intgemm/avx2_gemm.h b/intgemm/avx2_gemm.h
index d111b32..5e81475 100644
--- a/intgemm/avx2_gemm.h
+++ b/intgemm/avx2_gemm.h
@@ -19,34 +19,30 @@ INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i)
 
 class QuantizeTile16 {
   public:
-    INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
-
-    INTGEMM_AVX2 Register Consecutive(const float *input) const {
-      return Tile(input, input + 8);
+    INTGEMM_AVX2 static inline Register Consecutive(FRegister mult_reg, const float *input) {
+      return Tile(mult_reg, input, input + 8);
     }
 
-    INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
-      return Tile(
+    INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
+      return Tile(mult_reg,
         input,
         input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0));
     }
 
-    INTGEMM_AVX2 Register ForReshape(const float *input, Index cols) const {
+    INTGEMM_AVX2 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
       // 8 rows in the first 128-bit register, 8 in the second register.
-      return Tile(input, input + 8 * cols);
+      return Tile(mult_reg, input, input + 8 * cols);
     }
 
   private:
-    INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) const {
-      Register g0 = QuantizerGrab(input0, mult_);
-      Register g1 = QuantizerGrab(input1, mult_);
+    INTGEMM_AVX2 static inline Register Tile(FRegister mult_reg, const float *input0, const float *input1) {
+      Register g0 = QuantizerGrab(input0, mult_reg);
+      Register g1 = QuantizerGrab(input1, mult_reg);
       Register packed = _mm256_packs_epi32(g0, g1);
       // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15.
       // Technically this could be removed if the PrepareB did the same reordering internally.
       return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
     }
-
-    const FRegister mult_;
 };
 
 struct Kernels16 {
@@ -61,10 +57,10 @@ struct Kernels16 {
   INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
     assert(size % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
-    avx2::QuantizeTile16 q(quant_mult);
+    FRegister q = set1_ps<FRegister>(quant_mult);
     const float *end = input + size;
     for (; input != end; input += 16, output += 16) {
-      *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
+      *reinterpret_cast<__m256i*>(output) = QuantizeTile16::Consecutive(q, input);
     }
   }
 
@@ -96,17 +92,15 @@ struct Kernels16 {
  */
 class QuantizeTile8 {
   public:
-    INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
-
-    INTGEMM_AVX2 inline __m256i Consecutive(const float *input) const {
-      return Tile(input, input + 8, input + 16, input + 24);
+    INTGEMM_AVX2 static inline Register Consecutive(FRegister quant_mult, const float *input) {
+      return Tile(quant_mult, input, input + 8, input + 16, input + 24);
     }
 
-    INTGEMM_AVX2 inline __m256i ConsecutiveU(const float *input) const {
-      return TileU(input, input + 8, input + 16, input + 24);
+    INTGEMM_AVX2 static inline Register ConsecutiveU(FRegister quant_mult, const float *input) {
+      return TileU(quant_mult, input, input + 8, input + 16, input + 24);
     }
 
-    INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+    INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
       const float* inputs[4];
       for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
         while (cols_left < sizeof(Register) / sizeof(float)) {
@@ -117,24 +111,24 @@ class QuantizeTile8 {
         input += sizeof(Register) / sizeof(float);
         cols_left -= sizeof(Register) / sizeof(float);
       }
-      return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
+      return Tile(quant_mult, inputs[0], inputs[1], inputs[2], inputs[3]);
     }
 
-    INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) const {
+    INTGEMM_AVX2 static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
       // Put higher rows in the second half of the register.  These will jumble
       // around in the same way then conveniently land in the right place.
-      return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
+      return Tile(quant_mult, input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
     }
 
-    INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const {
+    INTGEMM_AVX2 static inline __m256i Tile(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
       // Looking at the assembly, gcc has pulled this outside the loops calling this.
       const __m256i neg127 = _mm256_set1_epi8(-127);
       const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
       // Grab 4 registers at a time in 32-bit format.
-      __m256i g0 = avx2::QuantizerGrab(input0, mult_);
-      __m256i g1 = avx2::QuantizerGrab(input1, mult_);
-      __m256i g2 = avx2::QuantizerGrab(input2, mult_);
-      __m256i g3 = avx2::QuantizerGrab(input3, mult_);
+      __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
+      __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
+      __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
+      __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
       // Pack 32-bit to 16-bit.
       __m256i packed0 = _mm256_packs_epi32(g0, g1);
       __m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -151,16 +145,16 @@ class QuantizeTile8 {
 
   private:
     //A version that produces uint8_ts
-    INTGEMM_AVX2 inline __m256i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const {
+    INTGEMM_AVX2 static inline Register TileU(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
       // Looking at the assembly, gcc has pulled this outside the loops calling this.
       const __m256i neg127 = _mm256_set1_epi8(-127);
       const __m256i pos127 = _mm256_set1_epi8(127);
       const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
       // Grab 4 registers at a time in 32-bit format.
-      __m256i g0 = avx2::QuantizerGrab(input0, mult_);
-      __m256i g1 = avx2::QuantizerGrab(input1, mult_);
-      __m256i g2 = avx2::QuantizerGrab(input2, mult_);
-      __m256i g3 = avx2::QuantizerGrab(input3, mult_);
+      __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
+      __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
+      __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
+      __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
       // Pack 32-bit to 16-bit.
       __m256i packed0 = _mm256_packs_epi32(g0, g1);
       __m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -175,8 +169,6 @@ class QuantizeTile8 {
       // and the values are only used for GEMM.
       return _mm256_permutevar8x32_epi32(packed, shuffle_param);
     }
-
-    const __m256 mult_;
 };
 
 struct Kernels8 {
@@ -187,9 +179,9 @@ struct Kernels8 {
     Quantize(input, output, quant_mult, rows * cols);
   }
  private:
-  INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2, __m256i, avx2)
+  INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2)
  public:
-  INTGEMM_QUANTIZE(INTGEMM_AVX2, __m256i, avx2)
+  INTGEMM_QUANTIZE(INTGEMM_AVX2)
 
   // Currently A is prepared by quantization but this could theoretically change.
   INTGEMM_AVX2 static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) {
@@ -200,10 +192,10 @@ struct Kernels8 {
   INTGEMM_AVX2 static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) {
     assert(size % 32 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
-    avx2::QuantizeTile8 q(quant_mult);
+    FRegister q = set1_ps<FRegister>(quant_mult);
     const float *end = input + size;
     for (; input != end; input += 32, output += 32) {
-      *reinterpret_cast<__m256i*>(output) = q.ConsecutiveU(input);
+      *reinterpret_cast<__m256i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
     }
   }
 
diff --git a/intgemm/avx512_gemm.h b/intgemm/avx512_gemm.h
index 6108289..f9fb1eb 100644
--- a/intgemm/avx512_gemm.h
+++ b/intgemm/avx512_gemm.h
@@ -66,36 +66,27 @@ INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const f
 // being used for the quantizer.
 class QuantizeTile16 {
   public:
-    /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-    INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
-    INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+    INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
       auto input0 = input;
       auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0);
-      auto g0 = QuantizerGrabHalves(input0, input1, mult_reg_);
-      auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, mult_reg_);
+      auto g0 = QuantizerGrabHalves(input0, input1, quant_mult);
+      auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, quant_mult);
       auto packed = packs_epi32(g0, g1);
       return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
     }
 
-    INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const {
-      __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
-      __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
+    INTGEMM_AVX512BW static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
+      __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, quant_mult);
+      __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, quant_mult);
       __m512i packed = packs_epi32(g0, g1);
       // Permute within 256-bit lanes, so same as INTGEMM_AVX2
       return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
     }
-
-  private:
-    const __m512 mult_reg_;
 };
 
 class QuantizeTile8 {
   public:
-    /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-    INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
-    INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+    INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
       static const __m512i neg127 = _mm512_set1_epi8(-127);
       static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
 
@@ -110,10 +101,10 @@ class QuantizeTile8 {
         cols_left -= sizeof(Register) / sizeof(float);
       }
 
-      auto g0 = QuantizerGrab(inputs[0], mult_reg_);
-      auto g1 = QuantizerGrab(inputs[1], mult_reg_);
-      auto g2 = QuantizerGrab(inputs[2], mult_reg_);
-      auto g3 = QuantizerGrab(inputs[3], mult_reg_);
+      auto g0 = QuantizerGrab(inputs[0], quant_mult);
+      auto g1 = QuantizerGrab(inputs[1], quant_mult);
+      auto g2 = QuantizerGrab(inputs[2], quant_mult);
+      auto g3 = QuantizerGrab(inputs[3], quant_mult);
 
       auto packed0 = packs_epi32(g0, g1);
       auto packed1 = packs_epi32(g2, g3);
@@ -122,17 +113,17 @@ class QuantizeTile8 {
       return _mm512_permutexvar_epi32(shuffle_param, packed);
     }
 
-    INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const {
+    INTGEMM_AVX512BW static inline __m512i ForReshape(FRegister quant_mult, const float *input, Index cols) {
       // TODO: try alternative: _mm512_cvtsepi32_epi8 ?
       const __m512i neg127 = _mm512_set1_epi8(-127);
       // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
       const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
 
       // 32-bit format.
-      __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_);
-      __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_);
-      __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_);
-      __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_);
+      __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, quant_mult);
+      __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, quant_mult);
+      __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, quant_mult);
+      __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, quant_mult);
       // Pack 32-bit to 16-bit.
       __m512i packed0 = packs_epi32(g0, g1);
       __m512i packed1 = packs_epi32(g2, g3);
@@ -143,9 +134,6 @@ class QuantizeTile8 {
       // 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63
       return _mm512_permutexvar_epi32(shuffle_param, packed);
     }
-
-  private:
-    const __m512 mult_reg_;
 };
 
 struct Kernels16 {
diff --git a/intgemm/interleave.h b/intgemm/interleave.h
index 1ac14b7..1ec686b 100644
--- a/intgemm/interleave.h
+++ b/intgemm/interleave.h
@@ -179,7 +179,7 @@ template <class Register> static inline void Transpose8InLane(
 // ... ...
 #define INTGEMM_PREPARE_B_8(target, QuantClass) \
 target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \
-  QuantClass q(quant_mult); \
+  FRegister q = set1_ps<FRegister>(quant_mult); \
   /* Currently all multipliers have a stride of 8 columns.*/ \
   const Index kColStride = 8; \
   assert(cols % kColStride == 0); \
@@ -193,14 +193,14 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
          This isn't quite Transpose8InLane because it's half the number of columns, \
          so each register starts with two rows instead of being one row. \
          The quantizers know to skip a row.*/ \
-      output[0] = q.ForReshape(input + cols * (r    ) + c, cols); \
-      output[1] = q.ForReshape(input + cols * (r + 1) + c, cols); \
-      output[2] = q.ForReshape(input + cols * (r + 4) + c, cols); \
-      output[3] = q.ForReshape(input + cols * (r + 5) + c, cols); \
-      output[4] = q.ForReshape(input + cols * (r + 8) + c, cols); \
-      output[5] = q.ForReshape(input + cols * (r + 9) + c, cols); \
-      output[6] = q.ForReshape(input + cols * (r + 12) + c, cols); \
-      output[7] = q.ForReshape(input + cols * (r + 13) + c, cols); \
+      output[0] = QuantClass::ForReshape(q, input + cols * (r    ) + c, cols); \
+      output[1] = QuantClass::ForReshape(q, input + cols * (r + 1) + c, cols); \
+      output[2] = QuantClass::ForReshape(q, input + cols * (r + 4) + c, cols); \
+      output[3] = QuantClass::ForReshape(q, input + cols * (r + 5) + c, cols); \
+      output[4] = QuantClass::ForReshape(q, input + cols * (r + 8) + c, cols); \
+      output[5] = QuantClass::ForReshape(q, input + cols * (r + 9) + c, cols); \
+      output[6] = QuantClass::ForReshape(q, input + cols * (r + 12) + c, cols); \
+      output[7] = QuantClass::ForReshape(q, input + cols * (r + 13) + c, cols); \
       Interleave8(output[0], output[1]); \
       Interleave8(output[2], output[3]); \
       Interleave8(output[4], output[5]); \
@@ -212,7 +212,7 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
 
 #define INTGEMM_PREPARE_B_16(target, QuantClass) \
 target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \
-  QuantClass q(quant_mult); \
+  FRegister q = set1_ps<FRegister>(quant_mult); \
   assert(cols % 8 == 0); \
   assert(rows % (sizeof(Register) / sizeof(int16_t)) == 0); \
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
@@ -222,7 +222,7 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
     for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \
       /* gcc unrolls this loop and uses registers for output[k]*/ \
       for (Index k = 0; k < 8; ++k) { \
-        output[k] = q.ForReshape(input + cols * (r + k) + c, cols); \
+        output[k] = QuantClass::ForReshape(q, input + cols * (r + k) + c, cols); \
       } \
       Transpose16InLane(output[0], output[1], output[2], output[3], output[4], output[5], output[6], output[7]); \
     } \
@@ -270,13 +270,13 @@ target static inline void PrepareBTransposed(const float* input, Integer* output
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
   assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
   \
-  Quantizer quantizer(quant_mult); \
+  FRegister q = set1_ps<FRegister>(quant_mult); \
   Register* output_it = reinterpret_cast<Register*>(output); \
   Index r = 0; \
   Index c = 0; \
   while (r < rows) { \
     for (Index ri = 0; ri < 8; ++ri) \
-      *output_it++ = quantizer.ConsecutiveWithWrapping(input + (r + ri) * cols + c, cols - c, cols, 8); \
+      *output_it++ = Quantizer::ConsecutiveWithWrapping(q, input + (r + ri) * cols + c, cols - c, cols, 8); \
     c += RegisterElemsInt; \
     while (c >= cols) { \
       r += kColStride; \
diff --git a/intgemm/multiply.h b/intgemm/multiply.h
index 4b7558d..e201e09 100644
--- a/intgemm/multiply.h
+++ b/intgemm/multiply.h
@@ -47,16 +47,16 @@ INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i p
 // Quantize function used for SSSE3 and AVX2.
 // Separate function for thread to work around gcc 7 bug that doesn't imbue
 // target attributes across #pragma omp parallel.
-#define INTGEMM_QUANTIZE_THREAD(target, Register, name) \
+#define INTGEMM_QUANTIZE_THREAD(target) \
 target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \
-  name::QuantizeTile8 q(quant_mult); \
+  FRegister q = set1_ps<FRegister>(quant_mult); \
   INTGEMM_OMP_FOR \
   for (std::size_t i = 0; i < count; i += sizeof(Register)) { \
-    *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+    *reinterpret_cast<Register*>(output + i) = QuantizeTile8::Consecutive(q, input + i); \
   } \
 }
 
-#define INTGEMM_QUANTIZE(target, Register, name) \
+#define INTGEMM_QUANTIZE(target) \
 target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
   assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
@@ -68,7 +68,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa
   } \
   std::size_t overhang = size & (kBatch - 1); \
   if (!overhang) return; \
-  name::QuantizeTile8 q(quant_mult); \
+  FRegister q = set1_ps<FRegister>(quant_mult); \
   /* Each does size(Register) / 32 == kBatch / 4 floats at a time.
    * If we're allowed to read one of them, then we can read the whole register.  */ \
   const float *inputs[4]; \
@@ -80,7 +80,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa
   for (; i < 4; ++i) { \
     inputs[i] = &input[fast_end]; \
   } \
-  Register result = q.Tile(inputs[0], inputs[1], inputs[2], inputs[3]); \
+  Register result = QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]); \
   std::memcpy(output + (size & ~(kBatch - 1)), &result, overhang); \
 }
 
diff --git a/intgemm/sse2_gemm.h b/intgemm/sse2_gemm.h
index dc3fa60..cd49efe 100644
--- a/intgemm/sse2_gemm.h
+++ b/intgemm/sse2_gemm.h
@@ -19,30 +19,26 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
 
 class QuantizeTile16 {
   public:
-    INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
-    INTGEMM_SSE2 inline __m128i Consecutive(const float *input) const {
-      return Tile(input, input + 4);
+    INTGEMM_SSE2 static inline Register Consecutive(__m128 mult_reg, const float *input) {
+      return Tile(mult_reg, input, input + 4);
     }
 
-    INTGEMM_SSE2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
-      return Tile(
+    INTGEMM_SSE2 static inline Register ConsecutiveWithWrapping(__m128 mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
+      return Tile(mult_reg,
         input,
         input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0));
     }
 
-    INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) const {
-      return Consecutive(input);
+    INTGEMM_SSE2 static inline Register ForReshape(__m128 mult_reg, const float *input, int) {
+      return Consecutive(mult_reg, input);
     }
 
   private:
-    INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) const {
-      __m128i g0 = QuantizerGrab(input0, mult_reg_);
-      __m128i g1 = QuantizerGrab(input1, mult_reg_);
+    INTGEMM_SSE2 static inline Register Tile(__m128 mult_reg, const float *input0, const float *input1) {
+      __m128i g0 = kernels::quantize(loadu_ps<__m128>(input0), mult_reg);
+      __m128i g1 = kernels::quantize(loadu_ps<__m128>(input1), mult_reg);
       return _mm_packs_epi32(g0, g1);
     }
-
-    const __m128 mult_reg_;
 };
 
 // This should be pure SSE2 (and below).
@@ -58,10 +54,10 @@ struct Kernels16 {
     assert(size % 8 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
-    sse2::QuantizeTile16 q(quant_mult);
+    FRegister q = set1_ps<FRegister>(quant_mult);
     const float *end = input + size;
     for (; input != end; input += 8, output += 8) {
-      *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
+      *reinterpret_cast<__m128i*>(output) = QuantizeTile16::Consecutive(q, input);
     }
   }
 
diff --git a/intgemm/ssse3_gemm.h b/intgemm/ssse3_gemm.h
index beaf4b1..865fe12 100644
--- a/intgemm/ssse3_gemm.h
+++ b/intgemm/ssse3_gemm.h
@@ -21,22 +21,20 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i)
 
 class QuantizeTile8 {
   public:
-    INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
-    INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) const {
+    INTGEMM_SSSE3 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
       // Skip a row.
-      return Tile(input, input + 4, input + 2 * cols, input + 2 * cols + 4);
+      return Tile(mult_reg, input, input + 4, input + 2 * cols, input + 2 * cols + 4);
     }
 
-    INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) const {
-      return Tile(input, input + 4, input + 8, input + 12);
+    INTGEMM_SSSE3 static inline Register Consecutive(FRegister mult_reg, const float *input) {
+      return Tile(mult_reg, input, input + 4, input + 8, input + 12);
     }
 
-    INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) const {
-      return TileU(input, input + 4, input + 8, input + 12);
+    INTGEMM_SSSE3 static inline Register ConsecutiveU(FRegister mult_reg, const float *input) {
+      return TileU(mult_reg, input, input + 4, input + 8, input + 12);
     }
 
-    INTGEMM_SSSE3 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+    INTGEMM_SSSE3 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
       const float* inputs[4];
       for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
         while (cols_left < sizeof(Register) / sizeof(float)) {
@@ -47,16 +45,16 @@ class QuantizeTile8 {
         input += sizeof(Register) / sizeof(float);
         cols_left -= sizeof(Register) / sizeof(float);
       }
-      return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
+      return Tile(mult_reg, inputs[0], inputs[1], inputs[2], inputs[3]);
     }
 
     // Quantize 16xfloat into 16xint8_t
-    INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const {
+    INTGEMM_SSSE3 static inline __m128i Tile(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
       const __m128i neg128 = _mm_set1_epi8(-128);
-      __m128i g0 = QuantizerGrab(input0, mult_reg_);
-      __m128i g1 = QuantizerGrab(input1, mult_reg_);
-      __m128i g2 = QuantizerGrab(input2, mult_reg_);
-      __m128i g3 = QuantizerGrab(input3, mult_reg_);
+      __m128i g0 = QuantizerGrab(input0, mult_reg);
+      __m128i g1 = QuantizerGrab(input1, mult_reg);
+      __m128i g2 = QuantizerGrab(input2, mult_reg);
+      __m128i g3 = QuantizerGrab(input3, mult_reg);
       __m128i packed0 = _mm_packs_epi32(g0, g1);
       __m128i packed1 = _mm_packs_epi32(g2, g3);
       __m128i packed = _mm_packs_epi16(packed0, packed1);
@@ -74,13 +72,13 @@ class QuantizeTile8 {
     }
 
   private:
-    INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const {
+    INTGEMM_SSSE3 static inline __m128i TileU(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
       const __m128i neg128 = _mm_set1_epi8(-128);
       const __m128i pos127 = _mm_set1_epi8(127);
-      __m128i g0 = QuantizerGrab(input0, mult_reg_);
-      __m128i g1 = QuantizerGrab(input1, mult_reg_);
-      __m128i g2 = QuantizerGrab(input2, mult_reg_);
-      __m128i g3 = QuantizerGrab(input3, mult_reg_);
+      __m128i g0 = QuantizerGrab(input0, mult_reg);
+      __m128i g1 = QuantizerGrab(input1, mult_reg);
+      __m128i g2 = QuantizerGrab(input2, mult_reg);
+      __m128i g3 = QuantizerGrab(input3, mult_reg);
       __m128i packed0 = _mm_packs_epi32(g0, g1);
       __m128i packed1 = _mm_packs_epi32(g2, g3);
       __m128i packed = _mm_packs_epi16(packed0, packed1);
@@ -96,9 +94,6 @@ class QuantizeTile8 {
       return _mm_add_epi8(_mm_sub_epi8(packed, evils), pos127);
       // No permute needed.  packs is in order for SSE.
     }
-
-  private:
-    const FRegister mult_reg_;
 };
 
 // pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need.
@@ -111,9 +106,9 @@ struct Kernels8 {
   }
 
  private:
-  INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3, __m128i, ssse3)
+  INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3)
  public:
-  INTGEMM_QUANTIZE(INTGEMM_SSSE3, __m128i, ssse3)
+  INTGEMM_QUANTIZE(INTGEMM_SSSE3)
 
   // Version with unsigned int + 127
   // Currently A is prepared by quantization but this could theoretically change.
@@ -125,10 +120,10 @@ struct Kernels8 {
     assert(size % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
     assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
-    ssse3::QuantizeTile8 q(quant_mult);
+    FRegister q = set1_ps<FRegister>(quant_mult);
     const float *end = input + size;
     for (; input != end; input += 16, output += 16) {
-      *reinterpret_cast<__m128i*>(output) = q.ConsecutiveU(input);
+      *reinterpret_cast<__m128i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
     }
   }
 
@@ -138,7 +133,7 @@ struct Kernels8 {
 
   INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8)
   INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, ssse3::QuantizeTile8, int8_t)
+  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t)
 
   INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
     ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
author	Kenneth Heafield <github@kheafield.com>	2020-09-15 23:53:06 +0300
committer	Kenneth Heafield <github@kheafield.com>	2020-09-15 23:53:06 +0300
commit	ada32a77770a98d5983647a8c02b5b65e18bb754 (patch)
tree	c439d2c7ac7765b17d3d0a8b17e4bb70c3c354c9
parent	fdbd834e92da7d32c0363e8eb5200ec301d0f146 (diff)