Merge pull request #66 from kpu/refactoring

Refactoring
author: Kenneth Heafield <kpu@users.noreply.github.com> 2020-02-07 18:25:34 +0300
committer: GitHub <noreply@github.com> 2020-02-07 18:25:34 +0300
commit: f1d5804bd1f04f114889ca7399e3abd93eb7d128 (patch)
tree: 8f4db81db093eab678d2e104792aa0637e4c28ba
parent: ca66ffeb5b8a412ae9307715cd94d3350930a12e (diff)
parent: d96738e3978b248bf190cf62aceb91da2667071c (diff)
6 files changed, 119 insertions, 130 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h
index 85319a8..93709e4 100644
--- a/avx2_gemm.h
+++ b/avx2_gemm.h
@@ -24,23 +24,23 @@ class QuantizeTile16 {
 
     INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
 
-    INTGEMM_AVX2 Integer Consecutive(const float *input) {
+    INTGEMM_AVX2 Integer Consecutive(const float *input) const {
       return Tile(input, input + 8);
     }
 
-    INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+    INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
       return Tile(
         input,
         input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0));
     }
 
-    INTGEMM_AVX2 Integer ForReshape(const float *input, Index cols) {
+    INTGEMM_AVX2 Integer ForReshape(const float *input, Index cols) const {
       // 8 rows in the first 128-bit register, 8 in the second register.
       return Tile(input, input + 8 * cols);
     }
 
   private:
-    INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) {
+    INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) const {
       __m256i g0 = QuantizerGrab(input0, mult_);
       __m256i g1 = QuantizerGrab(input1, mult_);
       __m256i packed = _mm256_packs_epi32(g0, g1);
@@ -107,15 +107,15 @@ class QuantizeTile8 {
 
     INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
 
-    INTGEMM_AVX2 inline __m256i Consecutive(const float *input) {
+    INTGEMM_AVX2 inline __m256i Consecutive(const float *input) const {
       return Tile(input, input + 8, input + 16, input + 24);
     }
 
-    INTGEMM_AVX2 inline __m256i ConsecutiveU(const float *input) {
+    INTGEMM_AVX2 inline __m256i ConsecutiveU(const float *input) const {
       return TileU(input, input + 8, input + 16, input + 24);
     }
 
-    INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+    INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
       const float* inputs[4];
       for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
         while (cols_left < sizeof(Integer) / sizeof(float)) {
@@ -129,14 +129,14 @@ class QuantizeTile8 {
       return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
     }
 
-    INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) {
+    INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) const {
       // Put higher rows in the second half of the register.  These will jumble
       // around in the same way then conveniently land in the right place.
       return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
     }
 
   private:
-    INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
+    INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const {
       // Looking at the assembly, gcc has pulled this outside the loops calling this.
       const __m256i neg127 = _mm256_set1_epi8(-127);
       const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
@@ -160,7 +160,7 @@ class QuantizeTile8 {
     }
 
     //A version that produces uint8_ts
-    INTGEMM_AVX2 inline __m256i TileU(const float *input0, const float *input1, const float *input2, const float *input3) {
+    INTGEMM_AVX2 inline __m256i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const {
       // Looking at the assembly, gcc has pulled this outside the loops calling this.
       const __m256i neg127 = _mm256_set1_epi8(-127);
       const __m256i pos127 = _mm256_set1_epi8(127);
diff --git a/avx512_gemm.h b/avx512_gemm.h
index efddd7a..91fdd8a 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -75,7 +75,7 @@ class QuantizeTile16 {
     /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
     INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
 
-    INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+    INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
       auto input0 = input;
       auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0);
       auto g0 = QuantizerGrabHalves(input0, input1, mult_reg_);
@@ -84,7 +84,7 @@ class QuantizeTile16 {
       return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
     }
 
-    INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
+    INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const {
       __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
       __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
       __m512i packed = _mm512_packs_epi32(g0, g1);
@@ -103,7 +103,7 @@ class QuantizeTile8 {
     /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
     INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
 
-    INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+    INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
       static const __m512i neg127 = _mm512_set1_epi8(-127);
       static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
 
@@ -130,7 +130,7 @@ class QuantizeTile8 {
       return _mm512_permutexvar_epi32(shuffle_param, packed);
     }
 
-    INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) {
+    INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const {
       // TODO: try alternative: _mm512_cvtsepi32_epi8 ?
       const __m512i neg127 = _mm512_set1_epi8(-127);
       // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
diff --git a/intgemm.h b/intgemm.h
index 8a63fb8..baba935 100644
--- a/intgemm.h
+++ b/intgemm.h
@@ -204,84 +204,13 @@ struct TileInfo {
   const Index b_cols;
 };
 
-/* 16-bit matrix multiplication. */
-template <typename Callback>
-struct Int16Mult {
-  // Multiply C = A * B, presuming A and B have been prepared.
-  static void (*Multiply)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
-};
-
-template <typename Callback>
-void (*Int16Mult<Callback>::Multiply)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512_16bit::Multiply<Callback> /*TODO VNNI 16-bit. */, AVX512_16bit::Multiply<Callback>, AVX2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, Unsupported_16bit::Multiply);
-
-struct Int16 {
-  using Integer = int16_t;
-
-  // A's size must be a multiple of 1x32.
-  // B's size must be a multiple of 32x8.
-  static constexpr TileInfo tile_info{1, 32, 32, 8};
-
-  // Currently A is prepared by quantization but this could theoretically change.
-  // A's columns must be a multiple of 8.
-  // The number of rows is anything.
-  static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
-    Quantize(input, output, quant_mult, rows * cols);
-  }
-
-  // Multiply floats by quant_mult then convert to 16-bit integers with saturation.
-  // input
-  static void (*Quantize)(const float *input, int16_t *output, float quant_mult, Index size);
-
-  // Warning: the output of PrepareB depends on the CPU.
-  // It will match the Multiply function on the same CPU though.
-  static void (*PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
-
-  // Convert from a B that was already transposed (routine not provided) and
-  // quantized (e.g. with Quantize) to the CPU-dependent format used for
-  // Multiply.  This is useful for storing a quantized model on disk then in a
-  // CPU-independent fashion.
-  static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols);
-
-  // Convert from a B that was already transposed (routine not provided) to
-  // the CPU-dependent format used for Multiply.  This is useful for storing
-  // a quantized model on disk then in a CPU-independent fashion.
-  static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols);
-
-  // Select columns from a prepared B matrix.  The number of selected columns must be a multiple of 8. 
-  static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
-
-  // Multiply C = A * B, presuming A and B have been prepared.
-  template <typename Callback>
-  static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    Int16Mult<Callback>::Multiply(A, B, A_rows, width, B_cols, callback);
-  }
-
-  static const char *const kName;
-};
-
-/* 8-bit matrix multiplication */
-template <typename Callback>
-struct Int8Mult {
-  // Multiply C = A * B, presuming A and B have been prepared.
-  static void (*Multiply)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
-  static void (*Multiply8Shift)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
-  static void (*PrepareBias)(const int8_t *B, Index width, Index B_cols, Callback callback);
-};
-
-template <typename Callback>
-void (*Int8Mult<Callback>::Multiply)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply<Callback>, AVX512_8bit::Multiply<Callback>, AVX2_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, Unsupported_8bit::Multiply);
-
-template <class Callback>
-void (*Int8Mult<Callback>::Multiply8Shift)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply8Shift<Callback>, AVX512_8bit::Multiply8Shift<Callback>, AVX2_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift);
-
-template <class Callback>
-void (*Int8Mult<Callback>::PrepareBias)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::PrepareBias<Callback>, AVX512_8bit::PrepareBias<Callback>, AVX2_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
-
+/*
+ * 8-bit matrix multiplication
+ */
 struct Int8 {
   using Integer = int8_t;
 
-  // A's size must be a multiple of 1x64.
-  // B's size must be a multiple of 64x8.
+  // A's size must be a multiple of 1x64, B's size must be a multiple of 64x8.
   static constexpr TileInfo tile_info{1, 64, 64, 8};
 
   // Currently A is prepared by quantization but this could theoretically change.
@@ -319,22 +248,29 @@ struct Int8 {
   // Multiply C = A * B, presuming A and B have been prepared.
   template <typename Callback>
   static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    Int8Mult<Callback>::Multiply(A, B, A_rows, width, B_cols, callback);
+    MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback);
   }
   
   static const char *const kName;
+
+private:
+  template <typename Callback>
+  struct MultiplyImpl {
+    static void (*run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
+  };
 };
 
-// Shifting A by 127 version of the above code
+template <typename Callback>
+void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply<Callback>, AVX512_8bit::Multiply<Callback>, AVX2_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, Unsupported_8bit::Multiply);
+
+/*
+ * 8-bit matrix multiplication with shifting A by 127
+ */
 struct Int8Shift {
-  typedef int8_t Integer;
+  using Integer = int8_t;
 
-  // A's size must be a multiple of 1x64.
-  static const Index kATileRow = 1;
-  static const Index kATileCol = 64;
-  // B's size must be a multiple of 64x8.
-  static const Index kBTileRow = 64;
-  static const Index kBTileCol = 8;
+  // A's size must be a multiple of 1x64, B's size must be a multiple of 64x8.
+  static constexpr TileInfo tile_info{1, 64, 64, 8};
 
   // Identical to the Int8 Version, except it adds 127 to each number, making sure that all numbers are positive.
   static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
@@ -360,7 +296,7 @@ struct Int8Shift {
   // Multiply C = A * B + Bias, presuming A, B and Bias have all been prepared (for A, PrepareAnew should be used
   template<class Callback>
   static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    Int8Mult<Callback>::Multiply8Shift((const uint8_t *)A, B, A_rows, width, B_cols, callback);
+    MultiplyImpl<Callback>::run((const uint8_t *)A, B, A_rows, width, B_cols, callback);
   }
 
   // This function prepares the bias for the Multiply routine that does unsigned * signed multiplication.
@@ -370,12 +306,85 @@ struct Int8Shift {
   // unquant_mult is computed by (-1)*(alpha)*(alpha)/(127.0f);
   template<class Callback>
   static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) {
-    Int8Mult<Callback>::PrepareBias(B, width, B_cols, callback);
+    PrepareBiasImpl<Callback>::run(B, width, B_cols, callback);
   }
   
   static const char *const kName;
+
+private:
+  template <typename Callback>
+  struct MultiplyImpl {
+    static void (*run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
+  };
+
+  template <typename Callback>
+  struct PrepareBiasImpl {
+    static void (*run)(const int8_t *B, Index width, Index B_cols, Callback callback);
+  };
+};
+
+template <class Callback>
+void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply8Shift<Callback>, AVX512_8bit::Multiply8Shift<Callback>, AVX2_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift);
+
+template <class Callback>
+void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::PrepareBias<Callback>, AVX512_8bit::PrepareBias<Callback>, AVX2_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
+
+/*
+ * 16-bit matrix multiplication
+ */
+struct Int16 {
+  using Integer = int16_t;
+
+  // A's size must be a multiple of 1x32, B's size must be a multiple of 32x8.
+  static constexpr TileInfo tile_info{1, 32, 32, 8};
+
+  // Currently A is prepared by quantization but this could theoretically change.
+  // A's columns must be a multiple of 8.
+  // The number of rows is anything.
+  static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
+    Quantize(input, output, quant_mult, rows * cols);
+  }
+
+  // Multiply floats by quant_mult then convert to 16-bit integers with saturation.
+  // input
+  static void (*Quantize)(const float *input, int16_t *output, float quant_mult, Index size);
+
+  // Warning: the output of PrepareB depends on the CPU.
+  // It will match the Multiply function on the same CPU though.
+  static void (*PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
+
+  // Convert from a B that was already transposed (routine not provided) and
+  // quantized (e.g. with Quantize) to the CPU-dependent format used for
+  // Multiply.  This is useful for storing a quantized model on disk then in a
+  // CPU-independent fashion.
+  static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols);
+
+  // Convert from a B that was already transposed (routine not provided) to
+  // the CPU-dependent format used for Multiply.  This is useful for storing
+  // a quantized model on disk then in a CPU-independent fashion.
+  static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols);
+
+  // Select columns from a prepared B matrix.  The number of selected columns must be a multiple of 8. 
+  static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
+
+  // Multiply C = A * B, presuming A and B have been prepared.
+  template <typename Callback>
+  static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
+    MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback);
+  }
+
+  static const char *const kName;
+
+private:
+  template <typename Callback>
+  struct MultiplyImpl {
+    static void (*run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
+  };
 };
 
+template <typename Callback>
+void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512_16bit::Multiply<Callback> /*TODO VNNI 16-bit. */, AVX512_16bit::Multiply<Callback>, AVX2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, Unsupported_16bit::Multiply);
+
 extern const CPUType kCPU;
 
 // Get the maximum absolute value of an array of floats. The number of floats must be a multiple of 16 and 64-byte aligned.
diff --git a/multiply.h b/multiply.h
index 1fc5ed5..abc224c 100644
--- a/multiply.h
+++ b/multiply.h
@@ -561,26 +561,6 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
   } \
 } \
 
-
-// Find the maximum absolute value of packed float32s.
-/*
-template <class Register> inline static float MaxAbsoluteBackend(const float *begin_float, const float *end_float) {
-  assert(end_float > begin_float);
-  assert((end_float - begin_float) % (sizeof(Register) / sizeof(float)) == 0);
-  const Register *begin = reinterpret_cast<const Register*>(begin_float);
-  const Register *end = reinterpret_cast<const Register*>(end_float);
-  // Get the sign bit.
-  union {float f; int32_t i;} float_convert;
-  float_convert.i = 0x7fffffff;
-  Register and_me = set1_ps<Register>(float_convert.f);
-  Register highest = and_ps(and_me, *begin);
-  for (++begin; begin != end; ++begin) {
-    Register reg = and_ps(and_me, *begin);
-    highest = max_ps(highest, reg);
-  }
-
-  return MaxFloat32(highest);
-}*/
 #define INTGEMM_MAXABSOLUTE(Register, target) \
 target static float MaxAbsolute(const float *begin_float, const float *end_float) { \
   assert(end_float > begin_float); \
diff --git a/sse2_gemm.h b/sse2_gemm.h
index a27b358..a2bae35 100644
--- a/sse2_gemm.h
+++ b/sse2_gemm.h
@@ -25,22 +25,22 @@ class QuantizeTile16 {
 
     INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
 
-    INTGEMM_SSE2 inline __m128i Consecutive(const float *input) {
+    INTGEMM_SSE2 inline __m128i Consecutive(const float *input) const {
       return Tile(input, input + 4);
     }
 
-    INTGEMM_SSE2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+    INTGEMM_SSE2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
       return Tile(
         input,
         input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0));
     }
 
-    INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) {
+    INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) const {
       return Consecutive(input);
     }
 
   private:
-    INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) {
+    INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) const {
       __m128i g0 = QuantizerGrab(input0, mult_reg_);
       __m128i g1 = QuantizerGrab(input1, mult_reg_);
       return _mm_packs_epi32(g0, g1);
diff --git a/ssse3_gemm.h b/ssse3_gemm.h
index 18bf14b..40a26f4 100644
--- a/ssse3_gemm.h
+++ b/ssse3_gemm.h
@@ -26,20 +26,20 @@ class QuantizeTile8 {
 
     INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
 
-    INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) {
+    INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) const {
       // Skip a row.
       return Tile(input, input + 4, input + 2 * cols, input + 2 * cols + 4);
     }
 
-    INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) {
+    INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) const {
       return Tile(input, input + 4, input + 8, input + 12);
     }
 
-    INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) {
+    INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) const {
       return TileU(input, input + 4, input + 8, input + 12);
     }
 
-    INTGEMM_SSSE3 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) {
+    INTGEMM_SSSE3 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
       const float* inputs[4];
       for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
         while (cols_left < sizeof(Integer) / sizeof(float)) {
@@ -55,7 +55,7 @@ class QuantizeTile8 {
 
   private:
     // Quantize 16xfloat into 16xint8_t
-    INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) {
+    INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const {
       const __m128i neg128 = _mm_set1_epi8(-128);
       __m128i g0 = QuantizerGrab(input0, mult_reg_);
       __m128i g1 = QuantizerGrab(input1, mult_reg_);
@@ -77,7 +77,7 @@ class QuantizeTile8 {
       // No permute needed.  packs is in order for SSE.
     }
 
-    INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) {
+    INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const {
       const __m128i neg128 = _mm_set1_epi8(-128);
       const __m128i pos127 = _mm_set1_epi8(127);
       __m128i g0 = QuantizerGrab(input0, mult_reg_);
author	Kenneth Heafield <kpu@users.noreply.github.com>	2020-02-07 18:25:34 +0300
committer	GitHub <noreply@github.com>	2020-02-07 18:25:34 +0300
commit	f1d5804bd1f04f114889ca7399e3abd93eb7d128 (patch)
tree	8f4db81db093eab678d2e104792aa0637e4c28ba
parent	ca66ffeb5b8a412ae9307715cd94d3350930a12e (diff)
parent	d96738e3978b248bf190cf62aceb91da2667071c (diff)