diff options
author | Kenneth Heafield <kpu@users.noreply.github.com> | 2020-02-07 18:25:34 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-07 18:25:34 +0300 |
commit | f1d5804bd1f04f114889ca7399e3abd93eb7d128 (patch) | |
tree | 8f4db81db093eab678d2e104792aa0637e4c28ba | |
parent | ca66ffeb5b8a412ae9307715cd94d3350930a12e (diff) | |
parent | d96738e3978b248bf190cf62aceb91da2667071c (diff) |
Merge pull request #66 from kpu/refactoring
Refactoring
-rw-r--r-- | avx2_gemm.h | 20 | ||||
-rw-r--r-- | avx512_gemm.h | 8 | ||||
-rw-r--r-- | intgemm.h | 181 | ||||
-rw-r--r-- | multiply.h | 20 | ||||
-rw-r--r-- | sse2_gemm.h | 8 | ||||
-rw-r--r-- | ssse3_gemm.h | 12 |
6 files changed, 119 insertions, 130 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h index 85319a8..93709e4 100644 --- a/avx2_gemm.h +++ b/avx2_gemm.h @@ -24,23 +24,23 @@ class QuantizeTile16 { INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {} - INTGEMM_AVX2 Integer Consecutive(const float *input) { + INTGEMM_AVX2 Integer Consecutive(const float *input) const { return Tile(input, input + 8); } - INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { return Tile( input, input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0)); } - INTGEMM_AVX2 Integer ForReshape(const float *input, Index cols) { + INTGEMM_AVX2 Integer ForReshape(const float *input, Index cols) const { // 8 rows in the first 128-bit register, 8 in the second register. return Tile(input, input + 8 * cols); } private: - INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) { + INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) const { __m256i g0 = QuantizerGrab(input0, mult_); __m256i g1 = QuantizerGrab(input1, mult_); __m256i packed = _mm256_packs_epi32(g0, g1); @@ -107,15 +107,15 @@ class QuantizeTile8 { INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {} - INTGEMM_AVX2 inline __m256i Consecutive(const float *input) { + INTGEMM_AVX2 inline __m256i Consecutive(const float *input) const { return Tile(input, input + 8, input + 16, input + 24); } - INTGEMM_AVX2 inline __m256i ConsecutiveU(const float *input) { + INTGEMM_AVX2 inline __m256i ConsecutiveU(const float *input) const { return TileU(input, input + 8, input + 16, input + 24); } - INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + INTGEMM_AVX2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { const float* inputs[4]; for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) { while (cols_left < sizeof(Integer) / sizeof(float)) { @@ -129,14 +129,14 @@ class QuantizeTile8 { return Tile(inputs[0], inputs[1], inputs[2], inputs[3]); } - INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) { + INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) const { // Put higher rows in the second half of the register. These will jumble // around in the same way then conveniently land in the right place. return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols); } private: - INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) { + INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const { // Looking at the assembly, gcc has pulled this outside the loops calling this. const __m256i neg127 = _mm256_set1_epi8(-127); const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); @@ -160,7 +160,7 @@ class QuantizeTile8 { } //A version that produces uint8_ts - INTGEMM_AVX2 inline __m256i TileU(const float *input0, const float *input1, const float *input2, const float *input3) { + INTGEMM_AVX2 inline __m256i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const { // Looking at the assembly, gcc has pulled this outside the loops calling this. const __m256i neg127 = _mm256_set1_epi8(-127); const __m256i pos127 = _mm256_set1_epi8(127); diff --git a/avx512_gemm.h b/avx512_gemm.h index efddd7a..91fdd8a 100644 --- a/avx512_gemm.h +++ b/avx512_gemm.h @@ -75,7 +75,7 @@ class QuantizeTile16 { /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} - INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { auto input0 = input; auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0); auto g0 = QuantizerGrabHalves(input0, input1, mult_reg_); @@ -84,7 +84,7 @@ class QuantizeTile16 { return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); } - INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) { + INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const { __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_); __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_); __m512i packed = _mm512_packs_epi32(g0, g1); @@ -103,7 +103,7 @@ class QuantizeTile8 { /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} - INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + INTGEMM_AVX512BW Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { static const __m512i neg127 = _mm512_set1_epi8(-127); static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); @@ -130,7 +130,7 @@ class QuantizeTile8 { return _mm512_permutexvar_epi32(shuffle_param, packed); } - INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) { + INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const { // TODO: try alternative: _mm512_cvtsepi32_epi8 ? const __m512i neg127 = _mm512_set1_epi8(-127); // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc. @@ -204,84 +204,13 @@ struct TileInfo { const Index b_cols; }; -/* 16-bit matrix multiplication. */ -template <typename Callback> -struct Int16Mult { - // Multiply C = A * B, presuming A and B have been prepared. - static void (*Multiply)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback); -}; - -template <typename Callback> -void (*Int16Mult<Callback>::Multiply)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512_16bit::Multiply<Callback> /*TODO VNNI 16-bit. */, AVX512_16bit::Multiply<Callback>, AVX2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, Unsupported_16bit::Multiply); - -struct Int16 { - using Integer = int16_t; - - // A's size must be a multiple of 1x32. - // B's size must be a multiple of 32x8. - static constexpr TileInfo tile_info{1, 32, 32, 8}; - - // Currently A is prepared by quantization but this could theoretically change. - // A's columns must be a multiple of 8. - // The number of rows is anything. - static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { - Quantize(input, output, quant_mult, rows * cols); - } - - // Multiply floats by quant_mult then convert to 16-bit integers with saturation. - // input - static void (*Quantize)(const float *input, int16_t *output, float quant_mult, Index size); - - // Warning: the output of PrepareB depends on the CPU. - // It will match the Multiply function on the same CPU though. - static void (*PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols); - - // Convert from a B that was already transposed (routine not provided) and - // quantized (e.g. with Quantize) to the CPU-dependent format used for - // Multiply. This is useful for storing a quantized model on disk then in a - // CPU-independent fashion. - static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols); - - // Convert from a B that was already transposed (routine not provided) to - // the CPU-dependent format used for Multiply. This is useful for storing - // a quantized model on disk then in a CPU-independent fashion. - static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols); - - // Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8. - static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end); - - // Multiply C = A * B, presuming A and B have been prepared. - template <typename Callback> - static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { - Int16Mult<Callback>::Multiply(A, B, A_rows, width, B_cols, callback); - } - - static const char *const kName; -}; - -/* 8-bit matrix multiplication */ -template <typename Callback> -struct Int8Mult { - // Multiply C = A * B, presuming A and B have been prepared. - static void (*Multiply)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback); - static void (*Multiply8Shift)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback); - static void (*PrepareBias)(const int8_t *B, Index width, Index B_cols, Callback callback); -}; - -template <typename Callback> -void (*Int8Mult<Callback>::Multiply)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply<Callback>, AVX512_8bit::Multiply<Callback>, AVX2_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, Unsupported_8bit::Multiply); - -template <class Callback> -void (*Int8Mult<Callback>::Multiply8Shift)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply8Shift<Callback>, AVX512_8bit::Multiply8Shift<Callback>, AVX2_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift); - -template <class Callback> -void (*Int8Mult<Callback>::PrepareBias)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::PrepareBias<Callback>, AVX512_8bit::PrepareBias<Callback>, AVX2_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, Unsupported_8bit::PrepareBias); - +/* + * 8-bit matrix multiplication + */ struct Int8 { using Integer = int8_t; - // A's size must be a multiple of 1x64. - // B's size must be a multiple of 64x8. + // A's size must be a multiple of 1x64, B's size must be a multiple of 64x8. static constexpr TileInfo tile_info{1, 64, 64, 8}; // Currently A is prepared by quantization but this could theoretically change. @@ -319,22 +248,29 @@ struct Int8 { // Multiply C = A * B, presuming A and B have been prepared. template <typename Callback> static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { - Int8Mult<Callback>::Multiply(A, B, A_rows, width, B_cols, callback); + MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback); } static const char *const kName; + +private: + template <typename Callback> + struct MultiplyImpl { + static void (*run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback); + }; }; -// Shifting A by 127 version of the above code +template <typename Callback> +void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply<Callback>, AVX512_8bit::Multiply<Callback>, AVX2_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, SSSE3_8bit::Multiply<Callback>, Unsupported_8bit::Multiply); + +/* + * 8-bit matrix multiplication with shifting A by 127 + */ struct Int8Shift { - typedef int8_t Integer; + using Integer = int8_t; - // A's size must be a multiple of 1x64. - static const Index kATileRow = 1; - static const Index kATileCol = 64; - // B's size must be a multiple of 64x8. - static const Index kBTileRow = 64; - static const Index kBTileCol = 8; + // A's size must be a multiple of 1x64, B's size must be a multiple of 64x8. + static constexpr TileInfo tile_info{1, 64, 64, 8}; // Identical to the Int8 Version, except it adds 127 to each number, making sure that all numbers are positive. static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) { @@ -360,7 +296,7 @@ struct Int8Shift { // Multiply C = A * B + Bias, presuming A, B and Bias have all been prepared (for A, PrepareAnew should be used template<class Callback> static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { - Int8Mult<Callback>::Multiply8Shift((const uint8_t *)A, B, A_rows, width, B_cols, callback); + MultiplyImpl<Callback>::run((const uint8_t *)A, B, A_rows, width, B_cols, callback); } // This function prepares the bias for the Multiply routine that does unsigned * signed multiplication. @@ -370,12 +306,85 @@ struct Int8Shift { // unquant_mult is computed by (-1)*(alpha)*(alpha)/(127.0f); template<class Callback> static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) { - Int8Mult<Callback>::PrepareBias(B, width, B_cols, callback); + PrepareBiasImpl<Callback>::run(B, width, B_cols, callback); } static const char *const kName; + +private: + template <typename Callback> + struct MultiplyImpl { + static void (*run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback); + }; + + template <typename Callback> + struct PrepareBiasImpl { + static void (*run)(const int8_t *B, Index width, Index B_cols, Callback callback); + }; +}; + +template <class Callback> +void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::Multiply8Shift<Callback>, AVX512_8bit::Multiply8Shift<Callback>, AVX2_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, SSSE3_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift); + +template <class Callback> +void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI_8bit::PrepareBias<Callback>, AVX512_8bit::PrepareBias<Callback>, AVX2_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, SSSE3_8bit::PrepareBias<Callback>, Unsupported_8bit::PrepareBias); + +/* + * 16-bit matrix multiplication + */ +struct Int16 { + using Integer = int16_t; + + // A's size must be a multiple of 1x32, B's size must be a multiple of 32x8. + static constexpr TileInfo tile_info{1, 32, 32, 8}; + + // Currently A is prepared by quantization but this could theoretically change. + // A's columns must be a multiple of 8. + // The number of rows is anything. + static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) { + Quantize(input, output, quant_mult, rows * cols); + } + + // Multiply floats by quant_mult then convert to 16-bit integers with saturation. + // input + static void (*Quantize)(const float *input, int16_t *output, float quant_mult, Index size); + + // Warning: the output of PrepareB depends on the CPU. + // It will match the Multiply function on the same CPU though. + static void (*PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols); + + // Convert from a B that was already transposed (routine not provided) and + // quantized (e.g. with Quantize) to the CPU-dependent format used for + // Multiply. This is useful for storing a quantized model on disk then in a + // CPU-independent fashion. + static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols); + + // Convert from a B that was already transposed (routine not provided) to + // the CPU-dependent format used for Multiply. This is useful for storing + // a quantized model on disk then in a CPU-independent fashion. + static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols); + + // Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8. + static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end); + + // Multiply C = A * B, presuming A and B have been prepared. + template <typename Callback> + static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { + MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback); + } + + static const char *const kName; + +private: + template <typename Callback> + struct MultiplyImpl { + static void (*run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback); + }; }; +template <typename Callback> +void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512_16bit::Multiply<Callback> /*TODO VNNI 16-bit. */, AVX512_16bit::Multiply<Callback>, AVX2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, SSE2_16bit::Multiply<Callback>, Unsupported_16bit::Multiply); + extern const CPUType kCPU; // Get the maximum absolute value of an array of floats. The number of floats must be a multiple of 16 and 64-byte aligned. @@ -561,26 +561,6 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3( } \ } \ - -// Find the maximum absolute value of packed float32s. -/* -template <class Register> inline static float MaxAbsoluteBackend(const float *begin_float, const float *end_float) { - assert(end_float > begin_float); - assert((end_float - begin_float) % (sizeof(Register) / sizeof(float)) == 0); - const Register *begin = reinterpret_cast<const Register*>(begin_float); - const Register *end = reinterpret_cast<const Register*>(end_float); - // Get the sign bit. - union {float f; int32_t i;} float_convert; - float_convert.i = 0x7fffffff; - Register and_me = set1_ps<Register>(float_convert.f); - Register highest = and_ps(and_me, *begin); - for (++begin; begin != end; ++begin) { - Register reg = and_ps(and_me, *begin); - highest = max_ps(highest, reg); - } - - return MaxFloat32(highest); -}*/ #define INTGEMM_MAXABSOLUTE(Register, target) \ target static float MaxAbsolute(const float *begin_float, const float *end_float) { \ assert(end_float > begin_float); \ diff --git a/sse2_gemm.h b/sse2_gemm.h index a27b358..a2bae35 100644 --- a/sse2_gemm.h +++ b/sse2_gemm.h @@ -25,22 +25,22 @@ class QuantizeTile16 { INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {} - INTGEMM_SSE2 inline __m128i Consecutive(const float *input) { + INTGEMM_SSE2 inline __m128i Consecutive(const float *input) const { return Tile(input, input + 4); } - INTGEMM_SSE2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + INTGEMM_SSE2 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { return Tile( input, input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0)); } - INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) { + INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) const { return Consecutive(input); } private: - INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) { + INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) const { __m128i g0 = QuantizerGrab(input0, mult_reg_); __m128i g1 = QuantizerGrab(input1, mult_reg_); return _mm_packs_epi32(g0, g1); diff --git a/ssse3_gemm.h b/ssse3_gemm.h index 18bf14b..40a26f4 100644 --- a/ssse3_gemm.h +++ b/ssse3_gemm.h @@ -26,20 +26,20 @@ class QuantizeTile8 { INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {} - INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) { + INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) const { // Skip a row. return Tile(input, input + 4, input + 2 * cols, input + 2 * cols + 4); } - INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) { + INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) const { return Tile(input, input + 4, input + 8, input + 12); } - INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) { + INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) const { return TileU(input, input + 4, input + 8, input + 12); } - INTGEMM_SSSE3 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) { + INTGEMM_SSSE3 Integer ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { const float* inputs[4]; for (int i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) { while (cols_left < sizeof(Integer) / sizeof(float)) { @@ -55,7 +55,7 @@ class QuantizeTile8 { private: // Quantize 16xfloat into 16xint8_t - INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) { + INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const { const __m128i neg128 = _mm_set1_epi8(-128); __m128i g0 = QuantizerGrab(input0, mult_reg_); __m128i g1 = QuantizerGrab(input1, mult_reg_); @@ -77,7 +77,7 @@ class QuantizeTile8 { // No permute needed. packs is in order for SSE. } - INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) { + INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const { const __m128i neg128 = _mm_set1_epi8(-128); const __m128i pos127 = _mm_set1_epi8(127); __m128i g0 = QuantizerGrab(input0, mult_reg_); |