diff options
author | Kenneth Heafield <github@kheafield.com> | 2020-09-15 23:53:06 +0300 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2020-09-15 23:53:06 +0300 |
commit | ada32a77770a98d5983647a8c02b5b65e18bb754 (patch) | |
tree | c439d2c7ac7765b17d3d0a8b17e4bb70c3c354c9 | |
parent | fdbd834e92da7d32c0363e8eb5200ec301d0f146 (diff) |
Change quant_mult from member variable to static parameter
Might fix gcc 5.4
-rw-r--r-- | intgemm/avx2_gemm.h | 74 | ||||
-rw-r--r-- | intgemm/avx512_gemm.h | 44 | ||||
-rw-r--r-- | intgemm/interleave.h | 26 | ||||
-rw-r--r-- | intgemm/multiply.h | 12 | ||||
-rw-r--r-- | intgemm/sse2_gemm.h | 26 | ||||
-rw-r--r-- | intgemm/ssse3_gemm.h | 51 |
6 files changed, 102 insertions, 131 deletions
diff --git a/intgemm/avx2_gemm.h b/intgemm/avx2_gemm.h index d111b32..5e81475 100644 --- a/intgemm/avx2_gemm.h +++ b/intgemm/avx2_gemm.h @@ -19,34 +19,30 @@ INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i) class QuantizeTile16 { public: - INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {} - - INTGEMM_AVX2 Register Consecutive(const float *input) const { - return Tile(input, input + 8); + INTGEMM_AVX2 static inline Register Consecutive(FRegister mult_reg, const float *input) { + return Tile(mult_reg, input, input + 8); } - INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { - return Tile( + INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) { + return Tile(mult_reg, input, input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0)); } - INTGEMM_AVX2 Register ForReshape(const float *input, Index cols) const { + INTGEMM_AVX2 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) { // 8 rows in the first 128-bit register, 8 in the second register. - return Tile(input, input + 8 * cols); + return Tile(mult_reg, input, input + 8 * cols); } private: - INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) const { - Register g0 = QuantizerGrab(input0, mult_); - Register g1 = QuantizerGrab(input1, mult_); + INTGEMM_AVX2 static inline Register Tile(FRegister mult_reg, const float *input0, const float *input1) { + Register g0 = QuantizerGrab(input0, mult_reg); + Register g1 = QuantizerGrab(input1, mult_reg); Register packed = _mm256_packs_epi32(g0, g1); // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15. // Technically this could be removed if the PrepareB did the same reordering internally. return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); } - - const FRegister mult_; }; struct Kernels16 { @@ -61,10 +57,10 @@ struct Kernels16 { INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) { assert(size % 16 == 0); assert(reinterpret_cast<uintptr_t>(input) % 32 == 0); - avx2::QuantizeTile16 q(quant_mult); + FRegister q = set1_ps<FRegister>(quant_mult); const float *end = input + size; for (; input != end; input += 16, output += 16) { - *reinterpret_cast<__m256i*>(output) = q.Consecutive(input); + *reinterpret_cast<__m256i*>(output) = QuantizeTile16::Consecutive(q, input); } } @@ -96,17 +92,15 @@ struct Kernels16 { */ class QuantizeTile8 { public: - INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {} - - INTGEMM_AVX2 inline __m256i Consecutive(const float *input) const { - return Tile(input, input + 8, input + 16, input + 24); + INTGEMM_AVX2 static inline Register Consecutive(FRegister quant_mult, const float *input) { + return Tile(quant_mult, input, input + 8, input + 16, input + 24); } - INTGEMM_AVX2 inline __m256i ConsecutiveU(const float *input) const { - return TileU(input, input + 8, input + 16, input + 24); + INTGEMM_AVX2 static inline Register ConsecutiveU(FRegister quant_mult, const float *input) { + return TileU(quant_mult, input, input + 8, input + 16, input + 24); } - INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { + INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) { const float* inputs[4]; for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) { while (cols_left < sizeof(Register) / sizeof(float)) { @@ -117,24 +111,24 @@ class QuantizeTile8 { input += sizeof(Register) / sizeof(float); cols_left -= sizeof(Register) / sizeof(float); } - return Tile(inputs[0], inputs[1], inputs[2], inputs[3]); + return Tile(quant_mult, inputs[0], inputs[1], inputs[2], inputs[3]); } - INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) const { + INTGEMM_AVX2 static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) { // Put higher rows in the second half of the register. These will jumble // around in the same way then conveniently land in the right place. - return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols); + return Tile(quant_mult, input, input + 2 * cols, input + 16 * cols, input + 18 * cols); } - INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const { + INTGEMM_AVX2 static inline __m256i Tile(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) { // Looking at the assembly, gcc has pulled this outside the loops calling this. const __m256i neg127 = _mm256_set1_epi8(-127); const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); // Grab 4 registers at a time in 32-bit format. - __m256i g0 = avx2::QuantizerGrab(input0, mult_); - __m256i g1 = avx2::QuantizerGrab(input1, mult_); - __m256i g2 = avx2::QuantizerGrab(input2, mult_); - __m256i g3 = avx2::QuantizerGrab(input3, mult_); + __m256i g0 = avx2::QuantizerGrab(input0, quant_mult); + __m256i g1 = avx2::QuantizerGrab(input1, quant_mult); + __m256i g2 = avx2::QuantizerGrab(input2, quant_mult); + __m256i g3 = avx2::QuantizerGrab(input3, quant_mult); // Pack 32-bit to 16-bit. __m256i packed0 = _mm256_packs_epi32(g0, g1); __m256i packed1 = _mm256_packs_epi32(g2, g3); @@ -151,16 +145,16 @@ class QuantizeTile8 { private: //A version that produces uint8_ts - INTGEMM_AVX2 inline __m256i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const { + INTGEMM_AVX2 static inline Register TileU(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) { // Looking at the assembly, gcc has pulled this outside the loops calling this. const __m256i neg127 = _mm256_set1_epi8(-127); const __m256i pos127 = _mm256_set1_epi8(127); const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); // Grab 4 registers at a time in 32-bit format. - __m256i g0 = avx2::QuantizerGrab(input0, mult_); - __m256i g1 = avx2::QuantizerGrab(input1, mult_); - __m256i g2 = avx2::QuantizerGrab(input2, mult_); - __m256i g3 = avx2::QuantizerGrab(input3, mult_); + __m256i g0 = avx2::QuantizerGrab(input0, quant_mult); + __m256i g1 = avx2::QuantizerGrab(input1, quant_mult); + __m256i g2 = avx2::QuantizerGrab(input2, quant_mult); + __m256i g3 = avx2::QuantizerGrab(input3, quant_mult); // Pack 32-bit to 16-bit. __m256i packed0 = _mm256_packs_epi32(g0, g1); __m256i packed1 = _mm256_packs_epi32(g2, g3); @@ -175,8 +169,6 @@ class QuantizeTile8 { // and the values are only used for GEMM. return _mm256_permutevar8x32_epi32(packed, shuffle_param); } - - const __m256 mult_; }; struct Kernels8 { @@ -187,9 +179,9 @@ struct Kernels8 { Quantize(input, output, quant_mult, rows * cols); } private: - INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2, __m256i, avx2) + INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2) public: - INTGEMM_QUANTIZE(INTGEMM_AVX2, __m256i, avx2) + INTGEMM_QUANTIZE(INTGEMM_AVX2) // Currently A is prepared by quantization but this could theoretically change. INTGEMM_AVX2 static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) { @@ -200,10 +192,10 @@ struct Kernels8 { INTGEMM_AVX2 static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) { assert(size % 32 == 0); assert(reinterpret_cast<uintptr_t>(input) % 32 == 0); - avx2::QuantizeTile8 q(quant_mult); + FRegister q = set1_ps<FRegister>(quant_mult); const float *end = input + size; for (; input != end; input += 32, output += 32) { - *reinterpret_cast<__m256i*>(output) = q.ConsecutiveU(input); + *reinterpret_cast<__m256i*>(output) = QuantizeTile8::ConsecutiveU(q, input); } } diff --git a/intgemm/avx512_gemm.h b/intgemm/avx512_gemm.h index 6108289..f9fb1eb 100644 --- a/intgemm/avx512_gemm.h +++ b/intgemm/avx512_gemm.h @@ -66,36 +66,27 @@ INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const f // being used for the quantizer. class QuantizeTile16 { public: - /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ - INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} - - INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { + INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) { auto input0 = input; auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0); - auto g0 = QuantizerGrabHalves(input0, input1, mult_reg_); - auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, mult_reg_); + auto g0 = QuantizerGrabHalves(input0, input1, quant_mult); + auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, quant_mult); auto packed = packs_epi32(g0, g1); return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); } - INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const { - __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_); - __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_); + INTGEMM_AVX512BW static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) { + __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, quant_mult); + __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, quant_mult); __m512i packed = packs_epi32(g0, g1); // Permute within 256-bit lanes, so same as INTGEMM_AVX2 return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */); } - - private: - const __m512 mult_reg_; }; class QuantizeTile8 { public: - /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ - INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {} - - INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { + INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) { static const __m512i neg127 = _mm512_set1_epi8(-127); static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); @@ -110,10 +101,10 @@ class QuantizeTile8 { cols_left -= sizeof(Register) / sizeof(float); } - auto g0 = QuantizerGrab(inputs[0], mult_reg_); - auto g1 = QuantizerGrab(inputs[1], mult_reg_); - auto g2 = QuantizerGrab(inputs[2], mult_reg_); - auto g3 = QuantizerGrab(inputs[3], mult_reg_); + auto g0 = QuantizerGrab(inputs[0], quant_mult); + auto g1 = QuantizerGrab(inputs[1], quant_mult); + auto g2 = QuantizerGrab(inputs[2], quant_mult); + auto g3 = QuantizerGrab(inputs[3], quant_mult); auto packed0 = packs_epi32(g0, g1); auto packed1 = packs_epi32(g2, g3); @@ -122,17 +113,17 @@ class QuantizeTile8 { return _mm512_permutexvar_epi32(shuffle_param, packed); } - INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const { + INTGEMM_AVX512BW static inline __m512i ForReshape(FRegister quant_mult, const float *input, Index cols) { // TODO: try alternative: _mm512_cvtsepi32_epi8 ? const __m512i neg127 = _mm512_set1_epi8(-127); // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc. const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // 32-bit format. - __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_); - __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_); - __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_); - __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_); + __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, quant_mult); + __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, quant_mult); + __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, quant_mult); + __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, quant_mult); // Pack 32-bit to 16-bit. __m512i packed0 = packs_epi32(g0, g1); __m512i packed1 = packs_epi32(g2, g3); @@ -143,9 +134,6 @@ class QuantizeTile8 { // 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63 return _mm512_permutexvar_epi32(shuffle_param, packed); } - - private: - const __m512 mult_reg_; }; struct Kernels16 { diff --git a/intgemm/interleave.h b/intgemm/interleave.h index 1ac14b7..1ec686b 100644 --- a/intgemm/interleave.h +++ b/intgemm/interleave.h @@ -179,7 +179,7 @@ template <class Register> static inline void Transpose8InLane( // ... ... #define INTGEMM_PREPARE_B_8(target, QuantClass) \ target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \ - QuantClass q(quant_mult); \ + FRegister q = set1_ps<FRegister>(quant_mult); \ /* Currently all multipliers have a stride of 8 columns.*/ \ const Index kColStride = 8; \ assert(cols % kColStride == 0); \ @@ -193,14 +193,14 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl This isn't quite Transpose8InLane because it's half the number of columns, \ so each register starts with two rows instead of being one row. \ The quantizers know to skip a row.*/ \ - output[0] = q.ForReshape(input + cols * (r ) + c, cols); \ - output[1] = q.ForReshape(input + cols * (r + 1) + c, cols); \ - output[2] = q.ForReshape(input + cols * (r + 4) + c, cols); \ - output[3] = q.ForReshape(input + cols * (r + 5) + c, cols); \ - output[4] = q.ForReshape(input + cols * (r + 8) + c, cols); \ - output[5] = q.ForReshape(input + cols * (r + 9) + c, cols); \ - output[6] = q.ForReshape(input + cols * (r + 12) + c, cols); \ - output[7] = q.ForReshape(input + cols * (r + 13) + c, cols); \ + output[0] = QuantClass::ForReshape(q, input + cols * (r ) + c, cols); \ + output[1] = QuantClass::ForReshape(q, input + cols * (r + 1) + c, cols); \ + output[2] = QuantClass::ForReshape(q, input + cols * (r + 4) + c, cols); \ + output[3] = QuantClass::ForReshape(q, input + cols * (r + 5) + c, cols); \ + output[4] = QuantClass::ForReshape(q, input + cols * (r + 8) + c, cols); \ + output[5] = QuantClass::ForReshape(q, input + cols * (r + 9) + c, cols); \ + output[6] = QuantClass::ForReshape(q, input + cols * (r + 12) + c, cols); \ + output[7] = QuantClass::ForReshape(q, input + cols * (r + 13) + c, cols); \ Interleave8(output[0], output[1]); \ Interleave8(output[2], output[3]); \ Interleave8(output[4], output[5]); \ @@ -212,7 +212,7 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl #define INTGEMM_PREPARE_B_16(target, QuantClass) \ target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \ - QuantClass q(quant_mult); \ + FRegister q = set1_ps<FRegister>(quant_mult); \ assert(cols % 8 == 0); \ assert(rows % (sizeof(Register) / sizeof(int16_t)) == 0); \ assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ @@ -222,7 +222,7 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \ /* gcc unrolls this loop and uses registers for output[k]*/ \ for (Index k = 0; k < 8; ++k) { \ - output[k] = q.ForReshape(input + cols * (r + k) + c, cols); \ + output[k] = QuantClass::ForReshape(q, input + cols * (r + k) + c, cols); \ } \ Transpose16InLane(output[0], output[1], output[2], output[3], output[4], output[5], output[6], output[7]); \ } \ @@ -270,13 +270,13 @@ target static inline void PrepareBTransposed(const float* input, Integer* output assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \ \ - Quantizer quantizer(quant_mult); \ + FRegister q = set1_ps<FRegister>(quant_mult); \ Register* output_it = reinterpret_cast<Register*>(output); \ Index r = 0; \ Index c = 0; \ while (r < rows) { \ for (Index ri = 0; ri < 8; ++ri) \ - *output_it++ = quantizer.ConsecutiveWithWrapping(input + (r + ri) * cols + c, cols - c, cols, 8); \ + *output_it++ = Quantizer::ConsecutiveWithWrapping(q, input + (r + ri) * cols + c, cols - c, cols, 8); \ c += RegisterElemsInt; \ while (c >= cols) { \ r += kColStride; \ diff --git a/intgemm/multiply.h b/intgemm/multiply.h index 4b7558d..e201e09 100644 --- a/intgemm/multiply.h +++ b/intgemm/multiply.h @@ -47,16 +47,16 @@ INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i p // Quantize function used for SSSE3 and AVX2. // Separate function for thread to work around gcc 7 bug that doesn't imbue // target attributes across #pragma omp parallel. -#define INTGEMM_QUANTIZE_THREAD(target, Register, name) \ +#define INTGEMM_QUANTIZE_THREAD(target) \ target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \ - name::QuantizeTile8 q(quant_mult); \ + FRegister q = set1_ps<FRegister>(quant_mult); \ INTGEMM_OMP_FOR \ for (std::size_t i = 0; i < count; i += sizeof(Register)) { \ - *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \ + *reinterpret_cast<Register*>(output + i) = QuantizeTile8::Consecutive(q, input + i); \ } \ } -#define INTGEMM_QUANTIZE(target, Register, name) \ +#define INTGEMM_QUANTIZE(target) \ target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \ assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \ @@ -68,7 +68,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa } \ std::size_t overhang = size & (kBatch - 1); \ if (!overhang) return; \ - name::QuantizeTile8 q(quant_mult); \ + FRegister q = set1_ps<FRegister>(quant_mult); \ /* Each does size(Register) / 32 == kBatch / 4 floats at a time. * If we're allowed to read one of them, then we can read the whole register. */ \ const float *inputs[4]; \ @@ -80,7 +80,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa for (; i < 4; ++i) { \ inputs[i] = &input[fast_end]; \ } \ - Register result = q.Tile(inputs[0], inputs[1], inputs[2], inputs[3]); \ + Register result = QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]); \ std::memcpy(output + (size & ~(kBatch - 1)), &result, overhang); \ } diff --git a/intgemm/sse2_gemm.h b/intgemm/sse2_gemm.h index dc3fa60..cd49efe 100644 --- a/intgemm/sse2_gemm.h +++ b/intgemm/sse2_gemm.h @@ -19,30 +19,26 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i) class QuantizeTile16 { public: - INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {} - - INTGEMM_SSE2 inline __m128i Consecutive(const float *input) const { - return Tile(input, input + 4); + INTGEMM_SSE2 static inline Register Consecutive(__m128 mult_reg, const float *input) { + return Tile(mult_reg, input, input + 4); } - INTGEMM_SSE2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { - return Tile( + INTGEMM_SSE2 static inline Register ConsecutiveWithWrapping(__m128 mult_reg, const float *input, Index cols_left, Index cols, Index row_step) { + return Tile(mult_reg, input, input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0)); } - INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) const { - return Consecutive(input); + INTGEMM_SSE2 static inline Register ForReshape(__m128 mult_reg, const float *input, int) { + return Consecutive(mult_reg, input); } private: - INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) const { - __m128i g0 = QuantizerGrab(input0, mult_reg_); - __m128i g1 = QuantizerGrab(input1, mult_reg_); + INTGEMM_SSE2 static inline Register Tile(__m128 mult_reg, const float *input0, const float *input1) { + __m128i g0 = kernels::quantize(loadu_ps<__m128>(input0), mult_reg); + __m128i g1 = kernels::quantize(loadu_ps<__m128>(input1), mult_reg); return _mm_packs_epi32(g0, g1); } - - const __m128 mult_reg_; }; // This should be pure SSE2 (and below). @@ -58,10 +54,10 @@ struct Kernels16 { assert(size % 8 == 0); assert(reinterpret_cast<uintptr_t>(input) % 16 == 0); assert(reinterpret_cast<uintptr_t>(output) % 16 == 0); - sse2::QuantizeTile16 q(quant_mult); + FRegister q = set1_ps<FRegister>(quant_mult); const float *end = input + size; for (; input != end; input += 8, output += 8) { - *reinterpret_cast<__m128i*>(output) = q.Consecutive(input); + *reinterpret_cast<__m128i*>(output) = QuantizeTile16::Consecutive(q, input); } } diff --git a/intgemm/ssse3_gemm.h b/intgemm/ssse3_gemm.h index beaf4b1..865fe12 100644 --- a/intgemm/ssse3_gemm.h +++ b/intgemm/ssse3_gemm.h @@ -21,22 +21,20 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i) class QuantizeTile8 { public: - INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {} - - INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) const { + INTGEMM_SSSE3 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) { // Skip a row. - return Tile(input, input + 4, input + 2 * cols, input + 2 * cols + 4); + return Tile(mult_reg, input, input + 4, input + 2 * cols, input + 2 * cols + 4); } - INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) const { - return Tile(input, input + 4, input + 8, input + 12); + INTGEMM_SSSE3 static inline Register Consecutive(FRegister mult_reg, const float *input) { + return Tile(mult_reg, input, input + 4, input + 8, input + 12); } - INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) const { - return TileU(input, input + 4, input + 8, input + 12); + INTGEMM_SSSE3 static inline Register ConsecutiveU(FRegister mult_reg, const float *input) { + return TileU(mult_reg, input, input + 4, input + 8, input + 12); } - INTGEMM_SSSE3 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const { + INTGEMM_SSSE3 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) { const float* inputs[4]; for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) { while (cols_left < sizeof(Register) / sizeof(float)) { @@ -47,16 +45,16 @@ class QuantizeTile8 { input += sizeof(Register) / sizeof(float); cols_left -= sizeof(Register) / sizeof(float); } - return Tile(inputs[0], inputs[1], inputs[2], inputs[3]); + return Tile(mult_reg, inputs[0], inputs[1], inputs[2], inputs[3]); } // Quantize 16xfloat into 16xint8_t - INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const { + INTGEMM_SSSE3 static inline __m128i Tile(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) { const __m128i neg128 = _mm_set1_epi8(-128); - __m128i g0 = QuantizerGrab(input0, mult_reg_); - __m128i g1 = QuantizerGrab(input1, mult_reg_); - __m128i g2 = QuantizerGrab(input2, mult_reg_); - __m128i g3 = QuantizerGrab(input3, mult_reg_); + __m128i g0 = QuantizerGrab(input0, mult_reg); + __m128i g1 = QuantizerGrab(input1, mult_reg); + __m128i g2 = QuantizerGrab(input2, mult_reg); + __m128i g3 = QuantizerGrab(input3, mult_reg); __m128i packed0 = _mm_packs_epi32(g0, g1); __m128i packed1 = _mm_packs_epi32(g2, g3); __m128i packed = _mm_packs_epi16(packed0, packed1); @@ -74,13 +72,13 @@ class QuantizeTile8 { } private: - INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const { + INTGEMM_SSSE3 static inline __m128i TileU(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) { const __m128i neg128 = _mm_set1_epi8(-128); const __m128i pos127 = _mm_set1_epi8(127); - __m128i g0 = QuantizerGrab(input0, mult_reg_); - __m128i g1 = QuantizerGrab(input1, mult_reg_); - __m128i g2 = QuantizerGrab(input2, mult_reg_); - __m128i g3 = QuantizerGrab(input3, mult_reg_); + __m128i g0 = QuantizerGrab(input0, mult_reg); + __m128i g1 = QuantizerGrab(input1, mult_reg); + __m128i g2 = QuantizerGrab(input2, mult_reg); + __m128i g3 = QuantizerGrab(input3, mult_reg); __m128i packed0 = _mm_packs_epi32(g0, g1); __m128i packed1 = _mm_packs_epi32(g2, g3); __m128i packed = _mm_packs_epi16(packed0, packed1); @@ -96,9 +94,6 @@ class QuantizeTile8 { return _mm_add_epi8(_mm_sub_epi8(packed, evils), pos127); // No permute needed. packs is in order for SSE. } - - private: - const FRegister mult_reg_; }; // pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need. @@ -111,9 +106,9 @@ struct Kernels8 { } private: - INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3, __m128i, ssse3) + INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3) public: - INTGEMM_QUANTIZE(INTGEMM_SSSE3, __m128i, ssse3) + INTGEMM_QUANTIZE(INTGEMM_SSSE3) // Version with unsigned int + 127 // Currently A is prepared by quantization but this could theoretically change. @@ -125,10 +120,10 @@ struct Kernels8 { assert(size % 16 == 0); assert(reinterpret_cast<uintptr_t>(input) % 16 == 0); assert(reinterpret_cast<uintptr_t>(output) % 16 == 0); - ssse3::QuantizeTile8 q(quant_mult); + FRegister q = set1_ps<FRegister>(quant_mult); const float *end = input + size; for (; input != end; input += 16, output += 16) { - *reinterpret_cast<__m128i*>(output) = q.ConsecutiveU(input); + *reinterpret_cast<__m128i*>(output) = QuantizeTile8::ConsecutiveU(q, input); } } @@ -138,7 +133,7 @@ struct Kernels8 { INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8) INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t) - INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, ssse3::QuantizeTile8, int8_t) + INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t) INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) { ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end); |