Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2020-09-15 23:53:06 +0300
committerKenneth Heafield <github@kheafield.com>2020-09-15 23:53:06 +0300
commitada32a77770a98d5983647a8c02b5b65e18bb754 (patch)
treec439d2c7ac7765b17d3d0a8b17e4bb70c3c354c9
parentfdbd834e92da7d32c0363e8eb5200ec301d0f146 (diff)
Change quant_mult from member variable to static parameter
Might fix gcc 5.4
-rw-r--r--intgemm/avx2_gemm.h74
-rw-r--r--intgemm/avx512_gemm.h44
-rw-r--r--intgemm/interleave.h26
-rw-r--r--intgemm/multiply.h12
-rw-r--r--intgemm/sse2_gemm.h26
-rw-r--r--intgemm/ssse3_gemm.h51
6 files changed, 102 insertions, 131 deletions
diff --git a/intgemm/avx2_gemm.h b/intgemm/avx2_gemm.h
index d111b32..5e81475 100644
--- a/intgemm/avx2_gemm.h
+++ b/intgemm/avx2_gemm.h
@@ -19,34 +19,30 @@ INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i)
class QuantizeTile16 {
public:
- INTGEMM_AVX2 explicit QuantizeTile16(float mult) : mult_(_mm256_set1_ps(mult)) {}
-
- INTGEMM_AVX2 Register Consecutive(const float *input) const {
- return Tile(input, input + 8);
+ INTGEMM_AVX2 static inline Register Consecutive(FRegister mult_reg, const float *input) {
+ return Tile(mult_reg, input, input + 8);
}
- INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
- return Tile(
+ INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
+ return Tile(mult_reg,
input,
input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0));
}
- INTGEMM_AVX2 Register ForReshape(const float *input, Index cols) const {
+ INTGEMM_AVX2 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
// 8 rows in the first 128-bit register, 8 in the second register.
- return Tile(input, input + 8 * cols);
+ return Tile(mult_reg, input, input + 8 * cols);
}
private:
- INTGEMM_AVX2 __m256i Tile(const float *input0, const float *input1) const {
- Register g0 = QuantizerGrab(input0, mult_);
- Register g1 = QuantizerGrab(input1, mult_);
+ INTGEMM_AVX2 static inline Register Tile(FRegister mult_reg, const float *input0, const float *input1) {
+ Register g0 = QuantizerGrab(input0, mult_reg);
+ Register g1 = QuantizerGrab(input1, mult_reg);
Register packed = _mm256_packs_epi32(g0, g1);
// Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15.
// Technically this could be removed if the PrepareB did the same reordering internally.
return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
}
-
- const FRegister mult_;
};
struct Kernels16 {
@@ -61,10 +57,10 @@ struct Kernels16 {
INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
- avx2::QuantizeTile16 q(quant_mult);
+ FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 16, output += 16) {
- *reinterpret_cast<__m256i*>(output) = q.Consecutive(input);
+ *reinterpret_cast<__m256i*>(output) = QuantizeTile16::Consecutive(q, input);
}
}
@@ -96,17 +92,15 @@ struct Kernels16 {
*/
class QuantizeTile8 {
public:
- INTGEMM_AVX2 explicit QuantizeTile8(float quant_mult) : mult_(_mm256_set1_ps(quant_mult)) {}
-
- INTGEMM_AVX2 inline __m256i Consecutive(const float *input) const {
- return Tile(input, input + 8, input + 16, input + 24);
+ INTGEMM_AVX2 static inline Register Consecutive(FRegister quant_mult, const float *input) {
+ return Tile(quant_mult, input, input + 8, input + 16, input + 24);
}
- INTGEMM_AVX2 inline __m256i ConsecutiveU(const float *input) const {
- return TileU(input, input + 8, input + 16, input + 24);
+ INTGEMM_AVX2 static inline Register ConsecutiveU(FRegister quant_mult, const float *input) {
+ return TileU(quant_mult, input, input + 8, input + 16, input + 24);
}
- INTGEMM_AVX2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+ INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
const float* inputs[4];
for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
while (cols_left < sizeof(Register) / sizeof(float)) {
@@ -117,24 +111,24 @@ class QuantizeTile8 {
input += sizeof(Register) / sizeof(float);
cols_left -= sizeof(Register) / sizeof(float);
}
- return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
+ return Tile(quant_mult, inputs[0], inputs[1], inputs[2], inputs[3]);
}
- INTGEMM_AVX2 inline __m256i ForReshape(const float *input, Index cols) const {
+ INTGEMM_AVX2 static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
// Put higher rows in the second half of the register. These will jumble
// around in the same way then conveniently land in the right place.
- return Tile(input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
+ return Tile(quant_mult, input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
}
- INTGEMM_AVX2 inline __m256i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const {
+ INTGEMM_AVX2 static inline __m256i Tile(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
// Looking at the assembly, gcc has pulled this outside the loops calling this.
const __m256i neg127 = _mm256_set1_epi8(-127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
// Grab 4 registers at a time in 32-bit format.
- __m256i g0 = avx2::QuantizerGrab(input0, mult_);
- __m256i g1 = avx2::QuantizerGrab(input1, mult_);
- __m256i g2 = avx2::QuantizerGrab(input2, mult_);
- __m256i g3 = avx2::QuantizerGrab(input3, mult_);
+ __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
+ __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
+ __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
+ __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
// Pack 32-bit to 16-bit.
__m256i packed0 = _mm256_packs_epi32(g0, g1);
__m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -151,16 +145,16 @@ class QuantizeTile8 {
private:
//A version that produces uint8_ts
- INTGEMM_AVX2 inline __m256i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const {
+ INTGEMM_AVX2 static inline Register TileU(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
// Looking at the assembly, gcc has pulled this outside the loops calling this.
const __m256i neg127 = _mm256_set1_epi8(-127);
const __m256i pos127 = _mm256_set1_epi8(127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
// Grab 4 registers at a time in 32-bit format.
- __m256i g0 = avx2::QuantizerGrab(input0, mult_);
- __m256i g1 = avx2::QuantizerGrab(input1, mult_);
- __m256i g2 = avx2::QuantizerGrab(input2, mult_);
- __m256i g3 = avx2::QuantizerGrab(input3, mult_);
+ __m256i g0 = avx2::QuantizerGrab(input0, quant_mult);
+ __m256i g1 = avx2::QuantizerGrab(input1, quant_mult);
+ __m256i g2 = avx2::QuantizerGrab(input2, quant_mult);
+ __m256i g3 = avx2::QuantizerGrab(input3, quant_mult);
// Pack 32-bit to 16-bit.
__m256i packed0 = _mm256_packs_epi32(g0, g1);
__m256i packed1 = _mm256_packs_epi32(g2, g3);
@@ -175,8 +169,6 @@ class QuantizeTile8 {
// and the values are only used for GEMM.
return _mm256_permutevar8x32_epi32(packed, shuffle_param);
}
-
- const __m256 mult_;
};
struct Kernels8 {
@@ -187,9 +179,9 @@ struct Kernels8 {
Quantize(input, output, quant_mult, rows * cols);
}
private:
- INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2, __m256i, avx2)
+ INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2)
public:
- INTGEMM_QUANTIZE(INTGEMM_AVX2, __m256i, avx2)
+ INTGEMM_QUANTIZE(INTGEMM_AVX2)
// Currently A is prepared by quantization but this could theoretically change.
INTGEMM_AVX2 static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) {
@@ -200,10 +192,10 @@ struct Kernels8 {
INTGEMM_AVX2 static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) {
assert(size % 32 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
- avx2::QuantizeTile8 q(quant_mult);
+ FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 32, output += 32) {
- *reinterpret_cast<__m256i*>(output) = q.ConsecutiveU(input);
+ *reinterpret_cast<__m256i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
}
}
diff --git a/intgemm/avx512_gemm.h b/intgemm/avx512_gemm.h
index 6108289..f9fb1eb 100644
--- a/intgemm/avx512_gemm.h
+++ b/intgemm/avx512_gemm.h
@@ -66,36 +66,27 @@ INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const f
// being used for the quantizer.
class QuantizeTile16 {
public:
- /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
- INTGEMM_AVX512BW explicit QuantizeTile16(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
- INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+ INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
auto input0 = input;
auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0);
- auto g0 = QuantizerGrabHalves(input0, input1, mult_reg_);
- auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, mult_reg_);
+ auto g0 = QuantizerGrabHalves(input0, input1, quant_mult);
+ auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, quant_mult);
auto packed = packs_epi32(g0, g1);
return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
}
- INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const {
- __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, mult_reg_);
- __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, mult_reg_);
+ INTGEMM_AVX512BW static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
+ __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, quant_mult);
+ __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, quant_mult);
__m512i packed = packs_epi32(g0, g1);
// Permute within 256-bit lanes, so same as INTGEMM_AVX2
return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
}
-
- private:
- const __m512 mult_reg_;
};
class QuantizeTile8 {
public:
- /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
- INTGEMM_AVX512BW explicit QuantizeTile8(float mult) : mult_reg_(_mm512_set1_ps(mult)) {}
-
- INTGEMM_AVX512BW Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+ INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
static const __m512i neg127 = _mm512_set1_epi8(-127);
static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
@@ -110,10 +101,10 @@ class QuantizeTile8 {
cols_left -= sizeof(Register) / sizeof(float);
}
- auto g0 = QuantizerGrab(inputs[0], mult_reg_);
- auto g1 = QuantizerGrab(inputs[1], mult_reg_);
- auto g2 = QuantizerGrab(inputs[2], mult_reg_);
- auto g3 = QuantizerGrab(inputs[3], mult_reg_);
+ auto g0 = QuantizerGrab(inputs[0], quant_mult);
+ auto g1 = QuantizerGrab(inputs[1], quant_mult);
+ auto g2 = QuantizerGrab(inputs[2], quant_mult);
+ auto g3 = QuantizerGrab(inputs[3], quant_mult);
auto packed0 = packs_epi32(g0, g1);
auto packed1 = packs_epi32(g2, g3);
@@ -122,17 +113,17 @@ class QuantizeTile8 {
return _mm512_permutexvar_epi32(shuffle_param, packed);
}
- INTGEMM_AVX512BW inline __m512i ForReshape(const float *input, Index cols) const {
+ INTGEMM_AVX512BW static inline __m512i ForReshape(FRegister quant_mult, const float *input, Index cols) {
// TODO: try alternative: _mm512_cvtsepi32_epi8 ?
const __m512i neg127 = _mm512_set1_epi8(-127);
// In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
// 32-bit format.
- __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, mult_reg_);
- __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, mult_reg_);
- __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, mult_reg_);
- __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, mult_reg_);
+ __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, quant_mult);
+ __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, quant_mult);
+ __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, quant_mult);
+ __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, quant_mult);
// Pack 32-bit to 16-bit.
__m512i packed0 = packs_epi32(g0, g1);
__m512i packed1 = packs_epi32(g2, g3);
@@ -143,9 +134,6 @@ class QuantizeTile8 {
// 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63
return _mm512_permutexvar_epi32(shuffle_param, packed);
}
-
- private:
- const __m512 mult_reg_;
};
struct Kernels16 {
diff --git a/intgemm/interleave.h b/intgemm/interleave.h
index 1ac14b7..1ec686b 100644
--- a/intgemm/interleave.h
+++ b/intgemm/interleave.h
@@ -179,7 +179,7 @@ template <class Register> static inline void Transpose8InLane(
// ... ...
#define INTGEMM_PREPARE_B_8(target, QuantClass) \
target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \
- QuantClass q(quant_mult); \
+ FRegister q = set1_ps<FRegister>(quant_mult); \
/* Currently all multipliers have a stride of 8 columns.*/ \
const Index kColStride = 8; \
assert(cols % kColStride == 0); \
@@ -193,14 +193,14 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
This isn't quite Transpose8InLane because it's half the number of columns, \
so each register starts with two rows instead of being one row. \
The quantizers know to skip a row.*/ \
- output[0] = q.ForReshape(input + cols * (r ) + c, cols); \
- output[1] = q.ForReshape(input + cols * (r + 1) + c, cols); \
- output[2] = q.ForReshape(input + cols * (r + 4) + c, cols); \
- output[3] = q.ForReshape(input + cols * (r + 5) + c, cols); \
- output[4] = q.ForReshape(input + cols * (r + 8) + c, cols); \
- output[5] = q.ForReshape(input + cols * (r + 9) + c, cols); \
- output[6] = q.ForReshape(input + cols * (r + 12) + c, cols); \
- output[7] = q.ForReshape(input + cols * (r + 13) + c, cols); \
+ output[0] = QuantClass::ForReshape(q, input + cols * (r ) + c, cols); \
+ output[1] = QuantClass::ForReshape(q, input + cols * (r + 1) + c, cols); \
+ output[2] = QuantClass::ForReshape(q, input + cols * (r + 4) + c, cols); \
+ output[3] = QuantClass::ForReshape(q, input + cols * (r + 5) + c, cols); \
+ output[4] = QuantClass::ForReshape(q, input + cols * (r + 8) + c, cols); \
+ output[5] = QuantClass::ForReshape(q, input + cols * (r + 9) + c, cols); \
+ output[6] = QuantClass::ForReshape(q, input + cols * (r + 12) + c, cols); \
+ output[7] = QuantClass::ForReshape(q, input + cols * (r + 13) + c, cols); \
Interleave8(output[0], output[1]); \
Interleave8(output[2], output[3]); \
Interleave8(output[4], output[5]); \
@@ -212,7 +212,7 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
#define INTGEMM_PREPARE_B_16(target, QuantClass) \
target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \
- QuantClass q(quant_mult); \
+ FRegister q = set1_ps<FRegister>(quant_mult); \
assert(cols % 8 == 0); \
assert(rows % (sizeof(Register) / sizeof(int16_t)) == 0); \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
@@ -222,7 +222,7 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \
/* gcc unrolls this loop and uses registers for output[k]*/ \
for (Index k = 0; k < 8; ++k) { \
- output[k] = q.ForReshape(input + cols * (r + k) + c, cols); \
+ output[k] = QuantClass::ForReshape(q, input + cols * (r + k) + c, cols); \
} \
Transpose16InLane(output[0], output[1], output[2], output[3], output[4], output[5], output[6], output[7]); \
} \
@@ -270,13 +270,13 @@ target static inline void PrepareBTransposed(const float* input, Integer* output
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
\
- Quantizer quantizer(quant_mult); \
+ FRegister q = set1_ps<FRegister>(quant_mult); \
Register* output_it = reinterpret_cast<Register*>(output); \
Index r = 0; \
Index c = 0; \
while (r < rows) { \
for (Index ri = 0; ri < 8; ++ri) \
- *output_it++ = quantizer.ConsecutiveWithWrapping(input + (r + ri) * cols + c, cols - c, cols, 8); \
+ *output_it++ = Quantizer::ConsecutiveWithWrapping(q, input + (r + ri) * cols + c, cols - c, cols, 8); \
c += RegisterElemsInt; \
while (c >= cols) { \
r += kColStride; \
diff --git a/intgemm/multiply.h b/intgemm/multiply.h
index 4b7558d..e201e09 100644
--- a/intgemm/multiply.h
+++ b/intgemm/multiply.h
@@ -47,16 +47,16 @@ INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i p
// Quantize function used for SSSE3 and AVX2.
// Separate function for thread to work around gcc 7 bug that doesn't imbue
// target attributes across #pragma omp parallel.
-#define INTGEMM_QUANTIZE_THREAD(target, Register, name) \
+#define INTGEMM_QUANTIZE_THREAD(target) \
target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \
- name::QuantizeTile8 q(quant_mult); \
+ FRegister q = set1_ps<FRegister>(quant_mult); \
INTGEMM_OMP_FOR \
for (std::size_t i = 0; i < count; i += sizeof(Register)) { \
- *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+ *reinterpret_cast<Register*>(output + i) = QuantizeTile8::Consecutive(q, input + i); \
} \
}
-#define INTGEMM_QUANTIZE(target, Register, name) \
+#define INTGEMM_QUANTIZE(target) \
target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
@@ -68,7 +68,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa
} \
std::size_t overhang = size & (kBatch - 1); \
if (!overhang) return; \
- name::QuantizeTile8 q(quant_mult); \
+ FRegister q = set1_ps<FRegister>(quant_mult); \
/* Each does size(Register) / 32 == kBatch / 4 floats at a time.
* If we're allowed to read one of them, then we can read the whole register. */ \
const float *inputs[4]; \
@@ -80,7 +80,7 @@ target static void Quantize(const float *const input, int8_t *const output, floa
for (; i < 4; ++i) { \
inputs[i] = &input[fast_end]; \
} \
- Register result = q.Tile(inputs[0], inputs[1], inputs[2], inputs[3]); \
+ Register result = QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]); \
std::memcpy(output + (size & ~(kBatch - 1)), &result, overhang); \
}
diff --git a/intgemm/sse2_gemm.h b/intgemm/sse2_gemm.h
index dc3fa60..cd49efe 100644
--- a/intgemm/sse2_gemm.h
+++ b/intgemm/sse2_gemm.h
@@ -19,30 +19,26 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
class QuantizeTile16 {
public:
- INTGEMM_SSE2 explicit QuantizeTile16(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
- INTGEMM_SSE2 inline __m128i Consecutive(const float *input) const {
- return Tile(input, input + 4);
+ INTGEMM_SSE2 static inline Register Consecutive(__m128 mult_reg, const float *input) {
+ return Tile(mult_reg, input, input + 4);
}
- INTGEMM_SSE2 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
- return Tile(
+ INTGEMM_SSE2 static inline Register ConsecutiveWithWrapping(__m128 mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
+ return Tile(mult_reg,
input,
input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0));
}
- INTGEMM_SSE2 inline __m128i ForReshape(const float *input, int) const {
- return Consecutive(input);
+ INTGEMM_SSE2 static inline Register ForReshape(__m128 mult_reg, const float *input, int) {
+ return Consecutive(mult_reg, input);
}
private:
- INTGEMM_SSE2 __m128i Tile(const float *input0, const float *input1) const {
- __m128i g0 = QuantizerGrab(input0, mult_reg_);
- __m128i g1 = QuantizerGrab(input1, mult_reg_);
+ INTGEMM_SSE2 static inline Register Tile(__m128 mult_reg, const float *input0, const float *input1) {
+ __m128i g0 = kernels::quantize(loadu_ps<__m128>(input0), mult_reg);
+ __m128i g1 = kernels::quantize(loadu_ps<__m128>(input1), mult_reg);
return _mm_packs_epi32(g0, g1);
}
-
- const __m128 mult_reg_;
};
// This should be pure SSE2 (and below).
@@ -58,10 +54,10 @@ struct Kernels16 {
assert(size % 8 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
- sse2::QuantizeTile16 q(quant_mult);
+ FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 8, output += 8) {
- *reinterpret_cast<__m128i*>(output) = q.Consecutive(input);
+ *reinterpret_cast<__m128i*>(output) = QuantizeTile16::Consecutive(q, input);
}
}
diff --git a/intgemm/ssse3_gemm.h b/intgemm/ssse3_gemm.h
index beaf4b1..865fe12 100644
--- a/intgemm/ssse3_gemm.h
+++ b/intgemm/ssse3_gemm.h
@@ -21,22 +21,20 @@ INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i)
class QuantizeTile8 {
public:
- INTGEMM_SSSE3 explicit QuantizeTile8(float mult) : mult_reg_(_mm_set1_ps(mult)) {}
-
- INTGEMM_SSSE3 inline __m128i ForReshape(const float *input, Index cols) const {
+ INTGEMM_SSSE3 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
// Skip a row.
- return Tile(input, input + 4, input + 2 * cols, input + 2 * cols + 4);
+ return Tile(mult_reg, input, input + 4, input + 2 * cols, input + 2 * cols + 4);
}
- INTGEMM_SSSE3 inline __m128i Consecutive(const float *input) const {
- return Tile(input, input + 4, input + 8, input + 12);
+ INTGEMM_SSSE3 static inline Register Consecutive(FRegister mult_reg, const float *input) {
+ return Tile(mult_reg, input, input + 4, input + 8, input + 12);
}
- INTGEMM_SSSE3 inline __m128i ConsecutiveU(const float *input) const {
- return TileU(input, input + 4, input + 8, input + 12);
+ INTGEMM_SSSE3 static inline Register ConsecutiveU(FRegister mult_reg, const float *input) {
+ return TileU(mult_reg, input, input + 4, input + 8, input + 12);
}
- INTGEMM_SSSE3 Register ConsecutiveWithWrapping(const float *input, Index cols_left, Index cols, Index row_step) const {
+ INTGEMM_SSSE3 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
const float* inputs[4];
for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
while (cols_left < sizeof(Register) / sizeof(float)) {
@@ -47,16 +45,16 @@ class QuantizeTile8 {
input += sizeof(Register) / sizeof(float);
cols_left -= sizeof(Register) / sizeof(float);
}
- return Tile(inputs[0], inputs[1], inputs[2], inputs[3]);
+ return Tile(mult_reg, inputs[0], inputs[1], inputs[2], inputs[3]);
}
// Quantize 16xfloat into 16xint8_t
- INTGEMM_SSSE3 inline __m128i Tile(const float *input0, const float *input1, const float *input2, const float *input3) const {
+ INTGEMM_SSSE3 static inline __m128i Tile(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
const __m128i neg128 = _mm_set1_epi8(-128);
- __m128i g0 = QuantizerGrab(input0, mult_reg_);
- __m128i g1 = QuantizerGrab(input1, mult_reg_);
- __m128i g2 = QuantizerGrab(input2, mult_reg_);
- __m128i g3 = QuantizerGrab(input3, mult_reg_);
+ __m128i g0 = QuantizerGrab(input0, mult_reg);
+ __m128i g1 = QuantizerGrab(input1, mult_reg);
+ __m128i g2 = QuantizerGrab(input2, mult_reg);
+ __m128i g3 = QuantizerGrab(input3, mult_reg);
__m128i packed0 = _mm_packs_epi32(g0, g1);
__m128i packed1 = _mm_packs_epi32(g2, g3);
__m128i packed = _mm_packs_epi16(packed0, packed1);
@@ -74,13 +72,13 @@ class QuantizeTile8 {
}
private:
- INTGEMM_SSSE3 inline __m128i TileU(const float *input0, const float *input1, const float *input2, const float *input3) const {
+ INTGEMM_SSSE3 static inline __m128i TileU(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
const __m128i neg128 = _mm_set1_epi8(-128);
const __m128i pos127 = _mm_set1_epi8(127);
- __m128i g0 = QuantizerGrab(input0, mult_reg_);
- __m128i g1 = QuantizerGrab(input1, mult_reg_);
- __m128i g2 = QuantizerGrab(input2, mult_reg_);
- __m128i g3 = QuantizerGrab(input3, mult_reg_);
+ __m128i g0 = QuantizerGrab(input0, mult_reg);
+ __m128i g1 = QuantizerGrab(input1, mult_reg);
+ __m128i g2 = QuantizerGrab(input2, mult_reg);
+ __m128i g3 = QuantizerGrab(input3, mult_reg);
__m128i packed0 = _mm_packs_epi32(g0, g1);
__m128i packed1 = _mm_packs_epi32(g2, g3);
__m128i packed = _mm_packs_epi16(packed0, packed1);
@@ -96,9 +94,6 @@ class QuantizeTile8 {
return _mm_add_epi8(_mm_sub_epi8(packed, evils), pos127);
// No permute needed. packs is in order for SSE.
}
-
- private:
- const FRegister mult_reg_;
};
// pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need.
@@ -111,9 +106,9 @@ struct Kernels8 {
}
private:
- INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3, __m128i, ssse3)
+ INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3)
public:
- INTGEMM_QUANTIZE(INTGEMM_SSSE3, __m128i, ssse3)
+ INTGEMM_QUANTIZE(INTGEMM_SSSE3)
// Version with unsigned int + 127
// Currently A is prepared by quantization but this could theoretically change.
@@ -125,10 +120,10 @@ struct Kernels8 {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
- ssse3::QuantizeTile8 q(quant_mult);
+ FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 16, output += 16) {
- *reinterpret_cast<__m128i*>(output) = q.ConsecutiveU(input);
+ *reinterpret_cast<__m128i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
}
}
@@ -138,7 +133,7 @@ struct Kernels8 {
INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, ssse3::QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t)
- INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, ssse3::QuantizeTile8, int8_t)
+ INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t)
INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
ssse3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);