Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'multiply.h')
-rw-r--r--multiply.h208
1 files changed, 104 insertions, 104 deletions
diff --git a/multiply.h b/multiply.h
index abc224c..0aa86aa 100644
--- a/multiply.h
+++ b/multiply.h
@@ -139,42 +139,42 @@ INTGEMM_AVX2 static inline void RunCallback(Callback& callback_impl, vector_t<CP
// width must be a multiple of the register size.
// B_cols must be a multiple of 8.
// Multiply16
-#define INTGEMM_MULTIPLY16(Integer, target, cpu_type) \
+#define INTGEMM_MULTIPLY16(Register, target, cpu_type) \
template <typename Callback> target static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \
- assert(width % (sizeof(Integer) / sizeof(int16_t)) == 0); \
+ assert(width % (sizeof(Register) / sizeof(int16_t)) == 0); \
assert(B_cols % 8 == 0); \
- assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0); \
- assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); \
- const int simd_width = width / (sizeof(Integer) / sizeof(int16_t)); \
+ assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
+ assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
+ const int simd_width = width / (sizeof(Register) / sizeof(int16_t)); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
- const Integer *B0_col = reinterpret_cast<const Integer *>(B); \
+ const Register *B0_col = reinterpret_cast<const Register *>(B); \
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
/* Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \
for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
- const Integer *A_row = reinterpret_cast<const Integer*>(A + A_rowidx * width); \
+ const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \
/* These will be packed 32-bit integers containing sums for each row of B multiplied by the row of A. \
Iterate over shared (inner) dimension.*/ \
int k = 0; \
- Integer a = *(A_row + k); \
- Integer sum0 = madd_epi16(a, *(B0_col + k * 8)); \
- Integer sum1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \
- Integer sum2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \
- Integer sum3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \
- Integer sum4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \
- Integer sum5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \
- Integer sum6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \
- Integer sum7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \
+ Register a = *(A_row + k); \
+ Register sum0 = madd_epi16(a, *(B0_col + k * 8)); \
+ Register sum1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \
+ Register sum2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \
+ Register sum3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \
+ Register sum4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \
+ Register sum5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \
+ Register sum6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \
+ Register sum7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \
for (int k = 1; k < simd_width; ++k) { \
- Integer a = *(A_row + k); \
+ Register a = *(A_row + k); \
/* Multiply 16-bit, horizontally add to packed 32-bit integers.*/ \
- Integer mult0 = madd_epi16(a, *(B0_col + k * 8)); \
- Integer mult1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \
- Integer mult2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \
- Integer mult3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \
- Integer mult4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \
- Integer mult5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \
- Integer mult6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \
- Integer mult7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \
+ Register mult0 = madd_epi16(a, *(B0_col + k * 8)); \
+ Register mult1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \
+ Register mult2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \
+ Register mult3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \
+ Register mult4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \
+ Register mult5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \
+ Register mult6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \
+ Register mult7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \
/* Sum packed 32-bit integers with danger of overflow. TODO: accumulate in 64-bit every so often.*/ \
sum0 = add_epi32(sum0, mult0); \
sum1 = add_epi32(sum1, mult1); \
@@ -186,8 +186,8 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
sum7 = add_epi32(sum7, mult7); \
} \
/* Reduce sums within 128-bit lanes.*/ \
- Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
- Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
+ Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
+ Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
/*The specific implementation may need to reduce further.*/ \
auto total = PermuteSummer(pack0123, pack4567); \
RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \
@@ -196,30 +196,30 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
} \
//An int8_prepbias version of the above code, using the add 127 technique
-#define INTGEMM_PREPAREBIASFOR8(Integer, target, cpu_type) \
+#define INTGEMM_PREPAREBIASFOR8(Register, target, cpu_type) \
template <class Callback> target static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) { \
- assert(width % (sizeof(Integer) / sizeof(int8_t)) == 0); \
+ assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \
assert(B_cols % 8 == 0); \
- assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); \
- const int simd_width = width / (sizeof(Integer) / sizeof(int8_t)); \
+ assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
+ const int simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
- const Integer *B0_col = reinterpret_cast<const Integer *>(B); \
- const Integer a = set1_epi8<Integer>(1); \
+ const Register *B0_col = reinterpret_cast<const Register *>(B); \
+ const Register a = set1_epi8<Register>(1); \
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
- /*const Integer *A_row = reinterpret_cast<const Integer*>(A + A_rowidx * width);*/ \
+ /*const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width);*/ \
/* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \
Iterate over shared (inner) dimension.*/ \
int k = 0; \
- Integer sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
- Integer sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
- Integer sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
- Integer sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
- Integer sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
- Integer sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
- Integer sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
- Integer sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
+ Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
+ Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
+ Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
+ Register sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
+ Register sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
+ Register sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
+ Register sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
+ Register sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
/* Upcast to 32-bit and horizontally add. Seems a bit faster if this is declared here.*/ \
- Integer ones = set1_epi16<Integer>(1); \
+ Register ones = set1_epi16<Register>(1); \
sum0 = madd_epi16(sum0, ones); \
sum1 = madd_epi16(sum1, ones); \
sum2 = madd_epi16(sum2, ones); \
@@ -229,16 +229,16 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
sum6 = madd_epi16(sum6, ones); \
sum7 = madd_epi16(sum7, ones); \
for (int k = 1; k < simd_width; ++k) { \
- /*Integer a = *(A_row + k);*/ \
+ /*Register a = *(A_row + k);*/ \
/* Multiply 8-bit, horizontally add to packed 16-bit integers.*/ \
- Integer mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \
- Integer mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
- Integer mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
- Integer mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
- Integer mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
- Integer mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
- Integer mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
- Integer mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
+ Register mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \
+ Register mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
+ Register mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
+ Register mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
+ Register mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
+ Register mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
+ Register mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
+ Register mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
/* Upcast to 32-bit and horizontally add.*/ \
mult0 = madd_epi16(mult0, ones); \
mult1 = madd_epi16(mult1, ones); \
@@ -260,8 +260,8 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
\
} \
/* Reduce sums within 128-bit lanes.*/ \
- Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
- Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
+ Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
+ Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
/*The specific implementation may need to reduce further.*/ \
auto total = PermuteSummer(pack0123, pack4567); \
RunCallback(callback_impl, total, 0, B0_colidx, 1, B_cols); \
@@ -269,33 +269,33 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
} \
//An int8 version of the above code, using the add 127 technique
-#define INTGEMM_MULTIPLY8SHIFT(Integer, target, cpu_type) \
+#define INTGEMM_MULTIPLY8SHIFT(Register, target, cpu_type) \
template <class Callback> target static void Multiply8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \
- assert(width % (sizeof(Integer) / sizeof(int8_t)) == 0); \
+ assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \
assert(B_cols % 8 == 0); \
- assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0); \
- assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); \
- const int simd_width = width / (sizeof(Integer) / sizeof(int8_t)); \
+ assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
+ assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
+ const int simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
- const Integer *B0_col = reinterpret_cast<const Integer *>(B); \
+ const Register *B0_col = reinterpret_cast<const Register *>(B); \
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
/* Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \
for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
- const Integer *A_row = reinterpret_cast<const Integer*>(A + A_rowidx * width); \
+ const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \
/* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \
Iterate over shared (inner) dimension.*/ \
int k = 0; \
- Integer a = *(A_row + k); \
- Integer sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
- Integer sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
- Integer sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
- Integer sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
- Integer sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
- Integer sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
- Integer sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
- Integer sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
+ Register a = *(A_row + k); \
+ Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
+ Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
+ Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
+ Register sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
+ Register sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
+ Register sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
+ Register sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
+ Register sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
/* Upcast to 32-bit and horizontally add. Seems a bit faster if this is declared here.*/ \
- Integer ones = set1_epi16<Integer>(1); \
+ Register ones = set1_epi16<Register>(1); \
sum0 = madd_epi16(sum0, ones); \
sum1 = madd_epi16(sum1, ones); \
sum2 = madd_epi16(sum2, ones); \
@@ -305,16 +305,16 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
sum6 = madd_epi16(sum6, ones); \
sum7 = madd_epi16(sum7, ones); \
for (int k = 1; k < simd_width; ++k) { \
- Integer a = *(A_row + k); \
+ Register a = *(A_row + k); \
/* Multiply 8-bit, horizontally add to packed 16-bit integers.*/ \
- Integer mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \
- Integer mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
- Integer mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
- Integer mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
- Integer mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
- Integer mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
- Integer mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
- Integer mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
+ Register mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \
+ Register mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
+ Register mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
+ Register mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
+ Register mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
+ Register mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
+ Register mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
+ Register mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
/* Upcast to 32-bit and horizontally add.*/ \
mult0 = madd_epi16(mult0, ones); \
mult1 = madd_epi16(mult1, ones); \
@@ -336,8 +336,8 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
\
} \
/* Reduce sums within 128-bit lanes.*/ \
- Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
- Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
+ Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
+ Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
/*The specific implementation may need to reduce further.*/ \
auto total = PermuteSummer(pack0123, pack4567); \
RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \
@@ -493,35 +493,35 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a)));
}
//INTGEMM_AVX2 or INTGEMM_SSSE3 multiply
-#define INTGEMM_MULTIPLY8(Integer, target, cpu_type) \
+#define INTGEMM_MULTIPLY8(Register, target, cpu_type) \
template <typename Callback> target static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \
- assert(width % sizeof(Integer) == 0); \
+ assert(width % sizeof(Register) == 0); \
assert(B_cols % 8 == 0); \
- assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0); \
- assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); \
- const int simd_width = width / sizeof(Integer); \
+ assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
+ assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
+ const int simd_width = width / sizeof(Register); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
- const Integer *B0_col = reinterpret_cast<const Integer*>(B); \
+ const Register *B0_col = reinterpret_cast<const Register*>(B); \
/*Go over 8 columns of B at a time.*/ \
for (Index B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
/*Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \
for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
/*Iterate over shared (inner) dimension.*/ \
- const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width); \
- const Integer *A_end = A_live + simd_width; \
- const Integer *B_live = B0_col; \
+ const Register *A_live = reinterpret_cast<const Register *>(A + A_rowidx * width); \
+ const Register *A_end = A_live + simd_width; \
+ const Register *B_live = B0_col; \
/* Rather than initializing as zeros and adding, just initialize the first.*/ \
- Integer a = *(A_live++); \
- Integer a_positive = abs_epi8(a); \
+ Register a = *(A_live++); \
+ Register a_positive = abs_epi8(a); \
/* These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A.*/ \
- Integer sum0 = maddubs_epi16(a_positive, sign_epi8(B_live[0], a)); \
- Integer sum1 = maddubs_epi16(a_positive, sign_epi8(B_live[1], a)); \
- Integer sum2 = maddubs_epi16(a_positive, sign_epi8(B_live[2], a)); \
- Integer sum3 = maddubs_epi16(a_positive, sign_epi8(B_live[3], a)); \
- Integer sum4 = maddubs_epi16(a_positive, sign_epi8(B_live[4], a)); \
- Integer sum5 = maddubs_epi16(a_positive, sign_epi8(B_live[5], a)); \
- Integer sum6 = maddubs_epi16(a_positive, sign_epi8(B_live[6], a)); \
- Integer sum7 = maddubs_epi16(a_positive, sign_epi8(B_live[7], a)); \
+ Register sum0 = maddubs_epi16(a_positive, sign_epi8(B_live[0], a)); \
+ Register sum1 = maddubs_epi16(a_positive, sign_epi8(B_live[1], a)); \
+ Register sum2 = maddubs_epi16(a_positive, sign_epi8(B_live[2], a)); \
+ Register sum3 = maddubs_epi16(a_positive, sign_epi8(B_live[3], a)); \
+ Register sum4 = maddubs_epi16(a_positive, sign_epi8(B_live[4], a)); \
+ Register sum5 = maddubs_epi16(a_positive, sign_epi8(B_live[5], a)); \
+ Register sum6 = maddubs_epi16(a_positive, sign_epi8(B_live[6], a)); \
+ Register sum7 = maddubs_epi16(a_positive, sign_epi8(B_live[7], a)); \
B_live += 8; \
/* Use A as the loop variable so the add can be done where gcc likes it for branch prediction.*/ \
for (; A_live != A_end; ++A_live, B_live += 8) { \
@@ -544,7 +544,7 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
* _mm512_srai_epi32(_mm512_slli_epi32(sum, 16), 16),
* _mm512_srai_epi32(sum, 16));
*/ \
- Integer ones = set1_epi16<Integer>(1); \
+ Register ones = set1_epi16<Register>(1); \
sum0 = madd_epi16(sum0, ones); \
sum1 = madd_epi16(sum1, ones); \
sum2 = madd_epi16(sum2, ones); \
@@ -553,8 +553,8 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
sum5 = madd_epi16(sum5, ones); \
sum6 = madd_epi16(sum6, ones); \
sum7 = madd_epi16(sum7, ones); \
- Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
- Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
+ Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
+ Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
auto total = PermuteSummer(pack0123, pack4567); \
RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \
} \