diff options
Diffstat (limited to 'multiply.h')
-rw-r--r-- | multiply.h | 208 |
1 files changed, 104 insertions, 104 deletions
@@ -139,42 +139,42 @@ INTGEMM_AVX2 static inline void RunCallback(Callback& callback_impl, vector_t<CP // width must be a multiple of the register size. // B_cols must be a multiple of 8. // Multiply16 -#define INTGEMM_MULTIPLY16(Integer, target, cpu_type) \ +#define INTGEMM_MULTIPLY16(Register, target, cpu_type) \ template <typename Callback> target static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \ - assert(width % (sizeof(Integer) / sizeof(int16_t)) == 0); \ + assert(width % (sizeof(Register) / sizeof(int16_t)) == 0); \ assert(B_cols % 8 == 0); \ - assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0); \ - assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); \ - const int simd_width = width / (sizeof(Integer) / sizeof(int16_t)); \ + assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \ + assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \ + const int simd_width = width / (sizeof(Register) / sizeof(int16_t)); \ auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \ - const Integer *B0_col = reinterpret_cast<const Integer *>(B); \ + const Register *B0_col = reinterpret_cast<const Register *>(B); \ for (Index B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \ /* Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \ for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \ - const Integer *A_row = reinterpret_cast<const Integer*>(A + A_rowidx * width); \ + const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \ /* These will be packed 32-bit integers containing sums for each row of B multiplied by the row of A. \ Iterate over shared (inner) dimension.*/ \ int k = 0; \ - Integer a = *(A_row + k); \ - Integer sum0 = madd_epi16(a, *(B0_col + k * 8)); \ - Integer sum1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \ - Integer sum2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \ - Integer sum3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \ - Integer sum4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \ - Integer sum5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \ - Integer sum6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \ - Integer sum7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \ + Register a = *(A_row + k); \ + Register sum0 = madd_epi16(a, *(B0_col + k * 8)); \ + Register sum1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \ + Register sum2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \ + Register sum3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \ + Register sum4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \ + Register sum5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \ + Register sum6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \ + Register sum7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \ for (int k = 1; k < simd_width; ++k) { \ - Integer a = *(A_row + k); \ + Register a = *(A_row + k); \ /* Multiply 16-bit, horizontally add to packed 32-bit integers.*/ \ - Integer mult0 = madd_epi16(a, *(B0_col + k * 8)); \ - Integer mult1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \ - Integer mult2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \ - Integer mult3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \ - Integer mult4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \ - Integer mult5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \ - Integer mult6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \ - Integer mult7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \ + Register mult0 = madd_epi16(a, *(B0_col + k * 8)); \ + Register mult1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \ + Register mult2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \ + Register mult3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \ + Register mult4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \ + Register mult5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \ + Register mult6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \ + Register mult7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \ /* Sum packed 32-bit integers with danger of overflow. TODO: accumulate in 64-bit every so often.*/ \ sum0 = add_epi32(sum0, mult0); \ sum1 = add_epi32(sum1, mult1); \ @@ -186,8 +186,8 @@ template <typename Callback> target static void Multiply(const int16_t *A, const sum7 = add_epi32(sum7, mult7); \ } \ /* Reduce sums within 128-bit lanes.*/ \ - Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3); \ - Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7); \ + Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \ + Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \ /*The specific implementation may need to reduce further.*/ \ auto total = PermuteSummer(pack0123, pack4567); \ RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \ @@ -196,30 +196,30 @@ template <typename Callback> target static void Multiply(const int16_t *A, const } \ //An int8_prepbias version of the above code, using the add 127 technique -#define INTGEMM_PREPAREBIASFOR8(Integer, target, cpu_type) \ +#define INTGEMM_PREPAREBIASFOR8(Register, target, cpu_type) \ template <class Callback> target static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) { \ - assert(width % (sizeof(Integer) / sizeof(int8_t)) == 0); \ + assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \ assert(B_cols % 8 == 0); \ - assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); \ - const int simd_width = width / (sizeof(Integer) / sizeof(int8_t)); \ + assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \ + const int simd_width = width / (sizeof(Register) / sizeof(int8_t)); \ auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \ - const Integer *B0_col = reinterpret_cast<const Integer *>(B); \ - const Integer a = set1_epi8<Integer>(1); \ + const Register *B0_col = reinterpret_cast<const Register *>(B); \ + const Register a = set1_epi8<Register>(1); \ for (Index B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \ - /*const Integer *A_row = reinterpret_cast<const Integer*>(A + A_rowidx * width);*/ \ + /*const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width);*/ \ /* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \ Iterate over shared (inner) dimension.*/ \ int k = 0; \ - Integer sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \ - Integer sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \ - Integer sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \ - Integer sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \ - Integer sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \ - Integer sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \ - Integer sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \ - Integer sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \ + Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \ + Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \ + Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \ + Register sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \ + Register sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \ + Register sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \ + Register sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \ + Register sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \ /* Upcast to 32-bit and horizontally add. Seems a bit faster if this is declared here.*/ \ - Integer ones = set1_epi16<Integer>(1); \ + Register ones = set1_epi16<Register>(1); \ sum0 = madd_epi16(sum0, ones); \ sum1 = madd_epi16(sum1, ones); \ sum2 = madd_epi16(sum2, ones); \ @@ -229,16 +229,16 @@ template <typename Callback> target static void Multiply(const int16_t *A, const sum6 = madd_epi16(sum6, ones); \ sum7 = madd_epi16(sum7, ones); \ for (int k = 1; k < simd_width; ++k) { \ - /*Integer a = *(A_row + k);*/ \ + /*Register a = *(A_row + k);*/ \ /* Multiply 8-bit, horizontally add to packed 16-bit integers.*/ \ - Integer mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \ - Integer mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \ - Integer mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \ - Integer mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \ - Integer mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \ - Integer mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \ - Integer mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \ - Integer mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \ + Register mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \ + Register mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \ + Register mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \ + Register mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \ + Register mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \ + Register mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \ + Register mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \ + Register mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \ /* Upcast to 32-bit and horizontally add.*/ \ mult0 = madd_epi16(mult0, ones); \ mult1 = madd_epi16(mult1, ones); \ @@ -260,8 +260,8 @@ template <typename Callback> target static void Multiply(const int16_t *A, const \ } \ /* Reduce sums within 128-bit lanes.*/ \ - Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3); \ - Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7); \ + Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \ + Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \ /*The specific implementation may need to reduce further.*/ \ auto total = PermuteSummer(pack0123, pack4567); \ RunCallback(callback_impl, total, 0, B0_colidx, 1, B_cols); \ @@ -269,33 +269,33 @@ template <typename Callback> target static void Multiply(const int16_t *A, const } \ //An int8 version of the above code, using the add 127 technique -#define INTGEMM_MULTIPLY8SHIFT(Integer, target, cpu_type) \ +#define INTGEMM_MULTIPLY8SHIFT(Register, target, cpu_type) \ template <class Callback> target static void Multiply8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \ - assert(width % (sizeof(Integer) / sizeof(int8_t)) == 0); \ + assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \ assert(B_cols % 8 == 0); \ - assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0); \ - assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); \ - const int simd_width = width / (sizeof(Integer) / sizeof(int8_t)); \ + assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \ + assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \ + const int simd_width = width / (sizeof(Register) / sizeof(int8_t)); \ auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \ - const Integer *B0_col = reinterpret_cast<const Integer *>(B); \ + const Register *B0_col = reinterpret_cast<const Register *>(B); \ for (Index B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \ /* Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \ for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \ - const Integer *A_row = reinterpret_cast<const Integer*>(A + A_rowidx * width); \ + const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \ /* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \ Iterate over shared (inner) dimension.*/ \ int k = 0; \ - Integer a = *(A_row + k); \ - Integer sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \ - Integer sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \ - Integer sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \ - Integer sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \ - Integer sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \ - Integer sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \ - Integer sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \ - Integer sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \ + Register a = *(A_row + k); \ + Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \ + Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \ + Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \ + Register sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \ + Register sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \ + Register sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \ + Register sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \ + Register sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \ /* Upcast to 32-bit and horizontally add. Seems a bit faster if this is declared here.*/ \ - Integer ones = set1_epi16<Integer>(1); \ + Register ones = set1_epi16<Register>(1); \ sum0 = madd_epi16(sum0, ones); \ sum1 = madd_epi16(sum1, ones); \ sum2 = madd_epi16(sum2, ones); \ @@ -305,16 +305,16 @@ template <typename Callback> target static void Multiply(const int16_t *A, const sum6 = madd_epi16(sum6, ones); \ sum7 = madd_epi16(sum7, ones); \ for (int k = 1; k < simd_width; ++k) { \ - Integer a = *(A_row + k); \ + Register a = *(A_row + k); \ /* Multiply 8-bit, horizontally add to packed 16-bit integers.*/ \ - Integer mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \ - Integer mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \ - Integer mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \ - Integer mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \ - Integer mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \ - Integer mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \ - Integer mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \ - Integer mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \ + Register mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \ + Register mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \ + Register mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \ + Register mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \ + Register mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \ + Register mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \ + Register mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \ + Register mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \ /* Upcast to 32-bit and horizontally add.*/ \ mult0 = madd_epi16(mult0, ones); \ mult1 = madd_epi16(mult1, ones); \ @@ -336,8 +336,8 @@ template <typename Callback> target static void Multiply(const int16_t *A, const \ } \ /* Reduce sums within 128-bit lanes.*/ \ - Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3); \ - Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7); \ + Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \ + Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \ /*The specific implementation may need to reduce further.*/ \ auto total = PermuteSummer(pack0123, pack4567); \ RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \ @@ -493,35 +493,35 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3( sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a))); } //INTGEMM_AVX2 or INTGEMM_SSSE3 multiply -#define INTGEMM_MULTIPLY8(Integer, target, cpu_type) \ +#define INTGEMM_MULTIPLY8(Register, target, cpu_type) \ template <typename Callback> target static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \ - assert(width % sizeof(Integer) == 0); \ + assert(width % sizeof(Register) == 0); \ assert(B_cols % 8 == 0); \ - assert(reinterpret_cast<uintptr_t>(A) % sizeof(Integer) == 0); \ - assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); \ - const int simd_width = width / sizeof(Integer); \ + assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \ + assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \ + const int simd_width = width / sizeof(Register); \ auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \ - const Integer *B0_col = reinterpret_cast<const Integer*>(B); \ + const Register *B0_col = reinterpret_cast<const Register*>(B); \ /*Go over 8 columns of B at a time.*/ \ for (Index B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \ /*Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \ for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \ /*Iterate over shared (inner) dimension.*/ \ - const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width); \ - const Integer *A_end = A_live + simd_width; \ - const Integer *B_live = B0_col; \ + const Register *A_live = reinterpret_cast<const Register *>(A + A_rowidx * width); \ + const Register *A_end = A_live + simd_width; \ + const Register *B_live = B0_col; \ /* Rather than initializing as zeros and adding, just initialize the first.*/ \ - Integer a = *(A_live++); \ - Integer a_positive = abs_epi8(a); \ + Register a = *(A_live++); \ + Register a_positive = abs_epi8(a); \ /* These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A.*/ \ - Integer sum0 = maddubs_epi16(a_positive, sign_epi8(B_live[0], a)); \ - Integer sum1 = maddubs_epi16(a_positive, sign_epi8(B_live[1], a)); \ - Integer sum2 = maddubs_epi16(a_positive, sign_epi8(B_live[2], a)); \ - Integer sum3 = maddubs_epi16(a_positive, sign_epi8(B_live[3], a)); \ - Integer sum4 = maddubs_epi16(a_positive, sign_epi8(B_live[4], a)); \ - Integer sum5 = maddubs_epi16(a_positive, sign_epi8(B_live[5], a)); \ - Integer sum6 = maddubs_epi16(a_positive, sign_epi8(B_live[6], a)); \ - Integer sum7 = maddubs_epi16(a_positive, sign_epi8(B_live[7], a)); \ + Register sum0 = maddubs_epi16(a_positive, sign_epi8(B_live[0], a)); \ + Register sum1 = maddubs_epi16(a_positive, sign_epi8(B_live[1], a)); \ + Register sum2 = maddubs_epi16(a_positive, sign_epi8(B_live[2], a)); \ + Register sum3 = maddubs_epi16(a_positive, sign_epi8(B_live[3], a)); \ + Register sum4 = maddubs_epi16(a_positive, sign_epi8(B_live[4], a)); \ + Register sum5 = maddubs_epi16(a_positive, sign_epi8(B_live[5], a)); \ + Register sum6 = maddubs_epi16(a_positive, sign_epi8(B_live[6], a)); \ + Register sum7 = maddubs_epi16(a_positive, sign_epi8(B_live[7], a)); \ B_live += 8; \ /* Use A as the loop variable so the add can be done where gcc likes it for branch prediction.*/ \ for (; A_live != A_end; ++A_live, B_live += 8) { \ @@ -544,7 +544,7 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3( * _mm512_srai_epi32(_mm512_slli_epi32(sum, 16), 16), * _mm512_srai_epi32(sum, 16)); */ \ - Integer ones = set1_epi16<Integer>(1); \ + Register ones = set1_epi16<Register>(1); \ sum0 = madd_epi16(sum0, ones); \ sum1 = madd_epi16(sum1, ones); \ sum2 = madd_epi16(sum2, ones); \ @@ -553,8 +553,8 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3( sum5 = madd_epi16(sum5, ones); \ sum6 = madd_epi16(sum6, ones); \ sum7 = madd_epi16(sum7, ones); \ - Integer pack0123 = Pack0123(sum0, sum1, sum2, sum3); \ - Integer pack4567 = Pack0123(sum4, sum5, sum6, sum7); \ + Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \ + Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \ auto total = PermuteSummer(pack0123, pack4567); \ RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \ } \ |