diff options
author | Mateusz Chudyk <mateuszchudyk@gmail.com> | 2019-09-05 21:02:49 +0300 |
---|---|---|
committer | Mateusz Chudyk <mateuszchudyk@gmail.com> | 2019-09-13 18:02:28 +0300 |
commit | 30c7e3ab2d11723977ee2402c28f75b92280650b (patch) | |
tree | b53d3c8da8b2715f6b791051913067e47e6b82c5 | |
parent | 41dbd6bf88522fd30f0d92b4fe36f89f67898edf (diff) |
Change order of loops in multiplication
-rw-r--r-- | multiply.h | 6 |
1 files changed, 3 insertions, 3 deletions
@@ -501,11 +501,11 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3( assert(reinterpret_cast<uintptr_t>(B) % sizeof(Integer) == 0); \ const int simd_width = width / sizeof(Integer); \ auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \ - const Integer *B0_col = reinterpret_cast<const Integer*>(B); \ /*Go over 8 columns of B at a time.*/ \ - for (Index B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \ + for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \ + const Integer *B0_col = reinterpret_cast<const Integer*>(B); \ + for (Index B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \ /*Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \ - for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \ /*Iterate over shared (inner) dimension.*/ \ const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width); \ const Integer *A_end = A_live + simd_width; \ |