diff options
Diffstat (limited to 'src/GenerateKernelU8S8S32ACC32Avx512.cc')
-rw-r--r-- | src/GenerateKernelU8S8S32ACC32Avx512.cc | 17 |
1 files changed, 7 insertions, 10 deletions
diff --git a/src/GenerateKernelU8S8S32ACC32Avx512.cc b/src/GenerateKernelU8S8S32ACC32Avx512.cc index d1729e4..12243ee 100644 --- a/src/GenerateKernelU8S8S32ACC32Avx512.cc +++ b/src/GenerateKernelU8S8S32ACC32Avx512.cc @@ -48,7 +48,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::genComputeBlock< int rowRegs, int colRegs, int lda, - int leadingDimCRegAssign) { + int leadingDimCReg) { // used for matrix A asmjit::X86Zmm AReg = x86::zmm31; @@ -71,9 +71,9 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::genComputeBlock< a->vpmaddubsw(res1, AReg, BReg); a->vpmaddwd(res1, oneReg, res1); a->vpaddd( - CRegs_avx512_[i * leadingDimCRegAssign + j], + CRegs_avx512_[i * leadingDimCReg + j], res1, - CRegs_avx512_[i * leadingDimCRegAssign + j]); + CRegs_avx512_[i * leadingDimCReg + j]); } a->prefetcht0(x86::dword_ptr(B_pf, j * VLEN_ * sizeof(int8_t))); } @@ -93,10 +93,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::storeCRegs< asmjit::X86Gp C_Offset, asmjit::X86Gp ldcReg, bool accum, - int leadingDimCRegAssign) { - // temp register - asmjit::X86Zmm tmpReg = x86::zmm28; - + int leadingDimCReg) { for (int i = 0; i < rowRegs; ++i) { if (i != 0) { a->add(C_Offset, ldcReg); @@ -107,13 +104,13 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::storeCRegs< for (int j = 0; j < colRegs; ++j) { if (accum) { a->vpaddd( - CRegs_avx512_[i * leadingDimCRegAssign + j], - CRegs_avx512_[i * leadingDimCRegAssign + j], + CRegs_avx512_[i * leadingDimCReg + j], + CRegs_avx512_[i * leadingDimCReg + j], x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t))); } a->vmovups( x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)), - CRegs_avx512_[i * leadingDimCRegAssign + j]); + CRegs_avx512_[i * leadingDimCReg + j]); } } } |