diff options
Diffstat (limited to 'src/GenerateKernelU8S8S32ACC16.cc')
-rw-r--r-- | src/GenerateKernelU8S8S32ACC16.cc | 16 |
1 files changed, 10 insertions, 6 deletions
diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc index f83012b..cbd5877 100644 --- a/src/GenerateKernelU8S8S32ACC16.cc +++ b/src/GenerateKernelU8S8S32ACC16.cc @@ -23,12 +23,13 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::initCRegs< int rowRegs, int colRegs, int leadingDimCReg) { + using CRegs = x86::Ymm; for (int i = 0; i < rowRegs; ++i) { for (int j = 0; j < colRegs; ++j) { a->vxorps( - CRegs_avx2_[i * leadingDimCReg + j], - CRegs_avx2_[i * leadingDimCReg + j], - CRegs_avx2_[i * leadingDimCReg + j]); + CRegs(i * leadingDimCReg + j), + CRegs(i * leadingDimCReg + j), + CRegs(i * leadingDimCReg + j)); } } } @@ -54,6 +55,8 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::genComputeBlock< x86::Ymm tmpReg = x86::ymm14; + using CRegs = x86::Ymm; + for (int i = 0; i < rowRegs; ++i) { // broadcast A a->vpbroadcastw( @@ -62,9 +65,9 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::genComputeBlock< a->vpmaddubsw( tmpReg, AReg, x86::dword_ptr(buffer_B, j * VLEN_ * sizeof(int8_t))); a->vpaddsw( - CRegs_avx2_[i * leadingDimCReg + j], + CRegs(i * leadingDimCReg + j), tmpReg, - CRegs_avx2_[i * leadingDimCReg + j]); + CRegs(i * leadingDimCReg + j)); // Prefetching is hurting performance in some cases // because prefetch instructions itself consumes a slot // in pipeline issue thus slowing down the kernel. @@ -93,12 +96,13 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::storeCRegs< x86::Xmm extractDest128 = x86::xmm15; x86::Ymm extractDest256 = x86::ymm15; + using CRegs = x86::Ymm; for (int i = 0; i < rowRegs; ++i) { a->imul(C_Offset, ldcReg, static_cast<asmjit::Imm>(i * sizeof(int32_t))); for (int j = 0; j < colRegs; ++j) { for (int idx = 0; idx < 2; ++idx) { a->vextracti128( - extractDest128, CRegs_avx2_[i * leadingDimCReg + j], idx); + extractDest128, CRegs(i * leadingDimCReg + j), idx); a->vpmovsxwd(extractDest256, extractDest128); x86::Mem destAddr = x86::dword_ptr( a->zcx(), C_Offset, 0, (j * 2 + idx) * 8 * sizeof(int32_t)); |