From 61928df38bece4c1c16b98296355450f13ca0afe Mon Sep 17 00:00:00 2001 From: Jianyu Huang Date: Mon, 1 Jul 2019 11:55:51 -0700 Subject: Clean up some code for JIT code generator (#101) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/101 Some code cleanup: - Both ```leadingDimCReg``` and ```leadingDimCRegAssign``` are used in ```GenerateKernelU8S8S32ACC32.c```. We should unify them to only use one variable name. - Remove some redundant register variable ```asmjit::X86Ymm tmpReg = x86::ymm14;```. Reviewed By: dskhudia Differential Revision: D15673269 fbshipit-source-id: 81eb3673d0ff97391557413a13f1972561a1f2db --- src/GenerateKernelU8S8S32ACC16.cc | 24 +++++++++++++----------- src/GenerateKernelU8S8S32ACC16Avx512.cc | 18 +++++++++--------- src/GenerateKernelU8S8S32ACC32.cc | 17 +++++++---------- src/GenerateKernelU8S8S32ACC32Avx512.cc | 17 +++++++---------- 4 files changed, 36 insertions(+), 40 deletions(-) diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc index 082518c..718b883 100644 --- a/src/GenerateKernelU8S8S32ACC16.cc +++ b/src/GenerateKernelU8S8S32ACC16.cc @@ -34,13 +34,13 @@ void CodeGenBase::initCRegs< asmjit::X86Emitter* a, int rowRegs, int colRegs, - int leadingDimCRegAssign) { + int leadingDimCReg) { for (int i = 0; i < rowRegs; ++i) { for (int j = 0; j < colRegs; ++j) { a->vxorps( - CRegs_avx2_[i * leadingDimCRegAssign + j], - CRegs_avx2_[i * leadingDimCRegAssign + j], - CRegs_avx2_[i * leadingDimCRegAssign + j]); + CRegs_avx2_[i * leadingDimCReg + j], + CRegs_avx2_[i * leadingDimCReg + j], + CRegs_avx2_[i * leadingDimCReg + j]); } } } @@ -60,7 +60,7 @@ void CodeGenBase::genComputeBlock< int rowRegs, int colRegs, int lda, - int leadingDimCRegAssign) { + int leadingDimCReg) { // used for matrix A asmjit::X86Ymm AReg = x86::ymm12; @@ -74,9 +74,9 @@ void CodeGenBase::genComputeBlock< a->vpmaddubsw( tmpReg, AReg, x86::dword_ptr(buffer_B, j * VLEN_ * sizeof(int8_t))); a->vpaddsw( - CRegs_avx2_[i * leadingDimCRegAssign + j], + CRegs_avx2_[i * leadingDimCReg + j], tmpReg, - CRegs_avx2_[i * leadingDimCRegAssign + j]); + CRegs_avx2_[i * leadingDimCReg + j]); // Prefetching is hurting performance in some cases // because prefetch instructions itself consumes a slot // in pipeline issue thus slowing down the kernel. @@ -101,7 +101,7 @@ void CodeGenBase::storeCRegs< asmjit::X86Gp C_Offset, asmjit::X86Gp ldcReg, bool accum, - int leadingDimCRegAssign) { + int leadingDimCReg) { asmjit::X86Xmm extractDest128 = x86::xmm15; asmjit::X86Ymm extractDest256 = x86::ymm15; @@ -110,7 +110,7 @@ void CodeGenBase::storeCRegs< for (int j = 0; j < colRegs; ++j) { for (int idx = 0; idx < 2; ++idx) { a->vextracti128( - extractDest128, CRegs_avx2_[i * leadingDimCRegAssign + j], idx); + extractDest128, CRegs_avx2_[i * leadingDimCReg + j], idx); a->vpmovsxwd(extractDest256, extractDest128); asmjit::X86Mem destAddr = x86::dword_ptr( a->zcx(), C_Offset, 0, (j * 2 + idx) * 8 * sizeof(int32_t)); @@ -289,7 +289,8 @@ CodeGenBase::getOrCreate( a->jl(Loopk); // store C matrix - storeCRegs(a, rowRegs, colRegs, C_Offset, ldcReg, accum); + storeCRegs( + a, rowRegs, colRegs, C_Offset, ldcReg, accum); // increment A for next block a->sub(buffer_A, kSize); @@ -339,7 +340,8 @@ CodeGenBase::getOrCreate( a->jl(LoopkRem); // store C matrix - storeCRegs(a, rowRegs, colRegs, C_Offset, ldcReg, accum); + storeCRegs( + a, rowRegs, colRegs, C_Offset, ldcReg, accum); } asmjit::FuncUtils::emitEpilog(a, layout); diff --git a/src/GenerateKernelU8S8S32ACC16Avx512.cc b/src/GenerateKernelU8S8S32ACC16Avx512.cc index e5687eb..c95757b 100644 --- a/src/GenerateKernelU8S8S32ACC16Avx512.cc +++ b/src/GenerateKernelU8S8S32ACC16Avx512.cc @@ -22,13 +22,13 @@ void CodeGenBase::initCRegs< asmjit::X86Emitter* a, int rowRegs, int colRegs, - int leadingDimCRegAssign) { + int leadingDimCReg) { for (int i = 0; i < rowRegs; ++i) { for (int j = 0; j < colRegs; ++j) { a->vxorps( - CRegs_avx512_[i * leadingDimCRegAssign + j], - CRegs_avx512_[i * leadingDimCRegAssign + j], - CRegs_avx512_[i * leadingDimCRegAssign + j]); + CRegs_avx512_[i * leadingDimCReg + j], + CRegs_avx512_[i * leadingDimCReg + j], + CRegs_avx512_[i * leadingDimCReg + j]); } } } @@ -48,7 +48,7 @@ void CodeGenBase::genComputeBlock< int rowRegs, int colRegs, int lda, - int leadingDimCRegAssign) { + int leadingDimCReg) { // used for matrix A asmjit::X86Zmm AReg = x86::zmm29; @@ -69,9 +69,9 @@ void CodeGenBase::genComputeBlock< a->vpmaddubsw( tmpReg, AReg, AllRegs_avx512_[27-j]); a->vpaddsw( - CRegs_avx512_[i * leadingDimCRegAssign + j], + CRegs_avx512_[i * leadingDimCReg + j], tmpReg, - CRegs_avx512_[i * leadingDimCRegAssign + j]); + CRegs_avx512_[i * leadingDimCReg + j]); // Prefetching is hurting performance in some cases // because prefetch instructions itself consumes a slot // in pipeline issue thus slowing down the kernel. @@ -96,7 +96,7 @@ void CodeGenBase::storeCRegs< asmjit::X86Gp C_Offset, asmjit::X86Gp ldcReg, bool accum, - int leadingDimCRegAssign) { + int leadingDimCReg) { asmjit::X86Ymm extractDest256 = x86::ymm31; asmjit::X86Zmm extractDest512 = x86::zmm31; @@ -105,7 +105,7 @@ void CodeGenBase::storeCRegs< for (int j = 0; j < colRegs; ++j) { for (int idx = 0; idx < 2; ++idx) { a->vextracti32x8( - extractDest256, CRegs_avx512_[i * leadingDimCRegAssign + j], idx); + extractDest256, CRegs_avx512_[i * leadingDimCReg + j], idx); a->vpmovsxwd(extractDest512, extractDest256); asmjit::X86Mem destAddr = x86::dword_ptr( a->zcx(), C_Offset, 0, (j * 2 + idx) * 16 * sizeof(int32_t)); diff --git a/src/GenerateKernelU8S8S32ACC32.cc b/src/GenerateKernelU8S8S32ACC32.cc index d044530..58643ad 100644 --- a/src/GenerateKernelU8S8S32ACC32.cc +++ b/src/GenerateKernelU8S8S32ACC32.cc @@ -60,7 +60,7 @@ void CodeGenBase::genComputeBlock< int rowRegs, int colRegs, int lda, - int leadingDimCRegAssign) { + int leadingDimCReg) { // used for matrix A asmjit::X86Ymm AReg = x86::ymm12; @@ -83,9 +83,9 @@ void CodeGenBase::genComputeBlock< a->vpmaddubsw(res1, AReg, BReg); a->vpmaddwd(res1, oneReg, res1); a->vpaddd( - CRegs_avx2_[i * leadingDimCRegAssign + j], + CRegs_avx2_[i * leadingDimCReg + j], res1, - CRegs_avx2_[i * leadingDimCRegAssign + j]); + CRegs_avx2_[i * leadingDimCReg + j]); } a->prefetcht0(x86::dword_ptr(B_pf, j * VLEN_ * sizeof(int8_t))); } @@ -105,10 +105,7 @@ void CodeGenBase::storeCRegs< asmjit::X86Gp C_Offset, asmjit::X86Gp ldcReg, bool accum, - int leadingDimCRegAssign) { - // temp register - asmjit::X86Ymm tmpReg = x86::ymm14; - + int leadingDimCReg) { for (int i = 0; i < rowRegs; ++i) { if (i != 0) { a->add(C_Offset, ldcReg); @@ -116,13 +113,13 @@ void CodeGenBase::storeCRegs< for (int j = 0; j < colRegs; ++j) { if (accum) { a->vpaddd( - CRegs_avx2_[i * leadingDimCRegAssign + j], - CRegs_avx2_[i * leadingDimCRegAssign + j], + CRegs_avx2_[i * leadingDimCReg + j], + CRegs_avx2_[i * leadingDimCReg + j], x86::dword_ptr(a->zcx(), C_Offset, 0, j * 8 * sizeof(int32_t))); } a->vmovups( x86::dword_ptr(a->zcx(), C_Offset, 0, j * 8 * sizeof(int32_t)), - CRegs_avx2_[i * leadingDimCRegAssign + j]); + CRegs_avx2_[i * leadingDimCReg + j]); } } } diff --git a/src/GenerateKernelU8S8S32ACC32Avx512.cc b/src/GenerateKernelU8S8S32ACC32Avx512.cc index d1729e4..12243ee 100644 --- a/src/GenerateKernelU8S8S32ACC32Avx512.cc +++ b/src/GenerateKernelU8S8S32ACC32Avx512.cc @@ -48,7 +48,7 @@ void CodeGenBase::genComputeBlock< int rowRegs, int colRegs, int lda, - int leadingDimCRegAssign) { + int leadingDimCReg) { // used for matrix A asmjit::X86Zmm AReg = x86::zmm31; @@ -71,9 +71,9 @@ void CodeGenBase::genComputeBlock< a->vpmaddubsw(res1, AReg, BReg); a->vpmaddwd(res1, oneReg, res1); a->vpaddd( - CRegs_avx512_[i * leadingDimCRegAssign + j], + CRegs_avx512_[i * leadingDimCReg + j], res1, - CRegs_avx512_[i * leadingDimCRegAssign + j]); + CRegs_avx512_[i * leadingDimCReg + j]); } a->prefetcht0(x86::dword_ptr(B_pf, j * VLEN_ * sizeof(int8_t))); } @@ -93,10 +93,7 @@ void CodeGenBase::storeCRegs< asmjit::X86Gp C_Offset, asmjit::X86Gp ldcReg, bool accum, - int leadingDimCRegAssign) { - // temp register - asmjit::X86Zmm tmpReg = x86::zmm28; - + int leadingDimCReg) { for (int i = 0; i < rowRegs; ++i) { if (i != 0) { a->add(C_Offset, ldcReg); @@ -107,13 +104,13 @@ void CodeGenBase::storeCRegs< for (int j = 0; j < colRegs; ++j) { if (accum) { a->vpaddd( - CRegs_avx512_[i * leadingDimCRegAssign + j], - CRegs_avx512_[i * leadingDimCRegAssign + j], + CRegs_avx512_[i * leadingDimCReg + j], + CRegs_avx512_[i * leadingDimCReg + j], x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t))); } a->vmovups( x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)), - CRegs_avx512_[i * leadingDimCRegAssign + j]); + CRegs_avx512_[i * leadingDimCReg + j]); } } } -- cgit v1.2.3