Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJianyu Huang <jianyuhuang@fb.com>2019-07-01 21:55:51 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-07-01 22:13:52 +0300
commit61928df38bece4c1c16b98296355450f13ca0afe (patch)
tree8cd05dc094d40f1233f322416df003796f60a32d
parent278c146b929caf751f8e4daf31a039effe2bfb0c (diff)
Clean up some code for JIT code generator (#101)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/101 Some code cleanup: - Both ```leadingDimCReg``` and ```leadingDimCRegAssign``` are used in ```GenerateKernelU8S8S32ACC32.c```. We should unify them to only use one variable name. - Remove some redundant register variable ```asmjit::X86Ymm tmpReg = x86::ymm14;```. Reviewed By: dskhudia Differential Revision: D15673269 fbshipit-source-id: 81eb3673d0ff97391557413a13f1972561a1f2db
-rw-r--r--src/GenerateKernelU8S8S32ACC16.cc24
-rw-r--r--src/GenerateKernelU8S8S32ACC16Avx512.cc18
-rw-r--r--src/GenerateKernelU8S8S32ACC32.cc17
-rw-r--r--src/GenerateKernelU8S8S32ACC32Avx512.cc17
4 files changed, 36 insertions, 40 deletions
diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc
index 082518c..718b883 100644
--- a/src/GenerateKernelU8S8S32ACC16.cc
+++ b/src/GenerateKernelU8S8S32ACC16.cc
@@ -34,13 +34,13 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::initCRegs<
asmjit::X86Emitter* a,
int rowRegs,
int colRegs,
- int leadingDimCRegAssign) {
+ int leadingDimCReg) {
for (int i = 0; i < rowRegs; ++i) {
for (int j = 0; j < colRegs; ++j) {
a->vxorps(
- CRegs_avx2_[i * leadingDimCRegAssign + j],
- CRegs_avx2_[i * leadingDimCRegAssign + j],
- CRegs_avx2_[i * leadingDimCRegAssign + j]);
+ CRegs_avx2_[i * leadingDimCReg + j],
+ CRegs_avx2_[i * leadingDimCReg + j],
+ CRegs_avx2_[i * leadingDimCReg + j]);
}
}
}
@@ -60,7 +60,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::genComputeBlock<
int rowRegs,
int colRegs,
int lda,
- int leadingDimCRegAssign) {
+ int leadingDimCReg) {
// used for matrix A
asmjit::X86Ymm AReg = x86::ymm12;
@@ -74,9 +74,9 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::genComputeBlock<
a->vpmaddubsw(
tmpReg, AReg, x86::dword_ptr(buffer_B, j * VLEN_ * sizeof(int8_t)));
a->vpaddsw(
- CRegs_avx2_[i * leadingDimCRegAssign + j],
+ CRegs_avx2_[i * leadingDimCReg + j],
tmpReg,
- CRegs_avx2_[i * leadingDimCRegAssign + j]);
+ CRegs_avx2_[i * leadingDimCReg + j]);
// Prefetching is hurting performance in some cases
// because prefetch instructions itself consumes a slot
// in pipeline issue thus slowing down the kernel.
@@ -101,7 +101,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::storeCRegs<
asmjit::X86Gp C_Offset,
asmjit::X86Gp ldcReg,
bool accum,
- int leadingDimCRegAssign) {
+ int leadingDimCReg) {
asmjit::X86Xmm extractDest128 = x86::xmm15;
asmjit::X86Ymm extractDest256 = x86::ymm15;
@@ -110,7 +110,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::storeCRegs<
for (int j = 0; j < colRegs; ++j) {
for (int idx = 0; idx < 2; ++idx) {
a->vextracti128(
- extractDest128, CRegs_avx2_[i * leadingDimCRegAssign + j], idx);
+ extractDest128, CRegs_avx2_[i * leadingDimCReg + j], idx);
a->vpmovsxwd(extractDest256, extractDest128);
asmjit::X86Mem destAddr = x86::dword_ptr(
a->zcx(), C_Offset, 0, (j * 2 + idx) * 8 * sizeof(int32_t));
@@ -289,7 +289,8 @@ CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::getOrCreate<inst_set_t::avx2>(
a->jl(Loopk);
// store C matrix
- storeCRegs<inst_set_t::avx2>(a, rowRegs, colRegs, C_Offset, ldcReg, accum);
+ storeCRegs<inst_set_t::avx2>(
+ a, rowRegs, colRegs, C_Offset, ldcReg, accum);
// increment A for next block
a->sub(buffer_A, kSize);
@@ -339,7 +340,8 @@ CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::getOrCreate<inst_set_t::avx2>(
a->jl(LoopkRem);
// store C matrix
- storeCRegs<inst_set_t::avx2>(a, rowRegs, colRegs, C_Offset, ldcReg, accum);
+ storeCRegs<inst_set_t::avx2>(
+ a, rowRegs, colRegs, C_Offset, ldcReg, accum);
}
asmjit::FuncUtils::emitEpilog(a, layout);
diff --git a/src/GenerateKernelU8S8S32ACC16Avx512.cc b/src/GenerateKernelU8S8S32ACC16Avx512.cc
index e5687eb..c95757b 100644
--- a/src/GenerateKernelU8S8S32ACC16Avx512.cc
+++ b/src/GenerateKernelU8S8S32ACC16Avx512.cc
@@ -22,13 +22,13 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::initCRegs<
asmjit::X86Emitter* a,
int rowRegs,
int colRegs,
- int leadingDimCRegAssign) {
+ int leadingDimCReg) {
for (int i = 0; i < rowRegs; ++i) {
for (int j = 0; j < colRegs; ++j) {
a->vxorps(
- CRegs_avx512_[i * leadingDimCRegAssign + j],
- CRegs_avx512_[i * leadingDimCRegAssign + j],
- CRegs_avx512_[i * leadingDimCRegAssign + j]);
+ CRegs_avx512_[i * leadingDimCReg + j],
+ CRegs_avx512_[i * leadingDimCReg + j],
+ CRegs_avx512_[i * leadingDimCReg + j]);
}
}
}
@@ -48,7 +48,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::genComputeBlock<
int rowRegs,
int colRegs,
int lda,
- int leadingDimCRegAssign) {
+ int leadingDimCReg) {
// used for matrix A
asmjit::X86Zmm AReg = x86::zmm29;
@@ -69,9 +69,9 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::genComputeBlock<
a->vpmaddubsw(
tmpReg, AReg, AllRegs_avx512_[27-j]);
a->vpaddsw(
- CRegs_avx512_[i * leadingDimCRegAssign + j],
+ CRegs_avx512_[i * leadingDimCReg + j],
tmpReg,
- CRegs_avx512_[i * leadingDimCRegAssign + j]);
+ CRegs_avx512_[i * leadingDimCReg + j]);
// Prefetching is hurting performance in some cases
// because prefetch instructions itself consumes a slot
// in pipeline issue thus slowing down the kernel.
@@ -96,7 +96,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::storeCRegs<
asmjit::X86Gp C_Offset,
asmjit::X86Gp ldcReg,
bool accum,
- int leadingDimCRegAssign) {
+ int leadingDimCReg) {
asmjit::X86Ymm extractDest256 = x86::ymm31;
asmjit::X86Zmm extractDest512 = x86::zmm31;
@@ -105,7 +105,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::storeCRegs<
for (int j = 0; j < colRegs; ++j) {
for (int idx = 0; idx < 2; ++idx) {
a->vextracti32x8(
- extractDest256, CRegs_avx512_[i * leadingDimCRegAssign + j], idx);
+ extractDest256, CRegs_avx512_[i * leadingDimCReg + j], idx);
a->vpmovsxwd(extractDest512, extractDest256);
asmjit::X86Mem destAddr = x86::dword_ptr(
a->zcx(), C_Offset, 0, (j * 2 + idx) * 16 * sizeof(int32_t));
diff --git a/src/GenerateKernelU8S8S32ACC32.cc b/src/GenerateKernelU8S8S32ACC32.cc
index d044530..58643ad 100644
--- a/src/GenerateKernelU8S8S32ACC32.cc
+++ b/src/GenerateKernelU8S8S32ACC32.cc
@@ -60,7 +60,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::genComputeBlock<
int rowRegs,
int colRegs,
int lda,
- int leadingDimCRegAssign) {
+ int leadingDimCReg) {
// used for matrix A
asmjit::X86Ymm AReg = x86::ymm12;
@@ -83,9 +83,9 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::genComputeBlock<
a->vpmaddubsw(res1, AReg, BReg);
a->vpmaddwd(res1, oneReg, res1);
a->vpaddd(
- CRegs_avx2_[i * leadingDimCRegAssign + j],
+ CRegs_avx2_[i * leadingDimCReg + j],
res1,
- CRegs_avx2_[i * leadingDimCRegAssign + j]);
+ CRegs_avx2_[i * leadingDimCReg + j]);
}
a->prefetcht0(x86::dword_ptr(B_pf, j * VLEN_ * sizeof(int8_t)));
}
@@ -105,10 +105,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::storeCRegs<
asmjit::X86Gp C_Offset,
asmjit::X86Gp ldcReg,
bool accum,
- int leadingDimCRegAssign) {
- // temp register
- asmjit::X86Ymm tmpReg = x86::ymm14;
-
+ int leadingDimCReg) {
for (int i = 0; i < rowRegs; ++i) {
if (i != 0) {
a->add(C_Offset, ldcReg);
@@ -116,13 +113,13 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::storeCRegs<
for (int j = 0; j < colRegs; ++j) {
if (accum) {
a->vpaddd(
- CRegs_avx2_[i * leadingDimCRegAssign + j],
- CRegs_avx2_[i * leadingDimCRegAssign + j],
+ CRegs_avx2_[i * leadingDimCReg + j],
+ CRegs_avx2_[i * leadingDimCReg + j],
x86::dword_ptr(a->zcx(), C_Offset, 0, j * 8 * sizeof(int32_t)));
}
a->vmovups(
x86::dword_ptr(a->zcx(), C_Offset, 0, j * 8 * sizeof(int32_t)),
- CRegs_avx2_[i * leadingDimCRegAssign + j]);
+ CRegs_avx2_[i * leadingDimCReg + j]);
}
}
}
diff --git a/src/GenerateKernelU8S8S32ACC32Avx512.cc b/src/GenerateKernelU8S8S32ACC32Avx512.cc
index d1729e4..12243ee 100644
--- a/src/GenerateKernelU8S8S32ACC32Avx512.cc
+++ b/src/GenerateKernelU8S8S32ACC32Avx512.cc
@@ -48,7 +48,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::genComputeBlock<
int rowRegs,
int colRegs,
int lda,
- int leadingDimCRegAssign) {
+ int leadingDimCReg) {
// used for matrix A
asmjit::X86Zmm AReg = x86::zmm31;
@@ -71,9 +71,9 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::genComputeBlock<
a->vpmaddubsw(res1, AReg, BReg);
a->vpmaddwd(res1, oneReg, res1);
a->vpaddd(
- CRegs_avx512_[i * leadingDimCRegAssign + j],
+ CRegs_avx512_[i * leadingDimCReg + j],
res1,
- CRegs_avx512_[i * leadingDimCRegAssign + j]);
+ CRegs_avx512_[i * leadingDimCReg + j]);
}
a->prefetcht0(x86::dword_ptr(B_pf, j * VLEN_ * sizeof(int8_t)));
}
@@ -93,10 +93,7 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::storeCRegs<
asmjit::X86Gp C_Offset,
asmjit::X86Gp ldcReg,
bool accum,
- int leadingDimCRegAssign) {
- // temp register
- asmjit::X86Zmm tmpReg = x86::zmm28;
-
+ int leadingDimCReg) {
for (int i = 0; i < rowRegs; ++i) {
if (i != 0) {
a->add(C_Offset, ldcReg);
@@ -107,13 +104,13 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::storeCRegs<
for (int j = 0; j < colRegs; ++j) {
if (accum) {
a->vpaddd(
- CRegs_avx512_[i * leadingDimCRegAssign + j],
- CRegs_avx512_[i * leadingDimCRegAssign + j],
+ CRegs_avx512_[i * leadingDimCReg + j],
+ CRegs_avx512_[i * leadingDimCReg + j],
x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)));
}
a->vmovups(
x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)),
- CRegs_avx512_[i * leadingDimCRegAssign + j]);
+ CRegs_avx512_[i * leadingDimCReg + j]);
}
}
}