Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/GenerateKernelU8S8S32ACC16.cc')
-rw-r--r--src/GenerateKernelU8S8S32ACC16.cc16
1 files changed, 10 insertions, 6 deletions
diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc
index f83012b..cbd5877 100644
--- a/src/GenerateKernelU8S8S32ACC16.cc
+++ b/src/GenerateKernelU8S8S32ACC16.cc
@@ -23,12 +23,13 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::initCRegs<
int rowRegs,
int colRegs,
int leadingDimCReg) {
+ using CRegs = x86::Ymm;
for (int i = 0; i < rowRegs; ++i) {
for (int j = 0; j < colRegs; ++j) {
a->vxorps(
- CRegs_avx2_[i * leadingDimCReg + j],
- CRegs_avx2_[i * leadingDimCReg + j],
- CRegs_avx2_[i * leadingDimCReg + j]);
+ CRegs(i * leadingDimCReg + j),
+ CRegs(i * leadingDimCReg + j),
+ CRegs(i * leadingDimCReg + j));
}
}
}
@@ -54,6 +55,8 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::genComputeBlock<
x86::Ymm tmpReg = x86::ymm14;
+ using CRegs = x86::Ymm;
+
for (int i = 0; i < rowRegs; ++i) {
// broadcast A
a->vpbroadcastw(
@@ -62,9 +65,9 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::genComputeBlock<
a->vpmaddubsw(
tmpReg, AReg, x86::dword_ptr(buffer_B, j * VLEN_ * sizeof(int8_t)));
a->vpaddsw(
- CRegs_avx2_[i * leadingDimCReg + j],
+ CRegs(i * leadingDimCReg + j),
tmpReg,
- CRegs_avx2_[i * leadingDimCReg + j]);
+ CRegs(i * leadingDimCReg + j));
// Prefetching is hurting performance in some cases
// because prefetch instructions itself consumes a slot
// in pipeline issue thus slowing down the kernel.
@@ -93,12 +96,13 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int16_t>::storeCRegs<
x86::Xmm extractDest128 = x86::xmm15;
x86::Ymm extractDest256 = x86::ymm15;
+ using CRegs = x86::Ymm;
for (int i = 0; i < rowRegs; ++i) {
a->imul(C_Offset, ldcReg, static_cast<asmjit::Imm>(i * sizeof(int32_t)));
for (int j = 0; j < colRegs; ++j) {
for (int idx = 0; idx < 2; ++idx) {
a->vextracti128(
- extractDest128, CRegs_avx2_[i * leadingDimCReg + j], idx);
+ extractDest128, CRegs(i * leadingDimCReg + j), idx);
a->vpmovsxwd(extractDest256, extractDest128);
x86::Mem destAddr = x86::dword_ptr(
a->zcx(), C_Offset, 0, (j * 2 + idx) * 8 * sizeof(int32_t));