Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYoung Jin Kim <youki@microsoft.com>2019-09-25 02:38:34 +0300
committerYoung Jin Kim <youki@microsoft.com>2019-09-25 02:38:34 +0300
commit97caeee5af56b3d8ca56499f6107d6b3e7f21684 (patch)
treeb0e97b502a70126945ab84975597ae8a335ddd44
parent49e8018ab2397c175354317b35c6be6dd68f8932 (diff)
JIT code working on windows (AVX512)
-rw-r--r--include/fbgemm/Fbgemm.h12
-rw-r--r--src/GenerateKernelU8S8S32ACC32Avx512.cc40
-rw-r--r--src/QuantUtilsAvx2.cc2
-rw-r--r--src/Utils.cc3
4 files changed, 53 insertions, 4 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index 90d1ee9..1585f97 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -661,7 +661,11 @@ class FBGEMM_API PackAWithIm2Col
~PackAWithIm2Col() {
if (rowOffsetAllocatedHere) {
+#ifdef _MSC_VER
+ _aligned_free(row_offset_);
+#else
free(row_offset_);
+#endif
}
}
@@ -752,7 +756,11 @@ class FBGEMM_API PackAWithRowOffset final
~PackAWithRowOffset() {
if (rowOffsetAllocatedHere) {
+#ifdef _MSC_VER
+ _aligned_free(row_offset_);
+#else
free(row_offset_);
+#endif
}
}
@@ -845,7 +853,11 @@ class FBGEMM_API PackAWithQuantRowOffset final
~PackAWithQuantRowOffset() {
if (rowOffsetAllocatedHere) {
+#ifdef _MSC_VER
+ _aligned_free(row_offset_);
+#else
free(row_offset_);
+#endif
}
}
diff --git a/src/GenerateKernelU8S8S32ACC32Avx512.cc b/src/GenerateKernelU8S8S32ACC32Avx512.cc
index d1729e4..d5269c4 100644
--- a/src/GenerateKernelU8S8S32ACC32Avx512.cc
+++ b/src/GenerateKernelU8S8S32ACC32Avx512.cc
@@ -109,10 +109,12 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::storeCRegs<
a->vpaddd(
CRegs_avx512_[i * leadingDimCRegAssign + j],
CRegs_avx512_[i * leadingDimCRegAssign + j],
- x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)));
+ x86::dword_ptr(a->gpzRef(9), C_Offset, 0, j * 16 * sizeof(int32_t)));
+// x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)));
}
a->vmovups(
- x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)),
+ x86::dword_ptr(a->gpzRef(9), C_Offset, 0, j * 16 * sizeof(int32_t)),
+// x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)),
CRegs_avx512_[i * leadingDimCRegAssign + j]);
}
}
@@ -208,18 +210,34 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx512>(
int mRegBlocksRem = mc % mRegBlockSize;
// arguments to the function created
+#ifdef _MSC_VER
+ asmjit::X86Gp buffer_A = a->zcx();
+ asmjit::X86Gp buffer_B = a->zdx();
+ asmjit::X86Gp B_pf = a->gpzRef(8);
+ asmjit::X86Gp CBase = a->gpzRef(9);
+ asmjit::X86Gp kSize = a->zdi(); // a->zsi(); // x86::esi; // a->zsi();
+ asmjit::X86Gp ldcReg = a->zsi(); // a->zdi(); // x86::edi; // a->zdi();
+#else
asmjit::X86Gp buffer_A = a->zdi();
asmjit::X86Gp buffer_B = a->zsi();
asmjit::X86Gp B_pf = a->zdx();
asmjit::X86Gp CBase = a->zcx();
asmjit::X86Gp kSize = a->gpzRef(8);
asmjit::X86Gp ldcReg = a->gpzRef(9);
+#endif
asmjit::FuncDetail func;
+#ifdef _MSC_VER
+ //func.init(asmjit::FuncSignature4<void, uint8_t*, int8_t*, int8_t*, int32_t*>(
+ // asmjit::CallConv::kIdHost));
+ func.init(asmjit::FuncSignature6<void, uint8_t*, int8_t*, int8_t*, int32_t*, int, int>(
+ asmjit::CallConv::kIdHost));
+#else
func.init(
asmjit::
FuncSignature6<void, uint8_t*, int8_t*, int8_t*, int32_t*, int, int>(
asmjit::CallConv::kIdHost));
+#endif
asmjit::FuncFrameInfo ffi;
ffi.setDirtyRegs(
@@ -228,10 +246,20 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx512>(
asmjit::Utils::mask(8, 9, 10, 11, 12, 13, 14, 15));
ffi.setDirtyRegs(
asmjit::X86Reg::kKindGp,
+#ifdef _MSC_VER
asmjit::Utils::mask(8, 9, 10, 11, 12, 13, 14, 15));
+ //asmjit::Utils::mask(8, 9, 10, 11, 12, 13, 14, 15));
+#else
+ asmjit::Utils::mask(8, 9, 10, 11, 12, 13, 14, 15));
+#endif
asmjit::FuncArgsMapper args(&func);
+#ifdef _MSC_VER
+// args.assignAll(buffer_A, buffer_B, B_pf, CBase);
args.assignAll(buffer_A, buffer_B, B_pf, CBase, kSize, ldcReg);
+#else
+ args.assignAll(buffer_A, buffer_B, B_pf, CBase, kSize, ldcReg);
+#endif
args.updateFrameInfo(ffi);
@@ -241,6 +269,14 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx512>(
asmjit::FuncUtils::emitProlog(a, layout);
asmjit::FuncUtils::allocArgs(a, layout, args);
+//#ifdef _MSC_VER
+// // retrieve parameters from stack
+// a->mov(kSize, asmjit::x86::dword_ptr(asmjit::x86::rsp, func.getArg(4).getStackOffset())); //0x20)); //func.getArg(4).getStackOffset()));
+// std::cout << "func.getArg(4).getStackOffset(): " << func.getArg(4).getStackOffset() << std::endl;
+// a->mov(ldcReg, asmjit::x86::dword_ptr(asmjit::x86::rsp, func.getArg(5).getStackOffset())); //;0x28)); //func.getArg(5).getStackOffset()));
+// std::cout << "func.getArg(5).getStackOffset(): " << func.getArg(5).getStackOffset() << std::endl;
+//#endif
+
asmjit::Label LoopMBlocks = a->newLabel();
asmjit::Label LoopNBlocks = a->newLabel();
asmjit::Label Loopk = a->newLabel();
diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
index 7f43ced..8746793 100644
--- a/src/QuantUtilsAvx2.cc
+++ b/src/QuantUtilsAvx2.cc
@@ -23,7 +23,7 @@ void QuantizeAvx2(
uint8_t* dst,
int len,
const TensorQuantizationParams& qparams) {
-#if defined(__AVX2__) && defined(__FMA__)
+#if defined(__AVX2__) && (defined(__FMA__) || defined(_MSC_VER))
constexpr int VLEN = 8;
std::size_t i = 0;
__m256 inverse_scale_v = _mm256_set1_ps(1.f / qparams.scale);
diff --git a/src/Utils.cc b/src/Utils.cc
index 355a5cb..7306127 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -181,7 +181,8 @@ void transpose_simd(
if (cpuinfo_initialize()) {
if (fbgemmHasAvx512Support()) {
#ifdef _MSC_VER
- internal::transpose_8x8(M, N, src, ld_src, dst, ld_dst);
+// internal::transpose_8x8(M, N, src, ld_src, dst, ld_dst);
+ internal::transpose_16x16(M, N, src, ld_src, dst, ld_dst);
#else
internal::transpose_16x16(M, N, src, ld_src, dst, ld_dst);
#endif