diff options
author | Young Jin Kim <youki@microsoft.com> | 2019-09-25 02:38:34 +0300 |
---|---|---|
committer | Young Jin Kim <youki@microsoft.com> | 2019-09-25 02:38:34 +0300 |
commit | 97caeee5af56b3d8ca56499f6107d6b3e7f21684 (patch) | |
tree | b0e97b502a70126945ab84975597ae8a335ddd44 | |
parent | 49e8018ab2397c175354317b35c6be6dd68f8932 (diff) |
JIT code working on windows (AVX512)
-rw-r--r-- | include/fbgemm/Fbgemm.h | 12 | ||||
-rw-r--r-- | src/GenerateKernelU8S8S32ACC32Avx512.cc | 40 | ||||
-rw-r--r-- | src/QuantUtilsAvx2.cc | 2 | ||||
-rw-r--r-- | src/Utils.cc | 3 |
4 files changed, 53 insertions, 4 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index 90d1ee9..1585f97 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -661,7 +661,11 @@ class FBGEMM_API PackAWithIm2Col ~PackAWithIm2Col() { if (rowOffsetAllocatedHere) { +#ifdef _MSC_VER + _aligned_free(row_offset_); +#else free(row_offset_); +#endif } } @@ -752,7 +756,11 @@ class FBGEMM_API PackAWithRowOffset final ~PackAWithRowOffset() { if (rowOffsetAllocatedHere) { +#ifdef _MSC_VER + _aligned_free(row_offset_); +#else free(row_offset_); +#endif } } @@ -845,7 +853,11 @@ class FBGEMM_API PackAWithQuantRowOffset final ~PackAWithQuantRowOffset() { if (rowOffsetAllocatedHere) { +#ifdef _MSC_VER + _aligned_free(row_offset_); +#else free(row_offset_); +#endif } } diff --git a/src/GenerateKernelU8S8S32ACC32Avx512.cc b/src/GenerateKernelU8S8S32ACC32Avx512.cc index d1729e4..d5269c4 100644 --- a/src/GenerateKernelU8S8S32ACC32Avx512.cc +++ b/src/GenerateKernelU8S8S32ACC32Avx512.cc @@ -109,10 +109,12 @@ void CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::storeCRegs< a->vpaddd( CRegs_avx512_[i * leadingDimCRegAssign + j], CRegs_avx512_[i * leadingDimCRegAssign + j], - x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t))); + x86::dword_ptr(a->gpzRef(9), C_Offset, 0, j * 16 * sizeof(int32_t))); +// x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t))); } a->vmovups( - x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)), + x86::dword_ptr(a->gpzRef(9), C_Offset, 0, j * 16 * sizeof(int32_t)), +// x86::dword_ptr(a->zcx(), C_Offset, 0, j * 16 * sizeof(int32_t)), CRegs_avx512_[i * leadingDimCRegAssign + j]); } } @@ -208,18 +210,34 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx512>( int mRegBlocksRem = mc % mRegBlockSize; // arguments to the function created +#ifdef _MSC_VER + asmjit::X86Gp buffer_A = a->zcx(); + asmjit::X86Gp buffer_B = a->zdx(); + asmjit::X86Gp B_pf = a->gpzRef(8); + asmjit::X86Gp CBase = a->gpzRef(9); + asmjit::X86Gp kSize = a->zdi(); // a->zsi(); // x86::esi; // a->zsi(); + asmjit::X86Gp ldcReg = a->zsi(); // a->zdi(); // x86::edi; // a->zdi(); +#else asmjit::X86Gp buffer_A = a->zdi(); asmjit::X86Gp buffer_B = a->zsi(); asmjit::X86Gp B_pf = a->zdx(); asmjit::X86Gp CBase = a->zcx(); asmjit::X86Gp kSize = a->gpzRef(8); asmjit::X86Gp ldcReg = a->gpzRef(9); +#endif asmjit::FuncDetail func; +#ifdef _MSC_VER + //func.init(asmjit::FuncSignature4<void, uint8_t*, int8_t*, int8_t*, int32_t*>( + // asmjit::CallConv::kIdHost)); + func.init(asmjit::FuncSignature6<void, uint8_t*, int8_t*, int8_t*, int32_t*, int, int>( + asmjit::CallConv::kIdHost)); +#else func.init( asmjit:: FuncSignature6<void, uint8_t*, int8_t*, int8_t*, int32_t*, int, int>( asmjit::CallConv::kIdHost)); +#endif asmjit::FuncFrameInfo ffi; ffi.setDirtyRegs( @@ -228,10 +246,20 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx512>( asmjit::Utils::mask(8, 9, 10, 11, 12, 13, 14, 15)); ffi.setDirtyRegs( asmjit::X86Reg::kKindGp, +#ifdef _MSC_VER asmjit::Utils::mask(8, 9, 10, 11, 12, 13, 14, 15)); + //asmjit::Utils::mask(8, 9, 10, 11, 12, 13, 14, 15)); +#else + asmjit::Utils::mask(8, 9, 10, 11, 12, 13, 14, 15)); +#endif asmjit::FuncArgsMapper args(&func); +#ifdef _MSC_VER +// args.assignAll(buffer_A, buffer_B, B_pf, CBase); args.assignAll(buffer_A, buffer_B, B_pf, CBase, kSize, ldcReg); +#else + args.assignAll(buffer_A, buffer_B, B_pf, CBase, kSize, ldcReg); +#endif args.updateFrameInfo(ffi); @@ -241,6 +269,14 @@ CodeGenBase<uint8_t, int8_t, int32_t, int32_t>::getOrCreate<inst_set_t::avx512>( asmjit::FuncUtils::emitProlog(a, layout); asmjit::FuncUtils::allocArgs(a, layout, args); +//#ifdef _MSC_VER +// // retrieve parameters from stack +// a->mov(kSize, asmjit::x86::dword_ptr(asmjit::x86::rsp, func.getArg(4).getStackOffset())); //0x20)); //func.getArg(4).getStackOffset())); +// std::cout << "func.getArg(4).getStackOffset(): " << func.getArg(4).getStackOffset() << std::endl; +// a->mov(ldcReg, asmjit::x86::dword_ptr(asmjit::x86::rsp, func.getArg(5).getStackOffset())); //;0x28)); //func.getArg(5).getStackOffset())); +// std::cout << "func.getArg(5).getStackOffset(): " << func.getArg(5).getStackOffset() << std::endl; +//#endif + asmjit::Label LoopMBlocks = a->newLabel(); asmjit::Label LoopNBlocks = a->newLabel(); asmjit::Label Loopk = a->newLabel(); diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc index 7f43ced..8746793 100644 --- a/src/QuantUtilsAvx2.cc +++ b/src/QuantUtilsAvx2.cc @@ -23,7 +23,7 @@ void QuantizeAvx2( uint8_t* dst, int len, const TensorQuantizationParams& qparams) { -#if defined(__AVX2__) && defined(__FMA__) +#if defined(__AVX2__) && (defined(__FMA__) || defined(_MSC_VER)) constexpr int VLEN = 8; std::size_t i = 0; __m256 inverse_scale_v = _mm256_set1_ps(1.f / qparams.scale); diff --git a/src/Utils.cc b/src/Utils.cc index 355a5cb..7306127 100644 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -181,7 +181,8 @@ void transpose_simd( if (cpuinfo_initialize()) { if (fbgemmHasAvx512Support()) { #ifdef _MSC_VER - internal::transpose_8x8(M, N, src, ld_src, dst, ld_dst); +// internal::transpose_8x8(M, N, src, ld_src, dst, ld_dst); + internal::transpose_16x16(M, N, src, ld_src, dst, ld_dst); #else internal::transpose_16x16(M, N, src, ld_src, dst, ld_dst); #endif |