diff options
author | Egor Chesakov <Egor.Chesakov@microsoft.com> | 2021-06-15 20:59:28 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-06-15 20:59:28 +0300 |
commit | ddae375636ca6cdbdf781fcb748d4b160dbfcfc4 (patch) | |
tree | c7b7271b1813ba36b641644a43b8d228fabaa9d9 /src/coreclr/jit | |
parent | 56727d58ac649723e8807fea6f1882f8972e7ea1 (diff) |
Split 16-byte SIMD store around GC struct fields into two 8-byte SIMD stores (x86)/ two 8-byte mov-s (x64) (#53116)
Fixes #51638 by using
1) Constructing `ASG(OBJ(addr), 0)` for structs that have GC fields and keeping the current IR (i.e. `ASG(BLK(addr), 0)`) for other types. Such bookkeeping would allow the JIT to maintain information about the class layout.
2a) Emitting a sequence of `mov [m64],r64` instead of `movdqu [m128],xmm` when zeroing structs with GC fields that are not guaranteed to be on the stack on win-x64 or linux-x64.
2b) Emitting a sequence of `movq [m64],xmm` when zeroing such structs on win-x86.
Diffstat (limited to 'src/coreclr/jit')
-rw-r--r-- | src/coreclr/jit/codegenxarch.cpp | 62 | ||||
-rw-r--r-- | src/coreclr/jit/gentree.h | 7 | ||||
-rw-r--r-- | src/coreclr/jit/importer.cpp | 24 | ||||
-rw-r--r-- | src/coreclr/jit/lowerxarch.cpp | 15 | ||||
-rw-r--r-- | src/coreclr/jit/lsraxarch.cpp | 14 |
5 files changed, 98 insertions, 24 deletions
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 7fd371e103e..2bd0142381f 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2752,26 +2752,47 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) src = src->AsUnOp()->gtGetOp1(); } + unsigned size = node->GetLayout()->GetSize(); + + // An SSE mov that accesses data larger than 8 bytes may be implemented using + // multiple memory accesses. Hence, the JIT must not use such stores when + // INITBLK zeroes a struct that contains GC pointers and can be observed by + // other threads (i.e. when dstAddr is not an address of a local). + // For example, this can happen when initializing a struct field of an object. + const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences(); + +#ifdef TARGET_AMD64 + // On Amd64 the JIT will not use SIMD stores for such structs and instead + // will always allocate a GP register for src node. + const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES); +#else + // On X86 the JIT will use movq for structs that are larger than 16 bytes + // since it is more beneficial than using two mov-s from a GP register. + const bool willUseSimdMov = (size >= 16); +#endif + if (!src->isContained()) { srcIntReg = genConsumeReg(src); } else { - // If src is contained then it must be 0 and the size must be a multiple - // of XMM_REGSIZE_BYTES so initialization can use only SSE2 instructions. + // If src is contained then it must be 0. assert(src->IsIntegralConst(0)); - assert((node->GetLayout()->GetSize() % XMM_REGSIZE_BYTES) == 0); + assert(willUseSimdMov); +#ifdef TARGET_AMD64 + assert(size % 16 == 0); +#else + assert(size % 8 == 0); +#endif } emitter* emit = GetEmitter(); - unsigned size = node->GetLayout()->GetSize(); assert(size <= INT32_MAX); assert(dstOffset < (INT32_MAX - static_cast<int>(size))); - // Fill as much as possible using SSE2 stores. - if (size >= XMM_REGSIZE_BYTES) + if (willUseSimdMov) { regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); @@ -2791,9 +2812,25 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #endif } - instruction simdMov = simdUnalignedMovIns(); - for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize) + instruction simdMov = simdUnalignedMovIns(); + unsigned regSize = XMM_REGSIZE_BYTES; + unsigned bytesWritten = 0; + + while (bytesWritten < size) { +#ifdef TARGET_X86 + if (!canUse16BytesSimdMov || (bytesWritten + regSize > size)) + { + simdMov = INS_movq; + regSize = 8; + } +#endif + if (bytesWritten + regSize > size) + { + assert(srcIntReg != REG_NA); + break; + } + if (dstLclNum != BAD_VAR_NUM) { emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset); @@ -2803,11 +2840,12 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, dstOffset); } + + dstOffset += regSize; + bytesWritten += regSize; } - // TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores. - // On x64 it may also be worth zero initializing a 4/8 byte remainder using MOVD/MOVQ, that avoids the need - // to allocate a GPR just for the remainder. + size -= bytesWritten; } // Fill the remainder using normal stores. @@ -4604,7 +4642,7 @@ void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node) // The VM doesn't allow such large array elements but let's be sure. noway_assert(scale <= INT32_MAX); #else // !TARGET_64BIT - tmpReg = node->GetSingleTempReg(); + tmpReg = node->GetSingleTempReg(); #endif // !TARGET_64BIT GetEmitter()->emitIns_R_I(emitter::inst3opImulForReg(tmpReg), EA_PTRSIZE, indexReg, diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index ef65f7d7aae..e2a5045e906 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -5686,6 +5686,13 @@ public: bool gtBlkOpGcUnsafe; #endif +#ifdef TARGET_XARCH + bool IsOnHeapAndContainsReferences() + { + return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIsLocalAddr(); + } +#endif + GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, ClassLayout* layout) : GenTreeIndir(oper, type, addr, nullptr) , m_layout(layout) diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index f14d8d0cd40..fc8544f9138 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -16286,11 +16286,25 @@ void Compiler::impImportBlockCode(BasicBlock* block) "type operand incompatible with type of address"); } - size = info.compCompHnd->getClassSize(resolvedToken.hClass); // Size - op2 = gtNewIconNode(0); // Value - op1 = impPopStack().val; // Dest - op1 = gtNewBlockVal(op1, size); - op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false); + op2 = gtNewIconNode(0); // Value + op1 = impPopStack().val; // Dest + + if (eeIsValueClass(resolvedToken.hClass)) + { + op1 = gtNewStructVal(resolvedToken.hClass, op1); + if (op1->OperIs(GT_OBJ)) + { + gtSetObjGcInfo(op1->AsObj()); + } + } + else + { + size = info.compCompHnd->getClassSize(resolvedToken.hClass); + assert(size == TARGET_POINTER_SIZE); + op1 = gtNewBlockVal(op1, size); + } + + op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false); goto SPILL_APPEND; case CEE_INITBLK: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4d425300604..55bfab94f6f 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -216,11 +216,18 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) if (fill == 0) { - // If the size is multiple of XMM register size there's no need to load 0 in a GPR, - // codegen will use xorps to generate 0 directly in the temporary XMM register. - if ((size % XMM_REGSIZE_BYTES) == 0) + if (size >= XMM_REGSIZE_BYTES) { - src->SetContained(); + const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences(); +#ifdef TARGET_AMD64 + const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0); +#else + const bool willUseOnlySimdMov = (size % 8 == 0); +#endif + if (willUseOnlySimdMov) + { + src->SetContained(); + } } } #ifdef TARGET_AMD64 diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 811010f4f37..854e4521ec9 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1069,7 +1069,7 @@ int LinearScan::BuildCall(GenTreeCall* call) // The return value will be on the X87 stack, and we will need to move it. dstCandidates = allRegs(registerType); #else // !TARGET_X86 - dstCandidates = RBM_FLOATRET; + dstCandidates = RBM_FLOATRET; #endif // !TARGET_X86 } else if (registerType == TYP_LONG) @@ -1297,7 +1297,14 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) switch (blkNode->gtBlkOpKind) { case GenTreeBlk::BlkOpKindUnroll: - if (size >= XMM_REGSIZE_BYTES) + { +#ifdef TARGET_AMD64 + const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences(); + const bool willUseSimdMov = canUse16BytesSimdMov && (size >= 16); +#else + const bool willUseSimdMov = (size >= 16); +#endif + if (willUseSimdMov) { buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); SetContainsAVXFlags(); @@ -1310,7 +1317,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) srcRegMask = allByteRegs(); } #endif - break; + } + break; case GenTreeBlk::BlkOpKindRepInstr: dstAddrRegMask = RBM_RDI; |