Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/dotnet/runtime.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEgor Chesakov <Egor.Chesakov@microsoft.com>2021-06-15 20:59:28 +0300
committerGitHub <noreply@github.com>2021-06-15 20:59:28 +0300
commitddae375636ca6cdbdf781fcb748d4b160dbfcfc4 (patch)
treec7b7271b1813ba36b641644a43b8d228fabaa9d9 /src/coreclr/jit
parent56727d58ac649723e8807fea6f1882f8972e7ea1 (diff)
Split 16-byte SIMD store around GC struct fields into two 8-byte SIMD stores (x86)/ two 8-byte mov-s (x64) (#53116)
Fixes #51638 by using 1) Constructing `ASG(OBJ(addr), 0)` for structs that have GC fields and keeping the current IR (i.e. `ASG(BLK(addr), 0)`) for other types. Such bookkeeping would allow the JIT to maintain information about the class layout. 2a) Emitting a sequence of `mov [m64],r64` instead of `movdqu [m128],xmm` when zeroing structs with GC fields that are not guaranteed to be on the stack on win-x64 or linux-x64. 2b) Emitting a sequence of `movq [m64],xmm` when zeroing such structs on win-x86.
Diffstat (limited to 'src/coreclr/jit')
-rw-r--r--src/coreclr/jit/codegenxarch.cpp62
-rw-r--r--src/coreclr/jit/gentree.h7
-rw-r--r--src/coreclr/jit/importer.cpp24
-rw-r--r--src/coreclr/jit/lowerxarch.cpp15
-rw-r--r--src/coreclr/jit/lsraxarch.cpp14
5 files changed, 98 insertions, 24 deletions
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index 7fd371e103e..2bd0142381f 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -2752,26 +2752,47 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
src = src->AsUnOp()->gtGetOp1();
}
+ unsigned size = node->GetLayout()->GetSize();
+
+ // An SSE mov that accesses data larger than 8 bytes may be implemented using
+ // multiple memory accesses. Hence, the JIT must not use such stores when
+ // INITBLK zeroes a struct that contains GC pointers and can be observed by
+ // other threads (i.e. when dstAddr is not an address of a local).
+ // For example, this can happen when initializing a struct field of an object.
+ const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences();
+
+#ifdef TARGET_AMD64
+ // On Amd64 the JIT will not use SIMD stores for such structs and instead
+ // will always allocate a GP register for src node.
+ const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
+#else
+ // On X86 the JIT will use movq for structs that are larger than 16 bytes
+ // since it is more beneficial than using two mov-s from a GP register.
+ const bool willUseSimdMov = (size >= 16);
+#endif
+
if (!src->isContained())
{
srcIntReg = genConsumeReg(src);
}
else
{
- // If src is contained then it must be 0 and the size must be a multiple
- // of XMM_REGSIZE_BYTES so initialization can use only SSE2 instructions.
+ // If src is contained then it must be 0.
assert(src->IsIntegralConst(0));
- assert((node->GetLayout()->GetSize() % XMM_REGSIZE_BYTES) == 0);
+ assert(willUseSimdMov);
+#ifdef TARGET_AMD64
+ assert(size % 16 == 0);
+#else
+ assert(size % 8 == 0);
+#endif
}
emitter* emit = GetEmitter();
- unsigned size = node->GetLayout()->GetSize();
assert(size <= INT32_MAX);
assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
- // Fill as much as possible using SSE2 stores.
- if (size >= XMM_REGSIZE_BYTES)
+ if (willUseSimdMov)
{
regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);
@@ -2791,9 +2812,25 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
#endif
}
- instruction simdMov = simdUnalignedMovIns();
- for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize)
+ instruction simdMov = simdUnalignedMovIns();
+ unsigned regSize = XMM_REGSIZE_BYTES;
+ unsigned bytesWritten = 0;
+
+ while (bytesWritten < size)
{
+#ifdef TARGET_X86
+ if (!canUse16BytesSimdMov || (bytesWritten + regSize > size))
+ {
+ simdMov = INS_movq;
+ regSize = 8;
+ }
+#endif
+ if (bytesWritten + regSize > size)
+ {
+ assert(srcIntReg != REG_NA);
+ break;
+ }
+
if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset);
@@ -2803,11 +2840,12 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
+
+ dstOffset += regSize;
+ bytesWritten += regSize;
}
- // TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores.
- // On x64 it may also be worth zero initializing a 4/8 byte remainder using MOVD/MOVQ, that avoids the need
- // to allocate a GPR just for the remainder.
+ size -= bytesWritten;
}
// Fill the remainder using normal stores.
@@ -4604,7 +4642,7 @@ void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node)
// The VM doesn't allow such large array elements but let's be sure.
noway_assert(scale <= INT32_MAX);
#else // !TARGET_64BIT
- tmpReg = node->GetSingleTempReg();
+ tmpReg = node->GetSingleTempReg();
#endif // !TARGET_64BIT
GetEmitter()->emitIns_R_I(emitter::inst3opImulForReg(tmpReg), EA_PTRSIZE, indexReg,
diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h
index ef65f7d7aae..e2a5045e906 100644
--- a/src/coreclr/jit/gentree.h
+++ b/src/coreclr/jit/gentree.h
@@ -5686,6 +5686,13 @@ public:
bool gtBlkOpGcUnsafe;
#endif
+#ifdef TARGET_XARCH
+ bool IsOnHeapAndContainsReferences()
+ {
+ return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIsLocalAddr();
+ }
+#endif
+
GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, ClassLayout* layout)
: GenTreeIndir(oper, type, addr, nullptr)
, m_layout(layout)
diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp
index f14d8d0cd40..fc8544f9138 100644
--- a/src/coreclr/jit/importer.cpp
+++ b/src/coreclr/jit/importer.cpp
@@ -16286,11 +16286,25 @@ void Compiler::impImportBlockCode(BasicBlock* block)
"type operand incompatible with type of address");
}
- size = info.compCompHnd->getClassSize(resolvedToken.hClass); // Size
- op2 = gtNewIconNode(0); // Value
- op1 = impPopStack().val; // Dest
- op1 = gtNewBlockVal(op1, size);
- op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false);
+ op2 = gtNewIconNode(0); // Value
+ op1 = impPopStack().val; // Dest
+
+ if (eeIsValueClass(resolvedToken.hClass))
+ {
+ op1 = gtNewStructVal(resolvedToken.hClass, op1);
+ if (op1->OperIs(GT_OBJ))
+ {
+ gtSetObjGcInfo(op1->AsObj());
+ }
+ }
+ else
+ {
+ size = info.compCompHnd->getClassSize(resolvedToken.hClass);
+ assert(size == TARGET_POINTER_SIZE);
+ op1 = gtNewBlockVal(op1, size);
+ }
+
+ op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false);
goto SPILL_APPEND;
case CEE_INITBLK:
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 4d425300604..55bfab94f6f 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -216,11 +216,18 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
if (fill == 0)
{
- // If the size is multiple of XMM register size there's no need to load 0 in a GPR,
- // codegen will use xorps to generate 0 directly in the temporary XMM register.
- if ((size % XMM_REGSIZE_BYTES) == 0)
+ if (size >= XMM_REGSIZE_BYTES)
{
- src->SetContained();
+ const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
+#ifdef TARGET_AMD64
+ const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0);
+#else
+ const bool willUseOnlySimdMov = (size % 8 == 0);
+#endif
+ if (willUseOnlySimdMov)
+ {
+ src->SetContained();
+ }
}
}
#ifdef TARGET_AMD64
diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp
index 811010f4f37..854e4521ec9 100644
--- a/src/coreclr/jit/lsraxarch.cpp
+++ b/src/coreclr/jit/lsraxarch.cpp
@@ -1069,7 +1069,7 @@ int LinearScan::BuildCall(GenTreeCall* call)
// The return value will be on the X87 stack, and we will need to move it.
dstCandidates = allRegs(registerType);
#else // !TARGET_X86
- dstCandidates = RBM_FLOATRET;
+ dstCandidates = RBM_FLOATRET;
#endif // !TARGET_X86
}
else if (registerType == TYP_LONG)
@@ -1297,7 +1297,14 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
switch (blkNode->gtBlkOpKind)
{
case GenTreeBlk::BlkOpKindUnroll:
- if (size >= XMM_REGSIZE_BYTES)
+ {
+#ifdef TARGET_AMD64
+ const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
+ const bool willUseSimdMov = canUse16BytesSimdMov && (size >= 16);
+#else
+ const bool willUseSimdMov = (size >= 16);
+#endif
+ if (willUseSimdMov)
{
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
SetContainsAVXFlags();
@@ -1310,7 +1317,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
srcRegMask = allByteRegs();
}
#endif
- break;
+ }
+ break;
case GenTreeBlk::BlkOpKindRepInstr:
dstAddrRegMask = RBM_RDI;