Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/dotnet/runtime.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTanner Gooding <tagoo@outlook.com>2021-05-18 13:33:52 +0300
committerGitHub <noreply@github.com>2021-05-18 13:33:52 +0300
commita2b7648d3b23fa00442ebc24a12d255895a0945e (patch)
treee08a820c93e71977e3fb89b32f0b19e442d2a257 /src/coreclr
parent68ebecbc84197d76f2ab9863e39341dacf99c48f (diff)
Port SIMDIntrinsicGetItem and SIMDIntrinsicSetItem to be implemented via HWIntrinsics (#52288)
* Port SIMDIntrinsicGetItem and SIMDIntrinsicSetItem to be implemented using SimdAsHWIntrinsic * Apply suggestions from code review Co-authored-by: Egor Chesakov <Egor.Chesakov@microsoft.com> * Resolving mismerge * Added a comment explaining why we sometimes return and sometimes do containment checks * Update src/coreclr/jit/lsraarm64.cpp Co-authored-by: Egor Chesakov <Egor.Chesakov@microsoft.com> Co-authored-by: Egor Chesakov <Egor.Chesakov@microsoft.com>
Diffstat (limited to 'src/coreclr')
-rw-r--r--src/coreclr/jit/codegen.h2
-rw-r--r--src/coreclr/jit/codegenarm64.cpp247
-rw-r--r--src/coreclr/jit/compiler.h23
-rw-r--r--src/coreclr/jit/decomposelongs.cpp120
-rw-r--r--src/coreclr/jit/decomposelongs.h7
-rw-r--r--src/coreclr/jit/emitxarch.cpp135
-rw-r--r--src/coreclr/jit/gentree.cpp164
-rw-r--r--src/coreclr/jit/hwintrinsic.cpp77
-rw-r--r--src/coreclr/jit/hwintrinsicarm64.cpp55
-rw-r--r--src/coreclr/jit/hwintrinsiccodegenarm64.cpp122
-rw-r--r--src/coreclr/jit/hwintrinsiccodegenxarch.cpp194
-rw-r--r--src/coreclr/jit/hwintrinsiclistarm64.h4
-rw-r--r--src/coreclr/jit/hwintrinsiclistxarch.h4
-rw-r--r--src/coreclr/jit/hwintrinsicxarch.cpp349
-rw-r--r--src/coreclr/jit/lclmorph.cpp4
-rw-r--r--src/coreclr/jit/lower.h2
-rw-r--r--src/coreclr/jit/lowerarmarch.cpp58
-rw-r--r--src/coreclr/jit/lowerxarch.cpp781
-rw-r--r--src/coreclr/jit/lsraarm64.cpp69
-rw-r--r--src/coreclr/jit/lsraxarch.cpp119
-rw-r--r--src/coreclr/jit/morph.cpp67
-rw-r--r--src/coreclr/jit/simd.cpp121
-rw-r--r--src/coreclr/jit/simdashwintrinsic.cpp52
-rw-r--r--src/coreclr/jit/simdashwintrinsic.h9
-rw-r--r--src/coreclr/jit/simdashwintrinsiclistarm64.h1
-rw-r--r--src/coreclr/jit/simdashwintrinsiclistxarch.h2
-rw-r--r--src/coreclr/jit/simdcodegenxarch.cpp394
-rw-r--r--src/coreclr/jit/simdintrinsiclist.h13
28 files changed, 1657 insertions, 1538 deletions
diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
index 99cc72d8b12..92242ca8f34 100644
--- a/src/coreclr/jit/codegen.h
+++ b/src/coreclr/jit/codegen.h
@@ -977,8 +977,6 @@ protected:
void genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode);
void genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode);
void genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode);
- void genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode);
- void genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode);
void genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode);
void genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode);
void genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode);
diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp
index 14213cbcdc9..f0377bfe6a6 100644
--- a/src/coreclr/jit/codegenarm64.cpp
+++ b/src/coreclr/jit/codegenarm64.cpp
@@ -3873,17 +3873,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
genSIMDIntrinsicBinOp(simdNode);
break;
- case SIMDIntrinsicGetItem:
- genSIMDIntrinsicGetItem(simdNode);
- break;
-
- case SIMDIntrinsicSetX:
- case SIMDIntrinsicSetY:
- case SIMDIntrinsicSetZ:
- case SIMDIntrinsicSetW:
- genSIMDIntrinsicSetItem(simdNode);
- break;
-
case SIMDIntrinsicUpperSave:
genSIMDIntrinsicUpperSave(simdNode);
break;
@@ -4346,242 +4335,6 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode)
genProduceReg(simdNode);
}
-//------------------------------------------------------------------------------------
-// genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
-//
-// Arguments:
-// simdNode - The GT_SIMD node
-//
-// Return Value:
-// None.
-//
-void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
-{
- assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
-
- GenTree* op1 = simdNode->gtGetOp1();
- GenTree* op2 = simdNode->gtGetOp2();
- var_types simdType = op1->TypeGet();
- assert(varTypeIsSIMD(simdType));
-
- // op1 of TYP_SIMD12 should be considered as TYP_SIMD16
- if (simdType == TYP_SIMD12)
- {
- simdType = TYP_SIMD16;
- }
-
- var_types baseType = simdNode->GetSimdBaseType();
- regNumber targetReg = simdNode->GetRegNum();
- assert(targetReg != REG_NA);
- var_types targetType = simdNode->TypeGet();
- assert(targetType == genActualType(baseType));
-
- // GetItem has 2 operands:
- // - the source of SIMD type (op1)
- // - the index of the value to be returned.
- genConsumeOperands(simdNode);
-
- emitAttr baseTypeSize = emitTypeSize(baseType);
- unsigned baseTypeScale = genLog2(EA_SIZE_IN_BYTES(baseTypeSize));
-
- if (op2->IsCnsIntOrI())
- {
- assert(op2->isContained());
-
- ssize_t index = op2->AsIntCon()->gtIconVal;
-
- // We only need to generate code for the get if the index is valid
- // If the index is invalid, previously generated for the range check will throw
- if (GetEmitter()->isValidVectorIndex(emitTypeSize(simdType), baseTypeSize, index))
- {
- if (op1->isContained())
- {
- int offset = (int)index * genTypeSize(baseType);
- instruction ins = ins_Load(baseType);
-
- assert(!op1->isUsedFromReg());
-
- if (op1->OperIsLocal())
- {
- unsigned varNum = op1->AsLclVarCommon()->GetLclNum();
-
- GetEmitter()->emitIns_R_S(ins, emitActualTypeSize(baseType), targetReg, varNum, offset);
- }
- else
- {
- assert(op1->OperGet() == GT_IND);
-
- GenTree* addr = op1->AsIndir()->Addr();
- assert(!addr->isContained());
- regNumber baseReg = addr->GetRegNum();
-
- // ldr targetReg, [baseReg, #offset]
- GetEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(baseType), targetReg, baseReg, offset);
- }
- }
- else
- {
- assert(op1->isUsedFromReg());
- regNumber srcReg = op1->GetRegNum();
-
- instruction ins;
- if (varTypeIsFloating(baseType))
- {
- assert(genIsValidFloatReg(targetReg));
- // dup targetReg, srcReg[#index]
- ins = INS_dup;
- }
- else
- {
- assert(genIsValidIntReg(targetReg));
- if (varTypeIsUnsigned(baseType) || (baseTypeSize == EA_8BYTE))
- {
- // umov targetReg, srcReg[#index]
- ins = INS_umov;
- }
- else
- {
- // smov targetReg, srcReg[#index]
- ins = INS_smov;
- }
- }
- GetEmitter()->emitIns_R_R_I(ins, baseTypeSize, targetReg, srcReg, index);
- }
- }
- }
- else
- {
- assert(!op2->isContained());
-
- regNumber baseReg = REG_NA;
- regNumber indexReg = op2->GetRegNum();
-
- if (op1->isContained())
- {
- // Optimize the case of op1 is in memory and trying to access ith element.
- assert(!op1->isUsedFromReg());
- if (op1->OperIsLocal())
- {
- unsigned varNum = op1->AsLclVarCommon()->GetLclNum();
-
- baseReg = simdNode->ExtractTempReg();
-
- // Load the address of varNum
- GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, baseReg, varNum, 0);
- }
- else
- {
- // Require GT_IND addr to be not contained.
- assert(op1->OperGet() == GT_IND);
-
- GenTree* addr = op1->AsIndir()->Addr();
- assert(!addr->isContained());
-
- baseReg = addr->GetRegNum();
- }
- }
- else
- {
- assert(op1->isUsedFromReg());
- regNumber srcReg = op1->GetRegNum();
-
- unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
- noway_assert(compiler->lvaSIMDInitTempVarNum != BAD_VAR_NUM);
-
- baseReg = simdNode->ExtractTempReg();
-
- // Load the address of simdInitTempVarNum
- GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, baseReg, simdInitTempVarNum, 0);
-
- // Store the vector to simdInitTempVarNum
- GetEmitter()->emitIns_R_R(INS_str, emitTypeSize(simdType), srcReg, baseReg);
- }
-
- assert(genIsValidIntReg(indexReg));
- assert(genIsValidIntReg(baseReg));
- assert(baseReg != indexReg);
-
- // Load item at baseReg[index]
- GetEmitter()->emitIns_R_R_R_Ext(ins_Load(baseType), baseTypeSize, targetReg, baseReg, indexReg, INS_OPTS_LSL,
- baseTypeScale);
- }
-
- genProduceReg(simdNode);
-}
-
-//------------------------------------------------------------------------------------
-// genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i.
-//
-// Arguments:
-// simdNode - The GT_SIMD node
-//
-// Return Value:
-// None.
-//
-void CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode)
-{
- // Determine index based on intrinsic ID
- int index = -1;
- switch (simdNode->gtSIMDIntrinsicID)
- {
- case SIMDIntrinsicSetX:
- index = 0;
- break;
- case SIMDIntrinsicSetY:
- index = 1;
- break;
- case SIMDIntrinsicSetZ:
- index = 2;
- break;
- case SIMDIntrinsicSetW:
- index = 3;
- break;
-
- default:
- unreached();
- }
- assert(index != -1);
-
- // op1 is the SIMD vector
- // op2 is the value to be set
- GenTree* op1 = simdNode->gtGetOp1();
- GenTree* op2 = simdNode->gtGetOp2();
-
- var_types baseType = simdNode->GetSimdBaseType();
- regNumber targetReg = simdNode->GetRegNum();
- assert(targetReg != REG_NA);
- var_types targetType = simdNode->TypeGet();
- assert(varTypeIsSIMD(targetType));
-
- assert(op2->TypeGet() == baseType);
- assert(simdNode->GetSimdSize() >= ((index + 1) * genTypeSize(baseType)));
-
- genConsumeOperands(simdNode);
- regNumber op1Reg = op1->GetRegNum();
- regNumber op2Reg = op2->GetRegNum();
-
- assert(genIsValidFloatReg(targetReg));
- assert(genIsValidFloatReg(op1Reg));
- assert(genIsValidIntReg(op2Reg) || genIsValidFloatReg(op2Reg));
- assert(targetReg != op2Reg);
-
- emitAttr attr = emitTypeSize(baseType);
-
- // Insert mov if register assignment requires it
- GetEmitter()->emitIns_Mov(INS_mov, EA_16BYTE, targetReg, op1Reg, /* canSkip */ false);
-
- if (genIsValidIntReg(op2Reg))
- {
- GetEmitter()->emitIns_R_R_I(INS_ins, attr, targetReg, op2Reg, index);
- }
- else
- {
- GetEmitter()->emitIns_R_R_I_I(INS_ins, attr, targetReg, op2Reg, index, 0);
- }
-
- genProduceReg(simdNode);
-}
-
//-----------------------------------------------------------------------------
// genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD16 vector to
// the given register, if any, or to memory.
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 56f7b44d669..0efcbdf5ced 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -2906,6 +2906,21 @@ public:
GenTreeHWIntrinsic* gtNewSimdCreateBroadcastNode(
var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic);
+ GenTreeHWIntrinsic* gtNewSimdGetElementNode(var_types type,
+ GenTree* op1,
+ GenTree* op2,
+ CorInfoType simdBaseJitType,
+ unsigned simdSize,
+ bool isSimdAsHWIntrinsic);
+
+ GenTreeHWIntrinsic* gtNewSimdWithElementNode(var_types type,
+ GenTree* op1,
+ GenTree* op2,
+ GenTree* op3,
+ CorInfoType simdBaseJitType,
+ unsigned simdSize,
+ bool isSimdAsHWIntrinsic);
+
GenTreeHWIntrinsic* gtNewSimdAsHWIntrinsicNode(var_types type,
NamedIntrinsic hwIntrinsicID,
CorInfoType simdBaseJitType,
@@ -4116,6 +4131,7 @@ protected:
GenTree* impNonConstFallback(NamedIntrinsic intrinsic, var_types simdType, CorInfoType simdBaseJitType);
GenTree* addRangeCheckIfNeeded(
NamedIntrinsic intrinsic, GenTree* immOp, bool mustExpand, int immLowerBound, int immUpperBound);
+ GenTree* addRangeCheckForHWIntrinsic(GenTree* immOp, int immLowerBound, int immUpperBound);
#ifdef TARGET_XARCH
GenTree* impBaseIntrinsic(NamedIntrinsic intrinsic,
@@ -5899,8 +5915,8 @@ private:
unsigned* indexOut,
unsigned* simdSizeOut,
bool ignoreUsedInSIMDIntrinsic = false);
- GenTree* fgMorphFieldAssignToSIMDIntrinsicSet(GenTree* tree);
- GenTree* fgMorphFieldToSIMDIntrinsicGet(GenTree* tree);
+ GenTree* fgMorphFieldAssignToSimdSetElement(GenTree* tree);
+ GenTree* fgMorphFieldToSimdGetElement(GenTree* tree);
bool fgMorphCombineSIMDFieldAssignments(BasicBlock* block, Statement* stmt);
void impMarkContiguousSIMDFieldAssignments(Statement* stmt);
@@ -8545,9 +8561,6 @@ private:
// Normalizes TYP_STRUCT value in case of GT_CALL, GT_RET_EXPR and arg nodes.
GenTree* impSIMDPopStack(var_types type, bool expectAddr = false, CORINFO_CLASS_HANDLE structType = nullptr);
- // Create a GT_SIMD tree for a Get property of SIMD vector with a fixed index.
- GenTreeSIMD* impSIMDGetFixed(var_types simdType, CorInfoType simdBaseJitType, unsigned simdSize, int index);
-
// Transforms operands and returns the SIMD intrinsic to be applied on
// transformed operands to obtain given relop result.
SIMDIntrinsicID impSIMDRelOp(SIMDIntrinsicID relOpIntrinsicId,
diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp
index 882168e0224..3ad6ce9b9ff 100644
--- a/src/coreclr/jit/decomposelongs.cpp
+++ b/src/coreclr/jit/decomposelongs.cpp
@@ -240,11 +240,11 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
nextNode = DecomposeRotate(use);
break;
-#ifdef FEATURE_SIMD
- case GT_SIMD:
- nextNode = DecomposeSimd(use);
+#ifdef FEATURE_HW_INTRINSICS
+ case GT_HWINTRINSIC:
+ nextNode = DecomposeHWIntrinsic(use);
break;
-#endif // FEATURE_SIMD
+#endif // FEATURE_HW_INTRINSICS
case GT_LOCKADD:
case GT_XORR:
@@ -1622,10 +1622,10 @@ GenTree* DecomposeLongs::DecomposeUMod(LIR::Use& use)
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
}
-#ifdef FEATURE_SIMD
+#ifdef FEATURE_HW_INTRINSICS
//------------------------------------------------------------------------
-// DecomposeSimd: Decompose GT_SIMD.
+// DecomposeHWIntrinsic: Decompose GT_HWINTRINSIC.
//
// Arguments:
// use - the LIR::Use object for the def that needs to be decomposed.
@@ -1633,22 +1633,21 @@ GenTree* DecomposeLongs::DecomposeUMod(LIR::Use& use)
// Return Value:
// The next node to process.
//
-GenTree* DecomposeLongs::DecomposeSimd(LIR::Use& use)
+GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
{
- GenTree* tree = use.Def();
- genTreeOps oper = tree->OperGet();
-
- assert(oper == GT_SIMD);
+ GenTree* tree = use.Def();
+ assert(tree->OperIs(GT_HWINTRINSIC));
- GenTreeSIMD* simdTree = tree->AsSIMD();
+ GenTreeHWIntrinsic* hwintrinsicTree = tree->AsHWIntrinsic();
- switch (simdTree->gtSIMDIntrinsicID)
+ switch (hwintrinsicTree->gtHWIntrinsicId)
{
- case SIMDIntrinsicGetItem:
- return DecomposeSimdGetItem(use);
+ case NI_Vector128_GetElement:
+ case NI_Vector256_GetElement:
+ return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree);
default:
- noway_assert(!"unexpected GT_SIMD node in long decomposition");
+ noway_assert(!"unexpected GT_HWINTRINSIC node in long decomposition");
break;
}
@@ -1656,72 +1655,75 @@ GenTree* DecomposeLongs::DecomposeSimd(LIR::Use& use)
}
//------------------------------------------------------------------------
-// DecomposeSimdGetItem: Decompose GT_SIMD -- SIMDIntrinsicGetItem.
+// DecomposeHWIntrinsicGetElement: Decompose GT_HWINTRINSIC -- NI_Vector*_GetElement.
//
-// Decompose a get[i] node on Vector<long>. For:
+// Decompose a get[i] node on Vector*<long>. For:
//
-// GT_SIMD{get_item}[long](simd_var, index)
+// GT_HWINTRINSIC{GetElement}[long](simd_var, index)
//
// create:
//
// tmp_simd_var = simd_var
// tmp_index = index
-// loResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2)
-// hiResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2 + 1)
+// loResult = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, tmp_index * 2)
+// hiResult = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, tmp_index * 2 + 1)
// return: GT_LONG(loResult, hiResult)
//
-// This isn't optimal codegen, since SIMDIntrinsicGetItem sometimes requires
+// This isn't optimal codegen, since NI_Vector*_GetElement sometimes requires
// temps that could be shared, for example.
//
// Arguments:
// use - the LIR::Use object for the def that needs to be decomposed.
+// node - the hwintrinsic node to decompose
//
// Return Value:
// The next node to process.
//
-GenTree* DecomposeLongs::DecomposeSimdGetItem(LIR::Use& use)
+GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node)
{
- GenTree* tree = use.Def();
- genTreeOps oper = tree->OperGet();
-
- assert(oper == GT_SIMD);
+ assert(node == use.Def());
+ assert(varTypeIsLong(node));
+ assert((node->gtHWIntrinsicId == NI_Vector128_GetElement) || (node->gtHWIntrinsicId == NI_Vector256_GetElement));
- GenTreeSIMD* simdTree = tree->AsSIMD();
- var_types baseType = simdTree->GetSimdBaseType();
- unsigned simdSize = simdTree->GetSimdSize();
+ GenTree* op1 = node->gtGetOp1();
+ GenTree* op2 = node->gtGetOp2();
+ var_types simdBaseType = node->GetSimdBaseType();
+ unsigned simdSize = node->GetSimdSize();
- assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
- assert(varTypeIsLong(baseType));
- assert(varTypeIsLong(simdTree));
- assert(varTypeIsSIMD(simdTree->AsOp()->gtOp1->gtType));
- assert(simdTree->AsOp()->gtOp2->gtType == TYP_INT);
+ assert(varTypeIsLong(simdBaseType));
+ assert(varTypeIsSIMD(op1->TypeGet()));
+ assert(op2->TypeIs(TYP_INT));
- bool indexIsConst = simdTree->AsOp()->gtOp2->IsCnsIntOrI();
+ bool indexIsConst = op2->OperIsConst();
ssize_t index = 0;
+
if (indexIsConst)
{
- index = simdTree->AsOp()->gtOp2->AsIntCon()->gtIconVal;
+ index = op2->AsIntCon()->IconValue();
}
- GenTree* simdTmpVar = RepresentOpAsLocalVar(simdTree->AsOp()->gtOp1, simdTree, &simdTree->AsOp()->gtOp1);
+ GenTree* simdTmpVar = RepresentOpAsLocalVar(op1, node, &node->gtOp1);
unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum();
- JITDUMP("[DecomposeSimdGetItem]: Saving op1 tree to a temp var:\n");
+ JITDUMP("[DecomposeHWIntrinsicGetElement]: Saving op1 tree to a temp var:\n");
DISPTREERANGE(Range(), simdTmpVar);
Range().Remove(simdTmpVar);
+ op1 = node->gtGetOp1();
GenTree* indexTmpVar = nullptr;
unsigned indexTmpVarNum = 0;
+
if (!indexIsConst)
{
- indexTmpVar = RepresentOpAsLocalVar(simdTree->AsOp()->gtOp2, simdTree, &simdTree->AsOp()->gtOp2);
+ indexTmpVar = RepresentOpAsLocalVar(op2, node, &node->gtOp2);
indexTmpVarNum = indexTmpVar->AsLclVarCommon()->GetLclNum();
- JITDUMP("[DecomposeSimdGetItem]: Saving op2 tree to a temp var:\n");
+ JITDUMP("[DecomposeHWIntrinsicGetElement]: Saving op2 tree to a temp var:\n");
DISPTREERANGE(Range(), indexTmpVar);
Range().Remove(indexTmpVar);
+ op2 = node->gtGetOp2();
}
// Create:
- // loResult = GT_SIMD{get_item}[int](tmp_simd_var, index * 2)
+ // loResult = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, index * 2)
GenTree* simdTmpVar1 = simdTmpVar;
GenTree* indexTimesTwo1;
@@ -1729,34 +1731,34 @@ GenTree* DecomposeLongs::DecomposeSimdGetItem(LIR::Use& use)
if (indexIsConst)
{
// Reuse the existing index constant node.
- indexTimesTwo1 = simdTree->AsOp()->gtOp2;
+ indexTimesTwo1 = op2;
Range().Remove(indexTimesTwo1);
- indexTimesTwo1->AsIntCon()->gtIconVal = index * 2;
+ indexTimesTwo1->AsIntCon()->SetIconValue(index * 2);
- Range().InsertBefore(simdTree, simdTmpVar1, indexTimesTwo1);
+ Range().InsertBefore(node, simdTmpVar1, indexTimesTwo1);
}
else
{
GenTree* indexTmpVar1 = indexTmpVar;
GenTree* two1 = m_compiler->gtNewIconNode(2, TYP_INT);
indexTimesTwo1 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar1, two1);
- Range().InsertBefore(simdTree, simdTmpVar1, indexTmpVar1, two1, indexTimesTwo1);
+ Range().InsertBefore(node, simdTmpVar1, indexTmpVar1, two1, indexTimesTwo1);
}
- GenTree* loResult = m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar1, indexTimesTwo1, SIMDIntrinsicGetItem,
- CORINFO_TYPE_INT, simdSize);
- Range().InsertBefore(simdTree, loResult);
+ GenTree* loResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar1, indexTimesTwo1,
+ node->gtHWIntrinsicId, CORINFO_TYPE_INT, simdSize);
+ Range().InsertBefore(node, loResult);
// Create:
- // hiResult = GT_SIMD{get_item}[int](tmp_simd_var, index * 2 + 1)
+ // hiResult = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, index * 2 + 1)
- GenTree* simdTmpVar2 = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTree->AsOp()->gtOp1->gtType);
+ GenTree* simdTmpVar2 = m_compiler->gtNewLclLNode(simdTmpVarNum, op1->TypeGet());
GenTree* indexTimesTwoPlusOne;
if (indexIsConst)
{
indexTimesTwoPlusOne = m_compiler->gtNewIconNode(index * 2 + 1, TYP_INT);
- Range().InsertBefore(simdTree, simdTmpVar2, indexTimesTwoPlusOne);
+ Range().InsertBefore(node, simdTmpVar2, indexTimesTwoPlusOne);
}
else
{
@@ -1765,22 +1767,22 @@ GenTree* DecomposeLongs::DecomposeSimdGetItem(LIR::Use& use)
GenTree* indexTimesTwo2 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar2, two2);
GenTree* one = m_compiler->gtNewIconNode(1, TYP_INT);
indexTimesTwoPlusOne = m_compiler->gtNewOperNode(GT_ADD, TYP_INT, indexTimesTwo2, one);
- Range().InsertBefore(simdTree, simdTmpVar2, indexTmpVar2, two2, indexTimesTwo2);
- Range().InsertBefore(simdTree, one, indexTimesTwoPlusOne);
+ Range().InsertBefore(node, simdTmpVar2, indexTmpVar2, two2, indexTimesTwo2);
+ Range().InsertBefore(node, one, indexTimesTwoPlusOne);
}
- GenTree* hiResult = m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar2, indexTimesTwoPlusOne, SIMDIntrinsicGetItem,
- CORINFO_TYPE_INT, simdSize);
- Range().InsertBefore(simdTree, hiResult);
+ GenTree* hiResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar2, indexTimesTwoPlusOne,
+ node->gtHWIntrinsicId, CORINFO_TYPE_INT, simdSize);
+ Range().InsertBefore(node, hiResult);
// Done with the original tree; remove it.
- Range().Remove(simdTree);
+ Range().Remove(node);
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
}
-#endif // FEATURE_SIMD
+#endif // FEATURE_HW_INTRINSICS
//------------------------------------------------------------------------
// StoreNodeToVar: Check if the user is a STORE_LCL_VAR, and if it isn't,
diff --git a/src/coreclr/jit/decomposelongs.h b/src/coreclr/jit/decomposelongs.h
index cc3bddab128..42ea8ada70d 100644
--- a/src/coreclr/jit/decomposelongs.h
+++ b/src/coreclr/jit/decomposelongs.h
@@ -55,8 +55,11 @@ private:
GenTree* DecomposeRotate(LIR::Use& use);
GenTree* DecomposeMul(LIR::Use& use);
GenTree* DecomposeUMod(LIR::Use& use);
- GenTree* DecomposeSimd(LIR::Use& use);
- GenTree* DecomposeSimdGetItem(LIR::Use& use);
+
+#ifdef FEATURE_HW_INTRINSICS
+ GenTree* DecomposeHWIntrinsic(LIR::Use& use);
+ GenTree* DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node);
+#endif // FEATURE_HW_INTRINSICS
// Helper functions
GenTree* FinalizeDecomposition(LIR::Use& use, GenTree* loResult, GenTree* hiResult, GenTree* insertResultAfter);
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 5fd16415d4f..c22d52bdc49 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -8499,8 +8499,58 @@ void emitter::emitDispIns(
}
else
{
- attr = id->idOpSize();
- sstr = codeGen->genSizeStr(attr);
+ emitAttr sizeAttr = id->idOpSize();
+ attr = sizeAttr;
+
+ switch (ins)
+ {
+ case INS_vextractf128:
+ case INS_vextracti128:
+ case INS_vinsertf128:
+ case INS_vinserti128:
+ {
+ sizeAttr = EA_16BYTE;
+ break;
+ }
+
+ case INS_pextrb:
+ case INS_pinsrb:
+ {
+ sizeAttr = EA_1BYTE;
+ break;
+ }
+
+ case INS_pextrw:
+ case INS_pextrw_sse41:
+ case INS_pinsrw:
+ {
+ sizeAttr = EA_2BYTE;
+ break;
+ }
+
+ case INS_extractps:
+ case INS_insertps:
+ case INS_pextrd:
+ case INS_pinsrd:
+ {
+ sizeAttr = EA_4BYTE;
+ break;
+ }
+
+ case INS_pextrq:
+ case INS_pinsrq:
+ {
+ sizeAttr = EA_8BYTE;
+ break;
+ }
+
+ default:
+ {
+ break;
+ }
+ }
+
+ sstr = codeGen->genSizeStr(sizeAttr);
if (ins == INS_lea)
{
@@ -9031,6 +9081,36 @@ void emitter::emitDispIns(
assert(IsThreeOperandAVXInstruction(ins));
printf("%s, ", emitRegName(id->idReg1(), attr));
printf("%s, ", emitRegName(id->idReg2(), attr));
+
+ switch (ins)
+ {
+ case INS_vinsertf128:
+ case INS_vinserti128:
+ {
+ attr = EA_16BYTE;
+ break;
+ }
+
+ case INS_pinsrb:
+ case INS_pinsrw:
+ case INS_pinsrd:
+ {
+ attr = EA_4BYTE;
+ break;
+ }
+
+ case INS_pinsrq:
+ {
+ attr = EA_8BYTE;
+ break;
+ }
+
+ default:
+ {
+ break;
+ }
+ }
+
printf("%s, ", emitRegName(id->idReg3(), attr));
val = emitGetInsSC(id);
goto PRINT_CONSTANT;
@@ -9044,7 +9124,55 @@ void emitter::emitDispIns(
printf("%s", emitRegName(id->idReg4(), attr));
break;
case IF_RRW_RRW_CNS:
- printf("%s,", emitRegName(id->idReg1(), attr));
+ {
+ emitAttr tgtAttr = attr;
+
+ switch (ins)
+ {
+ case INS_vextractf128:
+ case INS_vextracti128:
+ {
+ tgtAttr = EA_16BYTE;
+ break;
+ }
+
+ case INS_extractps:
+ case INS_pextrb:
+ case INS_pextrw:
+ case INS_pextrw_sse41:
+ case INS_pextrd:
+ {
+ tgtAttr = EA_4BYTE;
+ break;
+ }
+
+ case INS_pextrq:
+ {
+ tgtAttr = EA_8BYTE;
+ break;
+ }
+
+ case INS_pinsrb:
+ case INS_pinsrw:
+ case INS_pinsrd:
+ {
+ attr = EA_4BYTE;
+ break;
+ }
+
+ case INS_pinsrq:
+ {
+ attr = EA_8BYTE;
+ break;
+ }
+
+ default:
+ {
+ break;
+ }
+ }
+
+ printf("%s,", emitRegName(id->idReg1(), tgtAttr));
printf(" %s", emitRegName(id->idReg2(), attr));
val = emitGetInsSC(id);
#ifdef TARGET_AMD64
@@ -9061,6 +9189,7 @@ void emitter::emitDispIns(
goto PRINT_CONSTANT;
}
break;
+ }
case IF_RRD:
case IF_RWR:
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index b55131194b7..cb19c0600ab 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -19266,6 +19266,170 @@ GenTreeHWIntrinsic* Compiler::gtNewSimdCreateBroadcastNode(
return gtNewSimdHWIntrinsicNode(type, op1, hwIntrinsicID, simdBaseJitType, simdSize);
}
+GenTreeHWIntrinsic* Compiler::gtNewSimdGetElementNode(var_types type,
+ GenTree* op1,
+ GenTree* op2,
+ CorInfoType simdBaseJitType,
+ unsigned simdSize,
+ bool isSimdAsHWIntrinsic)
+{
+ NamedIntrinsic intrinsicId = NI_Vector128_GetElement;
+ var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
+
+ assert(varTypeIsArithmetic(simdBaseType));
+
+#if defined(TARGET_XARCH)
+ switch (simdBaseType)
+ {
+ // Using software fallback if simdBaseType is not supported by hardware
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ case TYP_INT:
+ case TYP_UINT:
+ case TYP_LONG:
+ case TYP_ULONG:
+ assert(compIsaSupportedDebugOnly(InstructionSet_SSE41));
+ break;
+
+ case TYP_DOUBLE:
+ case TYP_FLOAT:
+ case TYP_SHORT:
+ case TYP_USHORT:
+ assert(compIsaSupportedDebugOnly(InstructionSet_SSE2));
+ break;
+
+ default:
+ unreached();
+ }
+
+ if (simdSize == 32)
+ {
+ intrinsicId = NI_Vector256_GetElement;
+ }
+#elif defined(TARGET_ARM64)
+ if (simdSize == 8)
+ {
+ intrinsicId = NI_Vector64_GetElement;
+ }
+#else
+#error Unsupported platform
+#endif // !TARGET_XARCH && !TARGET_ARM64
+
+ int immUpperBound = getSIMDVectorLength(simdSize, simdBaseType) - 1;
+ bool rangeCheckNeeded = !op2->OperIsConst();
+
+ if (!rangeCheckNeeded)
+ {
+ ssize_t imm8 = op2->AsIntCon()->IconValue();
+ rangeCheckNeeded = (imm8 < 0) || (imm8 > immUpperBound);
+ }
+
+ if (rangeCheckNeeded)
+ {
+ op2 = addRangeCheckForHWIntrinsic(op2, 0, immUpperBound);
+ }
+
+ if (isSimdAsHWIntrinsic)
+ {
+ return gtNewSimdAsHWIntrinsicNode(type, op1, op2, intrinsicId, simdBaseJitType, simdSize);
+ }
+
+ return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsicId, simdBaseJitType, simdSize);
+}
+
+GenTreeHWIntrinsic* Compiler::gtNewSimdWithElementNode(var_types type,
+ GenTree* op1,
+ GenTree* op2,
+ GenTree* op3,
+ CorInfoType simdBaseJitType,
+ unsigned simdSize,
+ bool isSimdAsHWIntrinsic)
+{
+ NamedIntrinsic hwIntrinsicID = NI_Vector128_WithElement;
+ var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
+
+ assert(varTypeIsArithmetic(simdBaseType));
+ assert(op2->OperIsConst());
+
+ ssize_t imm8 = op2->AsIntCon()->IconValue();
+ ssize_t count = simdSize / genTypeSize(simdBaseType);
+
+ assert(0 <= imm8 && imm8 < count);
+
+#if defined(TARGET_XARCH)
+ switch (simdBaseType)
+ {
+ // Using software fallback if simdBaseType is not supported by hardware
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ case TYP_INT:
+ case TYP_UINT:
+ assert(compIsaSupportedDebugOnly(InstructionSet_SSE41));
+ break;
+
+ case TYP_LONG:
+ case TYP_ULONG:
+ assert(compIsaSupportedDebugOnly(InstructionSet_SSE41_X64));
+ break;
+
+ case TYP_DOUBLE:
+ case TYP_FLOAT:
+ case TYP_SHORT:
+ case TYP_USHORT:
+ assert(compIsaSupportedDebugOnly(InstructionSet_SSE2));
+ break;
+
+ default:
+ unreached();
+ }
+
+ if (simdSize == 32)
+ {
+ hwIntrinsicID = NI_Vector256_WithElement;
+ }
+#elif defined(TARGET_ARM64)
+ switch (simdBaseType)
+ {
+ case TYP_LONG:
+ case TYP_ULONG:
+ case TYP_DOUBLE:
+ if (simdSize == 8)
+ {
+ if (isSimdAsHWIntrinsic)
+ {
+ return gtNewSimdAsHWIntrinsicNode(type, op3, NI_Vector64_Create, simdBaseJitType, simdSize);
+ }
+
+ return gtNewSimdHWIntrinsicNode(type, op3, NI_Vector64_Create, simdBaseJitType, simdSize);
+ }
+ break;
+
+ case TYP_FLOAT:
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ case TYP_SHORT:
+ case TYP_USHORT:
+ case TYP_INT:
+ case TYP_UINT:
+ break;
+
+ default:
+ unreached();
+ }
+
+ hwIntrinsicID = NI_AdvSimd_Insert;
+#else
+#error Unsupported platform
+#endif // !TARGET_XARCH && !TARGET_ARM64
+
+ if (isSimdAsHWIntrinsic)
+ {
+ return gtNewSimdAsHWIntrinsicNode(type, op1, op2, op3, hwIntrinsicID, simdBaseJitType, simdSize);
+ }
+
+ return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, hwIntrinsicID, simdBaseJitType, simdSize);
+}
+
GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID)
{
SetOpLclRelatedToSIMDIntrinsic(op1);
diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp
index ea13b509e54..f659dd40a66 100644
--- a/src/coreclr/jit/hwintrinsic.cpp
+++ b/src/coreclr/jit/hwintrinsic.cpp
@@ -578,36 +578,7 @@ GenTree* Compiler::addRangeCheckIfNeeded(
assert(!immOp->IsCnsIntOrI());
assert(varTypeIsUnsigned(immOp));
- // Bounds check for value of an immediate operand
- // (immLowerBound <= immOp) && (immOp <= immUpperBound)
- //
- // implemented as a single comparison in the form of
- //
- // if ((immOp - immLowerBound) >= (immUpperBound - immLowerBound + 1))
- // {
- // throw new ArgumentOutOfRangeException();
- // }
- //
- // The value of (immUpperBound - immLowerBound + 1) is denoted as adjustedUpperBound.
-
- const ssize_t adjustedUpperBound = (ssize_t)immUpperBound - immLowerBound + 1;
- GenTree* adjustedUpperBoundNode = gtNewIconNode(adjustedUpperBound, TYP_INT);
-
- GenTree* immOpDup = nullptr;
-
- immOp = impCloneExpr(immOp, &immOpDup, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL,
- nullptr DEBUGARG("Clone an immediate operand for immediate value bounds check"));
-
- if (immLowerBound != 0)
- {
- immOpDup = gtNewOperNode(GT_SUB, TYP_INT, immOpDup, gtNewIconNode(immLowerBound, TYP_INT));
- }
-
- GenTreeBoundsChk* hwIntrinsicChk = new (this, GT_HW_INTRINSIC_CHK)
- GenTreeBoundsChk(GT_HW_INTRINSIC_CHK, TYP_VOID, immOpDup, adjustedUpperBoundNode, SCK_RNGCHK_FAIL);
- hwIntrinsicChk->gtThrowKind = SCK_ARG_RNG_EXCPN;
-
- return gtNewOperNode(GT_COMMA, immOp->TypeGet(), hwIntrinsicChk, immOp);
+ return addRangeCheckForHWIntrinsic(immOp, immLowerBound, immUpperBound);
}
else
{
@@ -616,6 +587,52 @@ GenTree* Compiler::addRangeCheckIfNeeded(
}
//------------------------------------------------------------------------
+// addRangeCheckForHWIntrinsic: add a GT_HW_INTRINSIC_CHK node for an intrinsic
+//
+// Arguments:
+// immOp -- the immediate operand of the intrinsic
+// immLowerBound -- lower incl. bound for a value of the immediate operand (for a non-full-range imm-intrinsic)
+// immUpperBound -- upper incl. bound for a value of the immediate operand (for a non-full-range imm-intrinsic)
+//
+// Return Value:
+// add a GT_HW_INTRINSIC_CHK node for non-full-range imm-intrinsic, which would throw ArgumentOutOfRangeException
+// when the imm-argument is not in the valid range
+//
+GenTree* Compiler::addRangeCheckForHWIntrinsic(GenTree* immOp, int immLowerBound, int immUpperBound)
+{
+ // Bounds check for value of an immediate operand
+ // (immLowerBound <= immOp) && (immOp <= immUpperBound)
+ //
+ // implemented as a single comparison in the form of
+ //
+ // if ((immOp - immLowerBound) >= (immUpperBound - immLowerBound + 1))
+ // {
+ // throw new ArgumentOutOfRangeException();
+ // }
+ //
+ // The value of (immUpperBound - immLowerBound + 1) is denoted as adjustedUpperBound.
+
+ const ssize_t adjustedUpperBound = (ssize_t)immUpperBound - immLowerBound + 1;
+ GenTree* adjustedUpperBoundNode = gtNewIconNode(adjustedUpperBound, TYP_INT);
+
+ GenTree* immOpDup = nullptr;
+
+ immOp = impCloneExpr(immOp, &immOpDup, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL,
+ nullptr DEBUGARG("Clone an immediate operand for immediate value bounds check"));
+
+ if (immLowerBound != 0)
+ {
+ immOpDup = gtNewOperNode(GT_SUB, TYP_INT, immOpDup, gtNewIconNode(immLowerBound, TYP_INT));
+ }
+
+ GenTreeBoundsChk* hwIntrinsicChk = new (this, GT_HW_INTRINSIC_CHK)
+ GenTreeBoundsChk(GT_HW_INTRINSIC_CHK, TYP_VOID, immOpDup, adjustedUpperBoundNode, SCK_RNGCHK_FAIL);
+ hwIntrinsicChk->gtThrowKind = SCK_ARG_RNG_EXCPN;
+
+ return gtNewOperNode(GT_COMMA, immOp->TypeGet(), hwIntrinsicChk, immOp);
+}
+
+//------------------------------------------------------------------------
// compSupportsHWIntrinsic: check whether a given instruction is enabled via configuration
//
// Arguments:
diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp
index 4f57bc38ace..41c96e3156d 100644
--- a/src/coreclr/jit/hwintrinsicarm64.cpp
+++ b/src/coreclr/jit/hwintrinsicarm64.cpp
@@ -256,8 +256,6 @@ void HWIntrinsicInfo::lookupImmBounds(
case NI_AdvSimd_StoreSelectedScalar:
case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128:
case NI_AdvSimd_Arm64_InsertSelectedScalar:
- case NI_Vector64_GetElement:
- case NI_Vector128_GetElement:
immUpperBound = Compiler::getSIMDVectorLength(simdSize, baseType) - 1;
break;
@@ -418,6 +416,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}
+ case NI_Vector64_GetElement:
+ case NI_Vector128_GetElement:
+ {
+ assert(!sig->hasThis());
+ assert(numArgs == 2);
+
+ if (!featureSIMD || !compExactlyDependsOn(InstructionSet_AdvSimd))
+ {
+ return nullptr;
+ }
+
+ op2 = impPopStack().val;
+ op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize));
+
+ const bool isSimdAsHWIntrinsic = true;
+ retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
+ break;
+ }
+
case NI_Vector64_get_Zero:
case NI_Vector64_get_AllBitsSet:
case NI_Vector128_get_Zero:
@@ -454,38 +471,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
impPopStack(); // pop the indexOp that we already have.
GenTree* vectorOp = impSIMDPopStack(getSIMDTypeForSize(simdSize));
- switch (simdBaseType)
- {
- case TYP_LONG:
- case TYP_ULONG:
- case TYP_DOUBLE:
- if (simdSize == 16)
- {
- retNode = gtNewSimdHWIntrinsicNode(retType, vectorOp, gtNewIconNode(imm8), valueOp,
- NI_AdvSimd_Insert, simdBaseJitType, simdSize);
- }
- else
- {
- retNode =
- gtNewSimdHWIntrinsicNode(retType, valueOp, NI_Vector64_Create, simdBaseJitType, simdSize);
- }
- break;
-
- case TYP_FLOAT:
- case TYP_BYTE:
- case TYP_UBYTE:
- case TYP_SHORT:
- case TYP_USHORT:
- case TYP_INT:
- case TYP_UINT:
- retNode = gtNewSimdHWIntrinsicNode(retType, vectorOp, gtNewIconNode(imm8), valueOp,
- NI_AdvSimd_Insert, simdBaseJitType, simdSize);
- break;
-
- default:
- return nullptr;
- }
-
+ retNode = gtNewSimdWithElementNode(retType, vectorOp, indexOp, valueOp, simdBaseJitType, simdSize,
+ /* isSimdAsHWIntrinsic */ true);
break;
}
diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp
index 8ae7457dbc2..3352c9ba595 100644
--- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp
+++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp
@@ -796,15 +796,127 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
case NI_Vector64_GetElement:
case NI_Vector128_GetElement:
- case NI_Vector64_ToScalar:
- case NI_Vector128_ToScalar:
{
- ssize_t indexValue = 0;
- if ((intrin.id == NI_Vector64_GetElement) || (intrin.id == NI_Vector128_GetElement))
+ assert(intrin.numOperands == 2);
+
+ var_types simdType = Compiler::getSIMDTypeForSize(node->GetSimdSize());
+
+ if (simdType == TYP_SIMD12)
+ {
+ // op1 of TYP_SIMD12 should be considered as TYP_SIMD16
+ simdType = TYP_SIMD16;
+ }
+
+ if (!intrin.op2->OperIsConst())
+ {
+ assert(!intrin.op2->isContained());
+
+ emitAttr baseTypeSize = emitTypeSize(intrin.baseType);
+ unsigned baseTypeScale = genLog2(EA_SIZE_IN_BYTES(baseTypeSize));
+
+ regNumber baseReg;
+ regNumber indexReg = op2Reg;
+
+ // Optimize the case of op1 is in memory and trying to access ith element.
+ if (!intrin.op1->isUsedFromReg())
+ {
+ assert(intrin.op1->isContained());
+
+ if (intrin.op1->OperIsLocal())
+ {
+ unsigned varNum = intrin.op1->AsLclVarCommon()->GetLclNum();
+ baseReg = node->ExtractTempReg();
+
+ // Load the address of varNum
+ GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, baseReg, varNum, 0);
+ }
+ else
+ {
+ // Require GT_IND addr to be not contained.
+ assert(intrin.op1->OperIs(GT_IND));
+
+ GenTree* addr = intrin.op1->AsIndir()->Addr();
+ assert(!addr->isContained());
+ baseReg = addr->GetRegNum();
+ }
+ }
+ else
+ {
+ unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
+ noway_assert(simdInitTempVarNum != BAD_VAR_NUM);
+
+ baseReg = node->ExtractTempReg();
+
+ // Load the address of simdInitTempVarNum
+ GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, baseReg, simdInitTempVarNum, 0);
+
+ // Store the vector to simdInitTempVarNum
+ GetEmitter()->emitIns_R_R(INS_str, emitTypeSize(simdType), op1Reg, baseReg);
+ }
+
+ assert(genIsValidIntReg(indexReg));
+ assert(genIsValidIntReg(baseReg));
+ assert(baseReg != indexReg);
+
+ // Load item at baseReg[index]
+ GetEmitter()->emitIns_R_R_R_Ext(ins_Load(intrin.baseType), baseTypeSize, targetReg, baseReg,
+ indexReg, INS_OPTS_LSL, baseTypeScale);
+ }
+ else if (!GetEmitter()->isValidVectorIndex(emitTypeSize(simdType), emitTypeSize(intrin.baseType),
+ intrin.op2->AsIntCon()->IconValue()))
+ {
+ // We only need to generate code for the get if the index is valid
+ // If the index is invalid, previously generated for the range check will throw
+ }
+ else if (!intrin.op1->isUsedFromReg())
+ {
+ assert(intrin.op1->isContained());
+ assert(intrin.op2->IsCnsIntOrI());
+
+ int offset = (int)intrin.op2->AsIntCon()->IconValue() * genTypeSize(intrin.baseType);
+ instruction ins = ins_Load(intrin.baseType);
+
+ assert(!intrin.op1->isUsedFromReg());
+
+ if (intrin.op1->OperIsLocal())
+ {
+ unsigned varNum = intrin.op1->AsLclVarCommon()->GetLclNum();
+ GetEmitter()->emitIns_R_S(ins, emitActualTypeSize(intrin.baseType), targetReg, varNum, offset);
+ }
+ else
+ {
+ assert(intrin.op1->OperIs(GT_IND));
+
+ GenTree* addr = intrin.op1->AsIndir()->Addr();
+ assert(!addr->isContained());
+ regNumber baseReg = addr->GetRegNum();
+
+ // ldr targetReg, [baseReg, #offset]
+ GetEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(intrin.baseType), targetReg, baseReg,
+ offset);
+ }
+ }
+ else
{
assert(intrin.op2->IsCnsIntOrI());
- indexValue = intrin.op2->AsIntCon()->gtIconVal;
+ ssize_t indexValue = intrin.op2->AsIntCon()->IconValue();
+
+ // no-op if vector is float/double, targetReg == op1Reg and fetching for 0th index.
+ if ((varTypeIsFloating(intrin.baseType) && (targetReg == op1Reg) && (indexValue == 0)))
+ {
+ break;
+ }
+
+ GetEmitter()->emitIns_R_R_I(ins, emitTypeSize(intrin.baseType), targetReg, op1Reg, indexValue,
+ INS_OPTS_NONE);
}
+ break;
+ }
+
+ case NI_Vector64_ToScalar:
+ case NI_Vector128_ToScalar:
+ {
+ const ssize_t indexValue = 0;
// no-op if vector is float/double, targetReg == op1Reg and fetching for 0th index.
if ((varTypeIsFloating(intrin.baseType) && (targetReg == op1Reg) && (indexValue == 0)))
diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
index 9680e91fde3..be6e5779269 100644
--- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
+++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp
@@ -226,13 +226,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
{
assert(ival == -1);
-
- if (intrinsicId == NI_SSE2_Extract)
- {
- // extract instructions return to GP-registers, so it needs int size as the emitsize
- simdSize = emitTypeSize(TYP_INT);
- }
-
auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); };
if (op2->IsCnsIntOrI())
@@ -1146,15 +1139,15 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
GenTree* op1 = node->gtGetOp1();
+ GenTree* op2 = node->gtGetOp2();
genConsumeHWIntrinsicOperands(node);
regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->GetRegNum();
- assert(node->gtGetOp2() == nullptr);
-
- emitter* emit = GetEmitter();
- emitAttr attr = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize()));
- instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
+ emitter* emit = GetEmitter();
+ var_types simdType = Compiler::getSIMDTypeForSize(node->GetSimdSize());
+ emitAttr attr = emitActualTypeSize(simdType);
+ instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
switch (intrinsicId)
{
@@ -1184,6 +1177,160 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
break;
}
+ case NI_Vector128_GetElement:
+ case NI_Vector256_GetElement:
+ {
+ if (simdType == TYP_SIMD12)
+ {
+ // op1 of TYP_SIMD12 should be considered as TYP_SIMD16
+ simdType = TYP_SIMD16;
+ }
+
+ // Optimize the case of op1 is in memory and trying to access ith element.
+ if (!op1->isUsedFromReg())
+ {
+ assert(op1->isContained());
+
+ regNumber baseReg;
+ regNumber indexReg;
+ int offset = 0;
+
+ if (op1->OperIsLocal())
+ {
+ // There are three parts to the total offset here:
+ // {offset of local} + {offset of vector field (lclFld only)} + {offset of element within vector}.
+ bool isEBPbased;
+ unsigned varNum = op1->AsLclVarCommon()->GetLclNum();
+ offset += compiler->lvaFrameAddress(varNum, &isEBPbased);
+
+#if !FEATURE_FIXED_OUT_ARGS
+ if (!isEBPbased)
+ {
+ // Adjust the offset by the amount currently pushed on the CPU stack
+ offset += genStackLevel;
+ }
+#else
+ assert(genStackLevel == 0);
+#endif // !FEATURE_FIXED_OUT_ARGS
+
+ if (op1->OperIs(GT_LCL_FLD))
+ {
+ offset += op1->AsLclFld()->GetLclOffs();
+ }
+ baseReg = (isEBPbased) ? REG_EBP : REG_ESP;
+ }
+ else
+ {
+ // Require GT_IND addr to be not contained.
+ assert(op1->OperIs(GT_IND));
+
+ GenTree* addr = op1->AsIndir()->Addr();
+ assert(!addr->isContained());
+ baseReg = addr->GetRegNum();
+ }
+
+ if (op2->OperIsConst())
+ {
+ assert(op2->isContained());
+ indexReg = REG_NA;
+ offset += (int)op2->AsIntCon()->IconValue() * genTypeSize(baseType);
+ }
+ else
+ {
+ indexReg = op2->GetRegNum();
+ assert(genIsValidIntReg(indexReg));
+ }
+
+ // Now, load the desired element.
+ GetEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
+ emitTypeSize(baseType), // Of the vector baseType
+ targetReg, // To targetReg
+ baseReg, // Base Reg
+ indexReg, // Indexed
+ genTypeSize(baseType), // by the size of the baseType
+ offset);
+ }
+ else if (op2->OperIsConst())
+ {
+ assert(intrinsicId == NI_Vector128_GetElement);
+ assert(varTypeIsFloating(baseType));
+ assert(op1Reg != REG_NA);
+
+ ssize_t ival = op2->AsIntCon()->IconValue();
+
+ if (baseType == TYP_FLOAT)
+ {
+ if (ival == 1)
+ {
+ if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE3))
+ {
+ emit->emitIns_R_R(INS_movshdup, attr, targetReg, op1Reg);
+ }
+ else
+ {
+ emit->emitIns_SIMD_R_R_R_I(INS_shufps, attr, targetReg, op1Reg, op1Reg,
+ static_cast<int8_t>(0x55));
+ }
+ }
+ else if (ival == 2)
+ {
+ emit->emitIns_SIMD_R_R_R(INS_unpckhps, attr, targetReg, op1Reg, op1Reg);
+ }
+ else
+ {
+ assert(ival == 3);
+ emit->emitIns_SIMD_R_R_R_I(INS_shufps, attr, targetReg, op1Reg, op1Reg,
+ static_cast<int8_t>(0xFF));
+ }
+ }
+ else
+ {
+ assert(baseType == TYP_DOUBLE);
+ assert(ival == 1);
+ emit->emitIns_SIMD_R_R_R(INS_unpckhpd, attr, targetReg, op1Reg, op1Reg);
+ }
+ }
+ else
+ {
+ // We don't have an instruction to implement this intrinsic if the index is not a constant.
+ // So we will use the SIMD temp location to store the vector, and the load the desired element.
+ // The range check will already have been performed, so at this point we know we have an index
+ // within the bounds of the vector.
+
+ unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
+ noway_assert(simdInitTempVarNum != BAD_VAR_NUM);
+
+ bool isEBPbased;
+ unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased);
+
+#if !FEATURE_FIXED_OUT_ARGS
+ if (!isEBPbased)
+ {
+ // Adjust the offset by the amount currently pushed on the CPU stack
+ offs += genStackLevel;
+ }
+#else
+ assert(genStackLevel == 0);
+#endif // !FEATURE_FIXED_OUT_ARGS
+
+ regNumber indexReg = op2->GetRegNum();
+
+ // Store the vector to the temp location.
+ GetEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)),
+ emitTypeSize(simdType), op1Reg, simdInitTempVarNum, 0);
+
+ // Now, load the desired element.
+ GetEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
+ emitTypeSize(baseType), // Of the vector baseType
+ targetReg, // To targetReg
+ (isEBPbased) ? REG_EBP : REG_ESP, // Stack-based
+ indexReg, // Indexed
+ genTypeSize(baseType), // by the size of the baseType
+ offs);
+ }
+ break;
+ }
+
case NI_Vector128_ToScalar:
case NI_Vector256_ToScalar:
{
@@ -1543,25 +1690,12 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
case NI_SSE41_Extract:
case NI_SSE41_X64_Extract:
{
- regNumber tmpTargetReg = REG_NA;
- instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
- if (baseType == TYP_FLOAT)
- {
- tmpTargetReg = node->ExtractTempReg();
- }
+ assert(!varTypeIsFloating(baseType));
- auto emitSwCase = [&](int8_t i) {
- if (baseType == TYP_FLOAT)
- {
- // extract instructions return to GP-registers, so it needs int size as the emitsize
- inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
- emit->emitIns_Mov(INS_movd, EA_4BYTE, targetReg, tmpTargetReg, /* canSkip */ false);
- }
- else
- {
- inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
- }
- };
+ instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
+ emitAttr attr = emitActualTypeSize(node->TypeGet());
+
+ auto emitSwCase = [&](int8_t i) { inst_RV_TT_IV(ins, attr, targetReg, op1, i); };
if (op2->IsCnsIntOrI())
{
diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h
index da9f788db49..71c3f56121a 100644
--- a/src/coreclr/jit/hwintrinsiclistarm64.h
+++ b/src/coreclr/jit/hwintrinsiclistarm64.h
@@ -32,7 +32,7 @@ HARDWARE_INTRINSIC(Vector64, Dot,
HARDWARE_INTRINSIC(Vector64, get_AllBitsSet, 8, 0, {INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(Vector64, get_Count, 8, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(Vector64, get_Zero, 8, 0, {INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
-HARDWARE_INTRINSIC(Vector64, GetElement, 8, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_NoJmpTableIMM|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Vector64, GetElement, 8, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector64, op_Equality, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector64, op_Inequality, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector64, ToScalar, 8, 1, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SIMDScalar|HW_Flag_SpecialCodeGen)
@@ -65,7 +65,7 @@ HARDWARE_INTRINSIC(Vector128, Dot,
HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
-HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_NoJmpTableIMM|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector128, GetLower, 16, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Vector128, GetUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(Vector128, op_Equality, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen)
diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h
index e5aec99897e..82d9a4356b2 100644
--- a/src/coreclr/jit/hwintrinsiclistxarch.h
+++ b/src/coreclr/jit/hwintrinsiclistxarch.h
@@ -48,7 +48,7 @@ HARDWARE_INTRINSIC(Vector128, Dot,
HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector128, op_Equality, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector128, op_Inequality, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
@@ -80,7 +80,7 @@ HARDWARE_INTRINSIC(Vector256, get_Zero,
HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, Dot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen)
-HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, GetLower, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, op_Equality, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector256, op_Inequality, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen)
diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp
index 73a43ab0584..0155e4120b5 100644
--- a/src/coreclr/jit/hwintrinsicxarch.cpp
+++ b/src/coreclr/jit/hwintrinsicxarch.cpp
@@ -978,14 +978,29 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
case NI_Vector128_WithElement:
{
assert(sig->numArgs == 3);
- GenTree* indexOp = impStackTop(1).val;
- if (!compExactlyDependsOn(InstructionSet_SSE2) || !varTypeIsArithmetic(simdBaseType) ||
- !indexOp->OperIsConst())
+
+ if (!compExactlyDependsOn(InstructionSet_SSE2) || !varTypeIsArithmetic(simdBaseType))
{
// Using software fallback if
// 1. JIT/hardware don't support SSE2 instructions
- // 2. simdBaseType is not a numeric type (throw execptions)
- // 3. index is not a constant
+ // 2. simdBaseType is not a numeric type (throw exceptions)
+ return nullptr;
+ }
+
+ GenTree* indexOp = impStackTop(1).val;
+
+ if (!indexOp->OperIsConst())
+ {
+ // Index is not a constant, use the software fallback
+ return nullptr;
+ }
+
+ ssize_t imm8 = indexOp->AsIntCon()->IconValue();
+ ssize_t count = simdSize / genTypeSize(simdBaseType);
+
+ if (imm8 >= count || imm8 < 0)
+ {
+ // Using software fallback if index is out of range (throw exeception)
return nullptr;
}
@@ -1018,177 +1033,15 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
break;
default:
- return nullptr;
- }
-
- ssize_t imm8 = indexOp->AsIntCon()->IconValue();
- ssize_t cachedImm8 = imm8;
- ssize_t count = simdSize / genTypeSize(simdBaseType);
-
- if (imm8 >= count || imm8 < 0)
- {
- // Using software fallback if index is out of range (throw exeception)
- return nullptr;
+ unreached();
}
GenTree* valueOp = impPopStack().val;
- impPopStack(); // pops the indexOp that we already have.
+ impPopStack(); // Pop the indexOp now that we know its valid
GenTree* vectorOp = impSIMDPopStack(getSIMDTypeForSize(simdSize));
- GenTree* clonedVectorOp = nullptr;
-
- if (simdSize == 32)
- {
- // Extract the half vector that will be modified
- assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
-
- // copy `vectorOp` to accept the modified half vector
- vectorOp = impCloneExpr(vectorOp, &clonedVectorOp, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL,
- nullptr DEBUGARG("Clone Vector for Vector256<T>.WithElement"));
-
- if (imm8 >= count / 2)
- {
- imm8 -= count / 2;
- vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, gtNewIconNode(1), NI_AVX_ExtractVector128,
- simdBaseJitType, simdSize);
- }
- else
- {
- vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, NI_Vector256_GetLower, simdBaseJitType,
- simdSize);
- }
- }
-
- GenTree* immNode = gtNewIconNode(imm8);
-
- switch (simdBaseType)
- {
- case TYP_LONG:
- case TYP_ULONG:
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, valueOp, immNode, NI_SSE41_X64_Insert,
- simdBaseJitType, 16);
- break;
-
- case TYP_FLOAT:
- {
- if (!compOpportunisticallyDependsOn(InstructionSet_SSE41))
- {
- // Emulate Vector128<float>.WithElement by SSE instructions
- if (imm8 == 0)
- {
- // vector.WithElement(0, value)
- // =>
- // movss xmm0, xmm1 (xmm0 = vector, xmm1 = value)
- valueOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, NI_Vector128_CreateScalarUnsafe,
- CORINFO_TYPE_FLOAT, 16);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, valueOp, NI_SSE_MoveScalar,
- CORINFO_TYPE_FLOAT, 16);
- }
- else if (imm8 == 1)
- {
- // vector.WithElement(1, value)
- // =>
- // shufps xmm1, xmm0, 0 (xmm0 = vector, xmm1 = value)
- // shufps xmm1, xmm0, 226
- GenTree* tmpOp =
- gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, NI_Vector128_CreateScalarUnsafe,
- CORINFO_TYPE_FLOAT, 16);
- GenTree* dupVectorOp = nullptr;
- vectorOp = impCloneExpr(vectorOp, &dupVectorOp, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL,
- nullptr DEBUGARG("Clone Vector for Vector128<float>.WithElement"));
- tmpOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmpOp, vectorOp, gtNewIconNode(0),
- NI_SSE_Shuffle, CORINFO_TYPE_FLOAT, 16);
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmpOp, dupVectorOp, gtNewIconNode(226),
- NI_SSE_Shuffle, CORINFO_TYPE_FLOAT, 16);
- }
- else
- {
- ssize_t controlBits1 = 0;
- ssize_t controlBits2 = 0;
- if (imm8 == 2)
- {
- controlBits1 = 48;
- controlBits2 = 132;
- }
- else
- {
- controlBits1 = 32;
- controlBits2 = 36;
- }
- // vector.WithElement(2, value)
- // =>
- // shufps xmm1, xmm0, 48 (xmm0 = vector, xmm1 = value)
- // shufps xmm0, xmm1, 132
- //
- // vector.WithElement(3, value)
- // =>
- // shufps xmm1, xmm0, 32 (xmm0 = vector, xmm1 = value)
- // shufps xmm0, xmm1, 36
- GenTree* tmpOp =
- gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, NI_Vector128_CreateScalarUnsafe,
- CORINFO_TYPE_FLOAT, 16);
- GenTree* dupVectorOp = nullptr;
- vectorOp = impCloneExpr(vectorOp, &dupVectorOp, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL,
- nullptr DEBUGARG("Clone Vector for Vector128<float>.WithElement"));
- valueOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, tmpOp, gtNewIconNode(controlBits1),
- NI_SSE_Shuffle, CORINFO_TYPE_FLOAT, 16);
- retNode =
- gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, dupVectorOp, gtNewIconNode(controlBits2),
- NI_SSE_Shuffle, CORINFO_TYPE_FLOAT, 16);
- }
- break;
- }
- else
- {
- valueOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, NI_Vector128_CreateScalarUnsafe,
- CORINFO_TYPE_FLOAT, 16);
- immNode->AsIntCon()->SetIconValue(imm8 * 16);
- FALLTHROUGH;
- }
- }
-
- case TYP_BYTE:
- case TYP_UBYTE:
- case TYP_INT:
- case TYP_UINT:
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, valueOp, immNode, NI_SSE41_Insert,
- simdBaseJitType, 16);
- break;
-
- case TYP_SHORT:
- case TYP_USHORT:
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, valueOp, immNode, NI_SSE2_Insert,
- simdBaseJitType, 16);
- break;
-
- case TYP_DOUBLE:
- {
- // vector.WithElement(0, value)
- // =>
- // movsd xmm0, xmm1 (xmm0 = vector, xmm1 = value)
- //
- // vector.WithElement(1, value)
- // =>
- // unpcklpd xmm0, xmm1 (xmm0 = vector, xmm1 = value)
- valueOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, NI_Vector128_CreateScalarUnsafe,
- CORINFO_TYPE_DOUBLE, 16);
- NamedIntrinsic in = (imm8 == 0) ? NI_SSE2_MoveScalar : NI_SSE2_UnpackLow;
- retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, valueOp, in, CORINFO_TYPE_DOUBLE, 16);
- break;
- }
-
- default:
- return nullptr;
- }
-
- if (simdSize == 32)
- {
- assert(clonedVectorOp);
- int upperOrLower = (cachedImm8 >= count / 2) ? 1 : 0;
- retNode = gtNewSimdHWIntrinsicNode(retType, clonedVectorOp, retNode, gtNewIconNode(upperOrLower),
- NI_AVX_InsertVector128, simdBaseJitType, simdSize);
- }
-
+ retNode = gtNewSimdWithElementNode(retType, vectorOp, indexOp, valueOp, simdBaseJitType, simdSize,
+ /* isSimdAsHWIntrinsic */ true);
break;
}
@@ -1205,14 +1058,12 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
case NI_Vector128_GetElement:
{
assert(sig->numArgs == 2);
- GenTree* indexOp = impStackTop().val;
- if (!compExactlyDependsOn(InstructionSet_SSE2) || !varTypeIsArithmetic(simdBaseType) ||
- !indexOp->OperIsConst())
+
+ if (!compExactlyDependsOn(InstructionSet_SSE2) || !varTypeIsArithmetic(simdBaseType))
{
// Using software fallback if
// 1. JIT/hardware don't support SSE2 instructions
// 2. simdBaseType is not a numeric type (throw execptions)
- // 3. index is not a constant
return nullptr;
}
@@ -1223,15 +1074,9 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
case TYP_UBYTE:
case TYP_INT:
case TYP_UINT:
- if (!compExactlyDependsOn(InstructionSet_SSE41))
- {
- return nullptr;
- }
- break;
-
case TYP_LONG:
case TYP_ULONG:
- if (!compExactlyDependsOn(InstructionSet_SSE41_X64))
+ if (!compExactlyDependsOn(InstructionSet_SSE41))
{
return nullptr;
}
@@ -1245,144 +1090,14 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
break;
default:
- break;
- }
-
- ssize_t imm8 = indexOp->AsIntCon()->IconValue();
- ssize_t count = simdSize / genTypeSize(simdBaseType);
-
- if (imm8 >= count || imm8 < 0)
- {
- // Using software fallback if index is out of range (throw exeception)
- return nullptr;
- }
-
- impPopStack();
- GenTree* vectorOp = impSIMDPopStack(getSIMDTypeForSize(simdSize));
- NamedIntrinsic resIntrinsic = NI_Illegal;
-
- if (simdSize == 32)
- {
- assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
-
- if (imm8 >= count / 2)
- {
- imm8 -= count / 2;
- vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, gtNewIconNode(1), NI_AVX_ExtractVector128,
- simdBaseJitType, simdSize);
- }
- else
- {
- vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, NI_Vector256_GetLower, simdBaseJitType,
- simdSize);
- }
- }
-
- if (imm8 == 0 && (genTypeSize(simdBaseType) >= 4))
- {
- switch (simdBaseType)
- {
- case TYP_LONG:
- resIntrinsic = NI_SSE2_X64_ConvertToInt64;
- break;
-
- case TYP_ULONG:
- resIntrinsic = NI_SSE2_X64_ConvertToUInt64;
- break;
-
- case TYP_INT:
- resIntrinsic = NI_SSE2_ConvertToInt32;
- break;
-
- case TYP_UINT:
- resIntrinsic = NI_SSE2_ConvertToUInt32;
- break;
-
- case TYP_FLOAT:
- case TYP_DOUBLE:
- resIntrinsic = NI_Vector128_ToScalar;
- break;
-
- default:
- return nullptr;
- }
-
- return gtNewSimdHWIntrinsicNode(retType, vectorOp, resIntrinsic, simdBaseJitType, 16);
+ unreached();
}
- GenTree* immNode = gtNewIconNode(imm8);
-
- switch (simdBaseType)
- {
- case TYP_LONG:
- case TYP_ULONG:
- retNode =
- gtNewSimdHWIntrinsicNode(retType, vectorOp, immNode, NI_SSE41_X64_Extract, simdBaseJitType, 16);
- break;
-
- case TYP_FLOAT:
- {
- if (!compOpportunisticallyDependsOn(InstructionSet_SSE41))
- {
- assert(imm8 >= 1);
- assert(imm8 <= 3);
- // Emulate Vector128<float>.GetElement(i) by SSE instructions
- // vector.GetElement(i)
- // =>
- // shufps xmm0, xmm0, control
- // (xmm0 = vector, control = i + 228)
- immNode->AsIntCon()->SetIconValue(228 + imm8);
- GenTree* clonedVectorOp = nullptr;
- vectorOp = impCloneExpr(vectorOp, &clonedVectorOp, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL,
- nullptr DEBUGARG("Clone Vector for Vector128<float>.GetElement"));
- vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, clonedVectorOp, immNode,
- NI_SSE_Shuffle, CORINFO_TYPE_FLOAT, 16);
- return gtNewSimdHWIntrinsicNode(retType, vectorOp, NI_Vector128_ToScalar, CORINFO_TYPE_FLOAT,
- 16);
- }
- FALLTHROUGH;
- }
-
- case TYP_UBYTE:
- case TYP_INT:
- case TYP_UINT:
- retNode =
- gtNewSimdHWIntrinsicNode(retType, vectorOp, immNode, NI_SSE41_Extract, simdBaseJitType, 16);
- break;
-
- case TYP_BYTE:
- // We do not have SSE41/SSE2 Extract APIs on signed small int, so need a CAST on the result
- retNode = gtNewSimdHWIntrinsicNode(TYP_UBYTE, vectorOp, immNode, NI_SSE41_Extract,
- CORINFO_TYPE_UBYTE, 16);
- retNode = gtNewCastNode(TYP_INT, retNode, true, TYP_BYTE);
- break;
-
- case TYP_SHORT:
- case TYP_USHORT:
- // We do not have SSE41/SSE2 Extract APIs on signed small int, so need a CAST on the result
- retNode = gtNewSimdHWIntrinsicNode(TYP_USHORT, vectorOp, immNode, NI_SSE2_Extract,
- CORINFO_TYPE_USHORT, 16);
- if (simdBaseType == TYP_SHORT)
- {
- retNode = gtNewCastNode(TYP_INT, retNode, true, TYP_SHORT);
- }
- break;
-
- case TYP_DOUBLE:
- assert(imm8 == 1);
- // vector.GetElement(1)
- // =>
- // pshufd xmm1, xmm0, 0xEE (xmm0 = vector)
- vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, gtNewIconNode(0xEE), NI_SSE2_Shuffle,
- CORINFO_TYPE_INT, 16);
- retNode =
- gtNewSimdHWIntrinsicNode(TYP_DOUBLE, vectorOp, NI_Vector128_ToScalar, CORINFO_TYPE_DOUBLE, 16);
- break;
-
- default:
- return nullptr;
- }
+ GenTree* op2 = impPopStack().val;
+ GenTree* op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize));
+ retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize,
+ /* isSimdAsHWIntrinsic */ true);
break;
}
diff --git a/src/coreclr/jit/lclmorph.cpp b/src/coreclr/jit/lclmorph.cpp
index db3e134b607..9e49f30bd70 100644
--- a/src/coreclr/jit/lclmorph.cpp
+++ b/src/coreclr/jit/lclmorph.cpp
@@ -917,7 +917,7 @@ private:
// a variable into a LCL_FLD but that blocks enregistration so we need to
// detect those case where we can use LCL_VAR instead, perhaps in conjuction
// with CAST and/or BITCAST.
- // Also skip SIMD variables for now, fgMorphFieldAssignToSIMDIntrinsicSet and
+ // Also skip SIMD variables for now, fgMorphFieldAssignToSimdSetElement and
// others need to be updated to recognize LCL_FLDs.
return;
}
@@ -958,7 +958,7 @@ private:
if (varTypeIsSIMD(indir->TypeGet()))
{
// TODO-ADDR: Skip SIMD indirs for now, SIMD typed LCL_FLDs works most of the time
- // but there are exceptions - fgMorphFieldAssignToSIMDIntrinsicSet for example.
+ // but there are exceptions - fgMorphFieldAssignToSimdSetElement for example.
// And more importantly, SIMD call args have to be wrapped in OBJ nodes currently.
return;
}
diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h
index 83ce67d2685..bc061bf7548 100644
--- a/src/coreclr/jit/lower.h
+++ b/src/coreclr/jit/lower.h
@@ -330,6 +330,8 @@ private:
#if defined(TARGET_XARCH)
void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node);
+ void LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node);
+ void LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node);
#elif defined(TARGET_ARM64)
bool IsValidConstForMovImm(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node);
diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp
index c1845995dc9..6774a4ef176 100644
--- a/src/coreclr/jit/lowerarmarch.cpp
+++ b/src/coreclr/jit/lowerarmarch.cpp
@@ -1659,48 +1659,21 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
{
switch (simdNode->gtSIMDIntrinsicID)
{
- GenTree* op1;
- GenTree* op2;
-
case SIMDIntrinsicInit:
- op1 = simdNode->AsOp()->gtOp1;
+ {
+ GenTree* op1 = simdNode->AsOp()->gtOp1;
if (op1->IsIntegralConst(0))
{
MakeSrcContained(simdNode, op1);
}
break;
+ }
case SIMDIntrinsicInitArray:
// We have an array and an index, which may be contained.
CheckImmedAndMakeContained(simdNode, simdNode->gtGetOp2());
break;
- case SIMDIntrinsicGetItem:
- {
- // This implements get_Item method. The sources are:
- // - the source SIMD struct
- // - index (which element to get)
- // The result is simdBaseType of SIMD struct.
- op1 = simdNode->AsOp()->gtOp1;
- op2 = simdNode->AsOp()->gtOp2;
-
- // If the index is a constant, mark it as contained.
- if (op2->IsCnsIntOrI())
- {
- MakeSrcContained(simdNode, op2);
- }
-
- if (IsContainableMemoryOp(op1))
- {
- MakeSrcContained(simdNode, op1);
- if (op1->OperGet() == GT_IND)
- {
- op1->AsIndir()->Addr()->ClearContained();
- }
- }
- break;
- }
-
default:
break;
}
@@ -1765,8 +1738,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
case NI_AdvSimd_InsertScalar:
case NI_AdvSimd_LoadAndInsertScalar:
case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128:
- case NI_Vector64_GetElement:
- case NI_Vector128_GetElement:
assert(hasImmediateOperand);
assert(varTypeIsIntegral(intrin.op2));
if (intrin.op2->IsCnsIntOrI())
@@ -1832,6 +1803,29 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
}
break;
+ case NI_Vector64_GetElement:
+ case NI_Vector128_GetElement:
+ {
+ assert(hasImmediateOperand);
+ assert(varTypeIsIntegral(intrin.op2));
+
+ if (intrin.op2->IsCnsIntOrI())
+ {
+ MakeSrcContained(node, intrin.op2);
+ }
+
+ if (IsContainableMemoryOp(intrin.op1))
+ {
+ MakeSrcContained(node, intrin.op1);
+
+ if (intrin.op1->OperIs(GT_IND))
+ {
+ intrin.op1->AsIndir()->Addr()->ClearContained();
+ }
+ }
+ break;
+ }
+
default:
unreached();
}
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
index 31279fb8b43..c9b0d6ef7ae 100644
--- a/src/coreclr/jit/lowerxarch.cpp
+++ b/src/coreclr/jit/lowerxarch.cpp
@@ -728,26 +728,6 @@ void Lowering::LowerSIMD(GenTreeSIMD* simdNode)
}
}
-#ifdef TARGET_XARCH
- if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (simdNode->gtGetOp1()->OperGet() == GT_IND))
- {
- // If SIMD vector is already in memory, we force its
- // addr to be evaluated into a reg. This would allow
- // us to generate [regBase] or [regBase+offset] or
- // [regBase+sizeOf(SIMD vector simdBaseType)*regIndex]
- // to access the required SIMD vector element directly
- // from memory.
- //
- // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we
- // might be able update GT_LEA to fold the regIndex
- // or offset in some cases. Instead with this
- // approach we always evaluate GT_LEA into a reg.
- // Ideally, we should be able to lower GetItem intrinsic
- // into GT_IND(newAddr) where newAddr combines
- // the addr of SIMD vector with the given index.
- simdNode->gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
- }
-#endif
ContainCheckSIMD(simdNode);
}
#endif // FEATURE_SIMD
@@ -950,6 +930,33 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
return;
}
+ case NI_Vector128_GetElement:
+ case NI_Vector256_GetElement:
+ {
+ LowerHWIntrinsicGetElement(node);
+
+ if ((node->gtHWIntrinsicId == NI_Vector128_GetElement) ||
+ (node->gtHWIntrinsicId == NI_Vector256_GetElement))
+ {
+ // Most NI_Vector*_GetElement intrinsics are lowered to
+ // alternative nodes, such as the Extract intrinsics,
+ // which are themselves lowered.
+ //
+ // However, certain types may not have a direct equivalent
+ // in which case we specially handle them directly as GetElement
+ // and want to do the relevant containment checks.
+ break;
+ }
+ return;
+ }
+
+ case NI_Vector128_WithElement:
+ case NI_Vector256_WithElement:
+ {
+ LowerHWIntrinsicWithElement(node);
+ return;
+ }
+
case NI_Vector128_op_Equality:
case NI_Vector256_op_Equality:
{
@@ -971,6 +978,38 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}
+ case NI_SSE41_Extract:
+ {
+ if (varTypeIsFloating(node->GetSimdBaseType()))
+ {
+ assert(node->GetSimdBaseType() == TYP_FLOAT);
+ assert(node->gtOp1 != nullptr);
+ assert(node->gtOp2 != nullptr);
+ assert(node->GetSimdSize() == 16);
+
+ GenTree* op2 = node->gtGetOp2();
+
+ if (!op2->OperIsConst())
+ {
+ // Extract allows the full range while GetElement only allows
+ // 0-3, so we need to mask the index here so codegen works.
+
+ GenTree* msk = comp->gtNewIconNode(3, TYP_INT);
+ BlockRange().InsertAfter(op2, msk);
+
+ GenTree* tmp = comp->gtNewOperNode(GT_AND, TYP_INT, op2, msk);
+ BlockRange().InsertAfter(msk, tmp);
+ LowerNode(tmp);
+
+ node->gtOp2 = tmp;
+ }
+
+ node->gtHWIntrinsicId = NI_Vector128_GetElement;
+ LowerNode(node);
+ }
+ break;
+ }
+
case NI_SSE2_Insert:
case NI_SSE41_Insert:
case NI_SSE41_X64_Insert:
@@ -2539,6 +2578,611 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node)
}
//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicGetElement: Lowers a Vector128 or Vector256 GetElement call
+//
+// Arguments:
+// node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node)
+{
+ NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+ var_types simdType = node->gtType;
+ CorInfoType simdBaseJitType = node->GetSimdBaseJitType();
+ var_types simdBaseType = node->GetSimdBaseType();
+ unsigned simdSize = node->GetSimdSize();
+
+ assert(!varTypeIsSIMD(simdType));
+ assert(varTypeIsArithmetic(simdBaseType));
+ assert(simdSize != 0);
+
+ GenTree* op1 = node->gtGetOp1();
+ GenTree* op2 = node->gtGetOp2();
+
+ assert(op1 != nullptr);
+ assert(op2 != nullptr);
+
+ if (op1->OperIs(GT_IND))
+ {
+ // If the vector is already in memory, we force its
+ // addr to be evaluated into a reg. This would allow
+ // us to generate [regBase] or [regBase + offset] or
+ // [regBase + sizeOf(simdBaseType) * regIndex] to access
+ // the required vector element directly from memory.
+ //
+ // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we
+ // might be able update GT_LEA to fold the regIndex
+ // or offset in some cases. Instead with this
+ // approach we always evaluate GT_LEA into a reg.
+ // Ideally, we should be able to lower GetItem intrinsic
+ // into GT_IND(newAddr) where newAddr combines
+ // the addr of the vector with the given index.
+ op1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG;
+ }
+
+ if (!op2->OperIsConst())
+ {
+ // We will specially handle GetElement in codegen when op2 isn't a constant
+ return;
+ }
+
+ // We should have a bounds check inserted for any index outside the allowed range
+ // but we need to generate some code anyways, and so we'll simply mask here for simplicity.
+
+ ssize_t count = simdSize / genTypeSize(simdBaseType);
+ ssize_t imm8 = static_cast<uint8_t>(op2->AsIntCon()->IconValue()) % count;
+
+ assert(0 <= imm8 && imm8 < count);
+
+ if (IsContainableMemoryOp(op1))
+ {
+ // We will specially handle GetElement in codegen when op1 is already in memory
+ op2->AsIntCon()->SetIconValue(imm8);
+ return;
+ }
+
+ switch (simdBaseType)
+ {
+ // Using software fallback if simdBaseType is not supported by hardware
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ case TYP_INT:
+ case TYP_UINT:
+ assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41));
+ break;
+
+ case TYP_LONG:
+ case TYP_ULONG:
+ // We either support TYP_LONG or we have been decomposed into two TYP_INT inserts
+ assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41_X64));
+ break;
+
+ case TYP_DOUBLE:
+ case TYP_FLOAT:
+ case TYP_SHORT:
+ case TYP_USHORT:
+ assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+ break;
+
+ default:
+ unreached();
+ }
+
+ // Remove the index node up front to simplify downstream logic
+ BlockRange().Remove(op2);
+
+ // Spare GenTrees to be used for the lowering logic below
+ // Defined upfront to avoid naming conflicts, etc...
+ GenTree* idx = nullptr;
+ GenTree* tmp1 = nullptr;
+ GenTree* tmp2 = nullptr;
+
+ if (intrinsicId == NI_Vector256_GetElement)
+ {
+ assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
+
+ if (imm8 >= count / 2)
+ {
+ // idx = CNS_INT int 1
+ // /--* op1 simd32
+ // +--* idx int
+ // op1 = * HWINTRINSIC simd32 T ExtractVector128
+
+ // This is roughly the following managed code:
+ // ...
+ // op1 = Avx.ExtractVector128(op1, 0x01);
+
+ imm8 -= count / 2;
+
+ idx = comp->gtNewIconNode(1);
+ BlockRange().InsertBefore(node, idx);
+
+ tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, idx, NI_AVX_ExtractVector128, simdBaseJitType,
+ simdSize);
+ BlockRange().InsertAfter(idx, tmp1);
+ LowerNode(tmp1);
+ }
+ else
+ {
+ // /--* op1 simd32
+ // op1 = * HWINTRINSIC simd32 T GetLower
+
+ // This is roughly the following managed code:
+ // ...
+ // op1 = op1.GetLower();
+
+ tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, 16);
+ BlockRange().InsertBefore(node, tmp1);
+ LowerNode(tmp1);
+ }
+
+ op1 = tmp1;
+ }
+
+ NamedIntrinsic resIntrinsic = NI_Illegal;
+
+ if (imm8 == 0 && (genTypeSize(simdBaseType) >= 4))
+ {
+ switch (simdBaseType)
+ {
+ case TYP_LONG:
+ resIntrinsic = NI_SSE2_X64_ConvertToInt64;
+ break;
+
+ case TYP_ULONG:
+ resIntrinsic = NI_SSE2_X64_ConvertToUInt64;
+ break;
+
+ case TYP_INT:
+ resIntrinsic = NI_SSE2_ConvertToInt32;
+ break;
+
+ case TYP_UINT:
+ resIntrinsic = NI_SSE2_ConvertToUInt32;
+ break;
+
+ case TYP_FLOAT:
+ case TYP_DOUBLE:
+ resIntrinsic = NI_Vector128_ToScalar;
+ break;
+
+ default:
+ unreached();
+ }
+
+ op2 = nullptr;
+ }
+ else
+ {
+ op2 = comp->gtNewIconNode(imm8);
+ BlockRange().InsertBefore(node, op2);
+
+ switch (simdBaseType)
+ {
+ case TYP_LONG:
+ case TYP_ULONG:
+ {
+ resIntrinsic = NI_SSE41_X64_Extract;
+ break;
+ }
+
+ case TYP_FLOAT:
+ case TYP_DOUBLE:
+ {
+ // We specially handle float and double for more efficient codegen
+ resIntrinsic = NI_Vector128_GetElement;
+ break;
+ }
+
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ case TYP_INT:
+ case TYP_UINT:
+ {
+ resIntrinsic = NI_SSE41_Extract;
+ break;
+ }
+
+ case TYP_SHORT:
+ case TYP_USHORT:
+ {
+ resIntrinsic = NI_SSE2_Extract;
+ break;
+ }
+
+ default:
+ unreached();
+ }
+ }
+
+ assert(resIntrinsic != NI_Illegal);
+
+ node->gtHWIntrinsicId = resIntrinsic;
+ node->gtOp1 = op1;
+ node->gtOp2 = op2;
+ node->SetSimdSize(16);
+
+ if (!varTypeIsFloating(simdBaseType))
+ {
+ assert(node->gtHWIntrinsicId != intrinsicId);
+ LowerNode(node);
+ }
+
+ if ((simdBaseType == TYP_BYTE) || (simdBaseType == TYP_SHORT))
+ {
+ // The intrinsic zeros the upper bits, so we need an explicit
+ // cast to ensure the result is properly sign extended
+
+ LIR::Use use;
+ bool foundUse = BlockRange().TryGetUse(node, &use);
+
+ GenTreeCast* cast = comp->gtNewCastNode(TYP_INT, node, /* isUnsigned */ true, simdBaseType);
+ BlockRange().InsertAfter(node, cast);
+
+ if (foundUse)
+ {
+ use.ReplaceWith(comp, cast);
+ }
+ LowerNode(cast);
+ }
+}
+
+//----------------------------------------------------------------------------------------------
+// Lowering::LowerHWIntrinsicWithElement: Lowers a Vector128 or Vector256 WithElement call
+//
+// Arguments:
+// node - The hardware intrinsic node.
+//
+void Lowering::LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node)
+{
+ NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
+ var_types simdType = node->gtType;
+ CorInfoType simdBaseJitType = node->GetSimdBaseJitType();
+ var_types simdBaseType = node->GetSimdBaseType();
+ unsigned simdSize = node->GetSimdSize();
+
+ assert(varTypeIsSIMD(simdType));
+ assert(varTypeIsArithmetic(simdBaseType));
+ assert(simdSize != 0);
+
+ GenTree* op1 = node->gtGetOp1();
+ GenTree* op2 = nullptr;
+ GenTree* op3 = nullptr;
+
+ assert(op1->OperIsList());
+ assert(node->gtGetOp2() == nullptr);
+
+ GenTreeArgList* argList = op1->AsArgList();
+
+ op1 = argList->Current();
+ argList = argList->Rest();
+
+ op2 = argList->Current();
+ argList = argList->Rest();
+
+ op3 = argList->Current();
+ argList = argList->Rest();
+
+ assert(op1 != nullptr);
+ assert(op2 != nullptr);
+ assert(op3 != nullptr);
+
+ assert(op2->OperIsConst());
+ assert(argList == nullptr);
+
+ ssize_t imm8 = op2->AsIntCon()->IconValue();
+ ssize_t cachedImm8 = imm8;
+ ssize_t count = simdSize / genTypeSize(simdBaseType);
+
+ assert(0 <= imm8 && imm8 < count);
+
+ switch (simdBaseType)
+ {
+ // Using software fallback if simdBaseType is not supported by hardware
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ case TYP_INT:
+ case TYP_UINT:
+ assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41));
+ break;
+
+ case TYP_LONG:
+ case TYP_ULONG:
+ assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41_X64));
+ break;
+
+ case TYP_DOUBLE:
+ case TYP_FLOAT:
+ case TYP_SHORT:
+ case TYP_USHORT:
+ assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
+ break;
+
+ default:
+ unreached();
+ }
+
+ // Remove the index node up front to simplify downstream logic
+ BlockRange().Remove(op2);
+
+ // Spare GenTrees to be used for the lowering logic below
+ // Defined upfront to avoid naming conflicts, etc...
+ GenTree* idx = nullptr;
+ GenTree* tmp1 = nullptr;
+ GenTree* tmp2 = nullptr;
+ GenTree* tmp3 = nullptr;
+ GenTree* tmpv = nullptr;
+
+ if (intrinsicId == NI_Vector256_WithElement)
+ {
+ assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX));
+
+ // We will be constructing the following parts:
+ // ...
+ // /--* op1 simd32
+ // * STORE_LCL_VAR simd32
+ // tmpv = LCL_VAR simd32
+ // op1 = LCL_VAR simd32
+
+ node->gtOp1 = op1;
+ LIR::Use op1Use(BlockRange(), &node->gtOp1, node);
+ ReplaceWithLclVar(op1Use);
+ tmpv = node->gtOp1;
+
+ op1 = comp->gtClone(tmpv);
+ BlockRange().InsertBefore(op3, op1);
+
+ if (imm8 >= count / 2)
+ {
+ // We will be constructing the following parts:
+ // ...
+ // idx = CNS_INT int 1
+ // /--* op1 simd32
+ // +--* idx int
+ // op1 = * HWINTRINSIC simd32 T ExtractVector128
+
+ // This is roughly the following managed code:
+ // ...
+ // op1 = Avx.ExtractVector128(op1, 0x01);
+
+ imm8 -= count / 2;
+
+ idx = comp->gtNewIconNode(1);
+ BlockRange().InsertAfter(op1, idx);
+
+ tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, idx, NI_AVX_ExtractVector128, simdBaseJitType,
+ simdSize);
+ BlockRange().InsertAfter(idx, tmp1);
+ LowerNode(tmp1);
+ }
+ else
+ {
+ // We will be constructing the following parts:
+ // ...
+ // /--* op1 simd32
+ // op1 = * HWINTRINSIC simd32 T GetLower
+
+ // This is roughly the following managed code:
+ // ...
+ // op1 = op1.GetLower();
+
+ tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize);
+ BlockRange().InsertAfter(op1, tmp1);
+ LowerNode(tmp1);
+ }
+
+ op1 = tmp1;
+ }
+
+ NamedIntrinsic resIntrinsic = NI_Illegal;
+
+ idx = comp->gtNewIconNode(imm8);
+ BlockRange().InsertBefore(node, idx);
+
+ switch (simdBaseType)
+ {
+ case TYP_LONG:
+ case TYP_ULONG:
+ {
+ op2 = idx;
+ resIntrinsic = NI_SSE41_X64_Insert;
+ break;
+ }
+
+ case TYP_FLOAT:
+ {
+ // We will be constructing the following parts:
+ // ...
+ // /--* op3 float
+ // tmp1 = * HWINTRINSIC simd16 T CreateScalarUnsafe
+
+ // This is roughly the following managed code:
+ // ...
+ // tmp1 = Vector128.CreateScalarUnsafe(op3);
+
+ tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op3, NI_Vector128_CreateScalarUnsafe, CORINFO_TYPE_FLOAT,
+ 16);
+ BlockRange().InsertBefore(idx, tmp1);
+ LowerNode(tmp1);
+
+ if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+ {
+ if (imm8 == 0)
+ {
+ // We will be constructing the following parts:
+ // ...
+ // /--* op1 simd16
+ // +--* op2 simd16
+ // node = * HWINTRINSIC simd16 T MoveScalar
+
+ // This is roughly the following managed code:
+ // ...
+ // node = Sse.MoveScalar(op1, op2);
+
+ op2 = tmp1;
+ resIntrinsic = NI_SSE_MoveScalar;
+ }
+ else
+ {
+ // We will be constructing the following parts:
+ // ...
+ // /--* op1 simd16
+ // * STORE_LCL_VAR simd16
+ // op2 = LCL_VAR simd16
+ // tmp2 = LCL_VAR simd16
+ // idx = CNS_INT int 0
+ // /--* tmp1 simd16
+ // +--* tmp2 simd16
+ // +--* idx int
+ // op1 = * HWINTRINSIC simd16 T Shuffle
+ // idx = CNS_INT int 226
+ // /--* op1 simd16
+ // +--* tmp2 simd16
+ // +--* idx int
+ // op1 = * HWINTRINSIC simd16 T Shuffle
+
+ // This is roughly the following managed code:
+ // ...
+ // tmp2 = Sse.Shuffle(tmp1, op1, 0 or 48 or 32);
+ // node = Sse.Shuffle(tmp2, op1, 226 or 132 or 36);
+
+ node->gtOp1 = op1;
+ LIR::Use op1Use(BlockRange(), &node->gtOp1, node);
+ ReplaceWithLclVar(op1Use);
+ op2 = node->gtOp1;
+
+ tmp2 = comp->gtClone(op2);
+ BlockRange().InsertAfter(tmp1, tmp2);
+
+ ssize_t controlBits1;
+ ssize_t controlBits2;
+
+ switch (imm8)
+ {
+ case 1:
+ {
+ controlBits1 = 0;
+ controlBits2 = 226;
+ break;
+ }
+
+ case 2:
+ {
+ controlBits1 = 48;
+ controlBits2 = 132;
+ break;
+ }
+
+ case 3:
+ {
+ controlBits1 = 32;
+ controlBits2 = 36;
+ break;
+ }
+
+ default:
+ unreached();
+ }
+
+ idx = comp->gtNewIconNode(controlBits1);
+ BlockRange().InsertAfter(tmp2, idx);
+
+ if (imm8 == 1)
+ {
+ std::swap(tmp1, tmp2);
+ }
+
+ op1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, tmp1, idx, NI_SSE_Shuffle,
+ CORINFO_TYPE_FLOAT, 16);
+ BlockRange().InsertAfter(idx, op1);
+ LowerNode(op1);
+
+ idx = comp->gtNewIconNode(controlBits2);
+ BlockRange().InsertAfter(op1, idx);
+
+ op1 = comp->gtNewArgList(op1, op2, idx);
+ op2 = nullptr;
+ resIntrinsic = NI_SSE_Shuffle;
+ }
+ break;
+ }
+ else
+ {
+ op3 = tmp1;
+ idx->AsIntCon()->SetIconValue(imm8 * 16);
+ FALLTHROUGH;
+ }
+ }
+
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ case TYP_INT:
+ case TYP_UINT:
+ {
+ op1 = comp->gtNewArgList(op1, op3, idx);
+ op2 = nullptr;
+ resIntrinsic = NI_SSE41_Insert;
+ break;
+ }
+
+ case TYP_SHORT:
+ case TYP_USHORT:
+ {
+ op1 = comp->gtNewArgList(op1, op3, idx);
+ op2 = nullptr;
+ resIntrinsic = NI_SSE2_Insert;
+ break;
+ }
+
+ case TYP_DOUBLE:
+ {
+ // We will be constructing the following parts:
+ // ...
+ // /--* op3 double
+ // tmp1 = * HWINTRINSIC simd16 T CreateScalarUnsafe
+
+ // This is roughly the following managed code:
+ // ...
+ // tmp1 = Vector128.CreateScalarUnsafe(op3);
+
+ tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op3, NI_Vector128_CreateScalarUnsafe, CORINFO_TYPE_DOUBLE,
+ 16);
+ BlockRange().InsertBefore(idx, tmp1);
+ LowerNode(tmp1);
+
+ op2 = tmp1;
+ resIntrinsic = (imm8 == 0) ? NI_SSE2_MoveScalar : NI_SSE2_UnpackLow;
+ break;
+ }
+
+ default:
+ unreached();
+ }
+
+ assert(resIntrinsic != NI_Illegal);
+
+ if (tmpv != nullptr)
+ {
+ tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, resIntrinsic, simdBaseJitType, 16);
+ BlockRange().InsertBefore(node, tmp1);
+ LowerNode(tmp1);
+
+ idx = comp->gtNewIconNode((cachedImm8 >= count / 2) ? 1 : 0);
+ BlockRange().InsertAfter(tmp1, idx);
+
+ op1 = comp->gtNewArgList(tmpv, tmp1, idx);
+ op2 = nullptr;
+ resIntrinsic = NI_AVX_InsertVector128;
+ }
+
+ node->gtHWIntrinsicId = resIntrinsic;
+ node->gtOp1 = op1;
+ node->gtOp2 = op2;
+
+ assert(node->gtHWIntrinsicId != intrinsicId);
+ LowerNode(node);
+}
+
+//----------------------------------------------------------------------------------------------
// Lowering::LowerHWIntrinsicDot: Lowers a Vector128 or Vector256 Dot call
//
// Arguments:
@@ -4668,12 +5312,9 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
{
switch (simdNode->gtSIMDIntrinsicID)
{
- GenTree* op1;
- GenTree* op2;
-
case SIMDIntrinsicInit:
{
- op1 = simdNode->AsOp()->gtOp1;
+ GenTree* op1 = simdNode->AsOp()->gtOp1;
#ifndef TARGET_64BIT
if (op1->OperGet() == GT_LONG)
{
@@ -4712,34 +5353,6 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode)
CheckImmedAndMakeContained(simdNode, simdNode->gtGetOp2());
break;
- case SIMDIntrinsicGetItem:
- {
- // This implements get_Item method. The sources are:
- // - the source SIMD struct
- // - index (which element to get)
- // The result is simdBaseType of SIMD struct.
- op1 = simdNode->AsOp()->gtOp1;
- op2 = simdNode->AsOp()->gtOp2;
-
- if (op1->OperGet() == GT_IND)
- {
- assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0);
- op1->AsIndir()->Addr()->ClearContained();
- }
- // If the index is a constant, mark it as contained.
- CheckImmedAndMakeContained(simdNode, op2);
-
- if (IsContainableMemoryOp(op1))
- {
- MakeSrcContained(simdNode, op1);
- if (op1->OperGet() == GT_IND)
- {
- op1->AsIndir()->Addr()->ClearContained();
- }
- }
- }
- break;
-
case SIMDIntrinsicShuffleSSE2:
// Second operand is an integer constant and marked as contained.
assert(simdNode->AsOp()->gtOp2->IsCnsIntOrI());
@@ -5212,9 +5825,13 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
if ((node->GetSimdSize() == 8) || (node->GetSimdSize() == 12))
{
- // TODO-XArch-CQ: Ideally we would key this off of the size containingNode
- // expects vs the size node actually is or would be if spilled to the stack
- return;
+ // We want to handle GetElement still for Vector2/3
+ if ((intrinsicId != NI_Vector128_GetElement) && (intrinsicId != NI_Vector256_GetElement))
+ {
+ // TODO-XArch-CQ: Ideally we would key this off of the size containingNode
+ // expects vs the size node actually is or would be if spilled to the stack
+ return;
+ }
}
// TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained
@@ -5390,8 +6007,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
switch (intrinsicId)
{
case NI_SSE2_Extract:
- case NI_SSE41_Extract:
- case NI_SSE41_X64_Extract:
case NI_AVX_ExtractVector128:
case NI_AVX2_ExtractVector128:
{
@@ -5444,6 +6059,15 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}
+ case NI_SSE41_Extract:
+ case NI_SSE41_X64_Extract:
+ {
+ assert(!varTypeIsFloating(simdBaseType));
+ // TODO-XARCH-CQ: These intrinsics are "ins reg/mem, xmm, imm8" and don't
+ // currently support containment.
+ break;
+ }
+
case NI_AVX_Permute:
{
// These intrinsics can have op2 be imm or reg/mem
@@ -5515,6 +6139,49 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}
+ case HW_Category_Helper:
+ {
+ // We don't currently have any IMM intrinsics which are also commutative
+ assert(!isCommutative);
+
+ switch (intrinsicId)
+ {
+ case NI_Vector128_GetElement:
+ case NI_Vector256_GetElement:
+ {
+ if (op1->OperIs(GT_IND))
+ {
+ assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0);
+ op1->AsIndir()->Addr()->ClearContained();
+ }
+
+ if (op2->OperIsConst())
+ {
+ MakeSrcContained(node, op2);
+ }
+
+ if (IsContainableMemoryOp(op1))
+ {
+ MakeSrcContained(node, op1);
+
+ if (op1->OperIs(GT_IND))
+ {
+ op1->AsIndir()->Addr()->ClearContained();
+ }
+ }
+ break;
+ }
+
+ default:
+ {
+ assert(!"Unhandled containment for helper binary hardware intrinsic");
+ break;
+ }
+ }
+
+ break;
+ }
+
default:
{
unreached();
diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp
index d7b51ff42bb..223dc906bad 100644
--- a/src/coreclr/jit/lsraarm64.cpp
+++ b/src/coreclr/jit/lsraarm64.cpp
@@ -809,44 +809,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
// No special handling required.
break;
- case SIMDIntrinsicGetItem:
- {
- op1 = simdTree->gtGetOp1();
- op2 = simdTree->gtGetOp2();
-
- // We have an object and an index, either of which may be contained.
- bool setOp2DelayFree = false;
- if (!op2->IsCnsIntOrI() && (!op1->isContained() || op1->OperIsLocal()))
- {
- // If the index is not a constant and the object is not contained or is a local
- // we will need a general purpose register to calculate the address
- // internal register must not clobber input index
- // TODO-Cleanup: An internal register will never clobber a source; this code actually
- // ensures that the index (op2) doesn't interfere with the target.
- buildInternalIntRegisterDefForNode(simdTree);
- setOp2DelayFree = true;
- }
- srcCount += BuildOperandUses(op1);
- if (!op2->isContained())
- {
- RefPosition* op2Use = BuildUse(op2);
- if (setOp2DelayFree)
- {
- setDelayFree(op2Use);
- }
- srcCount++;
- }
-
- if (!op2->IsCnsIntOrI() && (!op1->isContained()))
- {
- // If vector is not already in memory (contained) and the index is not a constant,
- // we will use the SIMD temp location to store the vector.
- compiler->getSIMDInitTempVarNum();
- }
- buildUses = false;
- }
- break;
-
case SIMDIntrinsicSub:
case SIMDIntrinsicBitwiseAnd:
case SIMDIntrinsicBitwiseOr:
@@ -854,10 +816,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
// No special handling required.
break;
- case SIMDIntrinsicSetX:
- case SIMDIntrinsicSetY:
- case SIMDIntrinsicSetZ:
- case SIMDIntrinsicSetW:
case SIMDIntrinsicNarrow:
{
// Op1 will write to dst before Op2 is free
@@ -904,10 +862,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
case SIMDIntrinsicCopyToArray:
case SIMDIntrinsicCopyToArrayX:
case SIMDIntrinsicNone:
- case SIMDIntrinsicGetX:
- case SIMDIntrinsicGetY:
- case SIMDIntrinsicGetZ:
- case SIMDIntrinsicGetW:
case SIMDIntrinsicHWAccel:
case SIMDIntrinsicWiden:
case SIMDIntrinsicInvalid:
@@ -1202,6 +1156,29 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
}
}
+ if ((intrin.id == NI_Vector64_GetElement) || (intrin.id == NI_Vector128_GetElement))
+ {
+ assert(!op2DelayFree);
+
+ if (!intrin.op2->IsCnsIntOrI() && (!intrin.op1->isContained() || intrin.op1->OperIsLocal()))
+ {
+ // If the index is not a constant and the object is not contained or is a local
+ // we will need a general purpose register to calculate the address
+ // internal register must not clobber input index
+ // TODO-Cleanup: An internal register will never clobber a source; this code actually
+ // ensures that the index (op2) doesn't interfere with the target.
+ buildInternalIntRegisterDefForNode(intrinsicTree);
+ op2DelayFree = true;
+ }
+
+ if (!intrin.op2->IsCnsIntOrI() && !intrin.op1->isContained())
+ {
+ // If the index is not a constant or op1 is in register,
+ // we will use the SIMD temp location to store the vector.
+ compiler->getSIMDInitTempVarNum();
+ }
+ }
+
srcCount += op2DelayFree ? BuildDelayFreeUses(intrin.op2) : BuildOperandUses(intrin.op2);
if (intrin.op3 != nullptr)
diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp
index ad750fad953..5c76005fc86 100644
--- a/src/coreclr/jit/lsraxarch.cpp
+++ b/src/coreclr/jit/lsraxarch.cpp
@@ -1956,96 +1956,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
case SIMDIntrinsicEqual:
break;
- case SIMDIntrinsicGetItem:
- {
- // This implements get_Item method. The sources are:
- // - the source SIMD struct
- // - index (which element to get)
- // The result is baseType of SIMD struct.
- // op1 may be a contained memory op, but if so we will consume its address.
- // op2 may be a contained constant.
- op1 = simdTree->gtGetOp1();
- op2 = simdTree->gtGetOp2();
-
- if (!op1->isContained())
- {
- // If the index is not a constant, we will use the SIMD temp location to store the vector.
- // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we
- // can use that in the process of extracting the element.
- //
- // If the index is a constant and base type is a small int we can use pextrw, but on AVX
- // we will need a temp if are indexing into the upper half of the AVX register.
- // In all other cases with constant index, we need a temp xmm register to extract the
- // element if index is other than zero.
-
- if (!op2->IsCnsIntOrI())
- {
- (void)compiler->getSIMDInitTempVarNum();
- }
- else if (!varTypeIsFloating(simdTree->GetSimdBaseType()))
- {
- bool needFloatTemp;
- if (varTypeIsSmallInt(simdTree->GetSimdBaseType()) &&
- (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported))
- {
- int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->GetSimdBaseType());
- needFloatTemp = (byteShiftCnt >= 16);
- }
- else
- {
- needFloatTemp = !op2->IsIntegralConst(0);
- }
-
- if (needFloatTemp)
- {
- buildInternalFloatRegisterDefForNode(simdTree);
- }
- }
-#ifdef TARGET_X86
- // This logic is duplicated from genSIMDIntrinsicGetItem().
- // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
- // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
- // cases will require this, so the non-byteable registers can be excluded.
-
- var_types baseType = simdTree->GetSimdBaseType();
- if (op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
- {
- bool ZeroOrSignExtnReqd = true;
- unsigned baseSize = genTypeSize(baseType);
- if (baseSize == 1)
- {
- if ((op2->AsIntCon()->gtIconVal % 2) == 1)
- {
- ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
- }
- }
- else
- {
- assert(baseSize == 2);
- ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
- }
- if (ZeroOrSignExtnReqd)
- {
- dstCandidates = allByteRegs();
- }
- }
-#endif // TARGET_X86
- }
- }
- break;
-
- case SIMDIntrinsicSetX:
- case SIMDIntrinsicSetY:
- case SIMDIntrinsicSetZ:
- case SIMDIntrinsicSetW:
- // We need an internal integer register for SSE2 codegen
- if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
- {
- buildInternalIntRegisterDefForNode(simdTree);
- }
-
- break;
-
case SIMDIntrinsicCast:
break;
@@ -2122,13 +2032,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
assert(simdTree->gtGetOp2()->isContainedIntOrIImmed());
break;
- case SIMDIntrinsicGetX:
- case SIMDIntrinsicGetY:
- case SIMDIntrinsicGetZ:
- case SIMDIntrinsicGetW:
- assert(!"Get intrinsics should not be seen during Lowering.");
- unreached();
-
default:
noway_assert(!"Unimplemented SIMD node type.");
unreached();
@@ -2281,6 +2184,20 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
break;
}
+ case NI_Vector128_GetElement:
+ case NI_Vector256_GetElement:
+ {
+ assert(numArgs == 2);
+
+ if (!op2->OperIsConst() && !op1->isContained())
+ {
+ // If the index is not a constant or op1 is in register,
+ // we will use the SIMD temp location to store the vector.
+ compiler->getSIMDInitTempVarNum();
+ }
+ break;
+ }
+
case NI_Vector128_ToVector256:
case NI_Vector128_ToVector256Unsafe:
case NI_Vector256_GetLower:
@@ -2342,12 +2259,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
case NI_SSE41_Extract:
{
- if (baseType == TYP_FLOAT)
- {
- buildInternalIntRegisterDefForNode(intrinsicTree);
- }
+ assert(!varTypeIsFloating(baseType));
+
#ifdef TARGET_X86
- else if (varTypeIsByte(baseType))
+ if (varTypeIsByte(baseType))
{
dstCandidates = allByteRegs();
}
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index 8dbbbebf72a..2aac4cb1360 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -6014,7 +6014,7 @@ GenTree* Compiler::fgMorphField(GenTree* tree, MorphAddrContext* mac)
// if this field belongs to simd struct, translate it to simd intrinsic.
if (mac == nullptr)
{
- GenTree* newTree = fgMorphFieldToSIMDIntrinsicGet(tree);
+ GenTree* newTree = fgMorphFieldToSimdGetElement(tree);
if (newTree != tree)
{
newTree = fgMorphSmpOp(newTree);
@@ -12027,29 +12027,33 @@ GenTree* Compiler::getSIMDStructFromField(GenTree* tree,
}
/*****************************************************************************
-* If a read operation tries to access simd struct field, then transform the
-* operation to the SIMD intrinsic SIMDIntrinsicGetItem, and return the new tree.
-* Otherwise, return the old tree.
+* If a read operation tries to access simd struct field, then transform the operation
+* to the SimdGetElementNode, and return the new tree. Otherwise, return the old tree.
* Argument:
* tree - GenTree*. If this pointer points to simd struct which is used for simd
-* intrinsic, we will morph it as simd intrinsic SIMDIntrinsicGetItem.
+* intrinsic, we will morph it as simd intrinsic NI_Vector128_GetElement.
* Return:
* A GenTree* which points to the new tree. If the tree is not for simd intrinsic,
* return nullptr.
*/
-GenTree* Compiler::fgMorphFieldToSIMDIntrinsicGet(GenTree* tree)
+GenTree* Compiler::fgMorphFieldToSimdGetElement(GenTree* tree)
{
unsigned index = 0;
CorInfoType simdBaseJitType = CORINFO_TYPE_UNDEF;
unsigned simdSize = 0;
GenTree* simdStructNode = getSIMDStructFromField(tree, &simdBaseJitType, &index, &simdSize);
+
if (simdStructNode != nullptr)
{
var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
+ GenTree* op2 = gtNewIconNode(index, TYP_INT);
+
+ assert(simdSize <= 16);
assert(simdSize >= ((index + 1) * genTypeSize(simdBaseType)));
- GenTree* op2 = gtNewIconNode(index);
- tree = gtNewSIMDNode(simdBaseType, simdStructNode, op2, SIMDIntrinsicGetItem, simdBaseJitType, simdSize);
+
+ tree = gtNewSimdGetElementNode(simdBaseType, simdStructNode, op2, simdBaseJitType, simdSize,
+ /* isSimdAsHWIntrinsic */ true);
#ifdef DEBUG
tree->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED;
#endif
@@ -12058,9 +12062,8 @@ GenTree* Compiler::fgMorphFieldToSIMDIntrinsicGet(GenTree* tree)
}
/*****************************************************************************
-* Transform an assignment of a SIMD struct field to SIMD intrinsic
-* SIMDIntrinsicSet*, and return a new tree. If it is not such an assignment,
-* then return the old tree.
+* Transform an assignment of a SIMD struct field to SimdWithElementNode, and
+* return a new tree. If it is not such an assignment, then return the old tree.
* Argument:
* tree - GenTree*. If this pointer points to simd struct which is used for simd
* intrinsic, we will morph it as simd intrinsic set.
@@ -12069,46 +12072,32 @@ GenTree* Compiler::fgMorphFieldToSIMDIntrinsicGet(GenTree* tree)
* return nullptr.
*/
-GenTree* Compiler::fgMorphFieldAssignToSIMDIntrinsicSet(GenTree* tree)
+GenTree* Compiler::fgMorphFieldAssignToSimdSetElement(GenTree* tree)
{
assert(tree->OperGet() == GT_ASG);
- GenTree* op1 = tree->gtGetOp1();
- GenTree* op2 = tree->gtGetOp2();
unsigned index = 0;
CorInfoType simdBaseJitType = CORINFO_TYPE_UNDEF;
unsigned simdSize = 0;
- GenTree* simdOp1Struct = getSIMDStructFromField(op1, &simdBaseJitType, &index, &simdSize);
- if (simdOp1Struct != nullptr)
+ GenTree* simdStructNode = getSIMDStructFromField(tree->gtGetOp1(), &simdBaseJitType, &index, &simdSize);
+
+ if (simdStructNode != nullptr)
{
+ var_types simdType = simdStructNode->gtType;
var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
- // Generate the simd set intrinsic
+ assert(simdSize <= 16);
assert(simdSize >= ((index + 1) * genTypeSize(simdBaseType)));
- SIMDIntrinsicID simdIntrinsicID = SIMDIntrinsicInvalid;
- switch (index)
- {
- case 0:
- simdIntrinsicID = SIMDIntrinsicSetX;
- break;
- case 1:
- simdIntrinsicID = SIMDIntrinsicSetY;
- break;
- case 2:
- simdIntrinsicID = SIMDIntrinsicSetZ;
- break;
- case 3:
- simdIntrinsicID = SIMDIntrinsicSetW;
- break;
- default:
- noway_assert(!"There is no set intrinsic for index bigger than 3");
- }
+ GenTree* op2 = gtNewIconNode(index, TYP_INT);
+ GenTree* op3 = tree->gtGetOp2();
+ NamedIntrinsic intrinsicId = NI_Vector128_WithElement;
- GenTree* target = gtClone(simdOp1Struct);
+ GenTree* target = gtClone(simdStructNode);
assert(target != nullptr);
- var_types simdType = target->gtType;
- GenTree* simdTree = gtNewSIMDNode(simdType, simdOp1Struct, op2, simdIntrinsicID, simdBaseJitType, simdSize);
+
+ GenTree* simdTree = gtNewSimdWithElementNode(simdType, simdStructNode, op2, op3, simdBaseJitType, simdSize,
+ /* isSimdAsHWIntrinsic */ true);
tree->AsOp()->gtOp1 = target;
tree->AsOp()->gtOp2 = simdTree;
@@ -12258,7 +12247,7 @@ GenTree* Compiler::fgMorphSmpOp(GenTree* tree, MorphAddrContext* mac)
// We should check whether op2 should be assigned to a SIMD field or not.
// If it is, we should tranlate the tree to simd intrinsic.
assert(!fgGlobalMorph || ((tree->gtDebugFlags & GTF_DEBUG_NODE_MORPHED) == 0));
- GenTree* newTree = fgMorphFieldAssignToSIMDIntrinsicSet(tree);
+ GenTree* newTree = fgMorphFieldAssignToSimdSetElement(tree);
typ = tree->TypeGet();
op1 = tree->gtGetOp1();
op2 = tree->gtGetOp2();
diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp
index ecded3a159a..0cb2947cff0 100644
--- a/src/coreclr/jit/simd.cpp
+++ b/src/coreclr/jit/simd.cpp
@@ -1167,7 +1167,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in
switch (intrinsicId)
{
case SIMDIntrinsicInit:
- case SIMDIntrinsicGetItem:
case SIMDIntrinsicSub:
case SIMDIntrinsicEqual:
case SIMDIntrinsicBitwiseAnd:
@@ -1212,7 +1211,8 @@ GenTree* Compiler::impSIMDPopStack(var_types type, bool expectAddr, CORINFO_CLAS
// SIMD type struct that it points to.
if (expectAddr)
{
- assert(tree->TypeGet() == TYP_BYREF);
+ assert(tree->TypeIs(TYP_BYREF, TYP_I_IMPL));
+
if (tree->OperGet() == GT_ADDR)
{
tree = tree->gtGetOp1();
@@ -1285,29 +1285,6 @@ GenTree* Compiler::impSIMDPopStack(var_types type, bool expectAddr, CORINFO_CLAS
return tree;
}
-// impSIMDGetFixed: Create a GT_SIMD tree for a Get property of SIMD vector with a fixed index.
-//
-// Arguments:
-// simdBaseJitType - The base (element) JIT type of the SIMD vector.
-// simdSize - The total size in bytes of the SIMD vector.
-// index - The index of the field to get.
-//
-// Return Value:
-// Returns a GT_SIMD node with the SIMDIntrinsicGetItem intrinsic id.
-//
-GenTreeSIMD* Compiler::impSIMDGetFixed(var_types simdType, CorInfoType simdBaseJitType, unsigned simdSize, int index)
-{
- var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
- assert(simdSize >= ((index + 1) * genTypeSize(simdBaseType)));
-
- // op1 is a SIMD source.
- GenTree* op1 = impSIMDPopStack(simdType, true);
-
- GenTree* op2 = gtNewIconNode(index);
- GenTreeSIMD* simdTree = gtNewSIMDNode(simdBaseType, op1, op2, SIMDIntrinsicGetItem, simdBaseJitType, simdSize);
- return simdTree;
-}
-
#ifdef TARGET_XARCH
// impSIMDLongRelOpEqual: transforms operands and returns the SIMD intrinsic to be applied on
// transformed operands to obtain == comparison result.
@@ -2306,11 +2283,13 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode,
simdTree = op2;
if (op3 != nullptr)
{
- simdTree = gtNewSIMDNode(simdType, simdTree, op3, SIMDIntrinsicSetZ, simdBaseJitType, size);
+ simdTree = gtNewSimdWithElementNode(simdType, simdTree, gtNewIconNode(2, TYP_INT), op3, simdBaseJitType,
+ size, /* isSimdAsHWIntrinsic */ true);
}
if (op4 != nullptr)
{
- simdTree = gtNewSIMDNode(simdType, simdTree, op4, SIMDIntrinsicSetW, simdBaseJitType, size);
+ simdTree = gtNewSimdWithElementNode(simdType, simdTree, gtNewIconNode(3, TYP_INT), op4, simdBaseJitType,
+ size, /* isSimdAsHWIntrinsic */ true);
}
copyBlkDst = op1;
@@ -2343,94 +2322,6 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode,
}
break;
- case SIMDIntrinsicGetItem:
- {
- // op1 is a SIMD variable that is "this" arg
- // op2 is an index of TYP_INT
- op2 = impSIMDPopStack(TYP_INT);
- op1 = impSIMDPopStack(simdType, instMethod);
- int vectorLength = getSIMDVectorLength(size, simdBaseType);
- if (!op2->IsCnsIntOrI() || op2->AsIntCon()->gtIconVal >= vectorLength || op2->AsIntCon()->gtIconVal < 0)
- {
- // We need to bounds-check the length of the vector.
- // For that purpose, we need to clone the index expression.
- GenTree* index = op2;
- if ((index->gtFlags & GTF_SIDE_EFFECT) != 0)
- {
- op2 = fgInsertCommaFormTemp(&index);
- }
- else
- {
- op2 = gtCloneExpr(index);
- }
-
- // For the non-constant case, we don't want to CSE the SIMD value, as we will just need to store
- // it to the stack to do the indexing anyway.
- op1->gtFlags |= GTF_DONT_CSE;
-
- GenTree* lengthNode = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, vectorLength);
- GenTreeBoundsChk* simdChk =
- new (this, GT_SIMD_CHK) GenTreeBoundsChk(GT_SIMD_CHK, TYP_VOID, index, lengthNode, SCK_RNGCHK_FAIL);
-
- // Create a GT_COMMA tree for the bounds check.
- op2 = gtNewOperNode(GT_COMMA, op2->TypeGet(), simdChk, op2);
- }
-
- assert(op1->TypeGet() == simdType);
- assert(op2->TypeGet() == TYP_INT);
-
- simdTree = gtNewSIMDNode(genActualType(callType), op1, op2, simdIntrinsicID, simdBaseJitType, size);
- retVal = simdTree;
- }
- break;
-
- case SIMDIntrinsicGetW:
- retVal = impSIMDGetFixed(simdType, simdBaseJitType, size, 3);
- break;
-
- case SIMDIntrinsicGetZ:
- retVal = impSIMDGetFixed(simdType, simdBaseJitType, size, 2);
- break;
-
- case SIMDIntrinsicGetY:
- retVal = impSIMDGetFixed(simdType, simdBaseJitType, size, 1);
- break;
-
- case SIMDIntrinsicGetX:
- retVal = impSIMDGetFixed(simdType, simdBaseJitType, size, 0);
- break;
-
- case SIMDIntrinsicSetW:
- case SIMDIntrinsicSetZ:
- case SIMDIntrinsicSetY:
- case SIMDIntrinsicSetX:
- {
- // op2 is the value to be set at indexTemp position
- // op1 is SIMD vector that is going to be modified, which is a byref
-
- // If op1 has a side-effect, then don't make it an intrinsic.
- // It would be in-efficient to read the entire vector into xmm reg,
- // modify it and write back entire xmm reg.
- //
- // TODO-CQ: revisit this later.
- op1 = impStackTop(1).val;
- if ((op1->gtFlags & GTF_SIDE_EFFECT) != 0)
- {
- return nullptr;
- }
-
- op2 = impSIMDPopStack(simdBaseType);
- op1 = impSIMDPopStack(simdType, instMethod);
-
- GenTree* src = gtCloneExpr(op1);
- assert(src != nullptr);
- simdTree = gtNewSIMDNode(simdType, src, op2, simdIntrinsicID, simdBaseJitType, size);
-
- copyBlkDst = gtNewOperNode(GT_ADDR, TYP_BYREF, op1);
- doCopyBlk = true;
- }
- break;
-
// Unary operators that take and return a Vector.
case SIMDIntrinsicCast:
case SIMDIntrinsicConvertToSingle:
diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp
index c093fb05f75..7369172c5c9 100644
--- a/src/coreclr/jit/simdashwintrinsic.cpp
+++ b/src/coreclr/jit/simdashwintrinsic.cpp
@@ -230,6 +230,12 @@ GenTree* Compiler::impSimdAsHWIntrinsic(NamedIntrinsic intrinsic,
isInstanceMethod = true;
argClass = clsHnd;
+
+ if (SimdAsHWIntrinsicInfo::BaseTypeFromThisArg(intrinsic))
+ {
+ assert(simdBaseJitType == CORINFO_TYPE_UNDEF);
+ simdBaseJitType = getBaseJitTypeAndSizeOfSIMDType(clsHnd, &simdSize);
+ }
}
else if ((clsHnd == m_simdHandleCache->SIMDVectorHandle) && (numArgs != 0))
{
@@ -436,6 +442,39 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic,
}
#if defined(TARGET_XARCH)
+ case NI_VectorT256_get_Item:
+ case NI_VectorT128_get_Item:
+ {
+ switch (simdBaseType)
+ {
+ // Using software fallback if simdBaseType is not supported by hardware
+ case TYP_BYTE:
+ case TYP_UBYTE:
+ case TYP_INT:
+ case TYP_UINT:
+ case TYP_LONG:
+ case TYP_ULONG:
+ if (!compExactlyDependsOn(InstructionSet_SSE41))
+ {
+ return nullptr;
+ }
+ break;
+
+ case TYP_DOUBLE:
+ case TYP_FLOAT:
+ case TYP_SHORT:
+ case TYP_USHORT:
+ // short/ushort/float/double is supported by SSE2
+ break;
+
+ default:
+ unreached();
+ }
+ break;
+ }
+#endif // TARGET_XARCH
+
+#if defined(TARGET_XARCH)
case NI_VectorT128_Dot:
{
if (!compOpportunisticallyDependsOn(InstructionSet_SSE41))
@@ -737,6 +776,13 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic,
break;
}
+ case NI_VectorT128_get_Item:
+ case NI_VectorT256_get_Item:
+ {
+ return gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize,
+ /* isSimdAsHWIntrinsic */ true);
+ }
+
case NI_Vector2_op_Division:
case NI_Vector3_op_Division:
{
@@ -1058,6 +1104,12 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic,
break;
}
+ case NI_VectorT128_get_Item:
+ {
+ return gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize,
+ /* isSimdAsHWIntrinsic */ true);
+ }
+
case NI_VectorT128_Max:
case NI_VectorT128_Min:
{
diff --git a/src/coreclr/jit/simdashwintrinsic.h b/src/coreclr/jit/simdashwintrinsic.h
index 41dd1c5a53b..176507d0b66 100644
--- a/src/coreclr/jit/simdashwintrinsic.h
+++ b/src/coreclr/jit/simdashwintrinsic.h
@@ -26,6 +26,9 @@ enum class SimdAsHWIntrinsicFlag : unsigned int
// Indicates the operands should be swapped in importation.
NeedsOperandsSwapped = 0x04,
+
+ // Base type should come from the this argument
+ BaseTypeFromThisArg = 0x08,
};
inline SimdAsHWIntrinsicFlag operator~(SimdAsHWIntrinsicFlag value)
@@ -124,6 +127,12 @@ struct SimdAsHWIntrinsicInfo
SimdAsHWIntrinsicFlag flags = lookupFlags(id);
return (flags & SimdAsHWIntrinsicFlag::NeedsOperandsSwapped) == SimdAsHWIntrinsicFlag::NeedsOperandsSwapped;
}
+
+ static bool BaseTypeFromThisArg(NamedIntrinsic id)
+ {
+ SimdAsHWIntrinsicFlag flags = lookupFlags(id);
+ return (flags & SimdAsHWIntrinsicFlag::BaseTypeFromThisArg) == SimdAsHWIntrinsicFlag::BaseTypeFromThisArg;
+ }
};
#endif // _SIMD_AS_HWINTRINSIC_H_
diff --git a/src/coreclr/jit/simdashwintrinsiclistarm64.h b/src/coreclr/jit/simdashwintrinsiclistarm64.h
index fc75eca9f3f..4eba54135a5 100644
--- a/src/coreclr/jit/simdashwintrinsiclistarm64.h
+++ b/src/coreclr/jit/simdashwintrinsiclistarm64.h
@@ -112,6 +112,7 @@ SIMD_AS_HWINTRINSIC_NM(VectorT128, EqualsInstance, "Equals",
SIMD_AS_HWINTRINSIC_ID(VectorT128, Floor, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Floor, NI_AdvSimd_Arm64_Floor}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT128, get_AllBitsSet, 0, {NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Count, 0, {NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count}, SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Item, 2, {NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item}, SimdAsHWIntrinsicFlag::InstanceMethod | SimdAsHWIntrinsicFlag::BaseTypeFromThisArg)
SIMD_AS_HWINTRINSIC_ID(VectorT128, get_One, 0, {NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Zero, 0, {NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT128, GreaterThan, 2, {NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_Arm64_CompareGreaterThan, NI_AdvSimd_Arm64_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_Arm64_CompareGreaterThan}, SimdAsHWIntrinsicFlag::None)
diff --git a/src/coreclr/jit/simdashwintrinsiclistxarch.h b/src/coreclr/jit/simdashwintrinsiclistxarch.h
index 99e5c29ff8a..af75fb75fae 100644
--- a/src/coreclr/jit/simdashwintrinsiclistxarch.h
+++ b/src/coreclr/jit/simdashwintrinsiclistxarch.h
@@ -112,6 +112,7 @@ SIMD_AS_HWINTRINSIC_NM(VectorT128, EqualsInstance, "Equals",
SIMD_AS_HWINTRINSIC_ID(VectorT128, Floor, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE41_Floor, NI_SSE41_Floor}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT128, get_AllBitsSet, 0, {NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Count, 0, {NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count}, SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Item, 2, {NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item}, SimdAsHWIntrinsicFlag::InstanceMethod | SimdAsHWIntrinsicFlag::BaseTypeFromThisArg)
SIMD_AS_HWINTRINSIC_ID(VectorT128, get_One, 0, {NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Zero, 0, {NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT128, GreaterThan, 2, {NI_SSE2_CompareGreaterThan, NI_VectorT128_GreaterThan, NI_SSE2_CompareGreaterThan, NI_VectorT128_GreaterThan, NI_SSE2_CompareGreaterThan, NI_VectorT128_GreaterThan, NI_VectorT128_GreaterThan, NI_VectorT128_GreaterThan, NI_SSE_CompareGreaterThan, NI_SSE2_CompareGreaterThan}, SimdAsHWIntrinsicFlag::None)
@@ -149,6 +150,7 @@ SIMD_AS_HWINTRINSIC_NM(VectorT256, EqualsInstance, "Equals",
SIMD_AS_HWINTRINSIC_ID(VectorT256, Floor, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AVX_Floor, NI_AVX_Floor}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT256, get_AllBitsSet, 0, {NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT256, get_Count, 0, {NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count}, SimdAsHWIntrinsicFlag::None)
+SIMD_AS_HWINTRINSIC_ID(VectorT256, get_Item, 2, {NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item}, SimdAsHWIntrinsicFlag::InstanceMethod | SimdAsHWIntrinsicFlag::BaseTypeFromThisArg)
SIMD_AS_HWINTRINSIC_ID(VectorT256, get_One, 0, {NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT256, get_Zero, 0, {NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero}, SimdAsHWIntrinsicFlag::None)
SIMD_AS_HWINTRINSIC_ID(VectorT256, GreaterThan, 2, {NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX_CompareGreaterThan, NI_AVX_CompareGreaterThan}, SimdAsHWIntrinsicFlag::None)
diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp
index 4b0b6cfd0b9..4523fe48a89 100644
--- a/src/coreclr/jit/simdcodegenxarch.cpp
+++ b/src/coreclr/jit/simdcodegenxarch.cpp
@@ -1563,389 +1563,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode)
genProduceReg(simdNode);
}
-//------------------------------------------------------------------------------------
-// genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i.
-//
-// Arguments:
-// simdNode - The GT_SIMD node
-//
-// Return Value:
-// None.
-//
-void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode)
-{
- assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
-
- GenTree* op1 = simdNode->gtGetOp1();
- GenTree* op2 = simdNode->gtGetOp2();
- var_types simdType = op1->TypeGet();
- assert(varTypeIsSIMD(simdType));
-
- // op1 of TYP_SIMD12 should be considered as TYP_SIMD16,
- // since it is in XMM register.
- if (simdType == TYP_SIMD12)
- {
- simdType = TYP_SIMD16;
- }
-
- var_types baseType = simdNode->GetSimdBaseType();
- regNumber targetReg = simdNode->GetRegNum();
- assert(targetReg != REG_NA);
- var_types targetType = simdNode->TypeGet();
- assert(targetType == genActualType(baseType));
-
- // GetItem has 2 operands:
- // - the source of SIMD type (op1)
- // - the index of the value to be returned.
- genConsumeOperands(simdNode);
- regNumber srcReg = op1->GetRegNum();
-
- // Optimize the case of op1 is in memory and trying to access ith element.
- if (!op1->isUsedFromReg())
- {
- assert(op1->isContained());
-
- regNumber baseReg;
- regNumber indexReg;
- int offset = 0;
-
- if (op1->OperIsLocal())
- {
- // There are three parts to the total offset here:
- // {offset of local} + {offset of SIMD Vector field (lclFld only)} + {offset of element within SIMD vector}.
- bool isEBPbased;
- unsigned varNum = op1->AsLclVarCommon()->GetLclNum();
- offset += compiler->lvaFrameAddress(varNum, &isEBPbased);
-
-#if !FEATURE_FIXED_OUT_ARGS
- if (!isEBPbased)
- {
- // Adjust the offset by the amount currently pushed on the CPU stack
- offset += genStackLevel;
- }
-#else
- assert(genStackLevel == 0);
-#endif // !FEATURE_FIXED_OUT_ARGS
-
- if (op1->OperGet() == GT_LCL_FLD)
- {
- offset += op1->AsLclFld()->GetLclOffs();
- }
- baseReg = (isEBPbased) ? REG_EBP : REG_ESP;
- }
- else
- {
- // Require GT_IND addr to be not contained.
- assert(op1->OperGet() == GT_IND);
-
- GenTree* addr = op1->AsIndir()->Addr();
- assert(!addr->isContained());
- baseReg = addr->GetRegNum();
- }
-
- if (op2->isContainedIntOrIImmed())
- {
- indexReg = REG_NA;
- offset += (int)op2->AsIntConCommon()->IconValue() * genTypeSize(baseType);
- }
- else
- {
- indexReg = op2->GetRegNum();
- assert(genIsValidIntReg(indexReg));
- }
-
- // Now, load the desired element.
- GetEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
- emitTypeSize(baseType), // Of the vector baseType
- targetReg, // To targetReg
- baseReg, // Base Reg
- indexReg, // Indexed
- genTypeSize(baseType), // by the size of the baseType
- offset);
- genProduceReg(simdNode);
- return;
- }
-
- // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant.
- // For the non-constant case, we will use the SIMD temp location to store the vector, and
- // the load the desired element.
- // The range check will already have been performed, so at this point we know we have an index
- // within the bounds of the vector.
- if (!op2->IsCnsIntOrI())
- {
- unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum;
- noway_assert(simdInitTempVarNum != BAD_VAR_NUM);
- bool isEBPbased;
- unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased);
-
-#if !FEATURE_FIXED_OUT_ARGS
- if (!isEBPbased)
- {
- // Adjust the offset by the amount currently pushed on the CPU stack
- offs += genStackLevel;
- }
-#else
- assert(genStackLevel == 0);
-#endif // !FEATURE_FIXED_OUT_ARGS
-
- regNumber indexReg = op2->GetRegNum();
-
- // Store the vector to the temp location.
- GetEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)),
- emitTypeSize(simdType), srcReg, simdInitTempVarNum, 0);
-
- // Now, load the desired element.
- GetEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load
- emitTypeSize(baseType), // Of the vector baseType
- targetReg, // To targetReg
- (isEBPbased) ? REG_EBP : REG_ESP, // Stack-based
- indexReg, // Indexed
- genTypeSize(baseType), // by the size of the baseType
- offs);
- genProduceReg(simdNode);
- return;
- }
-
- noway_assert(op2->isContained());
- noway_assert(op2->IsCnsIntOrI());
- unsigned int index = (unsigned int)op2->AsIntCon()->gtIconVal;
- unsigned int byteShiftCnt = index * genTypeSize(baseType);
-
- // In general we shouldn't have an index greater than or equal to the length of the vector.
- // However, if we have an out-of-range access, under minOpts it will not be optimized
- // away. The code will throw before we reach this point, but we still need to generate
- // code. In that case, we will simply mask off the upper bits.
- if (byteShiftCnt >= compiler->getSIMDVectorRegisterByteLength())
- {
- byteShiftCnt &= (compiler->getSIMDVectorRegisterByteLength() - 1);
- index = byteShiftCnt / genTypeSize(baseType);
- }
-
- regNumber tmpReg = REG_NA;
- if (simdNode->AvailableTempRegCount() != 0)
- {
- tmpReg = simdNode->GetSingleTempReg();
- }
- else
- {
- assert((byteShiftCnt == 0) || varTypeIsFloating(baseType) ||
- (varTypeIsSmallInt(baseType) && (byteShiftCnt < 16)));
- }
-
- if (byteShiftCnt >= 16)
- {
- assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
- byteShiftCnt -= 16;
- regNumber newSrcReg;
- if (varTypeIsFloating(baseType))
- {
- newSrcReg = targetReg;
- }
- else
- {
- // Integer types
- assert(tmpReg != REG_NA);
- newSrcReg = tmpReg;
- }
- GetEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, newSrcReg, srcReg, 0x01);
-
- srcReg = newSrcReg;
- }
-
- // Generate the following sequence:
- // 1) baseType is floating point
- // movaps targetReg, srcReg
- // psrldq targetReg, byteShiftCnt <-- not generated if accessing zero'th element
- //
- // 2) baseType is not floating point
- // movaps tmpReg, srcReg <-- not generated if accessing zero'th element
- // OR if tmpReg == srcReg
- // psrldq tmpReg, byteShiftCnt <-- not generated if accessing zero'th element
- // mov_xmm2i targetReg, tmpReg
- if (varTypeIsFloating(baseType))
- {
- inst_Mov(simdType, targetReg, srcReg, /* canSkip */ true);
-
- if (byteShiftCnt != 0)
- {
- instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
- assert((byteShiftCnt > 0) && (byteShiftCnt < 32));
- GetEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt);
- }
- }
- else
- {
- if (varTypeIsSmallInt(baseType))
- {
- // Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits.
- // In case of vector<short> we also need to sign extend the 16-bit value in targetReg
- // Vector<byte> - index/2 will give the index of the 16-bit value to extract. Shift right
- // by 8-bits if index is odd. In case of Vector<sbyte> also sign extend targetReg.
-
- unsigned baseSize = genTypeSize(baseType);
- if (baseSize == 1)
- {
- index /= 2;
- }
- // We actually want index % 8 for the AVX case (for SSE it will never be > 8).
- // Note that this doesn't matter functionally, because the instruction uses just the
- // low 3 bits of index, but it's better to use the right value.
- if (index > 8)
- {
- assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported);
- index -= 8;
- }
-
- assert((index >= 0) && (index <= 8));
- GetEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index);
-
- bool ZeroOrSignExtnReqd = true;
- if (baseSize == 1)
- {
- if ((op2->AsIntCon()->gtIconVal % 2) == 1)
- {
- // Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element.
- inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, 8);
-
- // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE
- ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
- }
- // else - we just need to zero/sign extend the byte since pextrw extracted 16-bits
- }
- else
- {
- // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT
- assert(baseSize == 2);
- ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
- }
-
- if (ZeroOrSignExtnReqd)
- {
- // Zero/sign extend the byte/short to 32-bits
- inst_Mov_Extend(baseType, /* srcInReg */ false, targetReg, targetReg, /* canSkip */ false,
- emitTypeSize(baseType));
- }
- }
- else
- {
- // We need a temp xmm register if the baseType is not floating point and
- // accessing non-zero'th element.
- if (byteShiftCnt != 0)
- {
- assert(tmpReg != REG_NA);
-
- inst_Mov(simdType, tmpReg, srcReg, /* canSkip */ true);
-
- assert((byteShiftCnt > 0) && (byteShiftCnt <= 32));
- instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16);
- GetEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt);
- }
- else
- {
- tmpReg = srcReg;
- }
-
- assert(tmpReg != REG_NA);
- inst_Mov(baseType, targetReg, tmpReg, /* canSkip */ false);
- }
- }
-
- genProduceReg(simdNode);
-}
-
-//------------------------------------------------------------------------------------
-// genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i.
-//
-// Arguments:
-// simdNode - The GT_SIMD node
-//
-// Return Value:
-// None.
-//
-// TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case.
-//
-void CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode)
-{
- // Determine index based on intrinsic ID
- int index = -1;
- switch (simdNode->gtSIMDIntrinsicID)
- {
- case SIMDIntrinsicSetX:
- index = 0;
- break;
- case SIMDIntrinsicSetY:
- index = 1;
- break;
- case SIMDIntrinsicSetZ:
- index = 2;
- break;
- case SIMDIntrinsicSetW:
- index = 3;
- break;
-
- default:
- unreached();
- }
- assert(index != -1);
-
- // op1 is the SIMD vector
- // op2 is the value to be set
- GenTree* op1 = simdNode->gtGetOp1();
- GenTree* op2 = simdNode->gtGetOp2();
-
- var_types baseType = simdNode->GetSimdBaseType();
- regNumber targetReg = simdNode->GetRegNum();
- assert(targetReg != REG_NA);
- var_types targetType = simdNode->TypeGet();
- assert(varTypeIsSIMD(targetType));
-
- // the following assert must hold.
- // supported only on vector2f/3f/4f right now
- noway_assert(baseType == TYP_FLOAT);
- assert(op2->TypeGet() == baseType);
- assert(simdNode->GetSimdSize() >= ((index + 1) * genTypeSize(baseType)));
-
- genConsumeOperands(simdNode);
- regNumber op1Reg = op1->GetRegNum();
- regNumber op2Reg = op2->GetRegNum();
-
- // TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate.
- inst_Mov(targetType, targetReg, op1Reg, /* canSkip */ true);
-
- // Right now this intrinsic is supported only for float base type vectors.
- // If in future need to support on other base type vectors, the below
- // logic needs modification.
- noway_assert(baseType == TYP_FLOAT);
-
- if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported)
- {
- // We need one additional int register as scratch
- regNumber tmpReg = simdNode->GetSingleTempReg();
- assert(genIsValidIntReg(tmpReg));
-
- // Move the value from xmm reg to an int reg
- inst_Mov(TYP_INT, tmpReg, op2Reg, /* canSkip */ false, emitActualTypeSize(baseType));
-
- assert((index >= 0) && (index <= 15));
-
- // First insert the lower 16-bits of tmpReg in targetReg at 2*index position
- // since every float has two 16-bit words.
- GetEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index);
-
- // Logical right shift tmpReg by 16-bits and insert in targetReg at 2*index + 1 position
- inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, 16);
- GetEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index + 1);
- }
- else
- {
- unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(0) | INSERTPS_TARGET_SELECT(index));
- assert((insertpsImm >= 0) && (insertpsImm <= 255));
- inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, (int8_t)insertpsImm);
- }
-
- genProduceReg(simdNode);
-}
-
//------------------------------------------------------------------------
// genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle.
//
@@ -2357,21 +1974,10 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode)
genSIMDIntrinsicRelOp(simdNode);
break;
- case SIMDIntrinsicGetItem:
- genSIMDIntrinsicGetItem(simdNode);
- break;
-
case SIMDIntrinsicShuffleSSE2:
genSIMDIntrinsicShuffleSSE2(simdNode);
break;
- case SIMDIntrinsicSetX:
- case SIMDIntrinsicSetY:
- case SIMDIntrinsicSetZ:
- case SIMDIntrinsicSetW:
- genSIMDIntrinsicSetItem(simdNode);
- break;
-
case SIMDIntrinsicUpperSave:
genSIMDIntrinsicUpperSave(simdNode);
break;
diff --git a/src/coreclr/jit/simdintrinsiclist.h b/src/coreclr/jit/simdintrinsiclist.h
index fb806804a56..258fecfdd65 100644
--- a/src/coreclr/jit/simdintrinsiclist.h
+++ b/src/coreclr/jit/simdintrinsiclist.h
@@ -57,19 +57,6 @@ SIMD_INTRINSIC(".ctor", true, InitFixed,
SIMD_INTRINSIC("CopyTo", true, CopyToArray, "CopyToArray", TYP_VOID, 2, {TYP_BYREF, TYP_REF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
SIMD_INTRINSIC("CopyTo", true, CopyToArrayX, "CopyToArray", TYP_VOID, 3, {TYP_BYREF, TYP_REF, TYP_INT }, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
-// Get operations
-SIMD_INTRINSIC("get_Item", true, GetItem, "get[i]", TYP_UNKNOWN, 2, {TYP_BYREF, TYP_INT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})
-SIMD_INTRINSIC("get_X", true, GetX, "getX", TYP_UNKNOWN, 1, {TYP_BYREF, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-SIMD_INTRINSIC("get_Y", true, GetY, "getY", TYP_UNKNOWN, 1, {TYP_BYREF, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-SIMD_INTRINSIC("get_Z", true, GetZ, "getZ", TYP_UNKNOWN, 1, {TYP_BYREF, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-SIMD_INTRINSIC("get_W", true, GetW, "getW", TYP_UNKNOWN, 1, {TYP_BYREF, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-
-// Set operations
-SIMD_INTRINSIC("set_X", true, SetX, "setX", TYP_VOID, 2, {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-SIMD_INTRINSIC("set_Y", true, SetY, "setY", TYP_VOID, 2, {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-SIMD_INTRINSIC("set_Z", true, SetZ, "setZ", TYP_VOID, 2, {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-SIMD_INTRINSIC("set_W", true, SetW, "setW", TYP_VOID, 2, {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF})
-
// Arithmetic Operations
SIMD_INTRINSIC("op_Subtraction", false, Sub, "-", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG})