diff options
Diffstat (limited to 'src/coreclr/jit/lowerxarch.cpp')
-rw-r--r-- | src/coreclr/jit/lowerxarch.cpp | 207 |
1 files changed, 204 insertions, 3 deletions
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 44d6781d081..40ccaebec0f 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1023,6 +1023,8 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector128_Create: case NI_Vector256_Create: + case NI_Vector128_CreateScalar: + case NI_Vector256_CreateScalar: { // We don't directly support the Vector128.Create or Vector256.Create methods in codegen // and instead lower them to other intrinsic nodes in LowerHWIntrinsicCreate so we expect @@ -1705,8 +1707,9 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) GenTree* tmp2 = nullptr; GenTree* tmp3 = nullptr; - bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd32Val); - size_t argCnt = node->GetOperandCount(); + bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd32Val); + bool isCreateScalar = (intrinsicId == NI_Vector128_CreateScalar) || (intrinsicId == NI_Vector256_CreateScalar); + size_t argCnt = node->GetOperandCount(); if (isConstant) { @@ -1745,6 +1748,156 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) } else if (argCnt == 1) { + if (isCreateScalar) + { + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + // Types need to be explicitly zero-extended to ensure upper-bits are zero + // + // We need to explicitly use TYP_UBYTE since unsigned is ignored for small types + // Explicitly handle both BYTE and UBYTE to account for reinterpret casts and the like + // + // The from type is INT since that is the input type tracked by IR, where-as the target + // type needs to be UBYTE so it implicitly zero-extends back to TYP_INT + + tmp1 = comp->gtNewCastNode(TYP_INT, op1, /* unsigned */ true, TYP_UBYTE); + BlockRange().InsertAfter(op1, tmp1); + LowerNode(tmp1); + + node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128Int32, tmp1); + node->SetSimdBaseJitType(CORINFO_TYPE_INT); + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + // Types need to be explicitly zero-extended to ensure upper-bits are zero + // + // We need to explicitly use TYP_USHORT since unsigned is ignored for small types + // Explicitly handle both SHORT and USHORT to account for reinterpret casts and the like + // + // The from type is INT since that is the input type tracked by IR, where-as the target + // type needs to be USHORT so it implicitly zero-extends back to TYP_INT + + tmp1 = comp->gtNewCastNode(TYP_INT, op1, /* unsigned */ true, TYP_USHORT); + BlockRange().InsertAfter(op1, tmp1); + LowerNode(tmp1); + + node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128Int32, tmp1); + node->SetSimdBaseJitType(CORINFO_TYPE_INT); + break; + } + + case TYP_INT: + { + node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128Int32); + break; + } + + case TYP_UINT: + { + node->ChangeHWIntrinsicId(NI_SSE2_ConvertScalarToVector128UInt32); + break; + } + +#if defined(TARGET_AMD64) + case TYP_LONG: + { + node->ChangeHWIntrinsicId(NI_SSE2_X64_ConvertScalarToVector128Int64); + break; + } + + case TYP_ULONG: + { + node->ChangeHWIntrinsicId(NI_SSE2_X64_ConvertScalarToVector128UInt64); + break; + } +#endif // TARGET_AMD64 + + case TYP_FLOAT: + { + tmp1 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(op1, tmp1); + LowerNode(tmp1); + + if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // Sse41.Insert has: + // * Bits 0-3: zmask + // * Bits 4-5: count_d + // * Bits 6-7: count_s (register form only) + // + // We want zmask 0b1110 (0xE) to zero elements 1/2/3 + // We want count_d 0b00 (0x0) to insert the value to element 0 + // We want count_s 0b00 (0x0) as we're just taking element 0 of the source + + idx = comp->gtNewIconNode(0x0E); + BlockRange().InsertAfter(op1, idx); + LowerNode(idx); + + node->ResetHWIntrinsicId(NI_SSE41_Insert, comp, tmp1, op1, idx); + } + else + { + node->ResetHWIntrinsicId(NI_SSE_MoveScalar, comp, tmp1, op1); + } + break; + } + + case TYP_DOUBLE: + { + tmp1 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(op1, tmp1); + LowerNode(tmp1); + + node->ResetHWIntrinsicId(NI_SSE2_MoveScalar, comp, tmp1, op1); + break; + } + + default: + { + unreached(); + } + } + + if (simdSize == 32) + { + // We're creating a Vector256 scalar so we need to treat the original op as Vector128, + // we need to unsafely extend up to Vector256 (which is actually safe since the 128-bit + // op will zero extend up to 256-bits), and then we need to replace the original use + // with the new TYP_SIMD32 node. + + node->ChangeType(TYP_SIMD16); + node->SetSimdSize(16); + LowerNode(node); + + tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node, NI_Vector128_ToVector256Unsafe, simdBaseJitType, + 16); + + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + BlockRange().InsertAfter(node, tmp2); + + if (foundUse) + { + use.ReplaceWith(tmp2); + } + else + { + node->ClearUnusedValue(); + tmp2->SetUnusedValue(); + } + + node = tmp2->AsHWIntrinsic(); + } + + return LowerNode(node); + } + // We have the following (where simd is simd16 or simd32): // /--* op1 T // node = * HWINTRINSIC simd T Create @@ -1822,6 +1975,8 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) return LowerNode(node); } + assert(intrinsicId == NI_Vector128_Create); + // We will be constructing the following parts: // /--* op1 T // tmp1 = * HWINTRINSIC simd16 T CreateScalarUnsafe @@ -6940,7 +7095,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_SSSE3_AlignRight: case NI_SSE41_Blend: case NI_SSE41_DotProduct: - case NI_SSE41_Insert: case NI_SSE41_X64_Insert: case NI_SSE41_MultipleSumAbsoluteDifferences: case NI_AVX_Blend: @@ -6968,6 +7122,53 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_SSE41_Insert: + { + GenTree* lastOp = node->Op(numArgs); + + if ((simdBaseType == TYP_FLOAT) && lastOp->IsCnsIntOrI()) + { + // Sse41.Insert has: + // * Bits 0-3: zmask + // * Bits 4-5: count_d + // * Bits 6-7: count_s (register form only) + // + // Where zmask specifies which elements to zero + // Where count_d specifies the destination index the value is being inserted to + // Where count_s specifies the source index of the value being inserted + + ssize_t ival = lastOp->AsIntConCommon()->IconValue(); + + ssize_t zmask = (ival & 0x0F); + ssize_t count_d = (ival & 0x30) >> 4; + ssize_t count_s = (ival & 0xC0) >> 6; + + if (op1->IsVectorZero()) + { + // When op1 is zero, we can contain op1 and modify the mask + // to zero everything except for the element we're inserting to + + MakeSrcContained(node, op1); + + zmask |= ~(1 << count_d); + zmask &= 0x0F; + + ival = (count_s << 6) | (count_d << 4) | (zmask); + lastOp->AsIntConCommon()->SetIconValue(ival); + } + } + + if (TryGetContainableHWIntrinsicOp(node, &op2, &supportsRegOptional)) + { + MakeSrcContained(node, op2); + } + else if (supportsRegOptional) + { + op2->SetRegOptional(); + } + break; + } + default: { assert(!"Unhandled containment for ternary hardware intrinsic with immediate operand"); |