-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
JIT: Unblock Vector###<long> intrinsics on x86 #112728
base: main
Are you sure you want to change the base?
Changes from 11 commits
3a130c8
7f220c2
78dc31d
7330c3e
69065ee
86ebdae
cdb0910
5d6fb3f
bb03516
cba4ab0
1f97bd9
71145ab
42a6ab8
1c98e23
c80c566
fb2cf30
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,7 +90,7 @@ void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range) | |
|
||
//------------------------------------------------------------------------ | ||
// DecomposeLongs::DecomposeRangeHelper: | ||
// Decompiose each node in the current range. | ||
// Decompose each node in the current range. | ||
// | ||
// Decomposition is done as an execution-order walk. Decomposition of | ||
// a particular node can create new nodes that need to be further | ||
|
@@ -122,44 +122,82 @@ void DecomposeLongs::DecomposeRangeHelper() | |
GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) | ||
{ | ||
// Handle the case where we are implicitly using the lower half of a long lclVar. | ||
if ((tree->TypeGet() == TYP_INT) && tree->OperIsLocal()) | ||
if (tree->TypeIs(TYP_INT) && tree->OperIsLocal()) | ||
{ | ||
LclVarDsc* varDsc = m_compiler->lvaGetDesc(tree->AsLclVarCommon()); | ||
if (varTypeIsLong(varDsc) && varDsc->lvPromoted) | ||
{ | ||
#ifdef DEBUG | ||
if (m_compiler->verbose) | ||
{ | ||
printf("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted " | ||
"half:\n"); | ||
m_compiler->gtDispTreeRange(Range(), tree); | ||
} | ||
#endif // DEBUG | ||
JITDUMP("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted " | ||
"half:\n"); | ||
DISPTREERANGE(Range(), tree); | ||
|
||
unsigned loVarNum = varDsc->lvFieldLclStart; | ||
tree->AsLclVarCommon()->SetLclNum(loVarNum); | ||
return tree->gtNext; | ||
} | ||
} | ||
|
||
if (tree->TypeGet() != TYP_LONG) | ||
if (!tree->TypeIs(TYP_LONG)) | ||
{ | ||
return tree->gtNext; | ||
} | ||
|
||
#ifdef DEBUG | ||
if (m_compiler->verbose) | ||
{ | ||
printf("Decomposing TYP_LONG tree. BEFORE:\n"); | ||
m_compiler->gtDispTreeRange(Range(), tree); | ||
} | ||
#endif // DEBUG | ||
|
||
LIR::Use use; | ||
if (!Range().TryGetUse(tree, &use)) | ||
{ | ||
LIR::Use::MakeDummyUse(Range(), tree, &use); | ||
} | ||
|
||
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) | ||
if (!use.IsDummyUse()) | ||
{ | ||
// HWIntrinsics can consume/produce a long directly, provided its source/target is memory. | ||
// Here we do a conservative check for specific cases where it is certain the load/store | ||
// can be contained. In those cases, we can skip decomposition. | ||
|
||
GenTree* user = use.User(); | ||
|
||
if (user->OperIsHWIntrinsic()) | ||
{ | ||
if (tree->OperIs(GT_CNS_LNG) || (tree->OperIs(GT_IND, GT_LCL_FLD) && (user == tree->gtNext))) | ||
{ | ||
return tree->gtNext; | ||
} | ||
} | ||
else if (user->OperIs(GT_STOREIND) && tree->OperIsHWIntrinsic() && m_compiler->opts.OptimizationEnabled()) | ||
{ | ||
// We're looking for this common pattern, with operands in either order in the LIR sequence: | ||
// t1 = * HWINTRINSIC long ToScalar | ||
// t0 = LCL_VAR byref | ||
// /--* t0 byref | ||
// +--* t1 long | ||
// * STOREIND long | ||
|
||
GenTree* next = tree->gtNext; | ||
if ((user != next) && !m_compiler->gtTreeHasSideEffects(next, GTF_SIDE_EFFECT)) | ||
{ | ||
next = next->gtNext; | ||
} | ||
|
||
if ((user == next) && HWIntrinsicInfo::IsVectorToScalar(tree->AsHWIntrinsic()->GetHWIntrinsicId())) | ||
{ | ||
return tree->gtNext; | ||
} | ||
} | ||
} | ||
|
||
if (tree->OperIs(GT_STOREIND) && tree->AsStoreInd()->Data()->OperIsHWIntrinsic()) | ||
{ | ||
// We should only get here if we matched the second pattern above. | ||
assert(HWIntrinsicInfo::IsVectorToScalar(tree->AsStoreInd()->Data()->AsHWIntrinsic()->GetHWIntrinsicId())); | ||
|
||
return tree->gtNext; | ||
} | ||
#endif // FEATURE_HW_INTRINSICS && TARGET_X86 | ||
|
||
JITDUMP("Decomposing TYP_LONG tree. BEFORE:\n"); | ||
DISPTREERANGE(Range(), tree); | ||
|
||
GenTree* nextNode = nullptr; | ||
switch (tree->OperGet()) | ||
{ | ||
|
@@ -270,19 +308,14 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) | |
|
||
// If we replaced the argument to a GT_FIELD_LIST element with a GT_LONG node, split that field list | ||
// element into two elements: one for each half of the GT_LONG. | ||
if ((use.Def()->OperGet() == GT_LONG) && !use.IsDummyUse() && (use.User()->OperGet() == GT_FIELD_LIST)) | ||
if (use.Def()->OperIs(GT_LONG) && !use.IsDummyUse() && use.User()->OperIs(GT_FIELD_LIST)) | ||
{ | ||
DecomposeFieldList(use.User()->AsFieldList(), use.Def()->AsOp()); | ||
} | ||
|
||
#ifdef DEBUG | ||
if (m_compiler->verbose) | ||
{ | ||
// NOTE: st_lcl_var doesn't dump properly afterwards. | ||
printf("Decomposing TYP_LONG tree. AFTER:\n"); | ||
m_compiler->gtDispTreeRange(Range(), use.Def()); | ||
} | ||
#endif | ||
// NOTE: st_lcl_var doesn't dump properly afterwards. | ||
JITDUMP("Decomposing TYP_LONG tree. AFTER:\n"); | ||
DISPTREERANGE(Range(), use.Def()); | ||
|
||
// When casting from a decomposed long to a smaller integer we can discard the high part. | ||
if (m_compiler->opts.OptimizationEnabled() && !use.IsDummyUse() && use.User()->OperIs(GT_CAST) && | ||
|
@@ -1707,6 +1740,13 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use) | |
return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree); | ||
} | ||
|
||
case NI_Vector128_ToScalar: | ||
case NI_Vector256_ToScalar: | ||
case NI_Vector512_ToScalar: | ||
{ | ||
return DecomposeHWIntrinsicToScalar(use, hwintrinsicTree); | ||
} | ||
|
||
case NI_EVEX_MoveMask: | ||
{ | ||
return DecomposeHWIntrinsicMoveMask(use, hwintrinsicTree); | ||
|
@@ -1751,9 +1791,7 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW | |
{ | ||
assert(node == use.Def()); | ||
assert(varTypeIsLong(node)); | ||
assert((node->GetHWIntrinsicId() == NI_Vector128_GetElement) || | ||
(node->GetHWIntrinsicId() == NI_Vector256_GetElement) || | ||
(node->GetHWIntrinsicId() == NI_Vector512_GetElement)); | ||
assert(HWIntrinsicInfo::IsVectorGetElement(node->GetHWIntrinsicId())); | ||
|
||
GenTree* op1 = node->Op(1); | ||
GenTree* op2 = node->Op(2); | ||
|
@@ -1835,6 +1873,93 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW | |
return FinalizeDecomposition(use, loResult, hiResult, hiResult); | ||
} | ||
|
||
//------------------------------------------------------------------------ | ||
// DecomposeHWIntrinsicToScalar: Decompose GT_HWINTRINSIC -- NI_Vector*_ToScalar. | ||
// | ||
// create: | ||
// | ||
// tmp_simd_var = simd_var | ||
// lo_result = GT_HWINTRINSIC{ToScalar}[int](tmp_simd_var) | ||
// hi_result = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, 1) | ||
// - or - | ||
// GT_HWINTRINSIC{ToScalar}[int](GT_RSZ(tmp_simd_var, 32)) | ||
// return: GT_LONG(lo_result, hi_result) | ||
// | ||
// Arguments: | ||
// use - the LIR::Use object for the def that needs to be decomposed. | ||
// node - the hwintrinsic node to decompose | ||
// | ||
// Return Value: | ||
// The GT_LONG node wrapping the upper and lower halves. | ||
// | ||
GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node) | ||
{ | ||
assert(node == use.Def()); | ||
assert(varTypeIsLong(node)); | ||
assert(HWIntrinsicInfo::IsVectorToScalar(node->GetHWIntrinsicId())); | ||
|
||
GenTree* op1 = node->Op(1); | ||
NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); | ||
var_types simdBaseType = node->GetSimdBaseType(); | ||
unsigned simdSize = node->GetSimdSize(); | ||
|
||
assert(varTypeIsLong(simdBaseType)); | ||
assert(varTypeIsSIMD(op1)); | ||
|
||
GenTree* simdTmpVar = RepresentOpAsLocalVar(op1, node, &node->Op(1)); | ||
unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum(); | ||
JITDUMP("[DecomposeHWIntrinsicToScalar]: Saving op1 tree to a temp var:\n"); | ||
DISPTREERANGE(Range(), simdTmpVar); | ||
|
||
GenTreeHWIntrinsic* loResult = | ||
m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar, intrinsicId, CORINFO_TYPE_INT, simdSize); | ||
Range().InsertAfter(simdTmpVar, loResult); | ||
|
||
simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet()); | ||
Range().InsertAfter(loResult, simdTmpVar); | ||
|
||
GenTreeHWIntrinsic* hiResult; | ||
if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41)) | ||
{ | ||
NamedIntrinsic getElement = NI_Illegal; | ||
switch (simdSize) | ||
{ | ||
case 16: | ||
getElement = NI_Vector128_GetElement; | ||
break; | ||
case 32: | ||
getElement = NI_Vector256_GetElement; | ||
break; | ||
case 64: | ||
getElement = NI_Vector512_GetElement; | ||
break; | ||
default: | ||
unreached(); | ||
} | ||
|
||
GenTree* one = m_compiler->gtNewIconNode(1); | ||
hiResult = | ||
m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar, one, getElement, CORINFO_TYPE_INT, simdSize); | ||
|
||
Range().InsertAfter(simdTmpVar, one, hiResult); | ||
} | ||
else | ||
{ | ||
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_SSE2)); | ||
|
||
GenTree* thirtyTwo = m_compiler->gtNewIconNode(32); | ||
GenTree* shift = m_compiler->gtNewSimdBinOpNode(GT_RSZ, op1->TypeGet(), simdTmpVar, thirtyTwo, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this missing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ToScalar is the original ; Method Program:ToScalar(System.Runtime.Intrinsics.Vector128`1[ulong]):ulong (FullOpts)
G_M34649_IG01: ;; offset=0x0000
push ebp
mov ebp, esp
movups xmm0, xmmword ptr [ebp+0x08]
;; size=7 bbWeight=1 PerfScore 4.25
G_M34649_IG02: ;; offset=0x0007
movd eax, xmm0
psrlq xmm0, 32
movd edx, xmm0
;; size=13 bbWeight=1 PerfScore 4.50
G_M34649_IG03: ;; offset=0x0014
pop ebp
ret 16
;; size=4 bbWeight=1 PerfScore 2.50
; Total bytes of code: 24 |
||
node->GetSimdBaseJitType(), simdSize); | ||
hiResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, shift, intrinsicId, CORINFO_TYPE_INT, simdSize); | ||
|
||
Range().InsertAfter(simdTmpVar, thirtyTwo, shift, hiResult); | ||
} | ||
|
||
Range().Remove(node); | ||
|
||
return FinalizeDecomposition(use, loResult, hiResult, hiResult); | ||
} | ||
|
||
//------------------------------------------------------------------------ | ||
// DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask | ||
// | ||
|
@@ -2262,6 +2387,13 @@ void DecomposeLongs::TryPromoteLongVar(unsigned lclNum) | |
{ | ||
return; | ||
} | ||
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86) | ||
if (varDsc->lvIsParam) | ||
{ | ||
// Promotion blocks combined read optimizations for SIMD loads of long params | ||
return; | ||
} | ||
Comment on lines
+2379
to
+2383
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In isolation, this change produced a small number of diffs and was mostly an improvement. A few regressions show up in the SPMI reports, but the overall impact is good, especially considering the places we can load a long to vector with |
||
#endif // FEATURE_HW_INTRINSICS && TARGET_X86 | ||
|
||
varDsc->lvFieldCnt = 2; | ||
varDsc->lvFieldLclStart = m_compiler->lvaCount; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
gtTreeHasSideEffects
is for HIR, not LIR. For LIR you should useOperEffects
. But also, relying on the execution order in this way is an anti-pattern. It means optimizations will subtly break from unrelated changes that people may make in the future. Can the whole thing be changed to use an appropriateIsInvariantInRange
check?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, yeah, I wasn't happy with this but wasn't sure the best way to handle it. I started down the path of using
IsInvariantInRange
, but that's private toLowering
, so I'd have to move it to the public surface and then have lowering pass itself toDecomposeLongs
. If that change is ok, I'll go ahead and do it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That sounds ok to me. Alternatively this could be a static method on
SideEffectSet
orLIR
and then each ofLower
andDecomposeLongs
would have a cachedSideEffectSet
to use for it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. I used
IsSafeToContainMem
instead ofIsInvariantInRange
because the name and arg order make more sense (to me, at least) in this context.