Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Unblock Vector###<long> intrinsics on x86 #112728

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5674,6 +5674,13 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
// These intrinsics are "ins reg/mem, xmm"
ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
attr = emitActualTypeSize(baseType);
#if defined(TARGET_X86)
if (varTypeIsLong(baseType))
{
ins = INS_movq;
attr = EA_8BYTE;
}
#endif // TARGET_X86
break;
}

Expand Down
186 changes: 153 additions & 33 deletions src/coreclr/jit/decomposelongs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,19 +78,19 @@ void DecomposeLongs::DecomposeBlock(BasicBlock* block)
// Return Value:
// None.
//
void DecomposeLongs::DecomposeRange(Compiler* compiler, LIR::Range& range)
void DecomposeLongs::DecomposeRange(Compiler* compiler, Lowering* lowering, LIR::Range& range)
{
assert(compiler != nullptr);

DecomposeLongs decomposer(compiler);
DecomposeLongs decomposer(compiler, lowering);
decomposer.m_range = &range;

decomposer.DecomposeRangeHelper();
}

//------------------------------------------------------------------------
// DecomposeLongs::DecomposeRangeHelper:
// Decompiose each node in the current range.
// Decompose each node in the current range.
//
// Decomposition is done as an execution-order walk. Decomposition of
// a particular node can create new nodes that need to be further
Expand Down Expand Up @@ -122,44 +122,70 @@ void DecomposeLongs::DecomposeRangeHelper()
GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
{
// Handle the case where we are implicitly using the lower half of a long lclVar.
if ((tree->TypeGet() == TYP_INT) && tree->OperIsLocal())
if (tree->TypeIs(TYP_INT) && tree->OperIsLocal())
{
LclVarDsc* varDsc = m_compiler->lvaGetDesc(tree->AsLclVarCommon());
if (varTypeIsLong(varDsc) && varDsc->lvPromoted)
{
#ifdef DEBUG
if (m_compiler->verbose)
{
printf("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
"half:\n");
m_compiler->gtDispTreeRange(Range(), tree);
}
#endif // DEBUG
JITDUMP("Changing implicit reference to lo half of long lclVar to an explicit reference of its promoted "
"half:\n");
DISPTREERANGE(Range(), tree);

unsigned loVarNum = varDsc->lvFieldLclStart;
tree->AsLclVarCommon()->SetLclNum(loVarNum);
return tree->gtNext;
}
}

if (tree->TypeGet() != TYP_LONG)
if (!tree->TypeIs(TYP_LONG))
{
return tree->gtNext;
}

#ifdef DEBUG
if (m_compiler->verbose)
{
printf("Decomposing TYP_LONG tree. BEFORE:\n");
m_compiler->gtDispTreeRange(Range(), tree);
}
#endif // DEBUG

LIR::Use use;
if (!Range().TryGetUse(tree, &use))
{
LIR::Use::MakeDummyUse(Range(), tree, &use);
}

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
if (!use.IsDummyUse())
{
// HWIntrinsics can consume/produce a long directly, provided its source/target is memory.
// Here we do a conservative check for specific cases where it is certain the load/store
// can be contained. In those cases, we can skip decomposition.

GenTree* user = use.User();

if (user->OperIsHWIntrinsic())
{
if (tree->OperIs(GT_CNS_LNG) ||
(tree->OperIs(GT_IND, GT_LCL_FLD) && m_lowering->IsSafeToContainMem(user, tree)))
{
return tree->gtNext;
}
}
else if (user->OperIs(GT_STOREIND) && tree->OperIsHWIntrinsic() && m_compiler->opts.OptimizationEnabled())
{
if (m_lowering->IsSafeToContainMem(user, tree))
{
return tree->gtNext;
}
}
}

if (tree->OperIs(GT_STOREIND) && tree->AsStoreInd()->Data()->OperIsHWIntrinsic())
{
// We should only get here if we matched the second pattern above.
assert(HWIntrinsicInfo::IsVectorToScalar(tree->AsStoreInd()->Data()->AsHWIntrinsic()->GetHWIntrinsicId()));

return tree->gtNext;
}
#endif // FEATURE_HW_INTRINSICS && TARGET_X86

JITDUMP("Decomposing TYP_LONG tree. BEFORE:\n");
DISPTREERANGE(Range(), tree);

GenTree* nextNode = nullptr;
switch (tree->OperGet())
{
Expand Down Expand Up @@ -270,19 +296,14 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)

// If we replaced the argument to a GT_FIELD_LIST element with a GT_LONG node, split that field list
// element into two elements: one for each half of the GT_LONG.
if ((use.Def()->OperGet() == GT_LONG) && !use.IsDummyUse() && (use.User()->OperGet() == GT_FIELD_LIST))
if (use.Def()->OperIs(GT_LONG) && !use.IsDummyUse() && use.User()->OperIs(GT_FIELD_LIST))
{
DecomposeFieldList(use.User()->AsFieldList(), use.Def()->AsOp());
}

#ifdef DEBUG
if (m_compiler->verbose)
{
// NOTE: st_lcl_var doesn't dump properly afterwards.
printf("Decomposing TYP_LONG tree. AFTER:\n");
m_compiler->gtDispTreeRange(Range(), use.Def());
}
#endif
// NOTE: st_lcl_var doesn't dump properly afterwards.
JITDUMP("Decomposing TYP_LONG tree. AFTER:\n");
DISPTREERANGE(Range(), use.Def());

// When casting from a decomposed long to a smaller integer we can discard the high part.
if (m_compiler->opts.OptimizationEnabled() && !use.IsDummyUse() && use.User()->OperIs(GT_CAST) &&
Expand Down Expand Up @@ -1707,6 +1728,13 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use)
return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree);
}

case NI_Vector128_ToScalar:
case NI_Vector256_ToScalar:
case NI_Vector512_ToScalar:
{
return DecomposeHWIntrinsicToScalar(use, hwintrinsicTree);
}

case NI_EVEX_MoveMask:
{
return DecomposeHWIntrinsicMoveMask(use, hwintrinsicTree);
Expand Down Expand Up @@ -1751,9 +1779,7 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
{
assert(node == use.Def());
assert(varTypeIsLong(node));
assert((node->GetHWIntrinsicId() == NI_Vector128_GetElement) ||
(node->GetHWIntrinsicId() == NI_Vector256_GetElement) ||
(node->GetHWIntrinsicId() == NI_Vector512_GetElement));
assert(HWIntrinsicInfo::IsVectorGetElement(node->GetHWIntrinsicId()));

GenTree* op1 = node->Op(1);
GenTree* op2 = node->Op(2);
Expand Down Expand Up @@ -1835,6 +1861,93 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
}

//------------------------------------------------------------------------
// DecomposeHWIntrinsicToScalar: Decompose GT_HWINTRINSIC -- NI_Vector*_ToScalar.
//
// create:
//
// tmp_simd_var = simd_var
// lo_result = GT_HWINTRINSIC{ToScalar}[int](tmp_simd_var)
// hi_result = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, 1)
// - or -
// GT_HWINTRINSIC{ToScalar}[int](GT_RSZ(tmp_simd_var, 32))
// return: GT_LONG(lo_result, hi_result)
//
// Arguments:
// use - the LIR::Use object for the def that needs to be decomposed.
// node - the hwintrinsic node to decompose
//
// Return Value:
// The GT_LONG node wrapping the upper and lower halves.
//
GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node)
{
assert(node == use.Def());
assert(varTypeIsLong(node));
assert(HWIntrinsicInfo::IsVectorToScalar(node->GetHWIntrinsicId()));

GenTree* op1 = node->Op(1);
NamedIntrinsic intrinsicId = node->GetHWIntrinsicId();
var_types simdBaseType = node->GetSimdBaseType();
unsigned simdSize = node->GetSimdSize();

assert(varTypeIsLong(simdBaseType));
assert(varTypeIsSIMD(op1));

GenTree* simdTmpVar = RepresentOpAsLocalVar(op1, node, &node->Op(1));
unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum();
JITDUMP("[DecomposeHWIntrinsicToScalar]: Saving op1 tree to a temp var:\n");
DISPTREERANGE(Range(), simdTmpVar);

GenTreeHWIntrinsic* loResult =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar, intrinsicId, CORINFO_TYPE_INT, simdSize);
Range().InsertAfter(simdTmpVar, loResult);

simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet());
Range().InsertAfter(loResult, simdTmpVar);

GenTreeHWIntrinsic* hiResult;
if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
{
NamedIntrinsic getElement = NI_Illegal;
switch (simdSize)
{
case 16:
getElement = NI_Vector128_GetElement;
break;
case 32:
getElement = NI_Vector256_GetElement;
break;
case 64:
getElement = NI_Vector512_GetElement;
break;
default:
unreached();
}

GenTree* one = m_compiler->gtNewIconNode(1);
hiResult =
m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar, one, getElement, CORINFO_TYPE_INT, simdSize);

Range().InsertAfter(simdTmpVar, one, hiResult);
}
else
{
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_SSE2));

GenTree* thirtyTwo = m_compiler->gtNewIconNode(32);
GenTree* shift = m_compiler->gtNewSimdBinOpNode(GT_RSZ, op1->TypeGet(), simdTmpVar, thirtyTwo,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this missing ToScalar() to get the 32-bit integer out of the simd result?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ToScalar is the original intrinsicId, and it's built on the next line down. Full SSE2 codegen for Vector128<ulong>.ToScalar():

; Method Program:ToScalar(System.Runtime.Intrinsics.Vector128`1[ulong]):ulong (FullOpts)
G_M34649_IG01:  ;; offset=0x0000
       push     ebp
       mov      ebp, esp
       movups   xmm0, xmmword ptr [ebp+0x08]
						;; size=7 bbWeight=1 PerfScore 4.25

G_M34649_IG02:  ;; offset=0x0007
       movd     eax, xmm0
       psrlq    xmm0, 32
       movd     edx, xmm0
						;; size=13 bbWeight=1 PerfScore 4.50

G_M34649_IG03:  ;; offset=0x0014
       pop      ebp
       ret      16
						;; size=4 bbWeight=1 PerfScore 2.50
; Total bytes of code: 24

node->GetSimdBaseJitType(), simdSize);
hiResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, shift, intrinsicId, CORINFO_TYPE_INT, simdSize);

Range().InsertAfter(simdTmpVar, thirtyTwo, shift, hiResult);
}

Range().Remove(node);

return FinalizeDecomposition(use, loResult, hiResult, hiResult);
}

//------------------------------------------------------------------------
// DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask
//
Expand Down Expand Up @@ -2262,6 +2375,13 @@ void DecomposeLongs::TryPromoteLongVar(unsigned lclNum)
{
return;
}
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_X86)
if (varDsc->lvIsParam)
{
// Promotion blocks combined read optimizations for SIMD loads of long params
return;
}
Comment on lines +2379 to +2383
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In isolation, this change produced a small number of diffs and was mostly an improvement. A few regressions show up in the SPMI reports, but the overall impact is good, especially considering the places we can load a long to vector with movq

#endif // FEATURE_HW_INTRINSICS && TARGET_X86

varDsc->lvFieldCnt = 2;
varDsc->lvFieldLclStart = m_compiler->lvaCount;
Expand Down
8 changes: 6 additions & 2 deletions src/coreclr/jit/decomposelongs.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,21 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#define _DECOMPOSELONGS_H_

#include "compiler.h"
#include "lower.h"

class DecomposeLongs
{
public:
DecomposeLongs(Compiler* compiler)
DecomposeLongs(Compiler* compiler, Lowering* lowering)
: m_compiler(compiler)
, m_lowering(lowering)
{
}

void PrepareForDecomposition();
void DecomposeBlock(BasicBlock* block);

static void DecomposeRange(Compiler* compiler, LIR::Range& range);
static void DecomposeRange(Compiler* compiler, Lowering* lowering, LIR::Range& range);

private:
inline LIR::Range& Range() const
Expand Down Expand Up @@ -64,6 +66,7 @@ class DecomposeLongs
#ifdef FEATURE_HW_INTRINSICS
GenTree* DecomposeHWIntrinsic(LIR::Use& use);
GenTree* DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node);
GenTree* DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIntrinsic* node);
GenTree* DecomposeHWIntrinsicMoveMask(LIR::Use& use, GenTreeHWIntrinsic* node);
#endif // FEATURE_HW_INTRINSICS

Expand All @@ -80,6 +83,7 @@ class DecomposeLongs

// Data
Compiler* m_compiler;
Lowering* m_lowering;
LIR::Range* m_range;
};

Expand Down
21 changes: 15 additions & 6 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20611,22 +20611,31 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
break;

case INS_movd:
case INS_movq: // only MOVQ xmm, xmm is different (emitted by Sse2.MoveScalar, should use MOVDQU instead)
case INS_movq:
if (memAccessKind == PERFSCORE_MEMORY_NONE)
{
// movd r32, xmm or xmm, r32
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency = PERFSCORE_LATENCY_3C;
if (isFloatReg(id->idReg1()) && isFloatReg(id->idReg2()))
{
// movq xmm, xmm
result.insThroughput = PERFSCORE_THROUGHPUT_3X;
result.insLatency = PERFSCORE_LATENCY_1C;
}
else
{
// movd r32/64, xmm or xmm, r32/64
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency = PERFSCORE_LATENCY_3C;
}
}
else if (memAccessKind == PERFSCORE_MEMORY_READ)
{
// movd xmm, m32
// ins xmm, m32/64
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += PERFSCORE_LATENCY_2C;
}
else
{
// movd m32, xmm
// ins m32/64, xmm
assert(memAccessKind == PERFSCORE_MEMORY_WRITE);
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_2C;
Expand Down
Loading
Loading