Skip to content

Commit ba9bd22

Browse files
frederik-hjrbyrnesarsenm
authored
[AMDGPU] Account for existing SDWA selections (#123221)
The si-peephole-sdwa pass adjusts the selections on sdwa instructions to the selections on their operands during its conversions. For instance, if an instruction selects `BYTE_0` and its operand selects `WORD_1`, the combined selection should be `BYTE_2`, i.e. "`BYTE_0` of `WORD_1`". The existing implementation does not always handle this correctly in some complex situations with instructions across different basic blocks as demonstrated by the test cases included in this PR. This PR adds an additional selection combination step to the conversion to fix this issue. It reverts the changes made by PR #123942 which had disabled the conversion of preexisting SDWA instructions completely as a quick fix. --------- Co-authored-by: Jeffrey Byrnes <[email protected]> Co-authored-by: Matt Arsenault <[email protected]>
1 parent 0735cec commit ba9bd22

22 files changed

+1757
-327
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

+154-42
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ class SIPeepholeSDWA {
6262
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
6363
void pseudoOpConvertToVOP2(MachineInstr &MI,
6464
const GCNSubtarget &ST) const;
65+
MachineInstr *createSDWAVersion(MachineInstr &MI);
6566
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
6667
void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
6768

@@ -85,11 +86,18 @@ class SIPeepholeSDWALegacy : public MachineFunctionPass {
8586
}
8687
};
8788

89+
using namespace AMDGPU::SDWA;
90+
8891
class SDWAOperand {
8992
private:
9093
MachineOperand *Target; // Operand that would be used in converted instruction
9194
MachineOperand *Replaced; // Operand that would be replace by Target
9295

96+
/// Returns true iff the SDWA selection of this SDWAOperand can be combined
97+
/// with the SDWA selections of its uses in \p MI.
98+
virtual bool canCombineSelections(const MachineInstr &MI,
99+
const SIInstrInfo *TII) = 0;
100+
93101
public:
94102
SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
95103
: Target(TargetOp), Replaced(ReplacedOp) {
@@ -118,8 +126,6 @@ class SDWAOperand {
118126
#endif
119127
};
120128

121-
using namespace AMDGPU::SDWA;
122-
123129
class SDWASrcOperand : public SDWAOperand {
124130
private:
125131
SdwaSel SrcSel;
@@ -131,13 +137,15 @@ class SDWASrcOperand : public SDWAOperand {
131137
SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
132138
SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
133139
bool Sext_ = false)
134-
: SDWAOperand(TargetOp, ReplacedOp),
135-
SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
140+
: SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
141+
Neg(Neg_), Sext(Sext_) {}
136142

137143
MachineInstr *potentialToConvert(const SIInstrInfo *TII,
138144
const GCNSubtarget &ST,
139145
SDWAOperandsMap *PotentialMatches = nullptr) override;
140146
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
147+
bool canCombineSelections(const MachineInstr &MI,
148+
const SIInstrInfo *TII) override;
141149

142150
SdwaSel getSrcSel() const { return SrcSel; }
143151
bool getAbs() const { return Abs; }
@@ -158,15 +166,16 @@ class SDWADstOperand : public SDWAOperand {
158166
DstUnused DstUn;
159167

160168
public:
161-
162169
SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
163170
SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
164-
: SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
171+
: SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
165172

166173
MachineInstr *potentialToConvert(const SIInstrInfo *TII,
167174
const GCNSubtarget &ST,
168175
SDWAOperandsMap *PotentialMatches = nullptr) override;
169176
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
177+
bool canCombineSelections(const MachineInstr &MI,
178+
const SIInstrInfo *TII) override;
170179

171180
SdwaSel getDstSel() const { return DstSel; }
172181
DstUnused getDstUnused() const { return DstUn; }
@@ -187,6 +196,8 @@ class SDWADstPreserveOperand : public SDWADstOperand {
187196
Preserve(PreserveOp) {}
188197

189198
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
199+
bool canCombineSelections(const MachineInstr &MI,
200+
const SIInstrInfo *TII) override;
190201

191202
MachineOperand *getPreservedOperand() const { return Preserve; }
192203

@@ -314,6 +325,38 @@ static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
314325
return nullptr;
315326
}
316327

328+
/// Combine an SDWA instruction's existing SDWA selection \p Sel with
329+
/// the SDWA selection \p OperandSel of its operand. If the selections
330+
/// are compatible, return the combined selection, otherwise return a
331+
/// nullopt.
332+
/// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1:
333+
/// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
334+
static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) {
335+
if (Sel == SdwaSel::DWORD)
336+
return OperandSel;
337+
338+
if (Sel == OperandSel || OperandSel == SdwaSel::DWORD)
339+
return Sel;
340+
341+
if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 ||
342+
Sel == SdwaSel::BYTE_3)
343+
return {};
344+
345+
if (OperandSel == SdwaSel::WORD_0)
346+
return Sel;
347+
348+
if (OperandSel == SdwaSel::WORD_1) {
349+
if (Sel == SdwaSel::BYTE_0)
350+
return SdwaSel::BYTE_2;
351+
if (Sel == SdwaSel::BYTE_1)
352+
return SdwaSel::BYTE_3;
353+
if (Sel == SdwaSel::WORD_0)
354+
return SdwaSel::WORD_1;
355+
}
356+
357+
return {};
358+
}
359+
317360
uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
318361
const MachineOperand *SrcOp) const {
319362
uint64_t Mods = 0;
@@ -350,7 +393,8 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
350393

351394
for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
352395
// Check that all instructions that use Reg can be converted
353-
if (!isConvertibleToSDWA(UseMI, ST, TII))
396+
if (!isConvertibleToSDWA(UseMI, ST, TII) ||
397+
!canCombineSelections(UseMI, TII))
354398
return nullptr;
355399

356400
// Now that it's guaranteed all uses are legal, iterate over the uses again
@@ -372,7 +416,9 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
372416
if (!PotentialMO)
373417
return nullptr;
374418

375-
return PotentialMO->getParent();
419+
MachineInstr *Parent = PotentialMO->getParent();
420+
421+
return canCombineSelections(*Parent, TII) ? Parent : nullptr;
376422
}
377423

378424
bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
@@ -451,13 +497,55 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
451497
}
452498
copyRegOperand(*Src, *getTargetOperand());
453499
if (!IsPreserveSrc) {
454-
SrcSel->setImm(getSrcSel());
500+
SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm());
501+
SrcSel->setImm(*combineSdwaSel(ExistingSel, getSrcSel()));
455502
SrcMods->setImm(getSrcMods(TII, Src));
456503
}
457504
getTargetOperand()->setIsKill(false);
458505
return true;
459506
}
460507

508+
/// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA
509+
/// instruction \p MI can be combined with the selection \p OpSel.
510+
static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
511+
AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) {
512+
assert(TII->isSDWA(MI.getOpcode()));
513+
514+
const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, SrcSelOpName);
515+
SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm());
516+
517+
return combineSdwaSel(SrcSel, OpSel).has_value();
518+
}
519+
520+
/// Verify that \p Op is the same register as the operand of the SDWA
521+
/// instruction \p MI named by \p SrcOpName and that the SDWA
522+
/// selection \p SrcSelOpName can be combined with the \p OpSel.
523+
static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
524+
AMDGPU::OpName SrcOpName,
525+
AMDGPU::OpName SrcSelOpName, MachineOperand *Op,
526+
SdwaSel OpSel) {
527+
assert(TII->isSDWA(MI.getOpcode()));
528+
529+
const MachineOperand *Src = TII->getNamedOperand(MI, SrcOpName);
530+
if (!Src || !isSameReg(*Src, *Op))
531+
return true;
532+
533+
return canCombineOpSel(MI, TII, SrcSelOpName, OpSel);
534+
}
535+
536+
bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI,
537+
const SIInstrInfo *TII) {
538+
if (!TII->isSDWA(MI.getOpcode()))
539+
return true;
540+
541+
using namespace AMDGPU;
542+
543+
return canCombineOpSel(MI, TII, OpName::src0, OpName::src0_sel,
544+
getReplacedOperand(), getSrcSel()) &&
545+
canCombineOpSel(MI, TII, OpName::src1, OpName::src1_sel,
546+
getReplacedOperand(), getSrcSel());
547+
}
548+
461549
MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
462550
const GCNSubtarget &ST,
463551
SDWAOperandsMap *PotentialMatches) {
@@ -476,7 +564,8 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
476564
return nullptr;
477565
}
478566

479-
return PotentialMO->getParent();
567+
MachineInstr *Parent = PotentialMO->getParent();
568+
return canCombineSelections(*Parent, TII) ? Parent : nullptr;
480569
}
481570

482571
bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
@@ -498,7 +587,10 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
498587
copyRegOperand(*Operand, *getTargetOperand());
499588
MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
500589
assert(DstSel);
501-
DstSel->setImm(getDstSel());
590+
591+
SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm());
592+
DstSel->setImm(combineSdwaSel(ExistingSel, getDstSel()).value());
593+
502594
MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
503595
assert(DstUnused);
504596
DstUnused->setImm(getDstUnused());
@@ -509,6 +601,14 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
509601
return true;
510602
}
511603

604+
bool SDWADstOperand::canCombineSelections(const MachineInstr &MI,
605+
const SIInstrInfo *TII) {
606+
if (!TII->isSDWA(MI.getOpcode()))
607+
return true;
608+
609+
return canCombineOpSel(MI, TII, AMDGPU::OpName::dst_sel, getDstSel());
610+
}
611+
512612
bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
513613
const SIInstrInfo *TII) {
514614
// MI should be moved right before v_or_b32.
@@ -538,6 +638,11 @@ bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
538638
return SDWADstOperand::convertToSDWA(MI, TII);
539639
}
540640

641+
bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI,
642+
const SIInstrInfo *TII) {
643+
return SDWADstOperand::canCombineSelections(MI, TII);
644+
}
645+
541646
std::optional<int64_t>
542647
SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
543648
if (Op.isImm()) {
@@ -962,11 +1067,8 @@ bool isConvertibleToSDWA(MachineInstr &MI,
9621067
const SIInstrInfo* TII) {
9631068
// Check if this is already an SDWA instruction
9641069
unsigned Opc = MI.getOpcode();
965-
if (TII->isSDWA(Opc)) {
966-
// FIXME: Reenable after fixing selection handling.
967-
// Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
968-
return false;
969-
}
1070+
if (TII->isSDWA(Opc))
1071+
return true;
9701072

9711073
// Check if this instruction has opcode that supports SDWA
9721074
if (AMDGPU::getSDWAOp(Opc) == -1)
@@ -1024,21 +1126,13 @@ bool isConvertibleToSDWA(MachineInstr &MI,
10241126
}
10251127
} // namespace
10261128

1027-
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
1028-
const SDWAOperandsVector &SDWAOperands) {
1029-
1030-
LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
1031-
1032-
// Convert to sdwa
1033-
int SDWAOpcode;
1129+
MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
10341130
unsigned Opcode = MI.getOpcode();
1035-
if (TII->isSDWA(Opcode)) {
1036-
SDWAOpcode = Opcode;
1037-
} else {
1038-
SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1039-
if (SDWAOpcode == -1)
1040-
SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
1041-
}
1131+
assert(!TII->isSDWA(Opcode));
1132+
1133+
int SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1134+
if (SDWAOpcode == -1)
1135+
SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
10421136
assert(SDWAOpcode != -1);
10431137

10441138
const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
@@ -1172,6 +1266,24 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
11721266
SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
11731267
}
11741268

1269+
return SDWAInst.getInstr();
1270+
}
1271+
1272+
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
1273+
const SDWAOperandsVector &SDWAOperands) {
1274+
LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
1275+
1276+
MachineInstr *SDWAInst;
1277+
if (TII->isSDWA(MI.getOpcode())) {
1278+
// Clone the instruction to allow revoking changes
1279+
// made to MI during the processing of the operands
1280+
// if the conversion fails.
1281+
SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
1282+
MI.getParent()->insert(MI.getIterator(), SDWAInst);
1283+
} else {
1284+
SDWAInst = createSDWAVersion(MI);
1285+
}
1286+
11751287
// Apply all sdwa operand patterns.
11761288
bool Converted = false;
11771289
for (auto &Operand : SDWAOperands) {
@@ -1190,19 +1302,18 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
11901302
Converted |= Operand->convertToSDWA(*SDWAInst, TII);
11911303
}
11921304

1193-
if (Converted) {
1194-
ConvertedInstructions.push_back(SDWAInst);
1195-
for (MachineOperand &MO : SDWAInst->uses()) {
1196-
if (!MO.isReg())
1197-
continue;
1198-
1199-
MRI->clearKillFlags(MO.getReg());
1200-
}
1201-
} else {
1305+
if (!Converted) {
12021306
SDWAInst->eraseFromParent();
12031307
return false;
12041308
}
12051309

1310+
ConvertedInstructions.push_back(SDWAInst);
1311+
for (MachineOperand &MO : SDWAInst->uses()) {
1312+
if (!MO.isReg())
1313+
continue;
1314+
1315+
MRI->clearKillFlags(MO.getReg());
1316+
}
12061317
LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
12071318
++NumSDWAInstructionsPeepholed;
12081319

@@ -1285,10 +1396,11 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
12851396

12861397
for (const auto &OperandPair : SDWAOperands) {
12871398
const auto &Operand = OperandPair.second;
1288-
MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);
1289-
if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {
1399+
MachineInstr *PotentialMI =
1400+
Operand->potentialToConvert(TII, ST, &PotentialMatches);
1401+
1402+
if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII))
12901403
PotentialMatches[PotentialMI].push_back(Operand.get());
1291-
}
12921404
}
12931405

12941406
for (auto &PotentialPair : PotentialMatches) {

0 commit comments

Comments
 (0)