diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index bdd164a2f0131..22f23e4c94e2d 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -62,6 +62,7 @@ class SIPeepholeSDWA { std::unique_ptr matchSDWAOperand(MachineInstr &MI); void pseudoOpConvertToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; + MachineInstr *createSDWAVersion(MachineInstr &MI); bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; @@ -85,11 +86,18 @@ class SIPeepholeSDWALegacy : public MachineFunctionPass { } }; +using namespace AMDGPU::SDWA; + class SDWAOperand { private: MachineOperand *Target; // Operand that would be used in converted instruction MachineOperand *Replaced; // Operand that would be replace by Target + /// Returns true iff the SDWA selection of this SDWAOperand can be combined + /// with the SDWA selections of its uses in \p MI. + virtual bool canCombineSelections(const MachineInstr &MI, + const SIInstrInfo *TII) = 0; + public: SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) : Target(TargetOp), Replaced(ReplacedOp) { @@ -118,8 +126,6 @@ class SDWAOperand { #endif }; -using namespace AMDGPU::SDWA; - class SDWASrcOperand : public SDWAOperand { private: SdwaSel SrcSel; @@ -131,13 +137,15 @@ class SDWASrcOperand : public SDWAOperand { SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, bool Sext_ = false) - : SDWAOperand(TargetOp, ReplacedOp), - SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} + : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), + Neg(Neg_), Sext(Sext_) {} MachineInstr *potentialToConvert(const SIInstrInfo *TII, const GCNSubtarget &ST, SDWAOperandsMap *PotentialMatches = nullptr) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + bool canCombineSelections(const MachineInstr &MI, + const SIInstrInfo *TII) override; SdwaSel getSrcSel() const { return SrcSel; } bool getAbs() const { return Abs; } @@ -158,15 +166,16 @@ class SDWADstOperand : public SDWAOperand { DstUnused DstUn; public: - SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) - : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} + : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} MachineInstr *potentialToConvert(const SIInstrInfo *TII, const GCNSubtarget &ST, SDWAOperandsMap *PotentialMatches = nullptr) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + bool canCombineSelections(const MachineInstr &MI, + const SIInstrInfo *TII) override; SdwaSel getDstSel() const { return DstSel; } DstUnused getDstUnused() const { return DstUn; } @@ -187,6 +196,8 @@ class SDWADstPreserveOperand : public SDWADstOperand { Preserve(PreserveOp) {} bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + bool canCombineSelections(const MachineInstr &MI, + const SIInstrInfo *TII) override; MachineOperand *getPreservedOperand() const { return Preserve; } @@ -314,6 +325,38 @@ static MachineOperand *findSingleRegDef(const MachineOperand *Reg, return nullptr; } +/// Combine an SDWA instruction's existing SDWA selection \p Sel with +/// the SDWA selection \p OperandSel of its operand. If the selections +/// are compatible, return the combined selection, otherwise return a +/// nullopt. +/// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1: +/// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X) +static std::optional combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) { + if (Sel == SdwaSel::DWORD) + return OperandSel; + + if (Sel == OperandSel || OperandSel == SdwaSel::DWORD) + return Sel; + + if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 || + Sel == SdwaSel::BYTE_3) + return {}; + + if (OperandSel == SdwaSel::WORD_0) + return Sel; + + if (OperandSel == SdwaSel::WORD_1) { + if (Sel == SdwaSel::BYTE_0) + return SdwaSel::BYTE_2; + if (Sel == SdwaSel::BYTE_1) + return SdwaSel::BYTE_3; + if (Sel == SdwaSel::WORD_0) + return SdwaSel::WORD_1; + } + + return {}; +} + uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, const MachineOperand *SrcOp) const { uint64_t Mods = 0; @@ -350,7 +393,8 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) // Check that all instructions that use Reg can be converted - if (!isConvertibleToSDWA(UseMI, ST, TII)) + if (!isConvertibleToSDWA(UseMI, ST, TII) || + !canCombineSelections(UseMI, TII)) return nullptr; // Now that it's guaranteed all uses are legal, iterate over the uses again @@ -372,7 +416,9 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, if (!PotentialMO) return nullptr; - return PotentialMO->getParent(); + MachineInstr *Parent = PotentialMO->getParent(); + + return canCombineSelections(*Parent, TII) ? Parent : nullptr; } bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { @@ -451,13 +497,55 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { } copyRegOperand(*Src, *getTargetOperand()); if (!IsPreserveSrc) { - SrcSel->setImm(getSrcSel()); + SdwaSel ExistingSel = static_cast(SrcSel->getImm()); + SrcSel->setImm(*combineSdwaSel(ExistingSel, getSrcSel())); SrcMods->setImm(getSrcMods(TII, Src)); } getTargetOperand()->setIsKill(false); return true; } +/// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA +/// instruction \p MI can be combined with the selection \p OpSel. +static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, + AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) { + assert(TII->isSDWA(MI.getOpcode())); + + const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, SrcSelOpName); + SdwaSel SrcSel = static_cast(SrcSelOp->getImm()); + + return combineSdwaSel(SrcSel, OpSel).has_value(); +} + +/// Verify that \p Op is the same register as the operand of the SDWA +/// instruction \p MI named by \p SrcOpName and that the SDWA +/// selection \p SrcSelOpName can be combined with the \p OpSel. +static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, + AMDGPU::OpName SrcOpName, + AMDGPU::OpName SrcSelOpName, MachineOperand *Op, + SdwaSel OpSel) { + assert(TII->isSDWA(MI.getOpcode())); + + const MachineOperand *Src = TII->getNamedOperand(MI, SrcOpName); + if (!Src || !isSameReg(*Src, *Op)) + return true; + + return canCombineOpSel(MI, TII, SrcSelOpName, OpSel); +} + +bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI, + const SIInstrInfo *TII) { + if (!TII->isSDWA(MI.getOpcode())) + return true; + + using namespace AMDGPU; + + return canCombineOpSel(MI, TII, OpName::src0, OpName::src0_sel, + getReplacedOperand(), getSrcSel()) && + canCombineOpSel(MI, TII, OpName::src1, OpName::src1_sel, + getReplacedOperand(), getSrcSel()); +} + MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, const GCNSubtarget &ST, SDWAOperandsMap *PotentialMatches) { @@ -476,7 +564,8 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, return nullptr; } - return PotentialMO->getParent(); + MachineInstr *Parent = PotentialMO->getParent(); + return canCombineSelections(*Parent, TII) ? Parent : nullptr; } bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { @@ -498,7 +587,10 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { copyRegOperand(*Operand, *getTargetOperand()); MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); assert(DstSel); - DstSel->setImm(getDstSel()); + + SdwaSel ExistingSel = static_cast(DstSel->getImm()); + DstSel->setImm(combineSdwaSel(ExistingSel, getDstSel()).value()); + MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); assert(DstUnused); DstUnused->setImm(getDstUnused()); @@ -509,6 +601,14 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { return true; } +bool SDWADstOperand::canCombineSelections(const MachineInstr &MI, + const SIInstrInfo *TII) { + if (!TII->isSDWA(MI.getOpcode())) + return true; + + return canCombineOpSel(MI, TII, AMDGPU::OpName::dst_sel, getDstSel()); +} + bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { // MI should be moved right before v_or_b32. @@ -538,6 +638,11 @@ bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, return SDWADstOperand::convertToSDWA(MI, TII); } +bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI, + const SIInstrInfo *TII) { + return SDWADstOperand::canCombineSelections(MI, TII); +} + std::optional SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { if (Op.isImm()) { @@ -962,11 +1067,8 @@ bool isConvertibleToSDWA(MachineInstr &MI, const SIInstrInfo* TII) { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); - if (TII->isSDWA(Opc)) { - // FIXME: Reenable after fixing selection handling. - // Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll - return false; - } + if (TII->isSDWA(Opc)) + return true; // Check if this instruction has opcode that supports SDWA if (AMDGPU::getSDWAOp(Opc) == -1) @@ -1024,21 +1126,13 @@ bool isConvertibleToSDWA(MachineInstr &MI, } } // namespace -bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, - const SDWAOperandsVector &SDWAOperands) { - - LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); - - // Convert to sdwa - int SDWAOpcode; +MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) { unsigned Opcode = MI.getOpcode(); - if (TII->isSDWA(Opcode)) { - SDWAOpcode = Opcode; - } else { - SDWAOpcode = AMDGPU::getSDWAOp(Opcode); - if (SDWAOpcode == -1) - SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); - } + assert(!TII->isSDWA(Opcode)); + + int SDWAOpcode = AMDGPU::getSDWAOp(Opcode); + if (SDWAOpcode == -1) + SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); assert(SDWAOpcode != -1); const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); @@ -1172,6 +1266,24 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); } + return SDWAInst.getInstr(); +} + +bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, + const SDWAOperandsVector &SDWAOperands) { + LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); + + MachineInstr *SDWAInst; + if (TII->isSDWA(MI.getOpcode())) { + // Clone the instruction to allow revoking changes + // made to MI during the processing of the operands + // if the conversion fails. + SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI); + MI.getParent()->insert(MI.getIterator(), SDWAInst); + } else { + SDWAInst = createSDWAVersion(MI); + } + // Apply all sdwa operand patterns. bool Converted = false; for (auto &Operand : SDWAOperands) { @@ -1190,19 +1302,18 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, Converted |= Operand->convertToSDWA(*SDWAInst, TII); } - if (Converted) { - ConvertedInstructions.push_back(SDWAInst); - for (MachineOperand &MO : SDWAInst->uses()) { - if (!MO.isReg()) - continue; - - MRI->clearKillFlags(MO.getReg()); - } - } else { + if (!Converted) { SDWAInst->eraseFromParent(); return false; } + ConvertedInstructions.push_back(SDWAInst); + for (MachineOperand &MO : SDWAInst->uses()) { + if (!MO.isReg()) + continue; + + MRI->clearKillFlags(MO.getReg()); + } LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); ++NumSDWAInstructionsPeepholed; @@ -1285,10 +1396,11 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) { for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) { + MachineInstr *PotentialMI = + Operand->potentialToConvert(TII, ST, &PotentialMatches); + + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) PotentialMatches[PotentialMI].push_back(Operand.get()); - } } for (auto &PotentialPair : PotentialMatches) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 08184e700c1a4..4bfd29430ff1e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -280,9 +280,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -300,8 +299,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -441,8 +439,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -612,11 +609,9 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 94f943af2532a..5673a6c6e869d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -281,9 +281,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -301,8 +300,7 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -442,8 +440,7 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -613,11 +610,9 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 2389924b82484..d9158e3558395 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -297,8 +297,7 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -418,8 +417,7 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -562,11 +560,9 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -764,20 +760,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s3, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_lshl_b32 s1, s7, 8 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: s_lshl_b32 s0, s4, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 34d36581a21db..1fd139b06417f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -291,8 +291,7 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -410,8 +409,7 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -550,11 +548,9 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -748,20 +744,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s3, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: s_lshl_b32 s1, s7, 8 ; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 -; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: s_lshl_b32 s0, s4, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index ba2694fca99fa..a029164b7acd8 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -6374,10 +6374,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 @@ -6603,10 +6601,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin ; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 @@ -7022,9 +7018,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8 -; GFX8-NEXT: v_add_f16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v6, v8, v5 ; GFX8-NEXT: v_or_b32_e32 v7, v6, v4 ; GFX8-NEXT: v_mov_b32_e32 v6, v7 @@ -7370,10 +7364,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 @@ -7632,10 +7624,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace( ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 @@ -7899,10 +7889,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v5, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v1, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 @@ -8161,10 +8149,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem ; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX8-NEXT: v_mov_b32_e32 v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v4, v1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 37fc59c664a24..0589b6abea26d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -14358,10 +14358,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14550,10 +14548,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14756,10 +14752,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14939,10 +14933,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p ; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15124,10 +15116,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15327,10 +15317,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15523,10 +15511,8 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15713,10 +15699,8 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -15903,10 +15887,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16086,10 +16068,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt ; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16273,10 +16253,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -16456,10 +16434,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__ ; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 096fda1710928..b0c76ecd30fbd 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -12090,10 +12090,8 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) # ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12313,10 +12311,8 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12554,10 +12550,8 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12765,10 +12759,8 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 { ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12978,10 +12970,8 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13212,10 +13202,8 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13439,10 +13427,8 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -13657,10 +13643,8 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 904ef8a4b6579..69f4cd4323d99 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -15399,10 +15399,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -15633,10 +15631,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -15867,10 +15863,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16085,10 +16079,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16297,10 +16289,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16510,10 +16500,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16752,10 +16740,8 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -16971,10 +16957,8 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17214,10 +17198,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17454,10 +17436,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17682,10 +17662,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -17896,10 +17874,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -18138,10 +18114,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -18378,10 +18352,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index c131921c83fff..36852816eaea1 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -12412,10 +12412,8 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -12691,10 +12689,8 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -12970,10 +12966,8 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -13242,10 +13236,8 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -13508,10 +13500,8 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1 ; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -13775,10 +13765,8 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1 ; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -14062,10 +14050,8 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 ; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -14335,10 +14321,8 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace( ; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 0693119af4b31..75e72a72bebb1 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -2518,17 +2518,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v6 -; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v7, v4, v5 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 -; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v9 +; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8 ; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2547,17 +2546,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v6 -; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v4, v5 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v6 +; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v9 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 50b6ad9f0cb37..6d11fecae5dc7 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -5024,10 +5024,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_add_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5249,10 +5247,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_add_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5468,10 +5464,8 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5684,10 +5678,8 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_add_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_add_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index fffdc16e1a501..c8848a4d89f10 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -5517,10 +5517,8 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_sub_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -5773,10 +5771,8 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_sub_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6020,10 +6016,8 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va ; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -6264,10 +6258,8 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h ; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 8c3758daacb9c..312dfa3717c77 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -592,8 +592,7 @@ define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 % ; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: global_store_dword v[5:6], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel-dst.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel-dst.mir new file mode 100644 index 0000000000000..6fc7c6debc472 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel-dst.mir @@ -0,0 +1,309 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -simplify-mir -run-pass=si-peephole-sdwa -o - %s | FileCheck %s + +# Test the combination of SDWA selections in si-peephole-sdwa. In each +# example, the SDWA destination selection specified on the first instruction +# must be combined with the destination selection that the pass determines +# for the operand, i.e. the second instruction. In the cases where +# this is not possible, no conversion should occur. + +--- +name: op_select_word_1_instr_select_dword +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_dword + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 5, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 6, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 16, %2, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_word_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_word_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 5, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 5, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 16, %2, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_word_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_word_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 5, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 4, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 16, %2, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_byte_3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_byte_3 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 3, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 16, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 3, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 16, %2, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_byte_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_byte_2 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 16, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 16, %2, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_byte_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_byte_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 3, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 16, %2, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_byte_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_byte_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 16, %2, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_dword +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_dword + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 3, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 6, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 24, %2, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_word_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_word_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 5, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 24, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 5, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 24, %2, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_word_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_word_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 4, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 24, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 4, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 24, %2, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_byte_3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_byte_3 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 3, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 3, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 24, %2, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_byte_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_byte_2 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 24, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 24, %2, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_byte_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_byte_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 24, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 24, %2, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_byte_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_byte_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e32 24, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 2, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = V_LSHLREV_B32_e32 24, %2, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 2, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel-src.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel-src.mir new file mode 100644 index 0000000000000..14ba8fccb172d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel-src.mir @@ -0,0 +1,1089 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -simplify-mir -run-pass=si-peephole-sdwa -o - %s | FileCheck %s + +# Test the combination of SDWA selections in si-peephole-sdwa. In each +# example, the SDWA source selection specified on the last instruction +# must be combined with the source selection that the pass determines +# for this operand, i.e. the second instruction. In the cases where +# this is not possible, no conversion should occur, i.e. the last +# instruction in the output mir should still use the second +# instruction with the same source selection. + +--- +name: op_select_byte0_instr_select_dword +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte0_instr_select_dword + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 255, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 255, %2, implicit $exec /* Select Byte_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 0, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_0_instr_select_word_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_0_instr_select_word_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 255, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_AND_B32_e64_]], 0, 1, 0, 0, 5, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 255, %2, implicit $exec /* Select Byte_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 0, 5, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_0_instr_select_word_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_0_instr_select_word_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 255, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_AND_B32_e64_]], 0, 1, 0, 0, 4, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 255, %2, implicit $exec /* Select Byte_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 0, 4, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_0_instr_select_byte_3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_0_instr_select_byte_3 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 255, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_AND_B32_e64_]], 0, 1, 0, 0, 3, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 255, %2, implicit $exec /* Select Byte_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 0, 3, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_0_instr_select_byte_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_0_instr_select_byte_2 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 255, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_AND_B32_e64_]], 0, 1, 0, 0, 2, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 255, %2, implicit $exec /* Select Byte_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 0, 2, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_0_instr_select_byte_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_0_instr_select_byte_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 255, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_AND_B32_e64_]], 0, 1, 0, 0, 1, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 255, %2, implicit $exec /* Select Byte_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 0, 1, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_0_instr_select_byte_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_0_instr_select_byte_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 255, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 255, %2, implicit $exec /* Select Byte_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 0, 0, implicit $exec + + S_ENDPGM 0 + +... + +--- +name: op_select_word_0_instr_select_dword +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_0_instr_select_dword + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 65535, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 4, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 65535, %2, implicit $exec /* Select Word_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_0_instr_select_word_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_0_instr_select_word_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 65535, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_AND_B32_e64_]], 0, 1, 0, 6, 5, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 65535, %2, implicit $exec /* Select Word_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 5, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_0_instr_select_word_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_0_instr_select_word_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 65535, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 4, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 65535, %2, implicit $exec /* Select Word_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 4, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_0_instr_select_byte_3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_0_instr_select_byte_3 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 65535, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_AND_B32_e64_]], 0, 1, 0, 6, 3, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 65535, %2, implicit $exec /* Select Word_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 3, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_0_instr_select_byte_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_0_instr_select_byte_2 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 65535, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_AND_B32_e64_]], 0, 1, 0, 6, 2, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 65535, %2, implicit $exec /* Select Word_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 2, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_0_instr_select_byte_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_0_instr_select_byte_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 65535, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 1, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 65535, %2, implicit $exec /* Select Word_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 1, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_0_instr_select_byte_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_0_instr_select_byte_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 65535, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_AND_B32_e64 65535, %2, implicit $exec /* Select Word_0 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 0, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_1_instr_select_dword +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_1_instr_select_dword + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 8, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 1, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B16_e32 8, %2, implicit $exec /* Select BYTE_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_1_instr_select_word_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_1_instr_select_word_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 8, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B16_e32_]], 0, 1, 0, 6, 5, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B16_e32 8, %2, implicit $exec /* Select BYTE_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 5, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_1_instr_select_word_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_1_instr_select_word_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 8, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B16_e32_]], 0, 1, 0, 6, 4, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B16_e32 8, %2, implicit $exec /* Select BYTE_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 4, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_1_instr_select_byte_3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_1_instr_select_byte_3 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 8, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B16_e32_]], 0, 1, 0, 6, 3, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B16_e32 8, %2, implicit $exec /* Select BYTE_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 3, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_1_instr_select_byte_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_1_instr_select_byte_2 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 8, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B16_e32_]], 0, 1, 0, 6, 2, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B16_e32 8, %2, implicit $exec /* Select BYTE_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 2, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_1_instr_select_byte_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_1_instr_select_byte_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 8, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 1, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B16_e32 8, %2, implicit $exec /* Select BYTE_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 1, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_1_instr_select_byte_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_1_instr_select_byte_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B16_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e32 8, [[V_LSHRREV_B32_sdwa]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_LSHRREV_B16_e32_]], 0, 1, 0, 6, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B16_e32 8, %2, implicit $exec /* Select BYTE_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 0, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_2_instr_select_dword +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_2_instr_select_dword + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 8, implicit $exec /* Select BYTE_2 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_2_instr_select_word_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_2_instr_select_word_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 5, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 8, implicit $exec /* Select BYTE_2 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 5, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_2_instr_select_word_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_2_instr_select_word_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 4, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 8, implicit $exec /* Select BYTE_2 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 4, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_2_instr_select_byte_3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_2_instr_select_byte_3 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 3, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 8, implicit $exec /* Select BYTE_2 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 3, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_2_instr_select_byte_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_2_instr_select_byte_2 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 8, implicit $exec /* Select BYTE_2 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 2, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_2_instr_select_byte_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_2_instr_select_byte_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 1, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 8, implicit $exec /* Select BYTE_2 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 1, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_2_instr_select_byte_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_2_instr_select_byte_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 8, implicit $exec /* Select BYTE_2 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 0, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_dword +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_dword + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 24, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 24, 8, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_word_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_word_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 24, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 5, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 24, 8, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 5, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_word_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_word_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 24, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 4, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 24, 8, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 4, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_byte_3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_byte_3 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 24, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 24, 8, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 3, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_byte_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_byte_2 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 24, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 2, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 24, 8, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 2, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_byte_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_byte_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 24, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 1, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 24, 8, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 1, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_byte_3_instr_select_byte_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_byte_3_instr_select_byte_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 24, 8, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 24, 8, implicit $exec /* Select BYTE_3 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 0, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_dword +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_dword + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 16, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_word_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_word_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 16, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 5, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_word_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_word_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 16, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 4, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_byte_3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_byte_3 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 3, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 16, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 3, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_byte_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_byte_2 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 0, [[V_BFE_I32_e64_]], 0, 1, 0, 6, 2, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 16, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 2, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_byte_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_byte_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 16, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 1, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_word_1_instr_select_byte_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_word_1_instr_select_byte_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 16, 16, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 16, 16, implicit $exec /* Select WORD_1 */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 0, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_dword_instr_select_dword +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_dword_instr_select_dword + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 6, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 0, 32, implicit $exec /* Select DWORD */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 6, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_dword_instr_select_word_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_dword_instr_select_word_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 5, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 0, 32, implicit $exec /* Select DWORD */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 5, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_dword_instr_select_word_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_dword_instr_select_word_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 4, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 0, 32, implicit $exec /* Select DWORD */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 4, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_dword_instr_select_byte_3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_dword_instr_select_byte_3 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 3, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 0, 32, implicit $exec /* Select DWORD */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 3, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_dword_instr_select_byte_2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_dword_instr_select_byte_2 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 2, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 0, 32, implicit $exec /* Select DWORD */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 2, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_dword_instr_select_byte_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_dword_instr_select_byte_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 1, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 0, 32, implicit $exec /* Select DWORD */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 1, implicit $exec + + S_ENDPGM 0 +... + +--- +name: op_select_dword_instr_select_byte_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: op_select_dword_instr_select_byte_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[COPY]], 0, [[COPY]], 0, 1, 0, 5, 0, implicit $exec + ; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[V_LSHRREV_B32_sdwa]], 0, 32, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_sdwa1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_sdwa 0, [[V_LSHRREV_B32_sdwa]], 1, [[V_LSHRREV_B32_sdwa]], 0, 1, 0, 6, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_sdwa 0, %1, 0, %1, 0, 1, 0, 5, 0, implicit $exec + %3:vgpr_32 = V_BFE_I32_e64 %2, 0, 32, implicit $exec /* Select DWORD */ + %4:vgpr_32 = V_LSHRREV_B32_sdwa 0, %2, 0, %3, 0, 1, 0, 6, 0, implicit $exec + + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll index 6eae905278f3e..8f984bfd4d7f7 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll @@ -32,17 +32,15 @@ define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1, ptr addrspace ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: ds_write_b32 v1, v1 ; CHECK-NEXT: .LBB0_2: ; %bb20 -; CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CHECK-NEXT: s_mov_b32 s0, exec_lo -; CHECK-NEXT: v_cmpx_ne_u16_e32 0, v0 -; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0 -; CHECK-NEXT: s_cbranch_execz .LBB0_4 -; CHECK-NEXT: ; %bb.3: ; %bb11 -; CHECK-NEXT: v_mov_b32_e32 v1, 2 -; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_cmp_ne_u16_sdwa s0, v0, v1 src0_sel:WORD_1 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s1, s0 +; CHECK-NEXT: s_xor_b32 s1, exec_lo, s1 +; CHECK-NEXT: ; %bb.3: ; %bb11 +; CHECK-NEXT: v_mov_b32_e32 v2, 2 +; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; CHECK-NEXT: ds_write_b32 v0, v1 offset:84 -; CHECK-NEXT: .LBB0_4: ; %bb14 +; CHECK-NEXT: ; %bb.4: ; %bb14 ; CHECK-NEXT: s_endpgm bb: %call = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir index cc2c8b3940d78..acad03d6d8960 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir @@ -1,56 +1,124 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CHECK %s - -# Currently the conversions in si-peephole-sdwa are disabled on preexisting sdwa instructions. -# If they are reenabled, the code matches this pattern instead of the corresponding pattern -# for V_LSHLREV_B32_sdwa further below: -# [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, %{{[0-9]+}}, 0, undef [[GLOBAL_LOAD_DWORD_SADDR]], 0, 6, 0, 6, 5, implicit $exec - -# TODO Implement a fix for the incorrect sdwa selection +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - %s | FileCheck %s --- name: sdwa_opsel_hazard body: | ; CHECK-LABEL: name: sdwa_opsel_hazard ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: successors: %bb.7(0x40000000), %bb.8(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[DEF1]], [[DEF2]], 0, 0, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF undef [[DEF]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.7 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 undef %5, 255, implicit $exec - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; CHECK-NEXT: [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef %5, 0, 6, 0, 6, 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0 + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 255, implicit $exec + ; CHECK-NEXT: [[V_AND_B32_sdwa:%[0-9]+]]:vgpr_32 = V_AND_B32_sdwa 0, undef [[GLOBAL_LOAD_DWORD_SADDR]], 0, [[V_MOV_B32_e32_]], 0, 6, 0, 5, 6, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec + ; CHECK-NEXT: [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, [[V_MOV_B32_e32_1]], 0, undef [[GLOBAL_LOAD_DWORD_SADDR]], 0, 6, 0, 6, 2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed undef %9, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_IF2:%[0-9]+]]:sreg_32 = SI_IF killed undef [[SI_IF1]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6: + ; CHECK-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[SI_IF3:%[0-9]+]]:sreg_32 = SI_IF undef [[DEF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7: + ; CHECK-NEXT: successors: %bb.8(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec + ; CHECK-NEXT: [[SI_IF4:%[0-9]+]]:sreg_32 = SI_IF killed undef [[SI_IF]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.9: + ; CHECK-NEXT: successors: %bb.10(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.10: + ; CHECK-NEXT: S_ENDPGM 0 bb.0: - successors: %bb.2(0x40000000) + successors: %bb.7(0x40000000), %bb.8(0x40000000) + liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6 + %0:sreg_32 = IMPLICIT_DEF %1:sreg_64_xexec_xnull = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed %1, %2, 0, 0, implicit $exec - S_BRANCH %bb.2 + %4:sreg_32 = SI_IF undef %0, %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.7 bb.1: + successors: %bb.2(0x80000000) + %5:vgpr_32 = V_AND_B32_e64 undef %6, 255, implicit $exec %7:vgpr_32 = V_LSHLREV_B32_e64 2, killed undef %5, implicit $exec - S_ENDPGM 0 bb.2: - successors: %bb.1(0x40000000) + successors: %bb.3(0x40000000), %bb.4(0x40000000) - %6:vgpr_32 = V_LSHRREV_B32_e64 16, undef %3, implicit $exec + %8:sreg_32 = SI_IF killed undef %9, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4(0x80000000) + + bb.4: + successors: %bb.5(0x40000000), %bb.6(0x40000000) + + %10:sreg_32 = SI_IF killed undef %8, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.5 + + bb.5: + successors: %bb.6(0x80000000) + bb.6: + successors: %bb.9(0x40000000), %bb.10(0x40000000) + + %11:sreg_32 = SI_IF undef %0, %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.9 + + bb.7: + successors: %bb.8(0x80000000) + + bb.8: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + + %6:vgpr_32 = V_LSHRREV_B32_e64 16, undef %3, implicit $exec + %9:sreg_32 = SI_IF killed undef %4, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.1 + bb.9: + successors: %bb.10(0x80000000) + + bb.10: + S_ENDPGM 0 + ... diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir index aaa32d871148b..62538120f8451 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir @@ -138,8 +138,7 @@ body: | --- # GCN-LABEL: {{^}}name: vop2_instructions -# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec -# GFX1010: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec +# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec # GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec # GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec # GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir index c027600a8af67..e2854df2468b3 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir @@ -147,15 +147,14 @@ body: | --- # GCN-LABEL: {{^}}name: vop2_instructions -# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec -# VI: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec + +# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit $mode, implicit $exec # VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec -# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec -# GFX9: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec +# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec # GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec # GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec # GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir index 467bc77c18577..ffbd2d092b5d8 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir @@ -37,10 +37,9 @@ body: | ; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec ; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec ; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec - ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec - ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_MUL_F32_sdwa]], implicit $exec - ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0) + ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 %2 = COPY $sgpr30_sgpr31 @@ -146,7 +145,7 @@ body: | ; SDWA-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[FLAT_LOAD_DWORD]], implicit $exec ; SDWA-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 65535 ; SDWA-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[FLAT_LOAD_DWORD]], killed [[S_MOV_B32_]], implicit $exec - ; SDWA-NEXT: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[FLAT_LOAD_DWORD1]], 0, 5, 2, 4, implicit $exec, implicit [[V_AND_B32_e64_]](tied-def 0) + ; SDWA-NEXT: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[FLAT_LOAD_DWORD1]], 0, 5, 2, 4, implicit $exec, implicit [[FLAT_LOAD_DWORD]](tied-def 0) ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_MOV_B32_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; SDWA-NEXT: S_ENDPGM 0 %2 = COPY $sgpr30_sgpr31 @@ -181,17 +180,15 @@ body: | ; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec ; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec ; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec - ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec ; SDWA-NEXT: {{ $}} ; SDWA-NEXT: bb.1: ; SDWA-NEXT: successors: %bb.2(0x80000000) ; SDWA-NEXT: {{ $}} - ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 6, 0, 1, 3, implicit $mode, implicit $exec - ; SDWA-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[V_MUL_F32_sdwa]], implicit $exec + ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec ; SDWA-NEXT: {{ $}} ; SDWA-NEXT: bb.2: - ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_LSHLREV_B32_e64_]], implicit $exec - ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) + ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0) + ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32)) ; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 934d9efba4656..2d84e87722951 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -1230,8 +1230,7 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { ; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v1, v0 ; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1346,8 +1345,7 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff00ff ; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v1, v0 ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ;