Skip to content

Commit 99d450e

Browse files
committed
Revert "[AMDGPU] SIPeepholeSDWA: Disable on existing SDWA instructions (#123942)"
This reverts commit 6fdaaaf. Breaks check-llvm, see #123942 (comment)
1 parent 0c71fdd commit 99d450e

19 files changed

+110
-400
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

+2-5
Original file line numberDiff line numberDiff line change
@@ -962,11 +962,8 @@ bool isConvertibleToSDWA(MachineInstr &MI,
962962
const SIInstrInfo* TII) {
963963
// Check if this is already an SDWA instruction
964964
unsigned Opc = MI.getOpcode();
965-
if (TII->isSDWA(Opc)) {
966-
// FIXME: Reenable after fixing selection handling.
967-
// Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
968-
return false;
969-
}
965+
if (TII->isSDWA(Opc))
966+
return true;
970967

971968
// Check if this instruction has opcode that supports SDWA
972969
if (AMDGPU::getSDWAOp(Opc) == -1)

llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll

+5-10
Original file line numberDiff line numberDiff line change
@@ -280,9 +280,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
280280
; GFX8-NEXT: v_min_i16_e32 v1, v2, v1
281281
; GFX8-NEXT: v_add_u16_e32 v1, v3, v1
282282
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
283-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
284283
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
285-
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
284+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
286285
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
287286
; GFX8-NEXT: s_setpc_b64 s[30:31]
288287
;
@@ -300,8 +299,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
300299
; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
301300
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
302301
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
303-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
304-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
302+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
305303
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
306304
; GFX9-NEXT: s_setpc_b64 s[30:31]
307305
;
@@ -441,8 +439,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
441439
; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
442440
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
443441
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
444-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
445-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
442+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
446443
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
447444
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
448445
; GFX9-NEXT: ; return to shader part epilog
@@ -612,11 +609,9 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
612609
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
613610
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
614611
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
615-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
616-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
612+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
617613
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
618-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
619-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
614+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
620615
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
621616
; GFX8-NEXT: s_setpc_b64 s[30:31]
622617
;

llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll

+5-10
Original file line numberDiff line numberDiff line change
@@ -281,9 +281,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
281281
; GFX8-NEXT: v_min_i16_e32 v1, v1, v4
282282
; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1
283283
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
284-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
285284
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
286-
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
285+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
287286
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
288287
; GFX8-NEXT: s_setpc_b64 s[30:31]
289288
;
@@ -301,8 +300,7 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
301300
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
302301
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
303302
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
304-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
305-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
303+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
306304
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
307305
; GFX9-NEXT: s_setpc_b64 s[30:31]
308306
;
@@ -442,8 +440,7 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
442440
; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
443441
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
444442
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
445-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
446-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
443+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
447444
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
448445
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
449446
; GFX9-NEXT: ; return to shader part epilog
@@ -613,11 +610,9 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
613610
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
614611
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
615612
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
616-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
617-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
613+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
618614
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
619-
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
620-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
615+
; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
621616
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
622617
; GFX8-NEXT: s_setpc_b64 s[30:31]
623618
;

llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll

+10-16
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,7 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
224224
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp
225225
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
226226
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
227-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
228-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
227+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
229228
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
230229
; GFX9-NEXT: s_setpc_b64 s[30:31]
231230
;
@@ -330,8 +329,7 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
330329
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
331330
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
332331
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
333-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
334-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
332+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
335333
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
336334
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
337335
; GFX9-NEXT: ; return to shader part epilog
@@ -453,11 +451,9 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
453451
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
454452
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
455453
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
456-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
457-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
454+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
458455
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
459-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
460-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
456+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
461457
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
462458
; GFX8-NEXT: s_setpc_b64 s[30:31]
463459
;
@@ -622,20 +618,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
622618
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
623619
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
624620
; GFX8-NEXT: v_mov_b32_e32 v2, s1
621+
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
625622
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
626623
; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp
627-
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
628-
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
629-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
630624
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
631625
; GFX8-NEXT: v_mov_b32_e32 v3, s1
632-
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
633-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
626+
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
627+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
634628
; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp
635-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
636629
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
637-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
638-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
630+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
631+
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
632+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
639633
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
640634
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
641635
; GFX8-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll

+10-16
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,7 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
218218
; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
219219
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
220220
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
221-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
222-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
221+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
223222
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
224223
; GFX9-NEXT: s_setpc_b64 s[30:31]
225224
;
@@ -322,8 +321,7 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
322321
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
323322
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
324323
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
325-
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
326-
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
324+
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
327325
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
328326
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
329327
; GFX9-NEXT: ; return to shader part epilog
@@ -441,11 +439,9 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
441439
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
442440
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
443441
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
444-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
445-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
442+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
446443
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
447-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
448-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
444+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
449445
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
450446
; GFX8-NEXT: s_setpc_b64 s[30:31]
451447
;
@@ -606,20 +602,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
606602
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
607603
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
608604
; GFX8-NEXT: v_mov_b32_e32 v2, s1
605+
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
609606
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
610607
; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp
611-
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
612-
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
613-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
614608
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
615609
; GFX8-NEXT: v_mov_b32_e32 v3, s1
616-
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
617-
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
610+
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
611+
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
618612
; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
619-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
620613
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
621-
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
622-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
614+
; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
615+
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
616+
; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
623617
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
624618
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
625619
; GFX8-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll

+7-21
Original file line numberDiff line numberDiff line change
@@ -6398,10 +6398,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
63986398
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
63996399
; GFX8-NEXT: s_waitcnt vmcnt(0)
64006400
; GFX8-NEXT: v_mov_b32_e32 v5, v0
6401-
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
6402-
; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
6401+
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
64036402
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
6404-
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
64056403
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
64066404
; GFX8-NEXT: v_mov_b32_e32 v0, v4
64076405
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -6627,10 +6625,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
66276625
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
66286626
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
66296627
; GFX8-NEXT: s_waitcnt vmcnt(0)
6630-
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
6631-
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
6628+
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
66326629
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
6633-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
66346630
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
66356631
; GFX8-NEXT: v_mov_b32_e32 v5, v2
66366632
; GFX8-NEXT: v_mov_b32_e32 v4, v1
@@ -7048,9 +7044,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
70487044
; GFX8-NEXT: ; =>This Loop Header: Depth=1
70497045
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
70507046
; GFX8-NEXT: s_waitcnt vmcnt(0)
7051-
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8
7052-
; GFX8-NEXT: v_add_f16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
7053-
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
7047+
; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
70547048
; GFX8-NEXT: v_add_f16_e32 v6, v8, v5
70557049
; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
70567050
; GFX8-NEXT: v_mov_b32_e32 v6, v7
@@ -7396,10 +7390,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
73967390
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
73977391
; GFX8-NEXT: s_waitcnt vmcnt(0)
73987392
; GFX8-NEXT: v_mov_b32_e32 v5, v0
7399-
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
7400-
; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
7393+
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
74017394
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
7402-
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
74037395
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
74047396
; GFX8-NEXT: v_mov_b32_e32 v0, v4
74057397
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -7658,10 +7650,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
76587650
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
76597651
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
76607652
; GFX8-NEXT: s_waitcnt vmcnt(0)
7661-
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
7662-
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
7653+
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
76637654
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
7664-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
76657655
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
76667656
; GFX8-NEXT: v_mov_b32_e32 v5, v2
76677657
; GFX8-NEXT: v_mov_b32_e32 v4, v1
@@ -7925,10 +7915,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
79257915
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
79267916
; GFX8-NEXT: s_waitcnt vmcnt(0)
79277917
; GFX8-NEXT: v_mov_b32_e32 v5, v0
7928-
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
7929-
; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
7918+
; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
79307919
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
7931-
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
79327920
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
79337921
; GFX8-NEXT: v_mov_b32_e32 v0, v4
79347922
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -8187,10 +8175,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
81878175
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
81888176
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
81898177
; GFX8-NEXT: s_waitcnt vmcnt(0)
8190-
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
8191-
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
8178+
; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
81928179
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
8193-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
81948180
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
81958181
; GFX8-NEXT: v_mov_b32_e32 v5, v2
81968182
; GFX8-NEXT: v_mov_b32_e32 v4, v1

0 commit comments

Comments
 (0)