-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LoongArch] Fix broadcast load with extension. #155960
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-loongarch Author: None (tangaac) ChangesPR #135896 introduces [x]vldrepl instructions without handling extension. Full diff: https://github.com/llvm/llvm-project/pull/155960.diff 3 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index ffb6c2980026f..478c335c3f07e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2471,8 +2471,9 @@ static SDValue lowerBUILD_VECTORAsBroadCastLoad(BuildVectorSDNode *BVOp,
if (!IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode()))
return SDValue();
- if (IsIdeneity) {
+ if (IsIdeneity && ISD::isNON_EXTLoad(IdentitySrc.getNode())) {
auto *LN = cast<LoadSDNode>(IdentitySrc);
+
SDVTList Tys =
LN->isIndexed()
? DAG.getVTList(VT, LN->getBasePtr().getValueType(), MVT::Other)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
index 976924bdca686..89592a0886cc1 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
@@ -18,6 +18,32 @@ define <4 x i64> @should_not_be_optimized(ptr %ptr, ptr %dst) {
ret <4 x i64> %tmp2
}
+define <16 x i16> @should_not_be_optimized_sext_load(ptr %ptr) {
+; CHECK-LABEL: should_not_be_optimized_sext_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.h $xr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ %tmp1 = sext i8 %tmp to i16
+ %tmp2 = insertelement <16 x i16> zeroinitializer, i16 %tmp1, i32 0
+ %tmp3 = shufflevector <16 x i16> %tmp2, <16 x i16> poison, <16 x i32> zeroinitializer
+ ret <16 x i16> %tmp3
+}
+
+define <16 x i16> @should_not_be_optimized_zext_load(ptr %ptr) {
+; CHECK-LABEL: should_not_be_optimized_zext_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.bu $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.h $xr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ %tmp1 = zext i8 %tmp to i16
+ %tmp2 = insertelement <16 x i16> zeroinitializer, i16 %tmp1, i32 0
+ %tmp3 = shufflevector <16 x i16> %tmp2, <16 x i16> poison, <16 x i32> zeroinitializer
+ ret <16 x i16> %tmp3
+}
+
define <4 x i64> @xvldrepl_d_unaligned_offset(ptr %ptr) {
; CHECK-LABEL: xvldrepl_d_unaligned_offset:
; CHECK: # %bb.0:
@@ -34,7 +60,8 @@ define <4 x i64> @xvldrepl_d_unaligned_offset(ptr %ptr) {
define <32 x i8> @xvldrepl_b(ptr %ptr) {
; CHECK-LABEL: xvldrepl_b:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvldrepl.b $xr0, $a0, 0
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.b $xr0, $a0
; CHECK-NEXT: ret
%tmp = load i8, ptr %ptr
%tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
@@ -45,7 +72,8 @@ define <32 x i8> @xvldrepl_b(ptr %ptr) {
define <32 x i8> @xvldrepl_b_offset(ptr %ptr) {
; CHECK-LABEL: xvldrepl_b_offset:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvldrepl.b $xr0, $a0, 33
+; CHECK-NEXT: ld.b $a0, $a0, 33
+; CHECK-NEXT: xvreplgr2vr.b $xr0, $a0
; CHECK-NEXT: ret
%p = getelementptr i8, ptr %ptr, i64 33
%tmp = load i8, ptr %p
@@ -58,7 +86,8 @@ define <32 x i8> @xvldrepl_b_offset(ptr %ptr) {
define <16 x i16> @xvldrepl_h(ptr %ptr) {
; CHECK-LABEL: xvldrepl_h:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvldrepl.h $xr0, $a0, 0
+; CHECK-NEXT: ld.h $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.h $xr0, $a0
; CHECK-NEXT: ret
%tmp = load i16, ptr %ptr
%tmp1 = insertelement <16 x i16> zeroinitializer, i16 %tmp, i32 0
@@ -69,7 +98,8 @@ define <16 x i16> @xvldrepl_h(ptr %ptr) {
define <16 x i16> @xvldrepl_h_offset(ptr %ptr) {
; CHECK-LABEL: xvldrepl_h_offset:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvldrepl.h $xr0, $a0, 66
+; CHECK-NEXT: ld.h $a0, $a0, 66
+; CHECK-NEXT: xvreplgr2vr.h $xr0, $a0
; CHECK-NEXT: ret
%p = getelementptr i16, ptr %ptr, i64 33
%tmp = load i16, ptr %p
@@ -81,7 +111,8 @@ define <16 x i16> @xvldrepl_h_offset(ptr %ptr) {
define <8 x i32> @xvldrepl_w(ptr %ptr) {
; CHECK-LABEL: xvldrepl_w:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvldrepl.w $xr0, $a0, 0
+; CHECK-NEXT: ld.w $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.w $xr0, $a0
; CHECK-NEXT: ret
%tmp = load i32, ptr %ptr
%tmp1 = insertelement <8 x i32> zeroinitializer, i32 %tmp, i32 0
@@ -92,7 +123,8 @@ define <8 x i32> @xvldrepl_w(ptr %ptr) {
define <8 x i32> @xvldrepl_w_offset(ptr %ptr) {
; CHECK-LABEL: xvldrepl_w_offset:
; CHECK: # %bb.0:
-; CHECK-NEXT: xvldrepl.w $xr0, $a0, 132
+; CHECK-NEXT: ld.w $a0, $a0, 132
+; CHECK-NEXT: xvreplgr2vr.w $xr0, $a0
; CHECK-NEXT: ret
%p = getelementptr i32, ptr %ptr, i64 33
%tmp = load i32, ptr %p
diff --git a/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
index c46747ef30509..a8cddbf9e6400 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
@@ -18,6 +18,32 @@ define <2 x i64> @should_not_be_optimized(ptr %ptr, ptr %dst){
ret <2 x i64> %tmp2
}
+define <8 x i16> @should_not_be_optimized_sext_load(ptr %ptr) {
+; CHECK-LABEL: should_not_be_optimized_sext_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.h $vr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ %tmp1 = sext i8 %tmp to i16
+ %tmp2 = insertelement <8 x i16> zeroinitializer, i16 %tmp1, i32 0
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> poison, <8 x i32> zeroinitializer
+ ret <8 x i16> %tmp3
+}
+
+define <8 x i16> @should_not_be_optimized_zext_load(ptr %ptr) {
+; CHECK-LABEL: should_not_be_optimized_zext_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.bu $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.h $vr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ %tmp1 = zext i8 %tmp to i16
+ %tmp2 = insertelement <8 x i16> zeroinitializer, i16 %tmp1, i32 0
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> poison, <8 x i32> zeroinitializer
+ ret <8 x i16> %tmp3
+}
+
define <2 x i64> @vldrepl_d_unaligned_offset(ptr %ptr) {
; CHECK-LABEL: vldrepl_d_unaligned_offset:
; CHECK: # %bb.0:
@@ -34,7 +60,8 @@ define <2 x i64> @vldrepl_d_unaligned_offset(ptr %ptr) {
define <16 x i8> @vldrepl_b(ptr %ptr) {
; CHECK-LABEL: vldrepl_b:
; CHECK: # %bb.0:
-; CHECK-NEXT: vldrepl.b $vr0, $a0, 0
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.b $vr0, $a0
; CHECK-NEXT: ret
%tmp = load i8, ptr %ptr
%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
@@ -45,7 +72,8 @@ define <16 x i8> @vldrepl_b(ptr %ptr) {
define <16 x i8> @vldrepl_b_offset(ptr %ptr) {
; CHECK-LABEL: vldrepl_b_offset:
; CHECK: # %bb.0:
-; CHECK-NEXT: vldrepl.b $vr0, $a0, 33
+; CHECK-NEXT: ld.b $a0, $a0, 33
+; CHECK-NEXT: vreplgr2vr.b $vr0, $a0
; CHECK-NEXT: ret
%p = getelementptr i8, ptr %ptr, i64 33
%tmp = load i8, ptr %p
@@ -58,7 +86,8 @@ define <16 x i8> @vldrepl_b_offset(ptr %ptr) {
define <8 x i16> @vldrepl_h(ptr %ptr) {
; CHECK-LABEL: vldrepl_h:
; CHECK: # %bb.0:
-; CHECK-NEXT: vldrepl.h $vr0, $a0, 0
+; CHECK-NEXT: ld.h $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.h $vr0, $a0
; CHECK-NEXT: ret
%tmp = load i16, ptr %ptr
%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %tmp, i32 0
@@ -69,7 +98,8 @@ define <8 x i16> @vldrepl_h(ptr %ptr) {
define <8 x i16> @vldrepl_h_offset(ptr %ptr) {
; CHECK-LABEL: vldrepl_h_offset:
; CHECK: # %bb.0:
-; CHECK-NEXT: vldrepl.h $vr0, $a0, 66
+; CHECK-NEXT: ld.h $a0, $a0, 66
+; CHECK-NEXT: vreplgr2vr.h $vr0, $a0
; CHECK-NEXT: ret
%p = getelementptr i16, ptr %ptr, i64 33
%tmp = load i16, ptr %p
@@ -81,7 +111,8 @@ define <8 x i16> @vldrepl_h_offset(ptr %ptr) {
define <4 x i32> @vldrepl_w(ptr %ptr) {
; CHECK-LABEL: vldrepl_w:
; CHECK: # %bb.0:
-; CHECK-NEXT: vldrepl.w $vr0, $a0, 0
+; CHECK-NEXT: ld.w $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.w $vr0, $a0
; CHECK-NEXT: ret
%tmp = load i32, ptr %ptr
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
@@ -92,7 +123,8 @@ define <4 x i32> @vldrepl_w(ptr %ptr) {
define <4 x i32> @vldrepl_w_offset(ptr %ptr) {
; CHECK-LABEL: vldrepl_w_offset:
; CHECK: # %bb.0:
-; CHECK-NEXT: vldrepl.w $vr0, $a0, 132
+; CHECK-NEXT: ld.w $a0, $a0, 132
+; CHECK-NEXT: vreplgr2vr.w $vr0, $a0
; CHECK-NEXT: ret
%p = getelementptr i32, ptr %ptr, i64 33
%tmp = load i32, ptr %p
@@ -169,3 +201,4 @@ define <2 x double> @vldrepl_d_dbl_offset(ptr %ptr) {
%tmp2 = shufflevector <2 x double> %tmp1, <2 x double> poison, <2 x i32> zeroinitializer
ret <2 x double> %tmp2
}
+
|
auto *LN = cast<LoadSDNode>(IdentitySrc); | ||
auto ExtType = LN->getExtensionType(); | ||
|
||
if (IsIdeneity && (ExtType == ISD::EXTLOAD || ExtType == ISD::NON_EXTLOAD)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you think it makes sense to precisely match the sizes of VT.getVectorElementVT()
and LN->getMemoryVT()
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It matches the case like this.
t2: i64,ch = CopyFromReg t0, Register:i64 %0
t16: i64,ch = load<(load (s8) from %ir.ptr), anyext from i8> t0, t2, undef:i64
t2: i64,ch = CopyFromReg t0, Register:i64 %0
t5: i64,ch = load<(load (s64) from %ir.ptr)> t0, t2, undef:i64
The |
PR #135896 introduces [x]vldrepl instructions without handling extension.
This patch will fix that.