Skip to content

Commit be99e2e

Browse files
committed
[VectorCombine] Fix the type used in foldShuffleOfIntrinsics Cost.
The shuffle needn't be twice the original number of vector elements, so the intermediate type used between the shuffle and the intrinsic should use the ShuffleDstTy number of elements. I found this when looking at shuffle costs and do not have test where it alters the output, but have added some cases where the shuffle output is not twice the size of the input.
1 parent 8286378 commit be99e2e

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -2376,7 +2376,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
23762376
} else {
23772377
auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
23782378
NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
2379-
VecTy->getNumElements() * 2));
2379+
ShuffleDstTy->getNumElements()));
23802380
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
23812381
VecTy, OldMask, CostKind);
23822382
}

llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll

+34
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,20 @@ entry:
6969
ret <8 x i1> %4
7070
}
7171

72+
define <2 x i1> @test4b(<4 x float> %0, <4 x float> %1) {
73+
; CHECK-LABEL: @test4b(
74+
; CHECK-NEXT: entry:
75+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <2 x i32> <i32 0, i32 4>
76+
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[TMP2]], i32 0)
77+
; CHECK-NEXT: ret <2 x i1> [[TMP3]]
78+
;
79+
entry:
80+
%2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
81+
%3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0)
82+
%4 = shufflevector <4 x i1> %2, <4 x i1> %3, <2 x i32> <i32 0, i32 4>
83+
ret <2 x i1> %4
84+
}
85+
7286
define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) {
7387
; CHECK-LABEL: @test5(
7488
; CHECK-NEXT: entry:
@@ -84,6 +98,26 @@ entry:
8498
ret <8 x float> %6
8599
}
86100

101+
define <2 x float> @test6(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1, <4 x float> %a2, <4 x float> %b2, <4 x float> %c2) {
102+
; SSE-LABEL: @test6(
103+
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], <2 x i32> <i32 0, i32 4>
104+
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B1:%.*]], <4 x float> [[B2:%.*]], <2 x i32> <i32 0, i32 4>
105+
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[C1:%.*]], <4 x float> [[C2:%.*]], <2 x i32> <i32 0, i32 4>
106+
; SSE-NEXT: [[S:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]])
107+
; SSE-NEXT: ret <2 x float> [[S]]
108+
;
109+
; AVX-LABEL: @test6(
110+
; AVX-NEXT: [[F1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A1:%.*]], <4 x float> [[B1:%.*]], <4 x float> [[C1:%.*]])
111+
; AVX-NEXT: [[F2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A2:%.*]], <4 x float> [[B2:%.*]], <4 x float> [[C2:%.*]])
112+
; AVX-NEXT: [[S:%.*]] = shufflevector <4 x float> [[F1]], <4 x float> [[F2]], <2 x i32> <i32 0, i32 4>
113+
; AVX-NEXT: ret <2 x float> [[S]]
114+
;
115+
%f1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1)
116+
%f2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a2, <4 x float> %b2, <4 x float> %c2)
117+
%s = shufflevector <4 x float> %f1, <4 x float> %f2, <2 x i32> <i32 0, i32 4>
118+
ret <2 x float> %s
119+
}
120+
87121
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
88122
declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
89123
declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)

0 commit comments

Comments
 (0)