From be99e2e8a2b8ff49d8de081f77cbfcb45d865876 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 7 May 2025 07:34:04 +0100
Subject: [PATCH] [VectorCombine] Fix the type used in foldShuffleOfIntrinsics
 Cost.

The shuffle needn't be twice the original number of vector elements, so the
intermediate type used between the shuffle and the intrinsic should use the
ShuffleDstTy number of elements.

I found this when looking at shuffle costs and do not have test where it alters
the output, but have added some cases where the shuffle output is not twice the
size of the input.
---
 .../Transforms/Vectorize/VectorCombine.cpp    |  2 +-
 .../X86/shuffle-of-intrinsics.ll              | 34 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e58789b5d5641..c7d221e8d1e5c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2376,7 +2376,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
     } else {
       auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
       NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
-                                               VecTy->getNumElements() * 2));
+                                               ShuffleDstTy->getNumElements()));
       NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
                                     VecTy, OldMask, CostKind);
     }
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
index 2dc76cbfdda41..e64e7807b7d4a 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
@@ -69,6 +69,20 @@ entry:
   ret <8 x i1> %4
 }
 
+define <2 x i1> @test4b(<4 x float> %0, <4 x float> %1) {
+; CHECK-LABEL: @test4b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[TMP2]], i32 0)
+; CHECK-NEXT:    ret <2 x i1> [[TMP3]]
+;
+entry:
+  %2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
+  %3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0)
+  %4 = shufflevector <4 x i1> %2, <4 x i1> %3, <2 x i32> <i32 0, i32 4>
+  ret <2 x i1> %4
+}
+
 define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
@@ -84,6 +98,26 @@ entry:
   ret <8 x float> %6
 }
 
+define <2 x float> @test6(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1, <4 x float> %a2, <4 x float> %b2, <4 x float> %c2) {
+; SSE-LABEL: @test6(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B1:%.*]], <4 x float> [[B2:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[C1:%.*]], <4 x float> [[C2:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[S:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]])
+; SSE-NEXT:    ret <2 x float> [[S]]
+;
+; AVX-LABEL: @test6(
+; AVX-NEXT:    [[F1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A1:%.*]], <4 x float> [[B1:%.*]], <4 x float> [[C1:%.*]])
+; AVX-NEXT:    [[F2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A2:%.*]], <4 x float> [[B2:%.*]], <4 x float> [[C2:%.*]])
+; AVX-NEXT:    [[S:%.*]] = shufflevector <4 x float> [[F1]], <4 x float> [[F2]], <2 x i32> <i32 0, i32 4>
+; AVX-NEXT:    ret <2 x float> [[S]]
+;
+  %f1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1)
+  %f2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a2, <4 x float> %b2, <4 x float> %c2)
+  %s = shufflevector <4 x float> %f1, <4 x float> %f2, <2 x i32> <i32 0, i32 4>
+  ret <2 x float> %s
+}
+
 declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
 declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
 declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)