llvm · bob80905 · Sep 4, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test
@@ -0,0 +1,178 @@
+#--- source.hlsl
+StructuredBuffer<half4> In  : register(t0);
+RWStructuredBuffer<half4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<half4> Out2 : register(u2); // test half2
+RWStructuredBuffer<half4> Out3 : register(u3); // test half3
+RWStructuredBuffer<half4> Out4 : register(u4); // test half4
+RWStructuredBuffer<half4> Out5 : register(u5); // constant folding
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    half4 v = In[tid.x];
+
+    half s1 = WaveActiveMax( v.x );
+    half s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
+    half s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
+    half s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
+
+    half2 v2_1 = WaveActiveMax( v.xy );
+    half2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : half2(0,0);
+    half2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : half2(0,0);
+    half2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : half2(0,0);
+
+    half3 v3_1 = WaveActiveMax( v.xyz );
+    half3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : half3(0,0,0);
+    half3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : half3(0,0,0);
+    half3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : half3(0,0,0);
+
+    half4 v4_1 = WaveActiveMax( v );
+    half4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : half4(0,0,0,0);
+    half4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : half4(0,0,0,0);
+    half4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : half4(0,0,0,0);
+
+    half scalars[4] = { s4, s3, s2, s1 };
+    half2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
+    half3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
+    half4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(half4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Float16
+    Stride: 8
+    # 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000
+    Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ]
+  - Name: Out1
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 32  
+  - Name: Out2
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: Out3
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: Out4
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: Out5
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 8
+  - Name: ExpectedOut1
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x0, 0x0, 0x0, 0x4000, 0x0, 0x0, 0x0, 0x4200, 0x0, 0x0, 0x0, 0x4400, 0x0, 0x0, 0x0 ]
+  - Name: ExpectedOut2
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x4900, 0x0, 0x0, 0x4000, 0x4d00, 0x0, 0x0, 0x4200, 0x4f80, 0x0, 0x0, 0x4400, 0x5100, 0x0, 0x0 ]
+  - Name: ExpectedOut3
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x4900, 0x5640, 0x0, 0x4000, 0x4d00, 0x5a40, 0x0, 0x4200, 0x4f80, 0x5cb0, 0x0, 0x4400, 0x5100, 0x5e40, 0x0 ]
+  - Name: ExpectedOut4
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ]
+  - Name: ExpectedOut5
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3C00, 0x4000, 0x4200, 0x4400 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+
+...
+#--- end
+
+# Bug https://github.com/llvm/llvm-project/issues/156775
+# XFAIL: Clang
+
+# Bug https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.fp32.test b/test/WaveOps/WaveActiveMax.fp32.test
@@ -0,0 +1,177 @@
+#--- source.hlsl
+StructuredBuffer<float4> In  : register(t0);
+RWStructuredBuffer<float4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<float4> Out2 : register(u2); // test float2
+RWStructuredBuffer<float4> Out3 : register(u3); // test float3
+RWStructuredBuffer<float4> Out4 : register(u4); // test float4
+RWStructuredBuffer<float4> Out5 : register(u5); // constant folding
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    float4 v = In[tid.x];
+
+    float s1 = WaveActiveMax( v.x );
+    float s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
+    float s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
+    float s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
+
+    float2 v2_1 = WaveActiveMax( v.xy );
+    float2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : float2(0,0);
+    float2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : float2(0,0);
+    float2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : float2(0,0);
+
+    float3 v3_1 = WaveActiveMax( v.xyz );
+    float3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : float3(0,0,0);
+    float3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : float3(0,0,0);
+    float3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : float3(0,0,0);
+
+    float4 v4_1 = WaveActiveMax( v );
+    float4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : float4(0,0,0,0);
+    float4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : float4(0,0,0,0);
+    float4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : float4(0,0,0,0);
+
+    float scalars[4] = { s4, s3, s2, s1 };
+    float2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
+    float3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
+    float4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(float4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ]
+  - Name: Out1
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 64  
+  - Name: Out2
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: Out3
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: Out4
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: Out5
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 16
+  - Name: ExpectedOut1
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0 ]
+  - Name: ExpectedOut2
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 10.0, 0.0, 0.0, 2.0, 20.0, 0.0, 0.0, 3.0, 30.0, 0.0, 0.0, 4.0, 40.0, 0.0, 0.0 ]
+  - Name: ExpectedOut3
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 10.0, 100.0, 0.0, 2.0, 20.0, 200.0, 0.0, 3.0, 30.0, 300.0, 0.0, 4.0, 40.0, 400.0, 0.0 ]
+  - Name: ExpectedOut4
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ]
+  - Name: ExpectedOut5
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 2.0, 3.0, 4.0 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+
+...
+#--- end
+
+# Bug https://github.com/llvm/llvm-project/issues/156775
+# XFAIL: Clang
+
+# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o