From dba563fa030b2e3cf80899d0db3dce53540f6327 Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Thu, 4 Sep 2025 15:32:58 -0700 Subject: [PATCH 01/11] fp16 seems to work --- test/WaveOps/WaveActiveMax.fp16.test | 177 +++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 test/WaveOps/WaveActiveMax.fp16.test diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test new file mode 100644 index 000000000..dc0dde2b1 --- /dev/null +++ b/test/WaveOps/WaveActiveMax.fp16.test @@ -0,0 +1,177 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test half2 +RWStructuredBuffer Out3 : register(u3); // test half3 +RWStructuredBuffer Out4 : register(u4); // test half4 +RWStructuredBuffer Out5 : register(u5); // constant folding + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + half4 v = In[tid.x]; + + half s1 = WaveActiveMax( v.x ); + half s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; + half s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; + half s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; + + half2 v2_1 = WaveActiveMax( v.xy ); + half2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : half2(0,0); + half2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : half2(0,0); + half2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : half2(0,0); + + half3 v3_1 = WaveActiveMax( v.xyz ); + half3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : half3(0,0,0); + half3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : half3(0,0,0); + half3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : half3(0,0,0); + + half4 v4_1 = WaveActiveMax( v ); + half4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : half4(0,0,0,0); + half4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : half4(0,0,0,0); + half4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : half4(0,0,0,0); + + half scalars[4] = { s4, s3, s2, s1 }; + half2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; + half3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; + half4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + + // constant folding case + Out5[0] = WaveActiveMax(half4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Float16 + Stride: 8 + # 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 + Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ] + - Name: Out1 + Format: Float16 + Stride: 8 + ZeroInitSize: 32 + - Name: Out2 + Format: Float16 + Stride: 8 + ZeroInitSize: 32 + - Name: Out3 + Format: Float16 + Stride: 8 + ZeroInitSize: 32 + - Name: Out4 + Format: Float16 + Stride: 8 + ZeroInitSize: 32 + - Name: Out5 + Format: Float16 + Stride: 8 + ZeroInitSize: 8 + - Name: ExpectedOut1 + Format: Float16 + Stride: 8 + Data: [ 0x3c00, 0x0, 0x0, 0x0, 0x4000, 0x0, 0x0, 0x0, 0x4200, 0x0, 0x0, 0x0, 0x4400, 0x0, 0x0, 0x0 ] + - Name: ExpectedOut2 + Format: Float16 + Stride: 8 + Data: [ 0x3c00, 0x4900, 0x0, 0x0, 0x4000, 0x4d00, 0x0, 0x0, 0x4200, 0x4f80, 0x0, 0x0, 0x4400, 0x5100, 0x0, 0x0 ] + - Name: ExpectedOut3 + Format: Float16 + Stride: 8 + Data: [ 0x3c00, 0x4900, 0x5640, 0x0, 0x4000, 0x4d00, 0x5a40, 0x0, 0x4200, 0x4f80, 0x5cb0, 0x0, 0x4400, 0x5100, 0x5e40, 0x0 ] + - Name: ExpectedOut4 + Format: Float16 + Stride: 8 + Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ] + - Name: ExpectedOut5 + Format: Float16 + Stride: 8 + Data: [ 0x3C00, 0x4000, 0x4200, 0x4400 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + +... +#--- end + +# XFAIL: Clang + + + +# RUN: split-file %s %t +# RUN: %if !Vulkan %{ %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl %} +# RUN: %if Vulkan %{ %dxc_target -enable-16bit-types -T cs_6_5 -fspv-target-env=vulkan1.1 -Fo %t.o %t/source.hlsl %} +# RUN: %offloader %t/pipeline.yaml %t.o From 8fec3b6c3ebfe097ddb1bc71f6da9526c8c5582c Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Fri, 5 Sep 2025 12:07:42 -0700 Subject: [PATCH 02/11] add uint variants --- test/WaveOps/WaveActiveMax.fp16.test | 3 +- test/WaveOps/WaveActiveMax.fp32.test | 175 ++++++++++++++ test/WaveOps/WaveActiveMax.fp64.test | 177 ++++++++++++++ test/WaveOps/WaveActiveMax.int16.test | 327 ++++++++++++++++++++++++++ test/WaveOps/WaveActiveMax.int32.test | 327 ++++++++++++++++++++++++++ test/WaveOps/WaveActiveMax.int64.test | 327 ++++++++++++++++++++++++++ 6 files changed, 1334 insertions(+), 2 deletions(-) create mode 100644 test/WaveOps/WaveActiveMax.fp32.test create mode 100644 test/WaveOps/WaveActiveMax.fp64.test create mode 100644 test/WaveOps/WaveActiveMax.int16.test create mode 100644 test/WaveOps/WaveActiveMax.int32.test create mode 100644 test/WaveOps/WaveActiveMax.int64.test diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test index dc0dde2b1..41e80e27f 100644 --- a/test/WaveOps/WaveActiveMax.fp16.test +++ b/test/WaveOps/WaveActiveMax.fp16.test @@ -172,6 +172,5 @@ DescriptorSets: # RUN: split-file %s %t -# RUN: %if !Vulkan %{ %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl %} -# RUN: %if Vulkan %{ %dxc_target -enable-16bit-types -T cs_6_5 -fspv-target-env=vulkan1.1 -Fo %t.o %t/source.hlsl %} +# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl # RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveActiveMax.fp32.test b/test/WaveOps/WaveActiveMax.fp32.test new file mode 100644 index 000000000..9aeba5943 --- /dev/null +++ b/test/WaveOps/WaveActiveMax.fp32.test @@ -0,0 +1,175 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test float2 +RWStructuredBuffer Out3 : register(u3); // test float3 +RWStructuredBuffer Out4 : register(u4); // test float4 +RWStructuredBuffer Out5 : register(u5); // constant folding + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + float4 v = In[tid.x]; + + float s1 = WaveActiveMax( v.x ); + float s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; + float s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; + float s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; + + float2 v2_1 = WaveActiveMax( v.xy ); + float2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : float2(0,0); + float2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : float2(0,0); + float2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : float2(0,0); + + float3 v3_1 = WaveActiveMax( v.xyz ); + float3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : float3(0,0,0); + float3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : float3(0,0,0); + float3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : float3(0,0,0); + + float4 v4_1 = WaveActiveMax( v ); + float4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : float4(0,0,0,0); + float4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : float4(0,0,0,0); + float4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : float4(0,0,0,0); + + float scalars[4] = { s4, s3, s2, s1 }; + float2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; + float3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; + float4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + + // constant folding case + Out5[0] = WaveActiveMax(float4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Float32 + Stride: 16 + Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ] + - Name: Out1 + Format: Float32 + Stride: 16 + ZeroInitSize: 64 + - Name: Out2 + Format: Float32 + Stride: 16 + ZeroInitSize: 64 + - Name: Out3 + Format: Float32 + Stride: 16 + ZeroInitSize: 64 + - Name: Out4 + Format: Float32 + Stride: 16 + ZeroInitSize: 64 + - Name: Out5 + Format: Float32 + Stride: 16 + ZeroInitSize: 16 + - Name: ExpectedOut1 + Format: Float32 + Stride: 16 + Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0 ] + - Name: ExpectedOut2 + Format: Float32 + Stride: 16 + Data: [ 1.0, 10.0, 0.0, 0.0, 2.0, 20.0, 0.0, 0.0, 3.0, 30.0, 0.0, 0.0, 4.0, 40.0, 0.0, 0.0 ] + - Name: ExpectedOut3 + Format: Float32 + Stride: 16 + Data: [ 1.0, 10.0, 100.0, 0.0, 2.0, 20.0, 200.0, 0.0, 3.0, 30.0, 300.0, 0.0, 4.0, 40.0, 400.0, 0.0 ] + - Name: ExpectedOut4 + Format: Float32 + Stride: 16 + Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ] + - Name: ExpectedOut5 + Format: Float32 + Stride: 16 + Data: [ 1.0, 2.0, 3.0, 4.0 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + +... +#--- end + +# XFAIL: Clang + + + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveActiveMax.fp64.test b/test/WaveOps/WaveActiveMax.fp64.test new file mode 100644 index 000000000..feac35dea --- /dev/null +++ b/test/WaveOps/WaveActiveMax.fp64.test @@ -0,0 +1,177 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test double2 +RWStructuredBuffer Out3 : register(u3); // test double3 +RWStructuredBuffer Out4 : register(u4); // test double4 +RWStructuredBuffer Out5 : register(u5); // constant folding + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + double4 v = In[tid.x]; + + double s1 = WaveActiveMax( v.x ); + double s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; + double s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; + double s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; + + double2 v2_1 = WaveActiveMax( v.xy ); + double2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : double2(0,0); + double2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : double2(0,0); + double2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : double2(0,0); + + double3 v3_1 = WaveActiveMax( v.xyz ); + double3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : double3(0,0,0); + double3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : double3(0,0,0); + double3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : double3(0,0,0); + + double4 v4_1 = WaveActiveMax( v ); + double4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : double4(0,0,0,0); + double4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : double4(0,0,0,0); + double4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : double4(0,0,0,0); + + double scalars[4] = { s4, s3, s2, s1 }; + double2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; + double3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; + double4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + + // constant folding case + Out5[0] = WaveActiveMax(double4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Float64 + Stride: 32 + Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ] + - Name: Out1 + Format: Float64 + Stride: 32 + ZeroInitSize: 128 + - Name: Out2 + Format: Float64 + Stride: 32 + ZeroInitSize: 128 + - Name: Out3 + Format: Float64 + Stride: 32 + ZeroInitSize: 128 + - Name: Out4 + Format: Float64 + Stride: 32 + ZeroInitSize: 128 + - Name: Out5 + Format: Float64 + Stride: 32 + ZeroInitSize: 32 + - Name: ExpectedOut1 + Format: Float64 + Stride: 32 + Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0 ] + - Name: ExpectedOut2 + Format: Float64 + Stride: 32 + Data: [ 1.0, 10.0, 0.0, 0.0, 2.0, 20.0, 0.0, 0.0, 3.0, 30.0, 0.0, 0.0, 4.0, 40.0, 0.0, 0.0 ] + - Name: ExpectedOut3 + Format: Float64 + Stride: 32 + Data: [ 1.0, 10.0, 100.0, 0.0, 2.0, 20.0, 200.0, 0.0, 3.0, 30.0, 300.0, 0.0, 4.0, 40.0, 400.0, 0.0 ] + - Name: ExpectedOut4 + Format: Float64 + Stride: 32 + Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ] + - Name: ExpectedOut5 + Format: Float64 + Stride: 32 + Data: [ 1.0, 2.0, 3.0, 4.0 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + +... +#--- end + +# REQUIRES: Double + +# Bug https://github.com/llvm/llvm-project/issues/156775 +# XFAIL: Clang + + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test new file mode 100644 index 000000000..81546cc91 --- /dev/null +++ b/test/WaveOps/WaveActiveMax.int16.test @@ -0,0 +1,327 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int16_t2 +RWStructuredBuffer Out3 : register(u3); // test int16_t3 +RWStructuredBuffer Out4 : register(u4); // test int16_t4 +RWStructuredBuffer Out5 : register(u5); // constant folding + +// uints +StructuredBuffer UIn : register(t6); +RWStructuredBuffer UOut1 : register(u7); +RWStructuredBuffer UOut2 : register(u8); +RWStructuredBuffer UOut3 : register(u9); +RWStructuredBuffer UOut4 : register(u10); +RWStructuredBuffer UOut5 : register(u11); + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + int16_t4 v = In[tid.x]; + + int16_t s1 = WaveActiveMax( v.x ); + int16_t s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; + int16_t s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; + int16_t s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; + + int16_t2 v2_1 = WaveActiveMax( v.xy ); + int16_t2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int16_t2(0,0); + int16_t2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int16_t2(0,0); + int16_t2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int16_t2(0,0); + + int16_t3 v3_1 = WaveActiveMax( v.xyz ); + int16_t3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0); + int16_t3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0); + int16_t3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0); + + int16_t4 v4_1 = WaveActiveMax( v ); + int16_t4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int16_t4(0,0,0,0); + int16_t4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int16_t4(0,0,0,0); + int16_t4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int16_t4(0,0,0,0); + + int16_t scalars[4] = { s4, s3, s2, s1 }; + int16_t2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; + int16_t3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; + int16_t4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + + // constant folding case + Out5[0] = WaveActiveMax(int16_t4(1,2,3,4)); + + // UINT case + + uint16_t4 uv = UIn[tid.x]; + + uint16_t us1 = WaveActiveMax( uv.x ); + uint16_t us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0; + uint16_t us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0; + uint16_t us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0; + + uint16_t2 uv2_1 = WaveActiveMax( uv.xy ); + uint16_t2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0); + uint16_t2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0); + uint16_t2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0); + + uint16_t3 uv3_1 = WaveActiveMax( uv.xyz ); + uint16_t3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0); + uint16_t3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0); + uint16_t3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0); + + uint16_t4 uv4_1 = WaveActiveMax( uv ); + uint16_t4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0); + uint16_t4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0); + uint16_t4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0); + + uint16_t uscalars[4] = { us4, us3, us2, us1 }; + uint16_t2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 }; + uint16_t3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 }; + uint16_t4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 }; + + UOut1[tid.x].x = uscalars[tid.x]; + UOut2[tid.x].xy = uvec2s[tid.x]; + UOut3[tid.x].xyz = uvec3s[tid.x]; + UOut4[tid.x] = uvec4s[tid.x]; + + // constant folding case + UOut5[0] = WaveActiveMax(uint16_t4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Int16 + Stride: 8 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: Out1 + Format: Int16 + Stride: 8 + ZeroInitSize: 32 + - Name: Out2 + Format: Int16 + Stride: 8 + ZeroInitSize: 32 + - Name: Out3 + Format: Int16 + Stride: 8 + ZeroInitSize: 32 + - Name: Out4 + Format: Int16 + Stride: 8 + ZeroInitSize: 32 + - Name: Out5 + Format: Int16 + Stride: 8 + ZeroInitSize: 8 + - Name: ExpectedOut1 + Format: Int16 + Stride: 8 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] + - Name: ExpectedOut2 + Format: Int16 + Stride: 8 + Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] + - Name: ExpectedOut3 + Format: Int16 + Stride: 8 + Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] + - Name: ExpectedOut4 + Format: Int16 + Stride: 8 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: ExpectedOut5 + Format: Int16 + Stride: 8 + Data: [ 1, 2, 3, 4 ] + - Name: UIn + Format: UInt16 + Stride: 2 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: UOut1 + Format: UInt16 + Stride: 8 + ZeroInitSize: 32 + - Name: UOut2 + Format: UInt16 + Stride: 8 + ZeroInitSize: 32 + - Name: UOut3 + Format: UInt16 + Stride: 8 + ZeroInitSize: 32 + - Name: UOut4 + Format: UInt16 + Stride: 8 + ZeroInitSize: 32 + - Name: UOut5 + Format: UInt16 + Stride: 8 + ZeroInitSize: 8 + - Name: UExpectedOut1 + Format: UInt16 + Stride: 8 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] + - Name: UExpectedOut2 + Format: UInt16 + Stride: 8 + Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] + - Name: UExpectedOut3 + Format: UInt16 + Stride: 8 + Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] + - Name: UExpectedOut4 + Format: UInt16 + Stride: 8 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: UExpectedOut5 + Format: UInt16 + Stride: 2 + Data: [ 1, 2, 3, 4 ] + +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 + - Result: UExpectedOut1 + Rule: BufferExact + Actual: UOut1 + Expected: UExpectedOut1 + - Result: UExpectedOut2 + Rule: BufferExact + Actual: UOut2 + Expected: UExpectedOut2 + - Result: UExpectedOut3 + Rule: BufferExact + Actual: UOut3 + Expected: UExpectedOut3 + - Result: UExpectedOut4 + Rule: BufferExact + Actual: UOut4 + Expected: UExpectedOut4 + - Result: UExpectedOut5 + Rule: BufferExact + Actual: UOut5 + Expected: UExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: UIn + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + - Name: UOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 7 + Space: 0 + VulkanBinding: + Binding: 7 + - Name: UOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 8 + Space: 0 + VulkanBinding: + Binding: 8 + - Name: UOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 9 + Space: 0 + VulkanBinding: + Binding: 9 + - Name: UOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 10 + Space: 0 + VulkanBinding: + Binding: 10 + - Name: UOut5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 11 + Space: 0 + VulkanBinding: + Binding: 11 + +... +#--- end + +# XFAIL: Clang + + + +# RUN: split-file %s %t +# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test new file mode 100644 index 000000000..4d1204bd0 --- /dev/null +++ b/test/WaveOps/WaveActiveMax.int32.test @@ -0,0 +1,327 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int2 +RWStructuredBuffer Out3 : register(u3); // test int3 +RWStructuredBuffer Out4 : register(u4); // test int4 +RWStructuredBuffer Out5 : register(u5); // constant folding + +// uints +StructuredBuffer UIn : register(t6); +RWStructuredBuffer UOut1 : register(u7); +RWStructuredBuffer UOut2 : register(u8); +RWStructuredBuffer UOut3 : register(u9); +RWStructuredBuffer UOut4 : register(u10); +RWStructuredBuffer UOut5 : register(u11); + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + int4 v = In[tid.x]; + + int s1 = WaveActiveMax( v.x ); + int s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; + int s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; + int s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; + + int2 v2_1 = WaveActiveMax( v.xy ); + int2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int2(0,0); + int2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int2(0,0); + int2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int2(0,0); + + int3 v3_1 = WaveActiveMax( v.xyz ); + int3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int3(0,0,0); + int3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int3(0,0,0); + int3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int3(0,0,0); + + int4 v4_1 = WaveActiveMax( v ); + int4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int4(0,0,0,0); + int4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int4(0,0,0,0); + int4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int4(0,0,0,0); + + int scalars[4] = { s4, s3, s2, s1 }; + int2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; + int3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; + int4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + + // constant folding case + Out5[0] = WaveActiveMax(int4(1,2,3,4)); + + // UINT case + + uint4 uv = UIn[tid.x]; + + uint us1 = WaveActiveMax( uv.x ); + uint us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0; + uint us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0; + uint us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0; + + uint2 uv2_1 = WaveActiveMax( uv.xy ); + uint2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint2(0,0); + uint2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint2(0,0); + uint2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint2(0,0); + + uint3 uv3_1 = WaveActiveMax( uv.xyz ); + uint3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0); + uint3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0); + uint3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0); + + uint4 uv4_1 = WaveActiveMax( uv ); + uint4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint4(0,0,0,0); + uint4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint4(0,0,0,0); + uint4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint4(0,0,0,0); + + uint uscalars[4] = { us4, us3, us2, us1 }; + uint2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 }; + uint3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 }; + uint4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 }; + + UOut1[tid.x].x = uscalars[tid.x]; + UOut2[tid.x].xy = uvec2s[tid.x]; + UOut3[tid.x].xyz = uvec3s[tid.x]; + UOut4[tid.x] = uvec4s[tid.x]; + + // constant folding case + UOut5[0] = WaveActiveMax(uint4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Int32 + Stride: 16 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: Out1 + Format: Int32 + Stride: 16 + ZeroInitSize: 64 + - Name: Out2 + Format: Int32 + Stride: 16 + ZeroInitSize: 64 + - Name: Out3 + Format: Int32 + Stride: 16 + ZeroInitSize: 64 + - Name: Out4 + Format: Int32 + Stride: 16 + ZeroInitSize: 64 + - Name: Out5 + Format: Int32 + Stride: 16 + ZeroInitSize: 16 + - Name: ExpectedOut1 + Format: Int32 + Stride: 16 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] + - Name: ExpectedOut2 + Format: Int32 + Stride: 16 + Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] + - Name: ExpectedOut3 + Format: Int32 + Stride: 16 + Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] + - Name: ExpectedOut4 + Format: Int32 + Stride: 16 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: ExpectedOut5 + Format: Int32 + Stride: 16 + Data: [ 1, 2, 3, 4 ] + - Name: UIn + Format: UInt32 + Stride: 16 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: UOut1 + Format: UInt32 + Stride: 16 + ZeroInitSize: 64 + - Name: UOut2 + Format: UInt32 + Stride: 16 + ZeroInitSize: 64 + - Name: UOut3 + Format: UInt32 + Stride: 16 + ZeroInitSize: 64 + - Name: UOut4 + Format: UInt32 + Stride: 16 + ZeroInitSize: 64 + - Name: UOut5 + Format: UInt32 + Stride: 16 + ZeroInitSize: 16 + - Name: UExpectedOut1 + Format: UInt32 + Stride: 16 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] + - Name: UExpectedOut2 + Format: UInt32 + Stride: 16 + Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] + - Name: UExpectedOut3 + Format: UInt32 + Stride: 16 + Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] + - Name: UExpectedOut4 + Format: UInt32 + Stride: 16 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: UExpectedOut5 + Format: UInt32 + Stride: 4 + Data: [ 1, 2, 3, 4 ] + +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 + - Result: UExpectedOut1 + Rule: BufferExact + Actual: UOut1 + Expected: UExpectedOut1 + - Result: UExpectedOut2 + Rule: BufferExact + Actual: UOut2 + Expected: UExpectedOut2 + - Result: UExpectedOut3 + Rule: BufferExact + Actual: UOut3 + Expected: UExpectedOut3 + - Result: UExpectedOut4 + Rule: BufferExact + Actual: UOut4 + Expected: UExpectedOut4 + - Result: UExpectedOut5 + Rule: BufferExact + Actual: UOut5 + Expected: UExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: UIn + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + - Name: UOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 7 + Space: 0 + VulkanBinding: + Binding: 7 + - Name: UOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 8 + Space: 0 + VulkanBinding: + Binding: 8 + - Name: UOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 9 + Space: 0 + VulkanBinding: + Binding: 9 + - Name: UOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 10 + Space: 0 + VulkanBinding: + Binding: 10 + - Name: UOut5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 11 + Space: 0 + VulkanBinding: + Binding: 11 + +... +#--- end + +# XFAIL: Clang + + + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test new file mode 100644 index 000000000..e6956cec6 --- /dev/null +++ b/test/WaveOps/WaveActiveMax.int64.test @@ -0,0 +1,327 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int64_t2 +RWStructuredBuffer Out3 : register(u3); // test int64_t3 +RWStructuredBuffer Out4 : register(u4); // test int64_t4 +RWStructuredBuffer Out5 : register(u5); // constant folding + +// uint64_ts +StructuredBuffer UIn : register(t6); +RWStructuredBuffer UOut1 : register(u7); +RWStructuredBuffer UOut2 : register(u8); +RWStructuredBuffer UOut3 : register(u9); +RWStructuredBuffer UOut4 : register(u10); +RWStructuredBuffer UOut5 : register(u11); + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + int64_t4 v = In[tid.x]; + + int64_t s1 = WaveActiveMax( v.x ); + int64_t s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; + int64_t s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; + int64_t s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; + + int64_t2 v2_1 = WaveActiveMax( v.xy ); + int64_t2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int64_t2(0,0); + int64_t2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int64_t2(0,0); + int64_t2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int64_t2(0,0); + + int64_t3 v3_1 = WaveActiveMax( v.xyz ); + int64_t3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0); + int64_t3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0); + int64_t3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0); + + int64_t4 v4_1 = WaveActiveMax( v ); + int64_t4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int64_t4(0,0,0,0); + int64_t4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int64_t4(0,0,0,0); + int64_t4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int64_t4(0,0,0,0); + + int64_t scalars[4] = { s4, s3, s2, s1 }; + int64_t2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; + int64_t3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; + int64_t4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + + // constant folding case + Out5[0] = WaveActiveMax(int64_t4(1,2,3,4)); + + // UINT64_t case + + uint64_t4 uv = UIn[tid.x]; + + uint64_t us1 = WaveActiveMax( uv.x ); + uint64_t us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0; + uint64_t us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0; + uint64_t us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0; + + uint64_t2 uv2_1 = WaveActiveMax( uv.xy ); + uint64_t2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0); + uint64_t2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0); + uint64_t2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0); + + uint64_t3 uv3_1 = WaveActiveMax( uv.xyz ); + uint64_t3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0); + uint64_t3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0); + uint64_t3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0); + + uint64_t4 uv4_1 = WaveActiveMax( uv ); + uint64_t4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0); + uint64_t4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0); + uint64_t4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0); + + uint64_t uscalars[4] = { us4, us3, us2, us1 }; + uint64_t2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 }; + uint64_t3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 }; + uint64_t4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 }; + + UOut1[tid.x].x = uscalars[tid.x]; + UOut2[tid.x].xy = uvec2s[tid.x]; + UOut3[tid.x].xyz = uvec3s[tid.x]; + UOut4[tid.x] = uvec4s[tid.x]; + + // constant folding case + UOut5[0] = WaveActiveMax(uint64_t4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [1, 1, 1] +Buffers: + - Name: In + Format: Int64 + Stride: 32 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: Out1 + Format: Int64 + Stride: 32 + ZeroInitSize: 128 + - Name: Out2 + Format: Int64 + Stride: 32 + ZeroInitSize: 128 + - Name: Out3 + Format: Int64 + Stride: 32 + ZeroInitSize: 128 + - Name: Out4 + Format: Int64 + Stride: 32 + ZeroInitSize: 128 + - Name: Out5 + Format: Int64 + Stride: 32 + ZeroInitSize: 32 + - Name: ExpectedOut1 + Format: Int64 + Stride: 32 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] + - Name: ExpectedOut2 + Format: Int64 + Stride: 32 + Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] + - Name: ExpectedOut3 + Format: Int64 + Stride: 32 + Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] + - Name: ExpectedOut4 + Format: Int64 + Stride: 32 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: ExpectedOut5 + Format: Int64 + Stride: 8 + Data: [ 1, 2, 3, 4 ] + - Name: UIn + Format: UInt64 + Stride: 32 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: UOut1 + Format: UInt64 + Stride: 32 + ZeroInitSize: 128 + - Name: UOut2 + Format: UInt64 + Stride: 32 + ZeroInitSize: 128 + - Name: UOut3 + Format: UInt64 + Stride: 32 + ZeroInitSize: 128 + - Name: UOut4 + Format: UInt64 + Stride: 32 + ZeroInitSize: 128 + - Name: UOut5 + Format: UInt64 + Stride: 32 + ZeroInitSize: 32 + - Name: UExpectedOut1 + Format: UInt64 + Stride: 32 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] + - Name: UExpectedOut2 + Format: UInt64 + Stride: 32 + Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] + - Name: UExpectedOut3 + Format: UInt64 + Stride: 32 + Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] + - Name: UExpectedOut4 + Format: UInt64 + Stride: 32 + Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + - Name: UExpectedOut5 + Format: UInt64 + Stride: 8 + Data: [ 1, 2, 3, 4 ] + +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 + - Result: UExpectedOut1 + Rule: BufferExact + Actual: UOut1 + Expected: UExpectedOut1 + - Result: UExpectedOut2 + Rule: BufferExact + Actual: UOut2 + Expected: UExpectedOut2 + - Result: UExpectedOut3 + Rule: BufferExact + Actual: UOut3 + Expected: UExpectedOut3 + - Result: UExpectedOut4 + Rule: BufferExact + Actual: UOut4 + Expected: UExpectedOut4 + - Result: UExpectedOut5 + Rule: BufferExact + Actual: UOut5 + Expected: UExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: UIn + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + - Name: UOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 7 + Space: 0 + VulkanBinding: + Binding: 7 + - Name: UOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 8 + Space: 0 + VulkanBinding: + Binding: 8 + - Name: UOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 9 + Space: 0 + VulkanBinding: + Binding: 9 + - Name: UOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 10 + Space: 0 + VulkanBinding: + Binding: 10 + - Name: UOut5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 11 + Space: 0 + VulkanBinding: + Binding: 11 + +... +#--- end + +# XFAIL: Clang + + + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o From f810b53bd2a6cde32acabdef3a5cd215ebbda6b1 Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Fri, 5 Sep 2025 12:26:19 -0700 Subject: [PATCH 03/11] add metal xfail, binding offset errors --- test/WaveOps/WaveActiveMax.fp16.test | 3 ++- test/WaveOps/WaveActiveMax.fp32.test | 3 ++- test/WaveOps/WaveActiveMax.fp64.test | 2 ++ test/WaveOps/WaveActiveMax.int16.test | 3 ++- test/WaveOps/WaveActiveMax.int32.test | 3 ++- test/WaveOps/WaveActiveMax.int64.test | 3 ++- 6 files changed, 12 insertions(+), 5 deletions(-) diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test index 41e80e27f..8c2dfc122 100644 --- a/test/WaveOps/WaveActiveMax.fp16.test +++ b/test/WaveOps/WaveActiveMax.fp16.test @@ -169,7 +169,8 @@ DescriptorSets: # XFAIL: Clang - +# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl diff --git a/test/WaveOps/WaveActiveMax.fp32.test b/test/WaveOps/WaveActiveMax.fp32.test index 9aeba5943..6155e821f 100644 --- a/test/WaveOps/WaveActiveMax.fp32.test +++ b/test/WaveOps/WaveActiveMax.fp32.test @@ -168,7 +168,8 @@ DescriptorSets: # XFAIL: Clang - +# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl diff --git a/test/WaveOps/WaveActiveMax.fp64.test b/test/WaveOps/WaveActiveMax.fp64.test index feac35dea..043a9e99a 100644 --- a/test/WaveOps/WaveActiveMax.fp64.test +++ b/test/WaveOps/WaveActiveMax.fp64.test @@ -171,6 +171,8 @@ DescriptorSets: # Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang +# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test index 81546cc91..4a9cb7c7d 100644 --- a/test/WaveOps/WaveActiveMax.int16.test +++ b/test/WaveOps/WaveActiveMax.int16.test @@ -320,7 +320,8 @@ DescriptorSets: # XFAIL: Clang - +# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test index 4d1204bd0..80b9a88aa 100644 --- a/test/WaveOps/WaveActiveMax.int32.test +++ b/test/WaveOps/WaveActiveMax.int32.test @@ -320,7 +320,8 @@ DescriptorSets: # XFAIL: Clang - +# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test index e6956cec6..caf7a3b7d 100644 --- a/test/WaveOps/WaveActiveMax.int64.test +++ b/test/WaveOps/WaveActiveMax.int64.test @@ -320,7 +320,8 @@ DescriptorSets: # XFAIL: Clang - +# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# XFAIL: Metal # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl From 69c5020aeabe9f7e7c1aae70a2d7bf31654b9c8d Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Fri, 5 Sep 2025 12:52:38 -0700 Subject: [PATCH 04/11] add XFAIL for warp --- test/WaveOps/WaveActiveMax.int64.test | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test index caf7a3b7d..f7f769129 100644 --- a/test/WaveOps/WaveActiveMax.int64.test +++ b/test/WaveOps/WaveActiveMax.int64.test @@ -320,9 +320,12 @@ DescriptorSets: # XFAIL: Clang -# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# Bug https://github.com/llvm/offload-test-suite/issues/393 # XFAIL: Metal +# Bug https://github.com/llvm/offload-test-suite/issues/430 +# XFAIL: Warp + # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl # RUN: %offloader %t/pipeline.yaml %t.o From c27263399294fe456ef1624501c241c7456606b7 Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Fri, 5 Sep 2025 13:35:56 -0700 Subject: [PATCH 05/11] change case of WARP --- test/WaveOps/WaveActiveMax.int64.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test index f7f769129..5a3ae612c 100644 --- a/test/WaveOps/WaveActiveMax.int64.test +++ b/test/WaveOps/WaveActiveMax.int64.test @@ -324,7 +324,7 @@ DescriptorSets: # XFAIL: Metal # Bug https://github.com/llvm/offload-test-suite/issues/430 -# XFAIL: Warp +# XFAIL: WARP # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl From 601f2fa6979da2e7ba9132743533f39e1894e877 Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Fri, 5 Sep 2025 14:08:05 -0700 Subject: [PATCH 06/11] try directx-warp --- test/WaveOps/WaveActiveMax.int64.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test index 5a3ae612c..60cb7fda2 100644 --- a/test/WaveOps/WaveActiveMax.int64.test +++ b/test/WaveOps/WaveActiveMax.int64.test @@ -324,7 +324,7 @@ DescriptorSets: # XFAIL: Metal # Bug https://github.com/llvm/offload-test-suite/issues/430 -# XFAIL: WARP +# XFAIL: DirectX-WARP # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl From 31759d74c9498c515cb2d85707c5625036f3cb76 Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Mon, 8 Sep 2025 10:28:56 -0700 Subject: [PATCH 07/11] add Bug to clang xfails --- test/WaveOps/WaveActiveMax.fp16.test | 3 ++- test/WaveOps/WaveActiveMax.fp32.test | 1 + test/WaveOps/WaveActiveMax.int16.test | 1 + test/WaveOps/WaveActiveMax.int32.test | 1 + test/WaveOps/WaveActiveMax.int64.test | 1 + 5 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test index 8c2dfc122..a79eb90ca 100644 --- a/test/WaveOps/WaveActiveMax.fp16.test +++ b/test/WaveOps/WaveActiveMax.fp16.test @@ -167,9 +167,10 @@ DescriptorSets: ... #--- end +# Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang -# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# Bug https://github.com/llvm/offload-test-suite/issues/393 # XFAIL: Metal # RUN: split-file %s %t diff --git a/test/WaveOps/WaveActiveMax.fp32.test b/test/WaveOps/WaveActiveMax.fp32.test index 6155e821f..4b7fcaf7f 100644 --- a/test/WaveOps/WaveActiveMax.fp32.test +++ b/test/WaveOps/WaveActiveMax.fp32.test @@ -166,6 +166,7 @@ DescriptorSets: ... #--- end +# Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang # Tracked by https://github.com/llvm/offload-test-suite/issues/393 diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test index 4a9cb7c7d..d10207fdd 100644 --- a/test/WaveOps/WaveActiveMax.int16.test +++ b/test/WaveOps/WaveActiveMax.int16.test @@ -318,6 +318,7 @@ DescriptorSets: ... #--- end +# Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang # Tracked by https://github.com/llvm/offload-test-suite/issues/393 diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test index 80b9a88aa..d8b5a9e85 100644 --- a/test/WaveOps/WaveActiveMax.int32.test +++ b/test/WaveOps/WaveActiveMax.int32.test @@ -318,6 +318,7 @@ DescriptorSets: ... #--- end +# Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang # Tracked by https://github.com/llvm/offload-test-suite/issues/393 diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test index 60cb7fda2..ebe252ca9 100644 --- a/test/WaveOps/WaveActiveMax.int64.test +++ b/test/WaveOps/WaveActiveMax.int64.test @@ -318,6 +318,7 @@ DescriptorSets: ... #--- end +# Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang # Bug https://github.com/llvm/offload-test-suite/issues/393 From 302183977b818b547b15cf4cfff585765dfe4f86 Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Mon, 8 Sep 2025 16:58:32 -0700 Subject: [PATCH 08/11] add bug instead of tracked by --- test/WaveOps/WaveActiveMax.fp64.test | 2 +- test/WaveOps/WaveActiveMax.int16.test | 2 +- test/WaveOps/WaveActiveMax.int32.test | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/WaveOps/WaveActiveMax.fp64.test b/test/WaveOps/WaveActiveMax.fp64.test index 043a9e99a..50f9ed873 100644 --- a/test/WaveOps/WaveActiveMax.fp64.test +++ b/test/WaveOps/WaveActiveMax.fp64.test @@ -171,7 +171,7 @@ DescriptorSets: # Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang -# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# Bug https://github.com/llvm/offload-test-suite/issues/393 # XFAIL: Metal # RUN: split-file %s %t diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test index d10207fdd..496fb1591 100644 --- a/test/WaveOps/WaveActiveMax.int16.test +++ b/test/WaveOps/WaveActiveMax.int16.test @@ -321,7 +321,7 @@ DescriptorSets: # Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang -# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# Bug https://github.com/llvm/offload-test-suite/issues/393 # XFAIL: Metal # RUN: split-file %s %t diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test index d8b5a9e85..0aa6a43ba 100644 --- a/test/WaveOps/WaveActiveMax.int32.test +++ b/test/WaveOps/WaveActiveMax.int32.test @@ -321,7 +321,7 @@ DescriptorSets: # Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang -# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# Bug https://github.com/llvm/offload-test-suite/issues/393 # XFAIL: Metal # RUN: split-file %s %t From 3b234aadd2163022d237967bc55885295c2a04bc Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Thu, 25 Sep 2025 17:09:59 -0700 Subject: [PATCH 09/11] make fp16 test more robust, addressing Tex --- test/WaveOps/WaveActiveMax.fp16.test | 601 +++++++++++++++++++++++++-- 1 file changed, 555 insertions(+), 46 deletions(-) diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test index a79eb90ca..08b17a417 100644 --- a/test/WaveOps/WaveActiveMax.fp16.test +++ b/test/WaveOps/WaveActiveMax.fp16.test @@ -1,50 +1,43 @@ #--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 16 +#define NUM_THREADS 4 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + StructuredBuffer In : register(t0); -RWStructuredBuffer Out1 : register(u1); // test scalar -RWStructuredBuffer Out2 : register(u2); // test half2 +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test half2 RWStructuredBuffer Out3 : register(u3); // test half3 RWStructuredBuffer Out4 : register(u4); // test half4 RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); + -[numthreads(4,1,1)] +[numthreads(NUM_THREADS,1,1)] void main(uint3 tid : SV_GroupThreadID) { - half4 v = In[tid.x]; - - half s1 = WaveActiveMax( v.x ); - half s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; - half s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; - half s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; - - half2 v2_1 = WaveActiveMax( v.xy ); - half2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : half2(0,0); - half2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : half2(0,0); - half2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : half2(0,0); - - half3 v3_1 = WaveActiveMax( v.xyz ); - half3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : half3(0,0,0); - half3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : half3(0,0,0); - half3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : half3(0,0,0); - - half4 v4_1 = WaveActiveMax( v ); - half4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : half4(0,0,0,0); - half4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : half4(0,0,0,0); - half4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : half4(0,0,0,0); - - half scalars[4] = { s4, s3, s2, s1 }; - half2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; - half3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; - half4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; - - Out1[tid.x].x = scalars[tid.x]; - Out2[tid.x].xy = vec2s[tid.x]; - Out3[tid.x].xyz = vec3s[tid.x]; - Out4[tid.x] = vec4s[tid.x]; + for (int ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (int MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + half4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveActiveMax( v.x ); + Out2[OutIdx].xy = WaveActiveMax( v.xy ); + Out3[OutIdx].xyz = WaveActiveMax( v.xyz ); + Out4[OutIdx] = WaveActiveMax( v ); + } + } + } // constant folding case Out5[0] = WaveActiveMax(half4(1,2,3,4)); } + //--- pipeline.yaml --- @@ -56,44 +49,553 @@ Buffers: - Name: In Format: Float16 Stride: 8 - # 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 - Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ] + # 2 value sets + # For each value set, + # and for each specific one of the 16 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 0x2000, 0x2200, 0x2400, 0x2800, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 0x2A00, 0x2C00, 0x2E00, 0x3000, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 0x3200, 0x3400, 0x3600, 0x3800, + 0x3900, 0x3A00, 0x3B00, 0x3BC0, + 0x2200, 0x2400, 0x2800, 0x2A00, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 0x2C00, 0x2E00, 0x3000, 0x3200, + 0x3400, 0x3600, 0x3800, 0x3900, + 0x3A00, 0x3B00, 0x3BC0, 0x2000, + 0x2400, 0x2800, 0x2A00, 0x2C00, + 0x2E00, 0x3000, 0x3200, 0x3400, + 0x3600, 0x3800, 0x3900, 0x3A00, + 0x3B00, 0x3BC0, 0x2000, 0x2200, + 0x2800, 0x2A00, 0x2C00, 0x2E00, + 0x3000, 0x3200, 0x3400, 0x3600, + 0x3800, 0x3900, 0x3A00, 0x3B00, + 0x3BC0, 0x2000, 0x2200, 0x2400, + 0x2A00, 0x2C00, 0x2E00, 0x3000, + 0x3200, 0x3400, 0x3600, 0x3800, + 0x3900, 0x3A00, 0x3B00, 0x3BC0, + 0x2000, 0x2200, 0x2400, 0x2800, + 0x2C00, 0x2E00, 0x3000, 0x3200, + 0x3400, 0x3600, 0x3800, 0x3900, + 0x3A00, 0x3B00, 0x3BC0, 0x2000, + 0x2200, 0x2400, 0x2800, 0x2A00, + 0x2E00, 0x3000, 0x3200, 0x3400, + 0x3600, 0x3800, 0x3900, 0x3A00, + 0x3B00, 0x3BC0, 0x2000, 0x2200, + 0x2400, 0x2800, 0x2A00, 0x2C00, + 0x3000, 0x3200, 0x3400, 0x3600, + 0x3800, 0x3900, 0x3A00, 0x3B00, + 0x3BC0, 0x2000, 0x2200, 0x2400, + 0x2800, 0x2A00, 0x2C00, 0x2E00, + 0x3200, 0x3400, 0x3600, 0x3800, + 0x3900, 0x3A00, 0x3B00, 0x3BC0, + 0x2000, 0x2200, 0x2400, 0x2800, + 0x2A00, 0x2C00, 0x2E00, 0x3000, + 0x3400, 0x3600, 0x3800, 0x3900, + 0x3A00, 0x3B00, 0x3BC0, 0x2000, + 0x2200, 0x2400, 0x2800, 0x2A00, + 0x2C00, 0x2E00, 0x3000, 0x3200, + 0x3600, 0x3800, 0x3900, 0x3A00, + 0x3B00, 0x3BC0, 0x2000, 0x2200, + 0x2400, 0x2800, 0x2A00, 0x2C00, + 0x2E00, 0x3000, 0x3200, 0x3400, + 0x3800, 0x3900, 0x3A00, 0x3B00, + 0x3BC0, 0x2000, 0x2200, 0x2400, + 0x2800, 0x2A00, 0x2C00, 0x2E00, + 0x3000, 0x3200, 0x3400, 0x3600, + 0x3900, 0x3A00, 0x3B00, 0x3BC0, + 0x2000, 0x2200, 0x2400, 0x2800, + 0x2A00, 0x2C00, 0x2E00, 0x3000, + 0x3200, 0x3400, 0x3600, 0x3800, + 0x3A00, 0x3B00, 0x3BC0, 0x2000, + 0x2200, 0x2400, 0x2800, 0x2A00, + 0x2C00, 0x2E00, 0x3000, 0x3200, + 0x3400, 0x3600, 0x3800, 0x3900, + 0x3B00, 0x3BC0, 0x2000, 0x2200, + 0x2400, 0x2800, 0x2A00, 0x2C00, + 0x2E00, 0x3000, 0x3200, 0x3400, + 0x3600, 0x3800, 0x3900, 0x3A00, + 0x3BC0, 0x2000, 0x2200, 0x2400, + 0x2800, 0x2A00, 0x2C00, 0x2E00, + 0x3000, 0x3200, 0x3400, 0x3600, + 0x3800, 0x3900, 0x3A00, 0x3B00, + 0x2800, 0x2400, 0x2200, 0x2000, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 0x3000, 0x2E00, 0x2C00, 0x2A00, + 0x3800, 0x3600, 0x3400, 0x3200, + 0x3BC0, 0x3B00, 0x3A00, 0x3900, + 0x2A00, 0x2800, 0x2400, 0x2200, + 0x3200, 0x3000, 0x2E00, 0x2C00, + 0x3900, 0x3800, 0x3600, 0x3400, + 0x2000, 0x3BC0, 0x3B00, 0x3A00, + 0x2C00, 0x2A00, 0x2800, 0x2400, + 0x3400, 0x3200, 0x3000, 0x2E00, + 0x3A00, 0x3900, 0x3800, 0x3600, + 0x2200, 0x2000, 0x3BC0, 0x3B00, + 0x2E00, 0x2C00, 0x2A00, 0x2800, + 0x3600, 0x3400, 0x3200, 0x3000, + 0x3B00, 0x3A00, 0x3900, 0x3800, + 0x2400, 0x2200, 0x2000, 0x3BC0, + 0x3000, 0x2E00, 0x2C00, 0x2A00, + 0x3800, 0x3600, 0x3400, 0x3200, + 0x3BC0, 0x3B00, 0x3A00, 0x3900, + 0x2800, 0x2400, 0x2200, 0x2000, + 0x3200, 0x3000, 0x2E00, 0x2C00, + 0x3900, 0x3800, 0x3600, 0x3400, + 0x2000, 0x3BC0, 0x3B00, 0x3A00, + 0x2A00, 0x2800, 0x2400, 0x2200, + 0x3400, 0x3200, 0x3000, 0x2E00, + 0x3A00, 0x3900, 0x3800, 0x3600, + 0x2200, 0x2000, 0x3BC0, 0x3B00, + 0x2C00, 0x2A00, 0x2800, 0x2400, + 0x3600, 0x3400, 0x3200, 0x3000, + 0x3B00, 0x3A00, 0x3900, 0x3800, + 0x2400, 0x2200, 0x2000, 0x3BC0, + 0x2E00, 0x2C00, 0x2A00, 0x2800, + 0x3800, 0x3600, 0x3400, 0x3200, + 0x3BC0, 0x3B00, 0x3A00, 0x3900, + 0x2800, 0x2400, 0x2200, 0x2000, + 0x3000, 0x2E00, 0x2C00, 0x2A00, + 0x3900, 0x3800, 0x3600, 0x3400, + 0x2000, 0x3BC0, 0x3B00, 0x3A00, + 0x2A00, 0x2800, 0x2400, 0x2200, + 0x3200, 0x3000, 0x2E00, 0x2C00, + 0x3A00, 0x3900, 0x3800, 0x3600, + 0x2200, 0x2000, 0x3BC0, 0x3B00, + 0x2C00, 0x2A00, 0x2800, 0x2400, + 0x3400, 0x3200, 0x3000, 0x2E00, + 0x3B00, 0x3A00, 0x3900, 0x3800, + 0x2400, 0x2200, 0x2000, 0x3BC0, + 0x2E00, 0x2C00, 0x2A00, 0x2800, + 0x3600, 0x3400, 0x3200, 0x3000, + 0x3BC0, 0x3B00, 0x3A00, 0x3900, + 0x2800, 0x2400, 0x2200, 0x2000, + 0x3000, 0x2E00, 0x2C00, 0x2A00, + 0x3800, 0x3600, 0x3400, 0x3200, + 0x2000, 0x3BC0, 0x3B00, 0x3A00, + 0x2A00, 0x2800, 0x2400, 0x2200, + 0x3200, 0x3000, 0x2E00, 0x2C00, + 0x3900, 0x3800, 0x3600, 0x3400, + 0x2200, 0x2000, 0x3BC0, 0x3B00, + 0x2C00, 0x2A00, 0x2800, 0x2400, + 0x3400, 0x3200, 0x3000, 0x2E00, + 0x3A00, 0x3900, 0x3800, 0x3600, + 0x2400, 0x2200, 0x2000, 0x3BC0, + 0x2E00, 0x2C00, 0x2A00, 0x2800, + 0x3600, 0x3400, 0x3200, 0x3000, + 0x3B00, 0x3A00, 0x3900, 0x3800 ] + - Name: Out1 Format: Float16 - Stride: 8 - ZeroInitSize: 32 + Stride: 2 + # 1 half is 2 bytes, * 4 halves for 4 threads, * 16 thread masks, * 2 value sets + ZeroInitSize: 256 - Name: Out2 Format: Float16 - Stride: 8 - ZeroInitSize: 32 + Stride: 4 + ZeroInitSize: 512 - Name: Out3 Format: Float16 Stride: 8 - ZeroInitSize: 32 + ZeroInitSize: 1024 - Name: Out4 Format: Float16 Stride: 8 - ZeroInitSize: 32 + ZeroInitSize: 1024 - Name: Out5 Format: Float16 Stride: 8 ZeroInitSize: 8 + - Name: Masks + Format: Int32 + Stride: 8 + # 16 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 0 0 0 1 + # 0 0 1 0 + # 0 0 1 1 + # 0 1 0 0 + # 0 1 0 1 + # 0 1 1 0 + # 0 1 1 1 + # 1 0 0 0 + # 1 0 0 1 + # 1 0 1 0 + # 1 0 1 1 + # 1 1 0 0 + # 1 1 0 1 + # 1 1 1 0 + # 1 1 1 1 + Data: [ + 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1] - Name: ExpectedOut1 Format: Float16 Stride: 8 - Data: [ 0x3c00, 0x0, 0x0, 0x0, 0x4000, 0x0, 0x0, 0x0, 0x4200, 0x0, 0x0, 0x0, 0x4400, 0x0, 0x0, 0x0 ] + # 2 value sets, 16 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x3A00, + 0x0, 0x0, 0x3600, 0x0, + 0x0, 0x0, 0x3BC0, 0x3BC0, + 0x0, 0x3200, 0x0, 0x0, + 0x0, 0x3400, 0x0, 0x3400, + 0x0, 0x3B00, 0x3B00, 0x0, + 0x0, 0x3BC0, 0x3BC0, 0x3BC0, + 0x3200, 0x0, 0x0, 0x0, + 0x3400, 0x0, 0x0, 0x3400, + 0x3600, 0x0, 0x3600, 0x0, + 0x3800, 0x0, 0x3800, 0x3800, + 0x3900, 0x3900, 0x0, 0x0, + 0x3A00, 0x3A00, 0x0, 0x3A00, + 0x3B00, 0x3B00, 0x3B00, 0x0, + 0x3BC0, 0x3BC0, 0x3BC0, 0x3BC0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x2000, + 0x0, 0x0, 0x3A00, 0x0, + 0x0, 0x0, 0x3B00, 0x3B00, + 0x0, 0x3800, 0x0, 0x0, + 0x0, 0x3900, 0x0, 0x3900, + 0x0, 0x3A00, 0x3A00, 0x0, + 0x0, 0x3B00, 0x3B00, 0x3B00, + 0x3800, 0x0, 0x0, 0x0, + 0x3900, 0x0, 0x0, 0x3900, + 0x3A00, 0x0, 0x3A00, 0x0, + 0x3B00, 0x0, 0x3B00, 0x3B00, + 0x3BC0, 0x3BC0, 0x0, 0x0, + 0x3900, 0x3900, 0x0, 0x3900, + 0x3400, 0x3400, 0x3400, 0x0, + 0x3B00, 0x3B00, 0x3B00, 0x3B00 ] - Name: ExpectedOut2 Format: Float16 Stride: 8 - Data: [ 0x3c00, 0x4900, 0x0, 0x0, 0x4000, 0x4d00, 0x0, 0x0, 0x4200, 0x4f80, 0x0, 0x0, 0x4400, 0x5100, 0x0, 0x0 ] + # 2 value sets, 16 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x3A00, 0x3B00, + 0x0, 0x0, 0x0, 0x0, + 0x3600, 0x3800, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3BC0, 0x3900, 0x3BC0, 0x3900, + 0x0, 0x0, 0x3200, 0x3400, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x3400, 0x3600, + 0x0, 0x0, 0x3400, 0x3600, + 0x0, 0x0, 0x3B00, 0x3BC0, + 0x3B00, 0x3BC0, 0x0, 0x0, + 0x0, 0x0, 0x3BC0, 0x3900, + 0x3BC0, 0x3900, 0x3BC0, 0x3900, + 0x3200, 0x3400, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3400, 0x3600, 0x0, 0x0, + 0x0, 0x0, 0x3400, 0x3600, + 0x3600, 0x3800, 0x0, 0x0, + 0x3600, 0x3800, 0x0, 0x0, + 0x3800, 0x3900, 0x0, 0x0, + 0x3800, 0x3900, 0x3800, 0x3900, + 0x3900, 0x3A00, 0x3900, 0x3A00, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3B00, 0x3A00, 0x3B00, + 0x0, 0x0, 0x3A00, 0x3B00, + 0x3B00, 0x3BC0, 0x3B00, 0x3BC0, + 0x3B00, 0x3BC0, 0x0, 0x0, + 0x3BC0, 0x3900, 0x3BC0, 0x3900, + 0x3BC0, 0x3900, 0x3BC0, 0x3900, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x2000, 0x3BC0, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3900, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3B00, 0x3A00, + 0x0, 0x0, 0x3800, 0x3600, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x3900, 0x3800, + 0x0, 0x0, 0x3900, 0x3800, + 0x0, 0x0, 0x3A00, 0x3900, + 0x3A00, 0x3900, 0x0, 0x0, + 0x0, 0x0, 0x3B00, 0x3A00, + 0x3B00, 0x3A00, 0x3B00, 0x3A00, + 0x3800, 0x3600, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3800, 0x0, 0x0, + 0x0, 0x0, 0x3900, 0x3800, + 0x3A00, 0x3900, 0x0, 0x0, + 0x3A00, 0x3900, 0x0, 0x0, + 0x3B00, 0x3A00, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3B00, 0x3A00, + 0x3BC0, 0x3B00, 0x3BC0, 0x3B00, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3BC0, 0x3900, 0x3BC0, + 0x0, 0x0, 0x3900, 0x3BC0, + 0x3400, 0x3200, 0x3400, 0x3200, + 0x3400, 0x3200, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3B00, 0x3A00, + 0x3B00, 0x3A00, 0x3B00, 0x3A00 ] - Name: ExpectedOut3 Format: Float16 Stride: 8 - Data: [ 0x3c00, 0x4900, 0x5640, 0x0, 0x4000, 0x4d00, 0x5a40, 0x0, 0x4200, 0x4f80, 0x5cb0, 0x0, 0x4400, 0x5100, 0x5e40, 0x0 ] + # 2 value sets, 16 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec + Data: [ 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3600, 0x3800, 0x3900, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3200, 0x3400, 0x3600, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3400, 0x3600, 0x3800, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3400, 0x3600, 0x3800, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3BC0, 0x3900, 0x0, + 0x3B00, 0x3BC0, 0x3900, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x0, + 0x3200, 0x3400, 0x3600, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3400, 0x3600, 0x3800, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3400, 0x3600, 0x3800, 0x0, + 0x3600, 0x3800, 0x3900, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3600, 0x3800, 0x3900, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3800, 0x3900, 0x3A00, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3800, 0x3900, 0x3A00, 0x0, + 0x3800, 0x3900, 0x3A00, 0x0, + 0x3900, 0x3A00, 0x3B00, 0x0, + 0x3900, 0x3A00, 0x3B00, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x0, + 0x3B00, 0x3BC0, 0x3200, 0x0, + 0x3B00, 0x3BC0, 0x3200, 0x0, + 0x3B00, 0x3BC0, 0x3200, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x2000, 0x3BC0, 0x3B00, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3900, 0x3800, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3800, 0x3600, 0x3400, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3800, 0x3600, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3800, 0x3600, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3900, 0x3BC0, 0x0, + 0x3A00, 0x3900, 0x3BC0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x3800, 0x3600, 0x3400, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3800, 0x3600, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3800, 0x3600, 0x0, + 0x3A00, 0x3900, 0x3800, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3900, 0x3800, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x3BC0, 0x3B00, 0x3A00, 0x0, + 0x3BC0, 0x3B00, 0x3A00, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3BC0, 0x3B00, 0x0, + 0x3900, 0x3BC0, 0x3B00, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3BC0, 0x3B00, 0x0, + 0x3400, 0x3200, 0x3BC0, 0x0, + 0x3400, 0x3200, 0x3BC0, 0x0, + 0x3400, 0x3200, 0x3BC0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0 ] - Name: ExpectedOut4 Format: Float16 Stride: 8 - Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ] + Data: [ 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x2000, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3600, 0x3800, 0x3900, 0x3A00, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x3B00, + 0x3BC0, 0x3900, 0x3A00, 0x3B00, + 0x0, 0x0, 0x0, 0x0, + 0x3200, 0x3400, 0x3600, 0x3800, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3400, 0x3600, 0x3800, 0x3900, + 0x0, 0x0, 0x0, 0x0, + 0x3400, 0x3600, 0x3800, 0x3900, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3BC0, 0x3900, 0x3A00, + 0x3B00, 0x3BC0, 0x3900, 0x3A00, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x3B00, + 0x3BC0, 0x3900, 0x3A00, 0x3B00, + 0x3BC0, 0x3900, 0x3A00, 0x3B00, + 0x3200, 0x3400, 0x3600, 0x3800, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3400, 0x3600, 0x3800, 0x3900, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3400, 0x3600, 0x3800, 0x3900, + 0x3600, 0x3800, 0x3900, 0x3A00, + 0x0, 0x0, 0x0, 0x0, + 0x3600, 0x3800, 0x3900, 0x3A00, + 0x0, 0x0, 0x0, 0x0, + 0x3800, 0x3900, 0x3A00, 0x3B00, + 0x0, 0x0, 0x0, 0x0, + 0x3800, 0x3900, 0x3A00, 0x3B00, + 0x3800, 0x3900, 0x3A00, 0x3B00, + 0x3900, 0x3A00, 0x3B00, 0x3BC0, + 0x3900, 0x3A00, 0x3B00, 0x3BC0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x3900, + 0x3A00, 0x3B00, 0x3BC0, 0x3900, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x3900, + 0x3B00, 0x3BC0, 0x3200, 0x3400, + 0x3B00, 0x3BC0, 0x3200, 0x3400, + 0x3B00, 0x3BC0, 0x3200, 0x3400, + 0x0, 0x0, 0x0, 0x0, + 0x3BC0, 0x3900, 0x3A00, 0x3B00, + 0x3BC0, 0x3900, 0x3A00, 0x3B00, + 0x3BC0, 0x3900, 0x3A00, 0x3B00, + 0x3BC0, 0x3900, 0x3A00, 0x3B00, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x2000, 0x3BC0, 0x3B00, 0x3A00, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3900, 0x3800, 0x3600, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x3BC0, + 0x3B00, 0x3A00, 0x3900, 0x3BC0, + 0x0, 0x0, 0x0, 0x0, + 0x3800, 0x3600, 0x3400, 0x3200, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3800, 0x3600, 0x3400, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3800, 0x3600, 0x3400, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3900, 0x3BC0, 0x3B00, + 0x3A00, 0x3900, 0x3BC0, 0x3B00, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x3BC0, + 0x3B00, 0x3A00, 0x3900, 0x3BC0, + 0x3B00, 0x3A00, 0x3900, 0x3BC0, + 0x3800, 0x3600, 0x3400, 0x3200, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3800, 0x3600, 0x3400, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3800, 0x3600, 0x3400, + 0x3A00, 0x3900, 0x3800, 0x3600, + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3900, 0x3800, 0x3600, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x3800, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x3800, + 0x3B00, 0x3A00, 0x3900, 0x3800, + 0x3BC0, 0x3B00, 0x3A00, 0x3900, + 0x3BC0, 0x3B00, 0x3A00, 0x3900, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3BC0, 0x3B00, 0x3A00, + 0x3900, 0x3BC0, 0x3B00, 0x3A00, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3BC0, 0x3B00, 0x3A00, + 0x3400, 0x3200, 0x3BC0, 0x3B00, + 0x3400, 0x3200, 0x3BC0, 0x3B00, + 0x3400, 0x3200, 0x3BC0, 0x3B00, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x3BC0, + 0x3B00, 0x3A00, 0x3900, 0x3BC0, + 0x3B00, 0x3A00, 0x3900, 0x3BC0, + 0x3B00, 0x3A00, 0x3900, 0x3BC0 ] - Name: ExpectedOut5 Format: Float16 Stride: 8 @@ -163,6 +665,13 @@ DescriptorSets: Space: 0 VulkanBinding: Binding: 5 + - Name: Masks + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 ... #--- end From ab089ec96cd72bf74c32b3269a5cf0de2525b8be Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Fri, 26 Sep 2025 13:54:41 -0700 Subject: [PATCH 10/11] use proper filter syntax --- test/WaveOps/WaveActiveMax.int64.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test index ebe252ca9..418727387 100644 --- a/test/WaveOps/WaveActiveMax.int64.test +++ b/test/WaveOps/WaveActiveMax.int64.test @@ -325,7 +325,7 @@ DescriptorSets: # XFAIL: Metal # Bug https://github.com/llvm/offload-test-suite/issues/430 -# XFAIL: DirectX-WARP +# XFAIL: DirectX && WARP # RUN: split-file %s %t # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl From 0bf8a4337dcb1d394bae66449067875848fe72a5 Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Fri, 3 Oct 2025 18:24:29 -0700 Subject: [PATCH 11/11] simplify to 4 masks and apply changes to all tests --- test/WaveOps/WaveActiveMax.fp16.test | 560 +++++--------------------- test/WaveOps/WaveActiveMax.fp32.test | 236 ++++++++--- test/WaveOps/WaveActiveMax.fp64.test | 250 +++++++++--- test/WaveOps/WaveActiveMax.int16.test | 368 ++++++++--------- test/WaveOps/WaveActiveMax.int32.test | 370 ++++++++--------- test/WaveOps/WaveActiveMax.int64.test | 372 +++++++++-------- 6 files changed, 1009 insertions(+), 1147 deletions(-) diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test index 08b17a417..80a1cda8a 100644 --- a/test/WaveOps/WaveActiveMax.fp16.test +++ b/test/WaveOps/WaveActiveMax.fp16.test @@ -1,6 +1,6 @@ #--- source.hlsl #define VALUE_SETS 2 -#define NUM_MASKS 16 +#define NUM_MASKS 4 #define NUM_THREADS 4 struct MaskStruct { @@ -51,7 +51,7 @@ Buffers: Stride: 8 # 2 value sets # For each value set, - # and for each specific one of the 16 thread masks in that value set, + # and for each specific one of the 4 thread masks in that value set, # and for each of the 4 threads in that thread mask, # there will be a unique set of 4 values, such that # none of the other threads in that thread mask share any values @@ -72,54 +72,6 @@ Buffers: 0x3000, 0x3200, 0x3400, 0x3600, 0x3800, 0x3900, 0x3A00, 0x3B00, 0x3BC0, 0x2000, 0x2200, 0x2400, - 0x2A00, 0x2C00, 0x2E00, 0x3000, - 0x3200, 0x3400, 0x3600, 0x3800, - 0x3900, 0x3A00, 0x3B00, 0x3BC0, - 0x2000, 0x2200, 0x2400, 0x2800, - 0x2C00, 0x2E00, 0x3000, 0x3200, - 0x3400, 0x3600, 0x3800, 0x3900, - 0x3A00, 0x3B00, 0x3BC0, 0x2000, - 0x2200, 0x2400, 0x2800, 0x2A00, - 0x2E00, 0x3000, 0x3200, 0x3400, - 0x3600, 0x3800, 0x3900, 0x3A00, - 0x3B00, 0x3BC0, 0x2000, 0x2200, - 0x2400, 0x2800, 0x2A00, 0x2C00, - 0x3000, 0x3200, 0x3400, 0x3600, - 0x3800, 0x3900, 0x3A00, 0x3B00, - 0x3BC0, 0x2000, 0x2200, 0x2400, - 0x2800, 0x2A00, 0x2C00, 0x2E00, - 0x3200, 0x3400, 0x3600, 0x3800, - 0x3900, 0x3A00, 0x3B00, 0x3BC0, - 0x2000, 0x2200, 0x2400, 0x2800, - 0x2A00, 0x2C00, 0x2E00, 0x3000, - 0x3400, 0x3600, 0x3800, 0x3900, - 0x3A00, 0x3B00, 0x3BC0, 0x2000, - 0x2200, 0x2400, 0x2800, 0x2A00, - 0x2C00, 0x2E00, 0x3000, 0x3200, - 0x3600, 0x3800, 0x3900, 0x3A00, - 0x3B00, 0x3BC0, 0x2000, 0x2200, - 0x2400, 0x2800, 0x2A00, 0x2C00, - 0x2E00, 0x3000, 0x3200, 0x3400, - 0x3800, 0x3900, 0x3A00, 0x3B00, - 0x3BC0, 0x2000, 0x2200, 0x2400, - 0x2800, 0x2A00, 0x2C00, 0x2E00, - 0x3000, 0x3200, 0x3400, 0x3600, - 0x3900, 0x3A00, 0x3B00, 0x3BC0, - 0x2000, 0x2200, 0x2400, 0x2800, - 0x2A00, 0x2C00, 0x2E00, 0x3000, - 0x3200, 0x3400, 0x3600, 0x3800, - 0x3A00, 0x3B00, 0x3BC0, 0x2000, - 0x2200, 0x2400, 0x2800, 0x2A00, - 0x2C00, 0x2E00, 0x3000, 0x3200, - 0x3400, 0x3600, 0x3800, 0x3900, - 0x3B00, 0x3BC0, 0x2000, 0x2200, - 0x2400, 0x2800, 0x2A00, 0x2C00, - 0x2E00, 0x3000, 0x3200, 0x3400, - 0x3600, 0x3800, 0x3900, 0x3A00, - 0x3BC0, 0x2000, 0x2200, 0x2400, - 0x2800, 0x2A00, 0x2C00, 0x2E00, - 0x3000, 0x3200, 0x3400, 0x3600, - 0x3800, 0x3900, 0x3A00, 0x3B00, 0x2800, 0x2400, 0x2200, 0x2000, # <-- Value set 1, thread mask 0, thread id 0 will read these In values 0x3000, 0x2E00, 0x2C00, 0x2A00, 0x3800, 0x3600, 0x3400, 0x3200, @@ -135,467 +87,143 @@ Buffers: 0x2E00, 0x2C00, 0x2A00, 0x2800, 0x3600, 0x3400, 0x3200, 0x3000, 0x3B00, 0x3A00, 0x3900, 0x3800, - 0x2400, 0x2200, 0x2000, 0x3BC0, - 0x3000, 0x2E00, 0x2C00, 0x2A00, - 0x3800, 0x3600, 0x3400, 0x3200, - 0x3BC0, 0x3B00, 0x3A00, 0x3900, - 0x2800, 0x2400, 0x2200, 0x2000, - 0x3200, 0x3000, 0x2E00, 0x2C00, - 0x3900, 0x3800, 0x3600, 0x3400, - 0x2000, 0x3BC0, 0x3B00, 0x3A00, - 0x2A00, 0x2800, 0x2400, 0x2200, - 0x3400, 0x3200, 0x3000, 0x2E00, - 0x3A00, 0x3900, 0x3800, 0x3600, - 0x2200, 0x2000, 0x3BC0, 0x3B00, - 0x2C00, 0x2A00, 0x2800, 0x2400, - 0x3600, 0x3400, 0x3200, 0x3000, - 0x3B00, 0x3A00, 0x3900, 0x3800, - 0x2400, 0x2200, 0x2000, 0x3BC0, - 0x2E00, 0x2C00, 0x2A00, 0x2800, - 0x3800, 0x3600, 0x3400, 0x3200, - 0x3BC0, 0x3B00, 0x3A00, 0x3900, - 0x2800, 0x2400, 0x2200, 0x2000, - 0x3000, 0x2E00, 0x2C00, 0x2A00, - 0x3900, 0x3800, 0x3600, 0x3400, - 0x2000, 0x3BC0, 0x3B00, 0x3A00, - 0x2A00, 0x2800, 0x2400, 0x2200, - 0x3200, 0x3000, 0x2E00, 0x2C00, - 0x3A00, 0x3900, 0x3800, 0x3600, - 0x2200, 0x2000, 0x3BC0, 0x3B00, - 0x2C00, 0x2A00, 0x2800, 0x2400, - 0x3400, 0x3200, 0x3000, 0x2E00, - 0x3B00, 0x3A00, 0x3900, 0x3800, - 0x2400, 0x2200, 0x2000, 0x3BC0, - 0x2E00, 0x2C00, 0x2A00, 0x2800, - 0x3600, 0x3400, 0x3200, 0x3000, - 0x3BC0, 0x3B00, 0x3A00, 0x3900, - 0x2800, 0x2400, 0x2200, 0x2000, - 0x3000, 0x2E00, 0x2C00, 0x2A00, - 0x3800, 0x3600, 0x3400, 0x3200, - 0x2000, 0x3BC0, 0x3B00, 0x3A00, - 0x2A00, 0x2800, 0x2400, 0x2200, - 0x3200, 0x3000, 0x2E00, 0x2C00, - 0x3900, 0x3800, 0x3600, 0x3400, - 0x2200, 0x2000, 0x3BC0, 0x3B00, - 0x2C00, 0x2A00, 0x2800, 0x2400, - 0x3400, 0x3200, 0x3000, 0x2E00, - 0x3A00, 0x3900, 0x3800, 0x3600, - 0x2400, 0x2200, 0x2000, 0x3BC0, - 0x2E00, 0x2C00, 0x2A00, 0x2800, - 0x3600, 0x3400, 0x3200, 0x3000, - 0x3B00, 0x3A00, 0x3900, 0x3800 ] + 0x2400, 0x2200, 0x2000, 0x3BC0 ] - Name: Out1 Format: Float16 Stride: 2 - # 1 half is 2 bytes, * 4 halves for 4 threads, * 16 thread masks, * 2 value sets - ZeroInitSize: 256 + # 1 half is 2 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets + ZeroInitSize: 64 - Name: Out2 Format: Float16 Stride: 4 - ZeroInitSize: 512 + ZeroInitSize: 128 - Name: Out3 Format: Float16 Stride: 8 - ZeroInitSize: 1024 + ZeroInitSize: 256 - Name: Out4 Format: Float16 Stride: 8 - ZeroInitSize: 1024 + ZeroInitSize: 256 - Name: Out5 Format: Float16 Stride: 8 ZeroInitSize: 8 - Name: Masks Format: Int32 - Stride: 8 - # 16 active mask sets for threads 0, 1, 2, 3: + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: # 0 0 0 0 - # 0 0 0 1 - # 0 0 1 0 - # 0 0 1 1 - # 0 1 0 0 - # 0 1 0 1 - # 0 1 1 0 - # 0 1 1 1 + # 1 1 1 1 # 1 0 0 0 - # 1 0 0 1 - # 1 0 1 0 - # 1 0 1 1 - # 1 1 0 0 - # 1 1 0 1 - # 1 1 1 0 - # 1 1 1 1 + # 0 1 1 0 Data: [ - 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1] + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] - Name: ExpectedOut1 Format: Float16 Stride: 8 - # 2 value sets, 16 masks per value set, 4 threads per mask, 1 result value per thread + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread Data: [ 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x3A00, - 0x0, 0x0, 0x3600, 0x0, - 0x0, 0x0, 0x3BC0, 0x3BC0, - 0x0, 0x3200, 0x0, 0x0, - 0x0, 0x3400, 0x0, 0x3400, - 0x0, 0x3B00, 0x3B00, 0x0, - 0x0, 0x3BC0, 0x3BC0, 0x3BC0, - 0x3200, 0x0, 0x0, 0x0, - 0x3400, 0x0, 0x0, 0x3400, - 0x3600, 0x0, 0x3600, 0x0, - 0x3800, 0x0, 0x3800, 0x3800, - 0x3900, 0x3900, 0x0, 0x0, - 0x3A00, 0x3A00, 0x0, 0x3A00, - 0x3B00, 0x3B00, 0x3B00, 0x0, - 0x3BC0, 0x3BC0, 0x3BC0, 0x3BC0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x2000, - 0x0, 0x0, 0x3A00, 0x0, - 0x0, 0x0, 0x3B00, 0x3B00, - 0x0, 0x3800, 0x0, 0x0, - 0x0, 0x3900, 0x0, 0x3900, - 0x0, 0x3A00, 0x3A00, 0x0, - 0x0, 0x3B00, 0x3B00, 0x3B00, - 0x3800, 0x0, 0x0, 0x0, - 0x3900, 0x0, 0x0, 0x3900, - 0x3A00, 0x0, 0x3A00, 0x0, - 0x3B00, 0x0, 0x3B00, 0x3B00, - 0x3BC0, 0x3BC0, 0x0, 0x0, - 0x3900, 0x3900, 0x0, 0x3900, - 0x3400, 0x3400, 0x3400, 0x0, - 0x3B00, 0x3B00, 0x3B00, 0x3B00 ] + 0x3A00, 0x3A00, 0x3A00, 0x3A00, + 0x2400, 0x0, 0x0, 0x0, + 0x0, 0x3800, 0x3800, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3900, 0x3900, 0x3900, + 0x2C00, 0x0, 0x0, 0x0, + 0x0, 0x3B00, 0x3B00, 0x0 ] - Name: ExpectedOut2 Format: Float16 Stride: 8 - # 2 value sets, 16 masks per value set, 4 threads per mask, 1 result value per thread + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread Data: [ 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x3A00, 0x3B00, - 0x0, 0x0, 0x0, 0x0, - 0x3600, 0x3800, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3BC0, 0x3900, 0x3BC0, 0x3900, - 0x0, 0x0, 0x3200, 0x3400, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x3400, 0x3600, - 0x0, 0x0, 0x3400, 0x3600, - 0x0, 0x0, 0x3B00, 0x3BC0, - 0x3B00, 0x3BC0, 0x0, 0x0, - 0x0, 0x0, 0x3BC0, 0x3900, - 0x3BC0, 0x3900, 0x3BC0, 0x3900, - 0x3200, 0x3400, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3400, 0x3600, 0x0, 0x0, - 0x0, 0x0, 0x3400, 0x3600, - 0x3600, 0x3800, 0x0, 0x0, - 0x3600, 0x3800, 0x0, 0x0, - 0x3800, 0x3900, 0x0, 0x0, - 0x3800, 0x3900, 0x3800, 0x3900, - 0x3900, 0x3A00, 0x3900, 0x3A00, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3B00, 0x3A00, 0x3B00, - 0x0, 0x0, 0x3A00, 0x3B00, - 0x3B00, 0x3BC0, 0x3B00, 0x3BC0, - 0x3B00, 0x3BC0, 0x0, 0x0, - 0x3BC0, 0x3900, 0x3BC0, 0x3900, - 0x3BC0, 0x3900, 0x3BC0, 0x3900, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x2000, 0x3BC0, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3900, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3B00, 0x3A00, - 0x0, 0x0, 0x3800, 0x3600, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x3900, 0x3800, - 0x0, 0x0, 0x3900, 0x3800, - 0x0, 0x0, 0x3A00, 0x3900, - 0x3A00, 0x3900, 0x0, 0x0, - 0x0, 0x0, 0x3B00, 0x3A00, - 0x3B00, 0x3A00, 0x3B00, 0x3A00, - 0x3800, 0x3600, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3800, 0x0, 0x0, - 0x0, 0x0, 0x3900, 0x3800, - 0x3A00, 0x3900, 0x0, 0x0, - 0x3A00, 0x3900, 0x0, 0x0, - 0x3B00, 0x3A00, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3B00, 0x3A00, - 0x3BC0, 0x3B00, 0x3BC0, 0x3B00, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3BC0, 0x3900, 0x3BC0, - 0x0, 0x0, 0x3900, 0x3BC0, - 0x3400, 0x3200, 0x3400, 0x3200, - 0x3400, 0x3200, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3B00, 0x3A00, - 0x3B00, 0x3A00, 0x3B00, 0x3A00 ] + 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3B00, 0x3A00, 0x3B00, + 0x3A00, 0x3B00, 0x3A00, 0x3B00, + 0x2400, 0x2800, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x3800, 0x3900, + 0x3800, 0x3900, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3BC0, 0x3900, 0x3BC0, + 0x3900, 0x3BC0, 0x3900, 0x3BC0, + 0x2C00, 0x2A00, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x3B00, 0x3A00, + 0x3B00, 0x3A00, 0x0, 0x0 ] - Name: ExpectedOut3 Format: Float16 Stride: 8 - # 2 value sets, 16 masks per value set, 4 threads per mask, 4 result values per thread + # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec - Data: [ 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3B00, 0x3BC0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3600, 0x3800, 0x3900, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3200, 0x3400, 0x3600, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3400, 0x3600, 0x3800, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3400, 0x3600, 0x3800, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3BC0, 0x3900, 0x0, - 0x3B00, 0x3BC0, 0x3900, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x0, - 0x3200, 0x3400, 0x3600, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3400, 0x3600, 0x3800, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3400, 0x3600, 0x3800, 0x0, - 0x3600, 0x3800, 0x3900, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3600, 0x3800, 0x3900, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3800, 0x3900, 0x3A00, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3800, 0x3900, 0x3A00, 0x0, - 0x3800, 0x3900, 0x3A00, 0x0, - 0x3900, 0x3A00, 0x3B00, 0x0, - 0x3900, 0x3A00, 0x3B00, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3B00, 0x3BC0, 0x0, - 0x3A00, 0x3B00, 0x3BC0, 0x0, - 0x0, 0x0, 0x0, 0x0, + Data: [ 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3A00, 0x3B00, 0x3BC0, 0x0, - 0x3B00, 0x3BC0, 0x3200, 0x0, - 0x3B00, 0x3BC0, 0x3200, 0x0, - 0x3B00, 0x3BC0, 0x3200, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x2000, 0x3BC0, 0x3B00, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3900, 0x3800, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3800, 0x3600, 0x3400, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3800, 0x3600, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3800, 0x3600, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3900, 0x3BC0, 0x0, - 0x3A00, 0x3900, 0x3BC0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x3800, 0x3600, 0x3400, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3800, 0x3600, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3800, 0x3600, 0x0, - 0x3A00, 0x3900, 0x3800, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3900, 0x3800, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x3BC0, 0x3B00, 0x3A00, 0x0, - 0x3BC0, 0x3B00, 0x3A00, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3BC0, 0x3B00, 0x0, - 0x3900, 0x3BC0, 0x3B00, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x0, + 0x2400, 0x2800, 0x2A00, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3800, 0x3900, 0x3A00, 0x0, + 0x3800, 0x3900, 0x3A00, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3900, 0x3BC0, 0x3B00, 0x0, - 0x3400, 0x3200, 0x3BC0, 0x0, - 0x3400, 0x3200, 0x3BC0, 0x0, - 0x3400, 0x3200, 0x3BC0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x0 ] + 0x3900, 0x3BC0, 0x3B00, 0x0, + 0x3900, 0x3BC0, 0x3B00, 0x0, + 0x3900, 0x3BC0, 0x3B00, 0x0, + 0x2C00, 0x2A00, 0x2800, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x0, + 0x0, 0x0, 0x0, 0x0 ] - Name: ExpectedOut4 Format: Float16 Stride: 8 - Data: [ 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3B00, 0x3BC0, 0x2000, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3600, 0x3800, 0x3900, 0x3A00, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x3B00, - 0x3BC0, 0x3900, 0x3A00, 0x3B00, - 0x0, 0x0, 0x0, 0x0, - 0x3200, 0x3400, 0x3600, 0x3800, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3400, 0x3600, 0x3800, 0x3900, - 0x0, 0x0, 0x0, 0x0, - 0x3400, 0x3600, 0x3800, 0x3900, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3BC0, 0x3900, 0x3A00, - 0x3B00, 0x3BC0, 0x3900, 0x3A00, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x3B00, - 0x3BC0, 0x3900, 0x3A00, 0x3B00, - 0x3BC0, 0x3900, 0x3A00, 0x3B00, - 0x3200, 0x3400, 0x3600, 0x3800, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3400, 0x3600, 0x3800, 0x3900, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3400, 0x3600, 0x3800, 0x3900, - 0x3600, 0x3800, 0x3900, 0x3A00, - 0x0, 0x0, 0x0, 0x0, - 0x3600, 0x3800, 0x3900, 0x3A00, - 0x0, 0x0, 0x0, 0x0, - 0x3800, 0x3900, 0x3A00, 0x3B00, - 0x0, 0x0, 0x0, 0x0, - 0x3800, 0x3900, 0x3A00, 0x3B00, - 0x3800, 0x3900, 0x3A00, 0x3B00, - 0x3900, 0x3A00, 0x3B00, 0x3BC0, - 0x3900, 0x3A00, 0x3B00, 0x3BC0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3B00, 0x3BC0, 0x3900, - 0x3A00, 0x3B00, 0x3BC0, 0x3900, - 0x0, 0x0, 0x0, 0x0, + Data: [ 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x3A00, 0x3B00, 0x3BC0, 0x3900, - 0x3B00, 0x3BC0, 0x3200, 0x3400, - 0x3B00, 0x3BC0, 0x3200, 0x3400, - 0x3B00, 0x3BC0, 0x3200, 0x3400, - 0x0, 0x0, 0x0, 0x0, - 0x3BC0, 0x3900, 0x3A00, 0x3B00, - 0x3BC0, 0x3900, 0x3A00, 0x3B00, - 0x3BC0, 0x3900, 0x3A00, 0x3B00, - 0x3BC0, 0x3900, 0x3A00, 0x3B00, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x2000, 0x3BC0, 0x3B00, 0x3A00, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3900, 0x3800, 0x3600, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x3BC0, - 0x3B00, 0x3A00, 0x3900, 0x3BC0, - 0x0, 0x0, 0x0, 0x0, - 0x3800, 0x3600, 0x3400, 0x3200, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3800, 0x3600, 0x3400, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3800, 0x3600, 0x3400, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3900, 0x3BC0, 0x3B00, - 0x3A00, 0x3900, 0x3BC0, 0x3B00, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x3BC0, - 0x3B00, 0x3A00, 0x3900, 0x3BC0, - 0x3B00, 0x3A00, 0x3900, 0x3BC0, - 0x3800, 0x3600, 0x3400, 0x3200, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3800, 0x3600, 0x3400, - 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3800, 0x3600, 0x3400, - 0x3A00, 0x3900, 0x3800, 0x3600, - 0x0, 0x0, 0x0, 0x0, - 0x3A00, 0x3900, 0x3800, 0x3600, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x3800, - 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x3800, - 0x3B00, 0x3A00, 0x3900, 0x3800, - 0x3BC0, 0x3B00, 0x3A00, 0x3900, - 0x3BC0, 0x3B00, 0x3A00, 0x3900, - 0x0, 0x0, 0x0, 0x0, + 0x3A00, 0x3B00, 0x3BC0, 0x3900, + 0x3A00, 0x3B00, 0x3BC0, 0x3900, + 0x3A00, 0x3B00, 0x3BC0, 0x3900, + 0x2400, 0x2800, 0x2A00, 0x2C00, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, - 0x3900, 0x3BC0, 0x3B00, 0x3A00, - 0x3900, 0x3BC0, 0x3B00, 0x3A00, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3800, 0x3900, 0x3A00, 0x3B00, + 0x3800, 0x3900, 0x3A00, 0x3B00, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3900, 0x3BC0, 0x3B00, 0x3A00, + 0x3900, 0x3BC0, 0x3B00, 0x3A00, + 0x3900, 0x3BC0, 0x3B00, 0x3A00, 0x3900, 0x3BC0, 0x3B00, 0x3A00, - 0x3400, 0x3200, 0x3BC0, 0x3B00, - 0x3400, 0x3200, 0x3BC0, 0x3B00, - 0x3400, 0x3200, 0x3BC0, 0x3B00, + 0x2C00, 0x2A00, 0x2800, 0x2400, 0x0, 0x0, 0x0, 0x0, - 0x3B00, 0x3A00, 0x3900, 0x3BC0, - 0x3B00, 0x3A00, 0x3900, 0x3BC0, - 0x3B00, 0x3A00, 0x3900, 0x3BC0, - 0x3B00, 0x3A00, 0x3900, 0x3BC0 ] + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, + 0x3B00, 0x3A00, 0x3900, 0x3800, + 0x3B00, 0x3A00, 0x3900, 0x3800, + 0x0, 0x0, 0x0, 0x0 ] - Name: ExpectedOut5 Format: Float16 Stride: 8 diff --git a/test/WaveOps/WaveActiveMax.fp32.test b/test/WaveOps/WaveActiveMax.fp32.test index 4b7fcaf7f..5f378766d 100644 --- a/test/WaveOps/WaveActiveMax.fp32.test +++ b/test/WaveOps/WaveActiveMax.fp32.test @@ -1,50 +1,43 @@ #--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + float mask[NUM_THREADS]; +}; + StructuredBuffer In : register(t0); -RWStructuredBuffer Out1 : register(u1); // test scalar -RWStructuredBuffer Out2 : register(u2); // test float2 +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test float2 RWStructuredBuffer Out3 : register(u3); // test float3 RWStructuredBuffer Out4 : register(u4); // test float4 RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); + -[numthreads(4,1,1)] +[numthreads(NUM_THREADS,1,1)] void main(uint3 tid : SV_GroupThreadID) { - float4 v = In[tid.x]; - - float s1 = WaveActiveMax( v.x ); - float s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; - float s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; - float s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; - - float2 v2_1 = WaveActiveMax( v.xy ); - float2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : float2(0,0); - float2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : float2(0,0); - float2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : float2(0,0); - - float3 v3_1 = WaveActiveMax( v.xyz ); - float3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : float3(0,0,0); - float3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : float3(0,0,0); - float3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : float3(0,0,0); - - float4 v4_1 = WaveActiveMax( v ); - float4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : float4(0,0,0,0); - float4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : float4(0,0,0,0); - float4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : float4(0,0,0,0); - - float scalars[4] = { s4, s3, s2, s1 }; - float2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; - float3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; - float4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; - - Out1[tid.x].x = scalars[tid.x]; - Out2[tid.x].xy = vec2s[tid.x]; - Out3[tid.x].xyz = vec3s[tid.x]; - Out4[tid.x] = vec4s[tid.x]; + for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (float MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + float4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveActiveMax( v.x ); + Out2[OutIdx].xy = WaveActiveMax( v.xy ); + Out3[OutIdx].xyz = WaveActiveMax( v.xyz ); + Out4[OutIdx] = WaveActiveMax( v ); + } + } + } // constant folding case - Out5[0] = WaveActiveMax(float4(1,2,3,4)); + Out5[0] = WaveActiveMax(float4(1.5,2.5,3.5,4.5)); } + //--- pipeline.yaml --- @@ -56,47 +49,185 @@ Buffers: - Name: In Format: Float32 Stride: 16 - Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ] + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 1.5, 2.5, 3.5, 4.5, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 5.5, 6.5, 7.5, 8.5, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 9.5, 10.5, 11.5, 12.5, + 13.5, 14.5, 15.5, 16.5, + 2.5, 3.5, 4.5, 5.5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 6.5, 7.5, 8.5, 9.5, + 10.5, 11.5, 12.5, 13.5, + 14.5, 15.5, 16.5, 1.5, + 3.5, 4.5, 5.5, 6.5, + 7.5, 8.5, 9.5, 10.5, + 11.5, 12.5, 13.5, 14.5, + 15.5, 16.5, 1.5, 2.5, + 4.5, 5.5, 6.5, 7.5, + 8.5, 9.5, 10.5, 11.5, + 12.5, 13.5, 14.5, 15.5, + 16.5, 1.5, 2.5, 3.5, + 4.5, 3.5, 2.5, 1.5, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 8.5, 7.5, 6.5, 5.5, + 12.5, 11.5, 10.5, 9.5, + 16.5, 15.5, 14.5, 13.5, + 5.5, 4.5, 3.5, 2.5, + 9.5, 8.5, 7.5, 6.5, + 13.5, 12.5, 11.5, 10.5, + 1.5, 16.5, 15.5, 14.5, + 6.5, 5.5, 4.5, 3.5, + 10.5, 9.5, 8.5, 7.5, + 14.5, 13.5, 12.5, 11.5, + 2.5, 1.5, 16.5, 15.5, + 7.5, 6.5, 5.5, 4.5, + 11.5, 10.5, 9.5, 8.5, + 15.5, 14.5, 13.5, 12.5, + 3.5, 2.5, 1.5, 16 ] + - Name: Out1 Format: Float32 - Stride: 16 - ZeroInitSize: 64 + Stride: 4 + # 1 float is 4 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets + ZeroInitSize: 128 - Name: Out2 Format: Float32 - Stride: 16 - ZeroInitSize: 64 + Stride: 8 + ZeroInitSize: 256 - Name: Out3 Format: Float32 Stride: 16 - ZeroInitSize: 64 + ZeroInitSize: 512 - Name: Out4 Format: Float32 Stride: 16 - ZeroInitSize: 64 + ZeroInitSize: 512 - Name: Out5 Format: Float32 Stride: 16 ZeroInitSize: 16 + - Name: Masks + Format: Float32 + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] - Name: ExpectedOut1 Format: Float32 Stride: 16 - Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0 ] + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 14.5, 14.5, 14.5, 14.5, + 3.5, 0, 0, 0, + 0, 12.5, 12.5, 0, + 0, 0, 0, 0, + 13.5, 13.5, 13.5, 13.5, + 6.5, 0, 0, 0, + 0, 15.5, 15.5, 0 ] - Name: ExpectedOut2 Format: Float32 Stride: 16 - Data: [ 1.0, 10.0, 0.0, 0.0, 2.0, 20.0, 0.0, 0.0, 3.0, 30.0, 0.0, 0.0, 4.0, 40.0, 0.0, 0.0 ] + # 2 value sets.5, 4 masks per value set, 4 threads per mask.5, 1 result value per thread + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 14.5, 15.5, 14.5, 15.5, + 14.5, 15.5, 14.5, 15.5, + 3.5, 4.5, 0, 0, + 0, 0, 0, 0, + 0, 0, 12.5, 13.5, + 12.5, 13.5, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13.5, 16.5, 13.5, 16.5, + 13.5, 16.5, 13.5, 16.5, + 6.5, 5.5, 0, 0,, + 0, 0, 0, 0, + 0, 0,, 15.5, 14.5, + 15.5, 14.5, 0, 0 ] - Name: ExpectedOut3 Format: Float32 Stride: 16 - Data: [ 1.0, 10.0, 100.0, 0.0, 2.0, 20.0, 200.0, 0.0, 3.0, 30.0, 300.0, 0.0, 4.0, 40.0, 400.0, 0.0 ] + # 2 value sets.5, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 14.5, 15.5, 16.5, 0, + 14.5, 15.5, 16.5, 0, + 14.5, 15.5, 16.5, 0, + 14.5, 15.5, 16.5, 0, + 3.5, 4.5, 5.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 12.5, 13.5, 14.5, 0, + 12.5, 13.5, 14.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13.5, 16.5, 15.5, 0, + 13.5, 16.5, 15.5, 0, + 13.5, 16.5, 15.5, 0, + 13.5, 16.5, 15.5, 0, + 6.5, 5.5, 4.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 15.5, 14.5, 13.5, 0, + 15.5, 14.5, 13.5, 0, + 0, 0, 0, 0 ] - Name: ExpectedOut4 Format: Float32 Stride: 16 - Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ] + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 14.5, 15.5, 16.5, 13.5, + 14.5, 15.5, 16.5, 13.5, + 14.5, 15.5, 16.5, 13.5, + 14.5, 15.5, 16.5, 13.5, + 3.5, 4.5, 5.5, 6.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 12.5, 13.5, 14.5, 15.5, + 12.5, 13.5, 14.5, 15.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13.5, 16.5, 15.5, 14.5, + 13.5, 16.5, 15.5, 14.5, + 13.5, 16.5, 15.5, 14.5, + 13.5, 16.5, 15.5, 14.5, + 6.5, 5.5, 4.5, 3.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 15.5, 14.5, 13.5, 12.5, + 15.5, 14.5, 13.5, 12.5, + 0, 0, 0, 0 ] - Name: ExpectedOut5 Format: Float32 - Stride: 16 - Data: [ 1.0, 2.0, 3.0, 4.0 ] + Stride: 8 + Data: [ 1.5, 2.5, 3.5, 4.5 ] Results: - Result: ExpectedOut1 Rule: BufferExact @@ -162,6 +293,13 @@ DescriptorSets: Space: 0 VulkanBinding: Binding: 5 + - Name: Masks + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 ... #--- end @@ -169,7 +307,7 @@ DescriptorSets: # Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang -# Tracked by https://github.com/llvm/offload-test-suite/issues/393 +# Bug https://github.com/llvm/offload-test-suite/issues/393 # XFAIL: Metal # RUN: split-file %s %t diff --git a/test/WaveOps/WaveActiveMax.fp64.test b/test/WaveOps/WaveActiveMax.fp64.test index 50f9ed873..91f5b98f8 100644 --- a/test/WaveOps/WaveActiveMax.fp64.test +++ b/test/WaveOps/WaveActiveMax.fp64.test @@ -1,50 +1,43 @@ #--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + double mask[NUM_THREADS]; +}; + StructuredBuffer In : register(t0); -RWStructuredBuffer Out1 : register(u1); // test scalar -RWStructuredBuffer Out2 : register(u2); // test double2 +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test double2 RWStructuredBuffer Out3 : register(u3); // test double3 RWStructuredBuffer Out4 : register(u4); // test double4 RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); -[numthreads(4,1,1)] + +[numthreads(NUM_THREADS,1,1)] void main(uint3 tid : SV_GroupThreadID) { - double4 v = In[tid.x]; - - double s1 = WaveActiveMax( v.x ); - double s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; - double s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; - double s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; - - double2 v2_1 = WaveActiveMax( v.xy ); - double2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : double2(0,0); - double2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : double2(0,0); - double2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : double2(0,0); - - double3 v3_1 = WaveActiveMax( v.xyz ); - double3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : double3(0,0,0); - double3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : double3(0,0,0); - double3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : double3(0,0,0); - - double4 v4_1 = WaveActiveMax( v ); - double4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : double4(0,0,0,0); - double4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : double4(0,0,0,0); - double4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : double4(0,0,0,0); - - double scalars[4] = { s4, s3, s2, s1 }; - double2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; - double3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; - double4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; - - Out1[tid.x].x = scalars[tid.x]; - Out2[tid.x].xy = vec2s[tid.x]; - Out3[tid.x].xyz = vec3s[tid.x]; - Out4[tid.x] = vec4s[tid.x]; + for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (double MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + double4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveActiveMax( v.x ); + Out2[OutIdx].xy = WaveActiveMax( v.xy ); + Out3[OutIdx].xyz = WaveActiveMax( v.xyz ); + Out4[OutIdx] = WaveActiveMax( v ); + } + } + } // constant folding case - Out5[0] = WaveActiveMax(double4(1,2,3,4)); + Out5[0] = WaveActiveMax(double4(1.5,2.5,3.5,4.5)); } + //--- pipeline.yaml --- @@ -55,48 +48,186 @@ Shaders: Buffers: - Name: In Format: Float64 - Stride: 32 - Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ] + Stride: 16 + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 1.5, 2.5, 3.5, 4.5, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 5.5, 6.5, 7.5, 8.5, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 9.5, 10.5, 11.5, 12.5, + 13.5, 14.5, 15.5, 16.5, + 2.5, 3.5, 4.5, 5.5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 6.5, 7.5, 8.5, 9.5, + 10.5, 11.5, 12.5, 13.5, + 14.5, 15.5, 16.5, 1.5, + 3.5, 4.5, 5.5, 6.5, + 7.5, 8.5, 9.5, 10.5, + 11.5, 12.5, 13.5, 14.5, + 15.5, 16.5, 1.5, 2.5, + 4.5, 5.5, 6.5, 7.5, + 8.5, 9.5, 10.5, 11.5, + 12.5, 13.5, 14.5, 15.5, + 16.5, 1.5, 2.5, 3.5, + 4.5, 3.5, 2.5, 1.5, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 8.5, 7.5, 6.5, 5.5, + 12.5, 11.5, 10.5, 9.5, + 16.5, 15.5, 14.5, 13.5, + 5.5, 4.5, 3.5, 2.5, + 9.5, 8.5, 7.5, 6.5, + 13.5, 12.5, 11.5, 10.5, + 1.5, 16.5, 15.5, 14.5, + 6.5, 5.5, 4.5, 3.5, + 10.5, 9.5, 8.5, 7.5, + 14.5, 13.5, 12.5, 11.5, + 2.5, 1.5, 16.5, 15.5, + 7.5, 6.5, 5.5, 4.5, + 11.5, 10.5, 9.5, 8.5, + 15.5, 14.5, 13.5, 12.5, + 3.5, 2.5, 1.5, 16 ] + - Name: Out1 Format: Float64 - Stride: 32 - ZeroInitSize: 128 + Stride: 4 + # 1 double is 8 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets + ZeroInitSize: 256 - Name: Out2 Format: Float64 - Stride: 32 - ZeroInitSize: 128 + Stride: 8 + ZeroInitSize: 512 - Name: Out3 Format: Float64 - Stride: 32 - ZeroInitSize: 128 + Stride: 16 + ZeroInitSize: 1024 - Name: Out4 Format: Float64 - Stride: 32 - ZeroInitSize: 128 + Stride: 16 + ZeroInitSize: 1024 - Name: Out5 Format: Float64 Stride: 32 ZeroInitSize: 32 + - Name: Masks + Format: Float64 + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] - Name: ExpectedOut1 Format: Float64 - Stride: 32 - Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0 ] + Stride: 16 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 14.5, 14.5, 14.5, 14.5, + 3.5, 0, 0, 0, + 0, 12.5, 12.5, 0, + 0, 0, 0, 0, + 13.5, 13.5, 13.5, 13.5, + 6.5, 0, 0, 0, + 0, 15.5, 15.5, 0 ] - Name: ExpectedOut2 Format: Float64 - Stride: 32 - Data: [ 1.0, 10.0, 0.0, 0.0, 2.0, 20.0, 0.0, 0.0, 3.0, 30.0, 0.0, 0.0, 4.0, 40.0, 0.0, 0.0 ] + Stride: 16 + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 14.5, 15.5, 14.5, 15.5, + 14.5, 15.5, 14.5, 15.5, + 3.5, 4.5, 0, 0, + 0, 0, 0, 0, + 0, 0, 12.5, 13.5, + 12.5, 13.5, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13.5, 16.5, 13.5, 16.5, + 13.5, 16.5, 13.5, 16.5, + 6.5, 5.5, 0, 0,, + 0, 0, 0, 0, + 0, 0,, 15.5, 14.5, + 15.5, 14.5, 0, 0 ] - Name: ExpectedOut3 Format: Float64 - Stride: 32 - Data: [ 1.0, 10.0, 100.0, 0.0, 2.0, 20.0, 200.0, 0.0, 3.0, 30.0, 300.0, 0.0, 4.0, 40.0, 400.0, 0.0 ] + Stride: 16 + # 2 value sets.5, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned.5, so the 3 result values are placed doubleo a 4 element vec + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 14.5, 15.5, 16.5, 0, + 14.5, 15.5, 16.5, 0, + 14.5, 15.5, 16.5, 0, + 14.5, 15.5, 16.5, 0, + 3.5, 4.5, 5.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 12.5, 13.5, 14.5, 0, + 12.5, 13.5, 14.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13.5, 16.5, 15.5, 0, + 13.5, 16.5, 15.5, 0, + 13.5, 16.5, 15.5, 0, + 13.5, 16.5, 15.5, 0, + 6.5, 5.5, 4.5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 15.5, 14.5, 13.5, 0, + 15.5, 14.5, 13.5, 0, + 0, 0, 0, 0 ] - Name: ExpectedOut4 Format: Float64 - Stride: 32 - Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ] + Stride: 16 + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 14.5, 15.5, 16.5, 13.5, + 14.5, 15.5, 16.5, 13.5, + 14.5, 15.5, 16.5, 13.5, + 14.5, 15.5, 16.5, 13.5, + 3.5, 4.5, 5.5, 6.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 12.5, 13.5, 14.5, 15.5, + 12.5, 13.5, 14.5, 15.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13.5, 16.5, 15.5, 14.5, + 13.5, 16.5, 15.5, 14.5, + 13.5, 16.5, 15.5, 14.5, + 13.5, 16.5, 15.5, 14.5, + 6.5, 5.5, 4.5, 3.5, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 15.5, 14.5, 13.5, 12.5, + 15.5, 14.5, 13.5, 12.5, + 0, 0, 0, 0 ] - Name: ExpectedOut5 Format: Float64 - Stride: 32 - Data: [ 1.0, 2.0, 3.0, 4.0 ] + Stride: 8 + Data: [ 1.5, 2.5, 3.5, 4.5 ] Results: - Result: ExpectedOut1 Rule: BufferExact @@ -162,12 +293,17 @@ DescriptorSets: Space: 0 VulkanBinding: Binding: 5 + - Name: Masks + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 ... #--- end -# REQUIRES: Double - # Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test index 496fb1591..1a8689905 100644 --- a/test/WaveOps/WaveActiveMax.int16.test +++ b/test/WaveOps/WaveActiveMax.int16.test @@ -1,95 +1,43 @@ #--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + StructuredBuffer In : register(t0); -RWStructuredBuffer Out1 : register(u1); // test scalar -RWStructuredBuffer Out2 : register(u2); // test int16_t2 +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int16_t2 RWStructuredBuffer Out3 : register(u3); // test int16_t3 RWStructuredBuffer Out4 : register(u4); // test int16_t4 RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); -// uints -StructuredBuffer UIn : register(t6); -RWStructuredBuffer UOut1 : register(u7); -RWStructuredBuffer UOut2 : register(u8); -RWStructuredBuffer UOut3 : register(u9); -RWStructuredBuffer UOut4 : register(u10); -RWStructuredBuffer UOut5 : register(u11); -[numthreads(4,1,1)] +[numthreads(NUM_THREADS,1,1)] void main(uint3 tid : SV_GroupThreadID) { - int16_t4 v = In[tid.x]; - - int16_t s1 = WaveActiveMax( v.x ); - int16_t s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; - int16_t s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; - int16_t s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; - - int16_t2 v2_1 = WaveActiveMax( v.xy ); - int16_t2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int16_t2(0,0); - int16_t2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int16_t2(0,0); - int16_t2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int16_t2(0,0); - - int16_t3 v3_1 = WaveActiveMax( v.xyz ); - int16_t3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0); - int16_t3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0); - int16_t3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0); - - int16_t4 v4_1 = WaveActiveMax( v ); - int16_t4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int16_t4(0,0,0,0); - int16_t4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int16_t4(0,0,0,0); - int16_t4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int16_t4(0,0,0,0); - - int16_t scalars[4] = { s4, s3, s2, s1 }; - int16_t2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; - int16_t3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; - int16_t4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; - - Out1[tid.x].x = scalars[tid.x]; - Out2[tid.x].xy = vec2s[tid.x]; - Out3[tid.x].xyz = vec3s[tid.x]; - Out4[tid.x] = vec4s[tid.x]; + for (int ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (int MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + int16_t4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveActiveMax( v.x ); + Out2[OutIdx].xy = WaveActiveMax( v.xy ); + Out3[OutIdx].xyz = WaveActiveMax( v.xyz ); + Out4[OutIdx] = WaveActiveMax( v ); + } + } + } // constant folding case Out5[0] = WaveActiveMax(int16_t4(1,2,3,4)); - - // UINT case - - uint16_t4 uv = UIn[tid.x]; - - uint16_t us1 = WaveActiveMax( uv.x ); - uint16_t us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0; - uint16_t us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0; - uint16_t us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0; - - uint16_t2 uv2_1 = WaveActiveMax( uv.xy ); - uint16_t2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0); - uint16_t2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0); - uint16_t2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0); - - uint16_t3 uv3_1 = WaveActiveMax( uv.xyz ); - uint16_t3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0); - uint16_t3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0); - uint16_t3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0); - - uint16_t4 uv4_1 = WaveActiveMax( uv ); - uint16_t4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0); - uint16_t4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0); - uint16_t4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0); - - uint16_t uscalars[4] = { us4, us3, us2, us1 }; - uint16_t2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 }; - uint16_t3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 }; - uint16_t4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 }; - - UOut1[tid.x].x = uscalars[tid.x]; - UOut2[tid.x].xy = uvec2s[tid.x]; - UOut3[tid.x].xyz = uvec3s[tid.x]; - UOut4[tid.x] = uvec4s[tid.x]; - - // constant folding case - UOut5[0] = WaveActiveMax(uint16_t4(1,2,3,4)); } + //--- pipeline.yaml --- @@ -100,93 +48,186 @@ Shaders: Buffers: - Name: In Format: Int16 - Stride: 8 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + Stride: 8 + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 9, 10, 11, 12, + 13, 14, 15, 16, + 2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 6, 7, 8, 9, + 10, 11, 12, 13, + 14, 15, 16, 1, + 3, 4, 5, 6, + 7, 8, 9, 10, + 11, 12, 13, 14, + 15, 16, 1, 2, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + 16, 1, 2, 3, + 4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 8, 7, 6, 5, + 12, 11, 10, 9, + 16, 15, 14, 13, + 5, 4, 3, 2, + 9, 8, 7, 6, + 13, 12, 11, 10, + 1, 16, 15, 14, + 6, 5, 4, 3, + 10, 9, 8, 7, + 14, 13, 12, 11, + 2, 1, 16, 15, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12, + 3, 2, 1, 16 ] + - Name: Out1 Format: Int16 - Stride: 8 - ZeroInitSize: 32 + Stride: 2 + # 1 int16_t is 2 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets + ZeroInitSize: 64 - Name: Out2 Format: Int16 - Stride: 8 - ZeroInitSize: 32 + Stride: 4 + ZeroInitSize: 128 - Name: Out3 Format: Int16 Stride: 8 - ZeroInitSize: 32 + ZeroInitSize: 256 - Name: Out4 Format: Int16 Stride: 8 - ZeroInitSize: 32 + ZeroInitSize: 256 - Name: Out5 Format: Int16 Stride: 8 ZeroInitSize: 8 + - Name: Masks + Format: Int32 + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] - Name: ExpectedOut1 Format: Int16 Stride: 8 - Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 14, 14, 14, 14, + 3, 0, 0, 0, + 0, 12, 12, 0, + 0, 0, 0, 0, + 13, 13, 13, 13, + 6, 0, 0, 0, + 0, 15, 15, 0 ] - Name: ExpectedOut2 Format: Int16 Stride: 8 - Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 14, 15, 14, 15, + 14, 15, 14, 15, + 3, 4, 0, 0, + 0, 0, 0, 0, + 0, 0, 12, 13, + 12, 13, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 16, 13, 16, + 13, 16, 13, 16, + 6, 5, 0, 0, + 0, 0, 0, 0, + 0, 0, 15, 14, + 15, 14, 0, 0 ] - Name: ExpectedOut3 Format: Int16 Stride: 8 - Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] + # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 14, 15, 16, 0, + 14, 15, 16, 0, + 14, 15, 16, 0, + 14, 15, 16, 0, + 3, 4, 5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 12, 13, 14, 0, + 12, 13, 14, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 16, 15, 0, + 13, 16, 15, 0, + 13, 16, 15, 0, + 13, 16, 15, 0, + 6, 5, 4, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 15, 14, 13, 0, + 15, 14, 13, 0, + 0, 0, 0, 0 ] - Name: ExpectedOut4 Format: Int16 Stride: 8 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 14, 15, 16, 13, + 14, 15, 16, 13, + 14, 15, 16, 13, + 14, 15, 16, 13, + 3, 4, 5, 6, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 12, 13, 14, 15, + 12, 13, 14, 15, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 16, 15, 14, + 13, 16, 15, 14, + 13, 16, 15, 14, + 13, 16, 15, 14, + 6, 5, 4, 3, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 15, 14, 13, 12, + 15, 14, 13, 12, + 0, 0, 0, 0 ] - Name: ExpectedOut5 Format: Int16 Stride: 8 Data: [ 1, 2, 3, 4 ] - - Name: UIn - Format: UInt16 - Stride: 2 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] - - Name: UOut1 - Format: UInt16 - Stride: 8 - ZeroInitSize: 32 - - Name: UOut2 - Format: UInt16 - Stride: 8 - ZeroInitSize: 32 - - Name: UOut3 - Format: UInt16 - Stride: 8 - ZeroInitSize: 32 - - Name: UOut4 - Format: UInt16 - Stride: 8 - ZeroInitSize: 32 - - Name: UOut5 - Format: UInt16 - Stride: 8 - ZeroInitSize: 8 - - Name: UExpectedOut1 - Format: UInt16 - Stride: 8 - Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] - - Name: UExpectedOut2 - Format: UInt16 - Stride: 8 - Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] - - Name: UExpectedOut3 - Format: UInt16 - Stride: 8 - Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] - - Name: UExpectedOut4 - Format: UInt16 - Stride: 8 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] - - Name: UExpectedOut5 - Format: UInt16 - Stride: 2 - Data: [ 1, 2, 3, 4 ] - Results: - Result: ExpectedOut1 Rule: BufferExact @@ -208,26 +249,6 @@ Results: Rule: BufferExact Actual: Out5 Expected: ExpectedOut5 - - Result: UExpectedOut1 - Rule: BufferExact - Actual: UOut1 - Expected: UExpectedOut1 - - Result: UExpectedOut2 - Rule: BufferExact - Actual: UOut2 - Expected: UExpectedOut2 - - Result: UExpectedOut3 - Rule: BufferExact - Actual: UOut3 - Expected: UExpectedOut3 - - Result: UExpectedOut4 - Rule: BufferExact - Actual: UOut4 - Expected: UExpectedOut4 - - Result: UExpectedOut5 - Rule: BufferExact - Actual: UOut5 - Expected: UExpectedOut5 DescriptorSets: - Resources: - Name: In @@ -272,48 +293,13 @@ DescriptorSets: Space: 0 VulkanBinding: Binding: 5 - - Name: UIn + - Name: Masks Kind: StructuredBuffer DirectXBinding: Register: 6 Space: 0 VulkanBinding: Binding: 6 - - Name: UOut1 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 7 - Space: 0 - VulkanBinding: - Binding: 7 - - Name: UOut2 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 8 - Space: 0 - VulkanBinding: - Binding: 8 - - Name: UOut3 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 9 - Space: 0 - VulkanBinding: - Binding: 9 - - Name: UOut4 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 10 - Space: 0 - VulkanBinding: - Binding: 10 - - Name: UOut5 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 11 - Space: 0 - VulkanBinding: - Binding: 11 ... #--- end diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test index 0aa6a43ba..721d6c426 100644 --- a/test/WaveOps/WaveActiveMax.int32.test +++ b/test/WaveOps/WaveActiveMax.int32.test @@ -1,95 +1,43 @@ #--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + StructuredBuffer In : register(t0); -RWStructuredBuffer Out1 : register(u1); // test scalar -RWStructuredBuffer Out2 : register(u2); // test int2 +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int2 RWStructuredBuffer Out3 : register(u3); // test int3 RWStructuredBuffer Out4 : register(u4); // test int4 RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); -// uints -StructuredBuffer UIn : register(t6); -RWStructuredBuffer UOut1 : register(u7); -RWStructuredBuffer UOut2 : register(u8); -RWStructuredBuffer UOut3 : register(u9); -RWStructuredBuffer UOut4 : register(u10); -RWStructuredBuffer UOut5 : register(u11); -[numthreads(4,1,1)] +[numthreads(NUM_THREADS,1,1)] void main(uint3 tid : SV_GroupThreadID) { - int4 v = In[tid.x]; - - int s1 = WaveActiveMax( v.x ); - int s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; - int s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; - int s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; - - int2 v2_1 = WaveActiveMax( v.xy ); - int2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int2(0,0); - int2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int2(0,0); - int2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int2(0,0); - - int3 v3_1 = WaveActiveMax( v.xyz ); - int3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int3(0,0,0); - int3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int3(0,0,0); - int3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int3(0,0,0); - - int4 v4_1 = WaveActiveMax( v ); - int4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int4(0,0,0,0); - int4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int4(0,0,0,0); - int4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int4(0,0,0,0); - - int scalars[4] = { s4, s3, s2, s1 }; - int2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; - int3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; - int4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; - - Out1[tid.x].x = scalars[tid.x]; - Out2[tid.x].xy = vec2s[tid.x]; - Out3[tid.x].xyz = vec3s[tid.x]; - Out4[tid.x] = vec4s[tid.x]; + for (int ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (int MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + int4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveActiveMax( v.x ); + Out2[OutIdx].xy = WaveActiveMax( v.xy ); + Out3[OutIdx].xyz = WaveActiveMax( v.xyz ); + Out4[OutIdx] = WaveActiveMax( v ); + } + } + } // constant folding case Out5[0] = WaveActiveMax(int4(1,2,3,4)); - - // UINT case - - uint4 uv = UIn[tid.x]; - - uint us1 = WaveActiveMax( uv.x ); - uint us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0; - uint us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0; - uint us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0; - - uint2 uv2_1 = WaveActiveMax( uv.xy ); - uint2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint2(0,0); - uint2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint2(0,0); - uint2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint2(0,0); - - uint3 uv3_1 = WaveActiveMax( uv.xyz ); - uint3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0); - uint3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0); - uint3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0); - - uint4 uv4_1 = WaveActiveMax( uv ); - uint4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint4(0,0,0,0); - uint4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint4(0,0,0,0); - uint4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint4(0,0,0,0); - - uint uscalars[4] = { us4, us3, us2, us1 }; - uint2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 }; - uint3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 }; - uint4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 }; - - UOut1[tid.x].x = uscalars[tid.x]; - UOut2[tid.x].xy = uvec2s[tid.x]; - UOut3[tid.x].xyz = uvec3s[tid.x]; - UOut4[tid.x] = uvec4s[tid.x]; - - // constant folding case - UOut5[0] = WaveActiveMax(uint4(1,2,3,4)); } + //--- pipeline.yaml --- @@ -100,93 +48,186 @@ Shaders: Buffers: - Name: In Format: Int32 - Stride: 16 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + Stride: 16 + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 9, 10, 11, 12, + 13, 14, 15, 16, + 2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 6, 7, 8, 9, + 10, 11, 12, 13, + 14, 15, 16, 1, + 3, 4, 5, 6, + 7, 8, 9, 10, + 11, 12, 13, 14, + 15, 16, 1, 2, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + 16, 1, 2, 3, + 4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 8, 7, 6, 5, + 12, 11, 10, 9, + 16, 15, 14, 13, + 5, 4, 3, 2, + 9, 8, 7, 6, + 13, 12, 11, 10, + 1, 16, 15, 14, + 6, 5, 4, 3, + 10, 9, 8, 7, + 14, 13, 12, 11, + 2, 1, 16, 15, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12, + 3, 2, 1, 16 ] + - Name: Out1 Format: Int32 - Stride: 16 - ZeroInitSize: 64 + Stride: 4 + # 1 int is 4 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets + ZeroInitSize: 128 - Name: Out2 Format: Int32 - Stride: 16 - ZeroInitSize: 64 + Stride: 8 + ZeroInitSize: 256 - Name: Out3 Format: Int32 Stride: 16 - ZeroInitSize: 64 + ZeroInitSize: 512 - Name: Out4 Format: Int32 Stride: 16 - ZeroInitSize: 64 + ZeroInitSize: 512 - Name: Out5 Format: Int32 Stride: 16 ZeroInitSize: 16 + - Name: Masks + Format: Int32 + Stride: 16 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] - Name: ExpectedOut1 Format: Int32 Stride: 16 - Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 14, 14, 14, 14, + 3, 0, 0, 0, + 0, 12, 12, 0, + 0, 0, 0, 0, + 13, 13, 13, 13, + 6, 0, 0, 0, + 0, 15, 15, 0 ] - Name: ExpectedOut2 Format: Int32 Stride: 16 - Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 14, 15, 14, 15, + 14, 15, 14, 15, + 3, 4, 0, 0, + 0, 0, 0, 0, + 0, 0, 12, 13, + 12, 13, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 16, 13, 16, + 13, 16, 13, 16, + 6, 5, 0, 0, + 0, 0, 0, 0, + 0, 0, 15, 14, + 15, 14, 0, 0 ] - Name: ExpectedOut3 Format: Int32 Stride: 16 - Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] + # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 14, 15, 16, 0, + 14, 15, 16, 0, + 14, 15, 16, 0, + 14, 15, 16, 0, + 3, 4, 5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 12, 13, 14, 0, + 12, 13, 14, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 16, 15, 0, + 13, 16, 15, 0, + 13, 16, 15, 0, + 13, 16, 15, 0, + 6, 5, 4, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 15, 14, 13, 0, + 15, 14, 13, 0, + 0, 0, 0, 0 ] - Name: ExpectedOut4 Format: Int32 Stride: 16 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 14, 15, 16, 13, + 14, 15, 16, 13, + 14, 15, 16, 13, + 14, 15, 16, 13, + 3, 4, 5, 6, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 12, 13, 14, 15, + 12, 13, 14, 15, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 16, 15, 14, + 13, 16, 15, 14, + 13, 16, 15, 14, + 13, 16, 15, 14, + 6, 5, 4, 3, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 15, 14, 13, 12, + 15, 14, 13, 12, + 0, 0, 0, 0 ] - Name: ExpectedOut5 Format: Int32 - Stride: 16 - Data: [ 1, 2, 3, 4 ] - - Name: UIn - Format: UInt32 - Stride: 16 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] - - Name: UOut1 - Format: UInt32 - Stride: 16 - ZeroInitSize: 64 - - Name: UOut2 - Format: UInt32 - Stride: 16 - ZeroInitSize: 64 - - Name: UOut3 - Format: UInt32 - Stride: 16 - ZeroInitSize: 64 - - Name: UOut4 - Format: UInt32 - Stride: 16 - ZeroInitSize: 64 - - Name: UOut5 - Format: UInt32 - Stride: 16 - ZeroInitSize: 16 - - Name: UExpectedOut1 - Format: UInt32 - Stride: 16 - Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] - - Name: UExpectedOut2 - Format: UInt32 - Stride: 16 - Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] - - Name: UExpectedOut3 - Format: UInt32 - Stride: 16 - Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] - - Name: UExpectedOut4 - Format: UInt32 - Stride: 16 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] - - Name: UExpectedOut5 - Format: UInt32 - Stride: 4 + Stride: 8 Data: [ 1, 2, 3, 4 ] - Results: - Result: ExpectedOut1 Rule: BufferExact @@ -208,26 +249,6 @@ Results: Rule: BufferExact Actual: Out5 Expected: ExpectedOut5 - - Result: UExpectedOut1 - Rule: BufferExact - Actual: UOut1 - Expected: UExpectedOut1 - - Result: UExpectedOut2 - Rule: BufferExact - Actual: UOut2 - Expected: UExpectedOut2 - - Result: UExpectedOut3 - Rule: BufferExact - Actual: UOut3 - Expected: UExpectedOut3 - - Result: UExpectedOut4 - Rule: BufferExact - Actual: UOut4 - Expected: UExpectedOut4 - - Result: UExpectedOut5 - Rule: BufferExact - Actual: UOut5 - Expected: UExpectedOut5 DescriptorSets: - Resources: - Name: In @@ -272,48 +293,13 @@ DescriptorSets: Space: 0 VulkanBinding: Binding: 5 - - Name: UIn + - Name: Masks Kind: StructuredBuffer DirectXBinding: Register: 6 Space: 0 VulkanBinding: Binding: 6 - - Name: UOut1 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 7 - Space: 0 - VulkanBinding: - Binding: 7 - - Name: UOut2 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 8 - Space: 0 - VulkanBinding: - Binding: 8 - - Name: UOut3 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 9 - Space: 0 - VulkanBinding: - Binding: 9 - - Name: UOut4 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 10 - Space: 0 - VulkanBinding: - Binding: 10 - - Name: UOut5 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 11 - Space: 0 - VulkanBinding: - Binding: 11 ... #--- end diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test index 418727387..a0b56a1eb 100644 --- a/test/WaveOps/WaveActiveMax.int64.test +++ b/test/WaveOps/WaveActiveMax.int64.test @@ -1,95 +1,43 @@ #--- source.hlsl +#define VALUE_SETS 2 +#define NUM_MASKS 4 +#define NUM_THREADS 4 + +struct MaskStruct { + int mask[NUM_THREADS]; +}; + StructuredBuffer In : register(t0); -RWStructuredBuffer Out1 : register(u1); // test scalar -RWStructuredBuffer Out2 : register(u2); // test int64_t2 +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int64_t2 RWStructuredBuffer Out3 : register(u3); // test int64_t3 RWStructuredBuffer Out4 : register(u4); // test int64_t4 RWStructuredBuffer Out5 : register(u5); // constant folding +StructuredBuffer Masks : register(t6); -// uint64_ts -StructuredBuffer UIn : register(t6); -RWStructuredBuffer UOut1 : register(u7); -RWStructuredBuffer UOut2 : register(u8); -RWStructuredBuffer UOut3 : register(u9); -RWStructuredBuffer UOut4 : register(u10); -RWStructuredBuffer UOut5 : register(u11); -[numthreads(4,1,1)] +[numthreads(NUM_THREADS,1,1)] void main(uint3 tid : SV_GroupThreadID) { - int64_t4 v = In[tid.x]; - - int64_t s1 = WaveActiveMax( v.x ); - int64_t s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0; - int64_t s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0; - int64_t s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0; - - int64_t2 v2_1 = WaveActiveMax( v.xy ); - int64_t2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int64_t2(0,0); - int64_t2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int64_t2(0,0); - int64_t2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int64_t2(0,0); - - int64_t3 v3_1 = WaveActiveMax( v.xyz ); - int64_t3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0); - int64_t3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0); - int64_t3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0); - - int64_t4 v4_1 = WaveActiveMax( v ); - int64_t4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int64_t4(0,0,0,0); - int64_t4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int64_t4(0,0,0,0); - int64_t4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int64_t4(0,0,0,0); - - int64_t scalars[4] = { s4, s3, s2, s1 }; - int64_t2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 }; - int64_t3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 }; - int64_t4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 }; - - Out1[tid.x].x = scalars[tid.x]; - Out2[tid.x].xy = vec2s[tid.x]; - Out3[tid.x].xyz = vec3s[tid.x]; - Out4[tid.x] = vec4s[tid.x]; + for (int ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) { + const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS; + for (int MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) { + int64_t4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x]; + const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x; + if (Masks[MaskIdx].mask[tid.x]) { + Out1[OutIdx] = WaveActiveMax( v.x ); + Out2[OutIdx].xy = WaveActiveMax( v.xy ); + Out3[OutIdx].xyz = WaveActiveMax( v.xyz ); + Out4[OutIdx] = WaveActiveMax( v ); + } + } + } // constant folding case Out5[0] = WaveActiveMax(int64_t4(1,2,3,4)); - - // UINT64_t case - - uint64_t4 uv = UIn[tid.x]; - - uint64_t us1 = WaveActiveMax( uv.x ); - uint64_t us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0; - uint64_t us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0; - uint64_t us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0; - - uint64_t2 uv2_1 = WaveActiveMax( uv.xy ); - uint64_t2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0); - uint64_t2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0); - uint64_t2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0); - - uint64_t3 uv3_1 = WaveActiveMax( uv.xyz ); - uint64_t3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0); - uint64_t3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0); - uint64_t3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0); - - uint64_t4 uv4_1 = WaveActiveMax( uv ); - uint64_t4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0); - uint64_t4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0); - uint64_t4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0); - - uint64_t uscalars[4] = { us4, us3, us2, us1 }; - uint64_t2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 }; - uint64_t3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 }; - uint64_t4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 }; - - UOut1[tid.x].x = uscalars[tid.x]; - UOut2[tid.x].xy = uvec2s[tid.x]; - UOut3[tid.x].xyz = uvec3s[tid.x]; - UOut4[tid.x] = uvec4s[tid.x]; - - // constant folding case - UOut5[0] = WaveActiveMax(uint64_t4(1,2,3,4)); } + //--- pipeline.yaml --- @@ -100,93 +48,186 @@ Shaders: Buffers: - Name: In Format: Int64 - Stride: 32 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + Stride: 32 + # 2 value sets + # For each value set, + # and for each specific one of the 4 thread masks in that value set, + # and for each of the 4 threads in that thread mask, + # there will be a unique set of 4 values, such that + # none of the other threads in that thread mask share any values + Data: [ + 1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values + 5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values + 9, 10, 11, 12, + 13, 14, 15, 16, + 2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values + 6, 7, 8, 9, + 10, 11, 12, 13, + 14, 15, 16, 1, + 3, 4, 5, 6, + 7, 8, 9, 10, + 11, 12, 13, 14, + 15, 16, 1, 2, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + 16, 1, 2, 3, + 4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values + 8, 7, 6, 5, + 12, 11, 10, 9, + 16, 15, 14, 13, + 5, 4, 3, 2, + 9, 8, 7, 6, + 13, 12, 11, 10, + 1, 16, 15, 14, + 6, 5, 4, 3, + 10, 9, 8, 7, + 14, 13, 12, 11, + 2, 1, 16, 15, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12, + 3, 2, 1, 16 ] + - Name: Out1 Format: Int64 - Stride: 32 - ZeroInitSize: 128 + Stride: 8 + # 1 int is 8 bytes, * 4 ints for 4 threads, * 4 thread masks, * 2 value sets + ZeroInitSize: 256 - Name: Out2 Format: Int64 - Stride: 32 - ZeroInitSize: 128 + Stride: 16 + ZeroInitSize: 512 - Name: Out3 Format: Int64 Stride: 32 - ZeroInitSize: 128 + ZeroInitSize: 1024 - Name: Out4 Format: Int64 Stride: 32 - ZeroInitSize: 128 + ZeroInitSize: 1024 - Name: Out5 Format: Int64 Stride: 32 ZeroInitSize: 32 + - Name: Masks + Format: Int64 + Stride: 8 + # 4 active mask sets for threads 0, 1, 2, 3: + # 0 0 0 0 + # 1 1 1 1 + # 1 0 0 0 + # 0 1 1 0 + Data: [ + 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0] - Name: ExpectedOut1 Format: Int64 Stride: 32 - Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 14, 14, 14, 14, + 3, 0, 0, 0, + 0, 12, 12, 0, + 0, 0, 0, 0, + 13, 13, 13, 13, + 6, 0, 0, 0, + 0, 15, 15, 0 ] - Name: ExpectedOut2 Format: Int64 Stride: 32 - Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] + # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 14, 15, 14, 15, + 14, 15, 14, 15, + 3, 4, 0, 0, + 0, 0, 0, 0, + 0, 0, 12, 13, + 12, 13, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 16, 13, 16, + 13, 16, 13, 16, + 6, 5, 0, 0, + 0, 0, 0, 0, + 0, 0, 15, 14, + 15, 14, 0, 0 ] - Name: ExpectedOut3 Format: Int64 Stride: 32 - Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] + # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread + # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 14, 15, 16, 0, + 14, 15, 16, 0, + 14, 15, 16, 0, + 14, 15, 16, 0, + 3, 4, 5, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 12, 13, 14, 0, + 12, 13, 14, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 16, 15, 0, + 13, 16, 15, 0, + 13, 16, 15, 0, + 13, 16, 15, 0, + 6, 5, 4, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 15, 14, 13, 0, + 15, 14, 13, 0, + 0, 0, 0, 0 ] - Name: ExpectedOut4 Format: Int64 Stride: 32 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] + Data: [ 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 14, 15, 16, 13, + 14, 15, 16, 13, + 14, 15, 16, 13, + 14, 15, 16, 13, + 3, 4, 5, 6, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 12, 13, 14, 15, + 12, 13, 14, 15, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 13, 16, 15, 14, + 13, 16, 15, 14, + 13, 16, 15, 14, + 13, 16, 15, 14, + 6, 5, 4, 3, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 15, 14, 13, 12, + 15, 14, 13, 12, + 0, 0, 0, 0 ] - Name: ExpectedOut5 Format: Int64 - Stride: 8 + Stride: 16 Data: [ 1, 2, 3, 4 ] - - Name: UIn - Format: UInt64 - Stride: 32 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] - - Name: UOut1 - Format: UInt64 - Stride: 32 - ZeroInitSize: 128 - - Name: UOut2 - Format: UInt64 - Stride: 32 - ZeroInitSize: 128 - - Name: UOut3 - Format: UInt64 - Stride: 32 - ZeroInitSize: 128 - - Name: UOut4 - Format: UInt64 - Stride: 32 - ZeroInitSize: 128 - - Name: UOut5 - Format: UInt64 - Stride: 32 - ZeroInitSize: 32 - - Name: UExpectedOut1 - Format: UInt64 - Stride: 32 - Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ] - - Name: UExpectedOut2 - Format: UInt64 - Stride: 32 - Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ] - - Name: UExpectedOut3 - Format: UInt64 - Stride: 32 - Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ] - - Name: UExpectedOut4 - Format: UInt64 - Stride: 32 - Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ] - - Name: UExpectedOut5 - Format: UInt64 - Stride: 8 - Data: [ 1, 2, 3, 4 ] - Results: - Result: ExpectedOut1 Rule: BufferExact @@ -208,26 +249,6 @@ Results: Rule: BufferExact Actual: Out5 Expected: ExpectedOut5 - - Result: UExpectedOut1 - Rule: BufferExact - Actual: UOut1 - Expected: UExpectedOut1 - - Result: UExpectedOut2 - Rule: BufferExact - Actual: UOut2 - Expected: UExpectedOut2 - - Result: UExpectedOut3 - Rule: BufferExact - Actual: UOut3 - Expected: UExpectedOut3 - - Result: UExpectedOut4 - Rule: BufferExact - Actual: UOut4 - Expected: UExpectedOut4 - - Result: UExpectedOut5 - Rule: BufferExact - Actual: UOut5 - Expected: UExpectedOut5 DescriptorSets: - Resources: - Name: In @@ -272,52 +293,19 @@ DescriptorSets: Space: 0 VulkanBinding: Binding: 5 - - Name: UIn + - Name: Masks Kind: StructuredBuffer DirectXBinding: Register: 6 Space: 0 VulkanBinding: Binding: 6 - - Name: UOut1 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 7 - Space: 0 - VulkanBinding: - Binding: 7 - - Name: UOut2 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 8 - Space: 0 - VulkanBinding: - Binding: 8 - - Name: UOut3 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 9 - Space: 0 - VulkanBinding: - Binding: 9 - - Name: UOut4 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 10 - Space: 0 - VulkanBinding: - Binding: 10 - - Name: UOut5 - Kind: RWStructuredBuffer - DirectXBinding: - Register: 11 - Space: 0 - VulkanBinding: - Binding: 11 ... #--- end +# REQUIRES: Int64 + # Bug https://github.com/llvm/llvm-project/issues/156775 # XFAIL: Clang