From dba563fa030b2e3cf80899d0db3dce53540f6327 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Thu, 4 Sep 2025 15:32:58 -0700
Subject: [PATCH 01/11] fp16 seems to work

---
 test/WaveOps/WaveActiveMax.fp16.test | 177 +++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 test/WaveOps/WaveActiveMax.fp16.test
diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test
new file mode 100644
index 000000000..dc0dde2b1
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.fp16.test
@@ -0,0 +1,177 @@
+#--- source.hlsl
+StructuredBuffer<half4> In  : register(t0);
+RWStructuredBuffer<half4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<half4> Out2 : register(u2); // test half2
+RWStructuredBuffer<half4> Out3 : register(u3); // test half3
+RWStructuredBuffer<half4> Out4 : register(u4); // test half4
+RWStructuredBuffer<half4> Out5 : register(u5); // constant folding
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    half4 v = In[tid.x];
+
+    half s1 = WaveActiveMax( v.x );
+    half s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
+    half s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
+    half s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
+
+    half2 v2_1 = WaveActiveMax( v.xy );
+    half2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : half2(0,0);
+    half2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : half2(0,0);
+    half2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : half2(0,0);
+
+    half3 v3_1 = WaveActiveMax( v.xyz );
+    half3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : half3(0,0,0);
+    half3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : half3(0,0,0);
+    half3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : half3(0,0,0);
+
+    half4 v4_1 = WaveActiveMax( v );
+    half4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : half4(0,0,0,0);
+    half4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : half4(0,0,0,0);
+    half4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : half4(0,0,0,0);
+
+    half scalars[4] = { s4, s3, s2, s1 };
+    half2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
+    half3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
+    half4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(half4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Float16
+    Stride: 8
+    # 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000
+    Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ]
+  - Name: Out1
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 32  
+  - Name: Out2
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: Out3
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: Out4
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: Out5
+    Format: Float16
+    Stride: 8
+    ZeroInitSize: 8
+  - Name: ExpectedOut1
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x0, 0x0, 0x0, 0x4000, 0x0, 0x0, 0x0, 0x4200, 0x0, 0x0, 0x0, 0x4400, 0x0, 0x0, 0x0 ]
+  - Name: ExpectedOut2
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x4900, 0x0, 0x0, 0x4000, 0x4d00, 0x0, 0x0, 0x4200, 0x4f80, 0x0, 0x0, 0x4400, 0x5100, 0x0, 0x0 ]
+  - Name: ExpectedOut3
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x4900, 0x5640, 0x0, 0x4000, 0x4d00, 0x5a40, 0x0, 0x4200, 0x4f80, 0x5cb0, 0x0, 0x4400, 0x5100, 0x5e40, 0x0 ]
+  - Name: ExpectedOut4
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ]
+  - Name: ExpectedOut5
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3C00, 0x4000, 0x4200, 0x4400 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+
+...
+#--- end
+
+# XFAIL: Clang
+
+
+
+# RUN: split-file %s %t
+# RUN: %if !Vulkan %{ %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl %}
+# RUN: %if Vulkan %{ %dxc_target -enable-16bit-types -T cs_6_5 -fspv-target-env=vulkan1.1 -Fo %t.o %t/source.hlsl %}
+# RUN: %offloader %t/pipeline.yaml %t.o 

From 8fec3b6c3ebfe097ddb1bc71f6da9526c8c5582c Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Fri, 5 Sep 2025 12:07:42 -0700
Subject: [PATCH 02/11] add uint variants

---
 test/WaveOps/WaveActiveMax.fp16.test  |   3 +-
 test/WaveOps/WaveActiveMax.fp32.test  | 175 ++++++++++++++
 test/WaveOps/WaveActiveMax.fp64.test  | 177 ++++++++++++++
 test/WaveOps/WaveActiveMax.int16.test | 327 ++++++++++++++++++++++++++
 test/WaveOps/WaveActiveMax.int32.test | 327 ++++++++++++++++++++++++++
 test/WaveOps/WaveActiveMax.int64.test | 327 ++++++++++++++++++++++++++
 6 files changed, 1334 insertions(+), 2 deletions(-)
 create mode 100644 test/WaveOps/WaveActiveMax.fp32.test
 create mode 100644 test/WaveOps/WaveActiveMax.fp64.test
 create mode 100644 test/WaveOps/WaveActiveMax.int16.test
 create mode 100644 test/WaveOps/WaveActiveMax.int32.test
 create mode 100644 test/WaveOps/WaveActiveMax.int64.test

diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test
index dc0dde2b1..41e80e27f 100644
--- a/test/WaveOps/WaveActiveMax.fp16.test
+++ b/test/WaveOps/WaveActiveMax.fp16.test
@@ -172,6 +172,5 @@ DescriptorSets:
 
 
 # RUN: split-file %s %t
-# RUN: %if !Vulkan %{ %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl %}
-# RUN: %if Vulkan %{ %dxc_target -enable-16bit-types -T cs_6_5 -fspv-target-env=vulkan1.1 -Fo %t.o %t/source.hlsl %}
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
 # RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.fp32.test b/test/WaveOps/WaveActiveMax.fp32.test
new file mode 100644
index 000000000..9aeba5943
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.fp32.test
@@ -0,0 +1,175 @@
+#--- source.hlsl
+StructuredBuffer<float4> In  : register(t0);
+RWStructuredBuffer<float4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<float4> Out2 : register(u2); // test float2
+RWStructuredBuffer<float4> Out3 : register(u3); // test float3
+RWStructuredBuffer<float4> Out4 : register(u4); // test float4
+RWStructuredBuffer<float4> Out5 : register(u5); // constant folding
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    float4 v = In[tid.x];
+
+    float s1 = WaveActiveMax( v.x );
+    float s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
+    float s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
+    float s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
+
+    float2 v2_1 = WaveActiveMax( v.xy );
+    float2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : float2(0,0);
+    float2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : float2(0,0);
+    float2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : float2(0,0);
+
+    float3 v3_1 = WaveActiveMax( v.xyz );
+    float3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : float3(0,0,0);
+    float3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : float3(0,0,0);
+    float3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : float3(0,0,0);
+
+    float4 v4_1 = WaveActiveMax( v );
+    float4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : float4(0,0,0,0);
+    float4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : float4(0,0,0,0);
+    float4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : float4(0,0,0,0);
+
+    float scalars[4] = { s4, s3, s2, s1 };
+    float2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
+    float3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
+    float4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(float4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ]
+  - Name: Out1
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 64  
+  - Name: Out2
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: Out3
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: Out4
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: Out5
+    Format: Float32
+    Stride: 16
+    ZeroInitSize: 16
+  - Name: ExpectedOut1
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0 ]
+  - Name: ExpectedOut2
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 10.0, 0.0, 0.0, 2.0, 20.0, 0.0, 0.0, 3.0, 30.0, 0.0, 0.0, 4.0, 40.0, 0.0, 0.0 ]
+  - Name: ExpectedOut3
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 10.0, 100.0, 0.0, 2.0, 20.0, 200.0, 0.0, 3.0, 30.0, 300.0, 0.0, 4.0, 40.0, 400.0, 0.0 ]
+  - Name: ExpectedOut4
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ]
+  - Name: ExpectedOut5
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 2.0, 3.0, 4.0 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+
+...
+#--- end
+
+# XFAIL: Clang
+
+
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.fp64.test b/test/WaveOps/WaveActiveMax.fp64.test
new file mode 100644
index 000000000..feac35dea
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.fp64.test
@@ -0,0 +1,177 @@
+#--- source.hlsl
+StructuredBuffer<double4> In  : register(t0);
+RWStructuredBuffer<double4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<double4> Out2 : register(u2); // test double2
+RWStructuredBuffer<double4> Out3 : register(u3); // test double3
+RWStructuredBuffer<double4> Out4 : register(u4); // test double4
+RWStructuredBuffer<double4> Out5 : register(u5); // constant folding
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    double4 v = In[tid.x];
+
+    double s1 = WaveActiveMax( v.x );
+    double s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
+    double s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
+    double s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
+
+    double2 v2_1 = WaveActiveMax( v.xy );
+    double2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : double2(0,0);
+    double2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : double2(0,0);
+    double2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : double2(0,0);
+
+    double3 v3_1 = WaveActiveMax( v.xyz );
+    double3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : double3(0,0,0);
+    double3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : double3(0,0,0);
+    double3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : double3(0,0,0);
+
+    double4 v4_1 = WaveActiveMax( v );
+    double4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : double4(0,0,0,0);
+    double4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : double4(0,0,0,0);
+    double4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : double4(0,0,0,0);
+
+    double scalars[4] = { s4, s3, s2, s1 };
+    double2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
+    double3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
+    double4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(double4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ]
+  - Name: Out1
+    Format: Float64
+    Stride: 32
+    ZeroInitSize: 128  
+  - Name: Out2
+    Format: Float64
+    Stride: 32
+    ZeroInitSize: 128
+  - Name: Out3
+    Format: Float64
+    Stride: 32
+    ZeroInitSize: 128
+  - Name: Out4
+    Format: Float64
+    Stride: 32
+    ZeroInitSize: 128
+  - Name: Out5
+    Format: Float64
+    Stride: 32
+    ZeroInitSize: 32
+  - Name: ExpectedOut1
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0 ]
+  - Name: ExpectedOut2
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 10.0, 0.0, 0.0, 2.0, 20.0, 0.0, 0.0, 3.0, 30.0, 0.0, 0.0, 4.0, 40.0, 0.0, 0.0 ]
+  - Name: ExpectedOut3
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 10.0, 100.0, 0.0, 2.0, 20.0, 200.0, 0.0, 3.0, 30.0, 300.0, 0.0, 4.0, 40.0, 400.0, 0.0 ]
+  - Name: ExpectedOut4
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ]
+  - Name: ExpectedOut5
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 2.0, 3.0, 4.0 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+
+...
+#--- end
+
+# REQUIRES: Double
+
+# Bug https://github.com/llvm/llvm-project/issues/156775
+# XFAIL: Clang
+
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test
new file mode 100644
index 000000000..81546cc91
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.int16.test
@@ -0,0 +1,327 @@
+#--- source.hlsl
+StructuredBuffer<int16_t4> In  : register(t0);
+RWStructuredBuffer<int16_t4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<int16_t4> Out2 : register(u2); // test int16_t2
+RWStructuredBuffer<int16_t4> Out3 : register(u3); // test int16_t3
+RWStructuredBuffer<int16_t4> Out4 : register(u4); // test int16_t4
+RWStructuredBuffer<int16_t4> Out5 : register(u5); // constant folding
+
+// uints
+StructuredBuffer<uint16_t4> UIn  : register(t6);
+RWStructuredBuffer<uint16_t4> UOut1 : register(u7);
+RWStructuredBuffer<uint16_t4> UOut2 : register(u8);
+RWStructuredBuffer<uint16_t4> UOut3 : register(u9);
+RWStructuredBuffer<uint16_t4> UOut4 : register(u10);
+RWStructuredBuffer<uint16_t4> UOut5 : register(u11);
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    int16_t4 v = In[tid.x];
+
+    int16_t s1 = WaveActiveMax( v.x );
+    int16_t s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
+    int16_t s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
+    int16_t s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
+
+    int16_t2 v2_1 = WaveActiveMax( v.xy );
+    int16_t2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int16_t2(0,0);
+    int16_t2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int16_t2(0,0);
+    int16_t2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int16_t2(0,0);
+
+    int16_t3 v3_1 = WaveActiveMax( v.xyz );
+    int16_t3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0);
+    int16_t3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0);
+    int16_t3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0);
+
+    int16_t4 v4_1 = WaveActiveMax( v );
+    int16_t4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int16_t4(0,0,0,0);
+    int16_t4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int16_t4(0,0,0,0);
+    int16_t4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int16_t4(0,0,0,0);
+
+    int16_t scalars[4] = { s4, s3, s2, s1 };
+    int16_t2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
+    int16_t3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
+    int16_t4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(int16_t4(1,2,3,4));
+
+    // UINT case
+
+    uint16_t4 uv = UIn[tid.x];
+
+    uint16_t us1 = WaveActiveMax( uv.x );
+    uint16_t us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0;
+    uint16_t us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0;
+    uint16_t us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0;
+
+    uint16_t2 uv2_1 = WaveActiveMax( uv.xy );
+    uint16_t2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0);
+    uint16_t2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0);
+    uint16_t2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0);
+
+    uint16_t3 uv3_1 = WaveActiveMax( uv.xyz );
+    uint16_t3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0);
+    uint16_t3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0);
+    uint16_t3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0);
+
+    uint16_t4 uv4_1 = WaveActiveMax( uv );
+    uint16_t4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0);
+    uint16_t4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0);
+    uint16_t4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0);
+
+    uint16_t uscalars[4] = { us4, us3, us2, us1 };
+    uint16_t2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 };
+    uint16_t3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 };
+    uint16_t4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 };    
+
+    UOut1[tid.x].x   = uscalars[tid.x];
+    UOut2[tid.x].xy  = uvec2s[tid.x];
+    UOut3[tid.x].xyz = uvec3s[tid.x];
+    UOut4[tid.x]     = uvec4s[tid.x];
+
+    // constant folding case
+    UOut5[0] = WaveActiveMax(uint16_t4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Int16
+    Stride: 8    
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: Out1
+    Format: Int16
+    Stride: 8
+    ZeroInitSize: 32  
+  - Name: Out2
+    Format: Int16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: Out3
+    Format: Int16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: Out4
+    Format: Int16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: Out5
+    Format: Int16
+    Stride: 8
+    ZeroInitSize: 8
+  - Name: ExpectedOut1
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
+  - Name: ExpectedOut4
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: ExpectedOut5
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 2, 3, 4 ]
+  - Name: UIn
+    Format: UInt16
+    Stride: 2
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: UOut1
+    Format: UInt16
+    Stride: 8
+    ZeroInitSize: 32  
+  - Name: UOut2
+    Format: UInt16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: UOut3
+    Format: UInt16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: UOut4
+    Format: UInt16
+    Stride: 8
+    ZeroInitSize: 32
+  - Name: UOut5
+    Format: UInt16
+    Stride: 8
+    ZeroInitSize: 8
+  - Name: UExpectedOut1
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: UExpectedOut5
+    Format: UInt16
+    Stride: 2
+    Data: [ 1, 2, 3, 4 ]
+
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+  - Result: UExpectedOut5
+    Rule: BufferExact
+    Actual: UOut5
+    Expected: UExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 10
+        Space: 0
+      VulkanBinding:
+        Binding: 10
+    - Name: UOut5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 11
+        Space: 0
+      VulkanBinding:
+        Binding: 11
+
+...
+#--- end
+
+# XFAIL: Clang
+
+
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test
new file mode 100644
index 000000000..4d1204bd0
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.int32.test
@@ -0,0 +1,327 @@
+#--- source.hlsl
+StructuredBuffer<int4> In  : register(t0);
+RWStructuredBuffer<int4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<int4> Out2 : register(u2); // test int2
+RWStructuredBuffer<int4> Out3 : register(u3); // test int3
+RWStructuredBuffer<int4> Out4 : register(u4); // test int4
+RWStructuredBuffer<int4> Out5 : register(u5); // constant folding
+
+// uints
+StructuredBuffer<uint4> UIn  : register(t6);
+RWStructuredBuffer<uint4> UOut1 : register(u7);
+RWStructuredBuffer<uint4> UOut2 : register(u8);
+RWStructuredBuffer<uint4> UOut3 : register(u9);
+RWStructuredBuffer<uint4> UOut4 : register(u10);
+RWStructuredBuffer<uint4> UOut5 : register(u11);
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    int4 v = In[tid.x];
+
+    int s1 = WaveActiveMax( v.x );
+    int s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
+    int s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
+    int s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
+
+    int2 v2_1 = WaveActiveMax( v.xy );
+    int2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int2(0,0);
+    int2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int2(0,0);
+    int2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int2(0,0);
+
+    int3 v3_1 = WaveActiveMax( v.xyz );
+    int3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int3(0,0,0);
+    int3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int3(0,0,0);
+    int3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int3(0,0,0);
+
+    int4 v4_1 = WaveActiveMax( v );
+    int4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int4(0,0,0,0);
+    int4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int4(0,0,0,0);
+    int4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int4(0,0,0,0);
+
+    int scalars[4] = { s4, s3, s2, s1 };
+    int2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
+    int3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
+    int4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(int4(1,2,3,4));
+
+    // UINT case
+
+    uint4 uv = UIn[tid.x];
+
+    uint us1 = WaveActiveMax( uv.x );
+    uint us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0;
+    uint us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0;
+    uint us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0;
+
+    uint2 uv2_1 = WaveActiveMax( uv.xy );
+    uint2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint2(0,0);
+    uint2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint2(0,0);
+    uint2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint2(0,0);
+
+    uint3 uv3_1 = WaveActiveMax( uv.xyz );
+    uint3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0);
+    uint3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0);
+    uint3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0);
+
+    uint4 uv4_1 = WaveActiveMax( uv );
+    uint4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint4(0,0,0,0);
+    uint4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint4(0,0,0,0);
+    uint4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint4(0,0,0,0);
+
+    uint uscalars[4] = { us4, us3, us2, us1 };
+    uint2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 };
+    uint3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 };
+    uint4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 };    
+
+    UOut1[tid.x].x   = uscalars[tid.x];
+    UOut2[tid.x].xy  = uvec2s[tid.x];
+    UOut3[tid.x].xyz = uvec3s[tid.x];
+    UOut4[tid.x]     = uvec4s[tid.x];
+
+    // constant folding case
+    UOut5[0] = WaveActiveMax(uint4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Int32
+    Stride: 16    
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: Out1
+    Format: Int32
+    Stride: 16
+    ZeroInitSize: 64  
+  - Name: Out2
+    Format: Int32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: Out3
+    Format: Int32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: Out4
+    Format: Int32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: Out5
+    Format: Int32
+    Stride: 16
+    ZeroInitSize: 16
+  - Name: ExpectedOut1
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
+  - Name: ExpectedOut4
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: ExpectedOut5
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 2, 3, 4 ]
+  - Name: UIn
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: UOut1
+    Format: UInt32
+    Stride: 16
+    ZeroInitSize: 64  
+  - Name: UOut2
+    Format: UInt32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: UOut3
+    Format: UInt32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: UOut4
+    Format: UInt32
+    Stride: 16
+    ZeroInitSize: 64
+  - Name: UOut5
+    Format: UInt32
+    Stride: 16
+    ZeroInitSize: 16
+  - Name: UExpectedOut1
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: UExpectedOut5
+    Format: UInt32
+    Stride: 4
+    Data: [ 1, 2, 3, 4 ]
+
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+  - Result: UExpectedOut5
+    Rule: BufferExact
+    Actual: UOut5
+    Expected: UExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 10
+        Space: 0
+      VulkanBinding:
+        Binding: 10
+    - Name: UOut5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 11
+        Space: 0
+      VulkanBinding:
+        Binding: 11
+
+...
+#--- end
+
+# XFAIL: Clang
+
+
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test
new file mode 100644
index 000000000..e6956cec6
--- /dev/null
+++ b/test/WaveOps/WaveActiveMax.int64.test
@@ -0,0 +1,327 @@
+#--- source.hlsl
+StructuredBuffer<int64_t4> In  : register(t0);
+RWStructuredBuffer<int64_t4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<int64_t4> Out2 : register(u2); // test int64_t2
+RWStructuredBuffer<int64_t4> Out3 : register(u3); // test int64_t3
+RWStructuredBuffer<int64_t4> Out4 : register(u4); // test int64_t4
+RWStructuredBuffer<int64_t4> Out5 : register(u5); // constant folding
+
+// uint64_ts
+StructuredBuffer<uint64_t4> UIn  : register(t6);
+RWStructuredBuffer<uint64_t4> UOut1 : register(u7);
+RWStructuredBuffer<uint64_t4> UOut2 : register(u8);
+RWStructuredBuffer<uint64_t4> UOut3 : register(u9);
+RWStructuredBuffer<uint64_t4> UOut4 : register(u10);
+RWStructuredBuffer<uint64_t4> UOut5 : register(u11);
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    int64_t4 v = In[tid.x];
+
+    int64_t s1 = WaveActiveMax( v.x );
+    int64_t s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
+    int64_t s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
+    int64_t s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
+
+    int64_t2 v2_1 = WaveActiveMax( v.xy );
+    int64_t2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int64_t2(0,0);
+    int64_t2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int64_t2(0,0);
+    int64_t2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int64_t2(0,0);
+
+    int64_t3 v3_1 = WaveActiveMax( v.xyz );
+    int64_t3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0);
+    int64_t3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0);
+    int64_t3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0);
+
+    int64_t4 v4_1 = WaveActiveMax( v );
+    int64_t4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int64_t4(0,0,0,0);
+    int64_t4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int64_t4(0,0,0,0);
+    int64_t4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int64_t4(0,0,0,0);
+
+    int64_t scalars[4] = { s4, s3, s2, s1 };
+    int64_t2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
+    int64_t3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
+    int64_t4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+
+    // constant folding case
+    Out5[0] = WaveActiveMax(int64_t4(1,2,3,4));
+
+    // UINT64_t case
+
+    uint64_t4 uv = UIn[tid.x];
+
+    uint64_t us1 = WaveActiveMax( uv.x );
+    uint64_t us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0;
+    uint64_t us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0;
+    uint64_t us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0;
+
+    uint64_t2 uv2_1 = WaveActiveMax( uv.xy );
+    uint64_t2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0);
+    uint64_t2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0);
+    uint64_t2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0);
+
+    uint64_t3 uv3_1 = WaveActiveMax( uv.xyz );
+    uint64_t3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0);
+    uint64_t3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0);
+    uint64_t3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0);
+
+    uint64_t4 uv4_1 = WaveActiveMax( uv );
+    uint64_t4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0);
+    uint64_t4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0);
+    uint64_t4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0);
+
+    uint64_t uscalars[4] = { us4, us3, us2, us1 };
+    uint64_t2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 };
+    uint64_t3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 };
+    uint64_t4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 };    
+
+    UOut1[tid.x].x   = uscalars[tid.x];
+    UOut2[tid.x].xy  = uvec2s[tid.x];
+    UOut3[tid.x].xyz = uvec3s[tid.x];
+    UOut4[tid.x]     = uvec4s[tid.x];
+
+    // constant folding case
+    UOut5[0] = WaveActiveMax(uint64_t4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [1, 1, 1]
+Buffers:
+  - Name: In
+    Format: Int64
+    Stride: 32    
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: Out1
+    Format: Int64
+    Stride: 32
+    ZeroInitSize: 128  
+  - Name: Out2
+    Format: Int64
+    Stride: 32
+    ZeroInitSize: 128
+  - Name: Out3
+    Format: Int64
+    Stride: 32
+    ZeroInitSize: 128
+  - Name: Out4
+    Format: Int64
+    Stride: 32
+    ZeroInitSize: 128
+  - Name: Out5
+    Format: Int64
+    Stride: 32
+    ZeroInitSize: 32
+  - Name: ExpectedOut1
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
+  - Name: ExpectedOut4
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: ExpectedOut5
+    Format: Int64
+    Stride: 8
+    Data: [ 1, 2, 3, 4 ]
+  - Name: UIn
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: UOut1
+    Format: UInt64
+    Stride: 32
+    ZeroInitSize: 128  
+  - Name: UOut2
+    Format: UInt64
+    Stride: 32
+    ZeroInitSize: 128
+  - Name: UOut3
+    Format: UInt64
+    Stride: 32
+    ZeroInitSize: 128
+  - Name: UOut4
+    Format: UInt64
+    Stride: 32
+    ZeroInitSize: 128
+  - Name: UOut5
+    Format: UInt64
+    Stride: 32
+    ZeroInitSize: 32
+  - Name: UExpectedOut1
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+  - Name: UExpectedOut5
+    Format: UInt64
+    Stride: 8
+    Data: [ 1, 2, 3, 4 ]
+
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+  - Result: UExpectedOut5
+    Rule: BufferExact
+    Actual: UOut5
+    Expected: UExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 10
+        Space: 0
+      VulkanBinding:
+        Binding: 10
+    - Name: UOut5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 11
+        Space: 0
+      VulkanBinding:
+        Binding: 11
+
+...
+#--- end
+
+# XFAIL: Clang
+
+
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 

From f810b53bd2a6cde32acabdef3a5cd215ebbda6b1 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Fri, 5 Sep 2025 12:26:19 -0700
Subject: [PATCH 03/11] add metal xfail, binding offset errors

---
 test/WaveOps/WaveActiveMax.fp16.test  | 3 ++-
 test/WaveOps/WaveActiveMax.fp32.test  | 3 ++-
 test/WaveOps/WaveActiveMax.fp64.test  | 2 ++
 test/WaveOps/WaveActiveMax.int16.test | 3 ++-
 test/WaveOps/WaveActiveMax.int32.test | 3 ++-
 test/WaveOps/WaveActiveMax.int64.test | 3 ++-
 6 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test
index 41e80e27f..8c2dfc122 100644
--- a/test/WaveOps/WaveActiveMax.fp16.test
+++ b/test/WaveOps/WaveActiveMax.fp16.test
@@ -169,7 +169,8 @@ DescriptorSets:
 
 # XFAIL: Clang
 
-
+# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
 
 # RUN: split-file %s %t
 # RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
diff --git a/test/WaveOps/WaveActiveMax.fp32.test b/test/WaveOps/WaveActiveMax.fp32.test
index 9aeba5943..6155e821f 100644
--- a/test/WaveOps/WaveActiveMax.fp32.test
+++ b/test/WaveOps/WaveActiveMax.fp32.test
@@ -168,7 +168,8 @@ DescriptorSets:
 
 # XFAIL: Clang
 
-
+# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
 
 # RUN: split-file %s %t
 # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
diff --git a/test/WaveOps/WaveActiveMax.fp64.test b/test/WaveOps/WaveActiveMax.fp64.test
index feac35dea..043a9e99a 100644
--- a/test/WaveOps/WaveActiveMax.fp64.test
+++ b/test/WaveOps/WaveActiveMax.fp64.test
@@ -171,6 +171,8 @@ DescriptorSets:
 # Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
+# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
 
 # RUN: split-file %s %t
 # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test
index 81546cc91..4a9cb7c7d 100644
--- a/test/WaveOps/WaveActiveMax.int16.test
+++ b/test/WaveOps/WaveActiveMax.int16.test
@@ -320,7 +320,8 @@ DescriptorSets:
 
 # XFAIL: Clang
 
-
+# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
 
 # RUN: split-file %s %t
 # RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test
index 4d1204bd0..80b9a88aa 100644
--- a/test/WaveOps/WaveActiveMax.int32.test
+++ b/test/WaveOps/WaveActiveMax.int32.test
@@ -320,7 +320,8 @@ DescriptorSets:
 
 # XFAIL: Clang
 
-
+# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
 
 # RUN: split-file %s %t
 # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test
index e6956cec6..caf7a3b7d 100644
--- a/test/WaveOps/WaveActiveMax.int64.test
+++ b/test/WaveOps/WaveActiveMax.int64.test
@@ -320,7 +320,8 @@ DescriptorSets:
 
 # XFAIL: Clang
 
-
+# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# XFAIL: Metal
 
 # RUN: split-file %s %t
 # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl

From 69c5020aeabe9f7e7c1aae70a2d7bf31654b9c8d Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Fri, 5 Sep 2025 12:52:38 -0700
Subject: [PATCH 04/11] add XFAIL for warp

---
 test/WaveOps/WaveActiveMax.int64.test | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test
index caf7a3b7d..f7f769129 100644
--- a/test/WaveOps/WaveActiveMax.int64.test
+++ b/test/WaveOps/WaveActiveMax.int64.test
@@ -320,9 +320,12 @@ DescriptorSets:
 
 # XFAIL: Clang
 
-# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# Bug https://github.com/llvm/offload-test-suite/issues/393
 # XFAIL: Metal
 
+# Bug https://github.com/llvm/offload-test-suite/issues/430
+# XFAIL: Warp
+
 # RUN: split-file %s %t
 # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
 # RUN: %offloader %t/pipeline.yaml %t.o 

From c27263399294fe456ef1624501c241c7456606b7 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Fri, 5 Sep 2025 13:35:56 -0700
Subject: [PATCH 05/11] change case of WARP

---
 test/WaveOps/WaveActiveMax.int64.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test
index f7f769129..5a3ae612c 100644
--- a/test/WaveOps/WaveActiveMax.int64.test
+++ b/test/WaveOps/WaveActiveMax.int64.test
@@ -324,7 +324,7 @@ DescriptorSets:
 # XFAIL: Metal
 
 # Bug https://github.com/llvm/offload-test-suite/issues/430
-# XFAIL: Warp
+# XFAIL: WARP
 
 # RUN: split-file %s %t
 # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl

From 601f2fa6979da2e7ba9132743533f39e1894e877 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Fri, 5 Sep 2025 14:08:05 -0700
Subject: [PATCH 06/11] try directx-warp

---
 test/WaveOps/WaveActiveMax.int64.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test
index 5a3ae612c..60cb7fda2 100644
--- a/test/WaveOps/WaveActiveMax.int64.test
+++ b/test/WaveOps/WaveActiveMax.int64.test
@@ -324,7 +324,7 @@ DescriptorSets:
 # XFAIL: Metal
 
 # Bug https://github.com/llvm/offload-test-suite/issues/430
-# XFAIL: WARP
+# XFAIL: DirectX-WARP
 
 # RUN: split-file %s %t
 # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl

From 31759d74c9498c515cb2d85707c5625036f3cb76 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Mon, 8 Sep 2025 10:28:56 -0700
Subject: [PATCH 07/11] add Bug to clang xfails

---
 test/WaveOps/WaveActiveMax.fp16.test  | 3 ++-
 test/WaveOps/WaveActiveMax.fp32.test  | 1 +
 test/WaveOps/WaveActiveMax.int16.test | 1 +
 test/WaveOps/WaveActiveMax.int32.test | 1 +
 test/WaveOps/WaveActiveMax.int64.test | 1 +
 5 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test
index 8c2dfc122..a79eb90ca 100644
--- a/test/WaveOps/WaveActiveMax.fp16.test
+++ b/test/WaveOps/WaveActiveMax.fp16.test
@@ -167,9 +167,10 @@ DescriptorSets:
 ...
 #--- end
 
+# Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
-# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# Bug https://github.com/llvm/offload-test-suite/issues/393
 # XFAIL: Metal
 
 # RUN: split-file %s %t
diff --git a/test/WaveOps/WaveActiveMax.fp32.test b/test/WaveOps/WaveActiveMax.fp32.test
index 6155e821f..4b7fcaf7f 100644
--- a/test/WaveOps/WaveActiveMax.fp32.test
+++ b/test/WaveOps/WaveActiveMax.fp32.test
@@ -166,6 +166,7 @@ DescriptorSets:
 ...
 #--- end
 
+# Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
 # Tracked by https://github.com/llvm/offload-test-suite/issues/393
diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test
index 4a9cb7c7d..d10207fdd 100644
--- a/test/WaveOps/WaveActiveMax.int16.test
+++ b/test/WaveOps/WaveActiveMax.int16.test
@@ -318,6 +318,7 @@ DescriptorSets:
 ...
 #--- end
 
+# Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
 # Tracked by https://github.com/llvm/offload-test-suite/issues/393
diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test
index 80b9a88aa..d8b5a9e85 100644
--- a/test/WaveOps/WaveActiveMax.int32.test
+++ b/test/WaveOps/WaveActiveMax.int32.test
@@ -318,6 +318,7 @@ DescriptorSets:
 ...
 #--- end
 
+# Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
 # Tracked by https://github.com/llvm/offload-test-suite/issues/393
diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test
index 60cb7fda2..ebe252ca9 100644
--- a/test/WaveOps/WaveActiveMax.int64.test
+++ b/test/WaveOps/WaveActiveMax.int64.test
@@ -318,6 +318,7 @@ DescriptorSets:
 ...
 #--- end
 
+# Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
 # Bug https://github.com/llvm/offload-test-suite/issues/393

From 302183977b818b547b15cf4cfff585765dfe4f86 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Mon, 8 Sep 2025 16:58:32 -0700
Subject: [PATCH 08/11] add bug instead of tracked by

---
 test/WaveOps/WaveActiveMax.fp64.test  | 2 +-
 test/WaveOps/WaveActiveMax.int16.test | 2 +-
 test/WaveOps/WaveActiveMax.int32.test | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/WaveOps/WaveActiveMax.fp64.test b/test/WaveOps/WaveActiveMax.fp64.test
index 043a9e99a..50f9ed873 100644
--- a/test/WaveOps/WaveActiveMax.fp64.test
+++ b/test/WaveOps/WaveActiveMax.fp64.test
@@ -171,7 +171,7 @@ DescriptorSets:
 # Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
-# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# Bug https://github.com/llvm/offload-test-suite/issues/393
 # XFAIL: Metal
 
 # RUN: split-file %s %t
diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test
index d10207fdd..496fb1591 100644
--- a/test/WaveOps/WaveActiveMax.int16.test
+++ b/test/WaveOps/WaveActiveMax.int16.test
@@ -321,7 +321,7 @@ DescriptorSets:
 # Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
-# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# Bug https://github.com/llvm/offload-test-suite/issues/393
 # XFAIL: Metal
 
 # RUN: split-file %s %t
diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test
index d8b5a9e85..0aa6a43ba 100644
--- a/test/WaveOps/WaveActiveMax.int32.test
+++ b/test/WaveOps/WaveActiveMax.int32.test
@@ -321,7 +321,7 @@ DescriptorSets:
 # Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
-# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# Bug https://github.com/llvm/offload-test-suite/issues/393
 # XFAIL: Metal
 
 # RUN: split-file %s %t

From 3b234aadd2163022d237967bc55885295c2a04bc Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Thu, 25 Sep 2025 17:09:59 -0700
Subject: [PATCH 09/11] make fp16 test more robust, addressing Tex

---
 test/WaveOps/WaveActiveMax.fp16.test | 601 +++++++++++++++++++++++++--
 1 file changed, 555 insertions(+), 46 deletions(-)

diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test
index a79eb90ca..08b17a417 100644
--- a/test/WaveOps/WaveActiveMax.fp16.test
+++ b/test/WaveOps/WaveActiveMax.fp16.test
@@ -1,50 +1,43 @@
 #--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 16
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    int mask[NUM_THREADS];
+};
+
 StructuredBuffer<half4> In  : register(t0);
-RWStructuredBuffer<half4> Out1 : register(u1); // test scalar
-RWStructuredBuffer<half4> Out2 : register(u2); // test half2
+RWStructuredBuffer<half> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<half2> Out2 : register(u2); // test half2
 RWStructuredBuffer<half4> Out3 : register(u3); // test half3
 RWStructuredBuffer<half4> Out4 : register(u4); // test half4
 RWStructuredBuffer<half4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
+
 
-[numthreads(4,1,1)]
+[numthreads(NUM_THREADS,1,1)]
 void main(uint3 tid : SV_GroupThreadID)
 {
-    half4 v = In[tid.x];
-
-    half s1 = WaveActiveMax( v.x );
-    half s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
-    half s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
-    half s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
-
-    half2 v2_1 = WaveActiveMax( v.xy );
-    half2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : half2(0,0);
-    half2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : half2(0,0);
-    half2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : half2(0,0);
-
-    half3 v3_1 = WaveActiveMax( v.xyz );
-    half3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : half3(0,0,0);
-    half3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : half3(0,0,0);
-    half3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : half3(0,0,0);
-
-    half4 v4_1 = WaveActiveMax( v );
-    half4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : half4(0,0,0,0);
-    half4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : half4(0,0,0,0);
-    half4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : half4(0,0,0,0);
-
-    half scalars[4] = { s4, s3, s2, s1 };
-    half2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
-    half3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
-    half4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
-
-    Out1[tid.x].x   = scalars[tid.x];
-    Out2[tid.x].xy  = vec2s[tid.x];
-    Out3[tid.x].xyz = vec3s[tid.x];
-    Out4[tid.x]     = vec4s[tid.x];
+    for (int ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (int MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            half4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
 
     // constant folding case
     Out5[0] = WaveActiveMax(half4(1,2,3,4));
 }
 
+
 //--- pipeline.yaml
 
 ---
@@ -56,44 +49,553 @@ Buffers:
   - Name: In
     Format: Float16
     Stride: 8
-    # 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000
-    Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ]
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 16 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    0x2000, 0x2200, 0x2400, 0x2800, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    0x2A00, 0x2C00, 0x2E00, 0x3000, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    0x3200, 0x3400, 0x3600, 0x3800,
+    0x3900, 0x3A00, 0x3B00, 0x3BC0,
+    0x2200, 0x2400, 0x2800, 0x2A00, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    0x2C00, 0x2E00, 0x3000, 0x3200,
+    0x3400, 0x3600, 0x3800, 0x3900,
+    0x3A00, 0x3B00, 0x3BC0, 0x2000,
+    0x2400, 0x2800, 0x2A00, 0x2C00,
+    0x2E00, 0x3000, 0x3200, 0x3400,
+    0x3600, 0x3800, 0x3900, 0x3A00,
+    0x3B00, 0x3BC0, 0x2000, 0x2200,
+    0x2800, 0x2A00, 0x2C00, 0x2E00,
+    0x3000, 0x3200, 0x3400, 0x3600,
+    0x3800, 0x3900, 0x3A00, 0x3B00,
+    0x3BC0, 0x2000, 0x2200, 0x2400,
+    0x2A00, 0x2C00, 0x2E00, 0x3000,
+    0x3200, 0x3400, 0x3600, 0x3800,
+    0x3900, 0x3A00, 0x3B00, 0x3BC0,
+    0x2000, 0x2200, 0x2400, 0x2800,
+    0x2C00, 0x2E00, 0x3000, 0x3200,
+    0x3400, 0x3600, 0x3800, 0x3900,
+    0x3A00, 0x3B00, 0x3BC0, 0x2000,
+    0x2200, 0x2400, 0x2800, 0x2A00,
+    0x2E00, 0x3000, 0x3200, 0x3400,
+    0x3600, 0x3800, 0x3900, 0x3A00,
+    0x3B00, 0x3BC0, 0x2000, 0x2200,
+    0x2400, 0x2800, 0x2A00, 0x2C00,
+    0x3000, 0x3200, 0x3400, 0x3600,
+    0x3800, 0x3900, 0x3A00, 0x3B00,
+    0x3BC0, 0x2000, 0x2200, 0x2400,
+    0x2800, 0x2A00, 0x2C00, 0x2E00,
+    0x3200, 0x3400, 0x3600, 0x3800,
+    0x3900, 0x3A00, 0x3B00, 0x3BC0,
+    0x2000, 0x2200, 0x2400, 0x2800,
+    0x2A00, 0x2C00, 0x2E00, 0x3000,
+    0x3400, 0x3600, 0x3800, 0x3900,
+    0x3A00, 0x3B00, 0x3BC0, 0x2000,
+    0x2200, 0x2400, 0x2800, 0x2A00,
+    0x2C00, 0x2E00, 0x3000, 0x3200,
+    0x3600, 0x3800, 0x3900, 0x3A00,
+    0x3B00, 0x3BC0, 0x2000, 0x2200,
+    0x2400, 0x2800, 0x2A00, 0x2C00,
+    0x2E00, 0x3000, 0x3200, 0x3400,
+    0x3800, 0x3900, 0x3A00, 0x3B00,
+    0x3BC0, 0x2000, 0x2200, 0x2400,
+    0x2800, 0x2A00, 0x2C00, 0x2E00,
+    0x3000, 0x3200, 0x3400, 0x3600,
+    0x3900, 0x3A00, 0x3B00, 0x3BC0,
+    0x2000, 0x2200, 0x2400, 0x2800,
+    0x2A00, 0x2C00, 0x2E00, 0x3000,
+    0x3200, 0x3400, 0x3600, 0x3800,
+    0x3A00, 0x3B00, 0x3BC0, 0x2000,
+    0x2200, 0x2400, 0x2800, 0x2A00,
+    0x2C00, 0x2E00, 0x3000, 0x3200,
+    0x3400, 0x3600, 0x3800, 0x3900,
+    0x3B00, 0x3BC0, 0x2000, 0x2200,
+    0x2400, 0x2800, 0x2A00, 0x2C00,
+    0x2E00, 0x3000, 0x3200, 0x3400,
+    0x3600, 0x3800, 0x3900, 0x3A00,
+    0x3BC0, 0x2000, 0x2200, 0x2400,
+    0x2800, 0x2A00, 0x2C00, 0x2E00,
+    0x3000, 0x3200, 0x3400, 0x3600,
+    0x3800, 0x3900, 0x3A00, 0x3B00,
+    0x2800, 0x2400, 0x2200, 0x2000, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    0x3000, 0x2E00, 0x2C00, 0x2A00, 
+    0x3800, 0x3600, 0x3400, 0x3200, 
+    0x3BC0, 0x3B00, 0x3A00, 0x3900, 
+    0x2A00, 0x2800, 0x2400, 0x2200, 
+    0x3200, 0x3000, 0x2E00, 0x2C00, 
+    0x3900, 0x3800, 0x3600, 0x3400, 
+    0x2000, 0x3BC0, 0x3B00, 0x3A00, 
+    0x2C00, 0x2A00, 0x2800, 0x2400, 
+    0x3400, 0x3200, 0x3000, 0x2E00, 
+    0x3A00, 0x3900, 0x3800, 0x3600, 
+    0x2200, 0x2000, 0x3BC0, 0x3B00, 
+    0x2E00, 0x2C00, 0x2A00, 0x2800, 
+    0x3600, 0x3400, 0x3200, 0x3000, 
+    0x3B00, 0x3A00, 0x3900, 0x3800, 
+    0x2400, 0x2200, 0x2000, 0x3BC0, 
+    0x3000, 0x2E00, 0x2C00, 0x2A00, 
+    0x3800, 0x3600, 0x3400, 0x3200, 
+    0x3BC0, 0x3B00, 0x3A00, 0x3900, 
+    0x2800, 0x2400, 0x2200, 0x2000, 
+    0x3200, 0x3000, 0x2E00, 0x2C00, 
+    0x3900, 0x3800, 0x3600, 0x3400, 
+    0x2000, 0x3BC0, 0x3B00, 0x3A00, 
+    0x2A00, 0x2800, 0x2400, 0x2200, 
+    0x3400, 0x3200, 0x3000, 0x2E00, 
+    0x3A00, 0x3900, 0x3800, 0x3600, 
+    0x2200, 0x2000, 0x3BC0, 0x3B00, 
+    0x2C00, 0x2A00, 0x2800, 0x2400, 
+    0x3600, 0x3400, 0x3200, 0x3000, 
+    0x3B00, 0x3A00, 0x3900, 0x3800, 
+    0x2400, 0x2200, 0x2000, 0x3BC0, 
+    0x2E00, 0x2C00, 0x2A00, 0x2800, 
+    0x3800, 0x3600, 0x3400, 0x3200, 
+    0x3BC0, 0x3B00, 0x3A00, 0x3900, 
+    0x2800, 0x2400, 0x2200, 0x2000, 
+    0x3000, 0x2E00, 0x2C00, 0x2A00, 
+    0x3900, 0x3800, 0x3600, 0x3400, 
+    0x2000, 0x3BC0, 0x3B00, 0x3A00, 
+    0x2A00, 0x2800, 0x2400, 0x2200, 
+    0x3200, 0x3000, 0x2E00, 0x2C00, 
+    0x3A00, 0x3900, 0x3800, 0x3600, 
+    0x2200, 0x2000, 0x3BC0, 0x3B00, 
+    0x2C00, 0x2A00, 0x2800, 0x2400, 
+    0x3400, 0x3200, 0x3000, 0x2E00, 
+    0x3B00, 0x3A00, 0x3900, 0x3800, 
+    0x2400, 0x2200, 0x2000, 0x3BC0, 
+    0x2E00, 0x2C00, 0x2A00, 0x2800, 
+    0x3600, 0x3400, 0x3200, 0x3000, 
+    0x3BC0, 0x3B00, 0x3A00, 0x3900, 
+    0x2800, 0x2400, 0x2200, 0x2000, 
+    0x3000, 0x2E00, 0x2C00, 0x2A00, 
+    0x3800, 0x3600, 0x3400, 0x3200, 
+    0x2000, 0x3BC0, 0x3B00, 0x3A00, 
+    0x2A00, 0x2800, 0x2400, 0x2200, 
+    0x3200, 0x3000, 0x2E00, 0x2C00, 
+    0x3900, 0x3800, 0x3600, 0x3400, 
+    0x2200, 0x2000, 0x3BC0, 0x3B00, 
+    0x2C00, 0x2A00, 0x2800, 0x2400, 
+    0x3400, 0x3200, 0x3000, 0x2E00, 
+    0x3A00, 0x3900, 0x3800, 0x3600, 
+    0x2400, 0x2200, 0x2000, 0x3BC0, 
+    0x2E00, 0x2C00, 0x2A00, 0x2800, 
+    0x3600, 0x3400, 0x3200, 0x3000, 
+    0x3B00, 0x3A00, 0x3900, 0x3800 ]
+
   - Name: Out1
     Format: Float16
-    Stride: 8
-    ZeroInitSize: 32  
+    Stride: 2
+    # 1 half is 2 bytes, * 4 halves for 4 threads, * 16 thread masks, * 2 value sets
+    ZeroInitSize: 256  
   - Name: Out2
     Format: Float16
-    Stride: 8
-    ZeroInitSize: 32
+    Stride: 4
+    ZeroInitSize: 512
   - Name: Out3
     Format: Float16
     Stride: 8
-    ZeroInitSize: 32
+    ZeroInitSize: 1024
   - Name: Out4
     Format: Float16
     Stride: 8
-    ZeroInitSize: 32
+    ZeroInitSize: 1024
   - Name: Out5
     Format: Float16
     Stride: 8
     ZeroInitSize: 8
+  - Name: Masks
+    Format: Int32
+    Stride: 8
+    # 16 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 0 0 0 1
+    # 0 0 1 0
+    # 0 0 1 1
+    # 0 1 0 0
+    # 0 1 0 1
+    # 0 1 1 0
+    # 0 1 1 1
+    # 1 0 0 0
+    # 1 0 0 1
+    # 1 0 1 0
+    # 1 0 1 1
+    # 1 1 0 0
+    # 1 1 0 1
+    # 1 1 1 0
+    # 1 1 1 1
+    Data: [ 
+    0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1]
   - Name: ExpectedOut1
     Format: Float16
     Stride: 8
-    Data: [ 0x3c00, 0x0, 0x0, 0x0, 0x4000, 0x0, 0x0, 0x0, 0x4200, 0x0, 0x0, 0x0, 0x4400, 0x0, 0x0, 0x0 ]
+    # 2 value sets, 16 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0x0, 0x0, 0x0, 0x0, 
+             0x0, 0x0, 0x0, 0x3A00, 
+             0x0, 0x0, 0x3600, 0x0, 
+             0x0, 0x0, 0x3BC0, 0x3BC0, 
+             0x0, 0x3200, 0x0, 0x0, 
+             0x0, 0x3400, 0x0, 0x3400, 
+             0x0, 0x3B00, 0x3B00, 0x0, 
+             0x0, 0x3BC0, 0x3BC0, 0x3BC0, 
+             0x3200, 0x0, 0x0, 0x0, 
+             0x3400, 0x0, 0x0, 0x3400, 
+             0x3600, 0x0, 0x3600, 0x0,
+             0x3800, 0x0, 0x3800, 0x3800, 
+             0x3900, 0x3900, 0x0, 0x0,
+             0x3A00, 0x3A00, 0x0, 0x3A00, 
+             0x3B00, 0x3B00, 0x3B00, 0x0,
+             0x3BC0, 0x3BC0, 0x3BC0, 0x3BC0, 
+             0x0, 0x0, 0x0, 0x0,
+             0x0, 0x0, 0x0, 0x2000, 
+             0x0, 0x0, 0x3A00, 0x0, 
+             0x0, 0x0, 0x3B00, 0x3B00, 
+             0x0, 0x3800, 0x0, 0x0, 
+             0x0, 0x3900, 0x0, 0x3900, 
+             0x0, 0x3A00, 0x3A00, 0x0, 
+             0x0, 0x3B00, 0x3B00, 0x3B00, 
+             0x3800, 0x0, 0x0, 0x0, 
+             0x3900, 0x0, 0x0, 0x3900, 
+             0x3A00, 0x0, 0x3A00, 0x0, 
+             0x3B00, 0x0, 0x3B00, 0x3B00, 
+             0x3BC0, 0x3BC0, 0x0, 0x0, 
+             0x3900, 0x3900, 0x0, 0x3900, 
+             0x3400, 0x3400, 0x3400, 0x0, 
+             0x3B00, 0x3B00, 0x3B00, 0x3B00 ]
   - Name: ExpectedOut2
     Format: Float16
     Stride: 8
-    Data: [ 0x3c00, 0x4900, 0x0, 0x0, 0x4000, 0x4d00, 0x0, 0x0, 0x4200, 0x4f80, 0x0, 0x0, 0x4400, 0x5100, 0x0, 0x0 ]
+    # 2 value sets, 16 masks per value set, 4 threads per mask, 1 result value per thread
+    Data: [ 0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x3A00, 0x3B00, 
+            0x0, 0x0, 0x0, 0x0,
+            0x3600, 0x3800, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3BC0, 0x3900, 0x3BC0, 0x3900,
+            0x0, 0x0, 0x3200, 0x3400,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x3400, 0x3600,
+            0x0, 0x0, 0x3400, 0x3600,
+            0x0, 0x0, 0x3B00, 0x3BC0,
+            0x3B00, 0x3BC0, 0x0, 0x0,
+            0x0, 0x0, 0x3BC0, 0x3900,
+            0x3BC0, 0x3900, 0x3BC0, 0x3900,
+            0x3200, 0x3400, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3400, 0x3600, 0x0, 0x0,
+            0x0, 0x0, 0x3400, 0x3600,
+            0x3600, 0x3800, 0x0, 0x0,
+            0x3600, 0x3800, 0x0, 0x0,
+            0x3800, 0x3900, 0x0, 0x0,
+            0x3800, 0x3900, 0x3800, 0x3900,
+            0x3900, 0x3A00, 0x3900, 0x3A00,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3B00, 0x3A00, 0x3B00,
+            0x0, 0x0, 0x3A00, 0x3B00,
+            0x3B00, 0x3BC0, 0x3B00, 0x3BC0,
+            0x3B00, 0x3BC0, 0x0, 0x0,
+            0x3BC0, 0x3900, 0x3BC0, 0x3900,
+            0x3BC0, 0x3900, 0x3BC0, 0x3900,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x2000, 0x3BC0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3900, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3B00, 0x3A00,
+            0x0, 0x0, 0x3800, 0x3600,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x3900, 0x3800,
+            0x0, 0x0, 0x3900, 0x3800,
+            0x0, 0x0, 0x3A00, 0x3900,
+            0x3A00, 0x3900, 0x0, 0x0,
+            0x0, 0x0, 0x3B00, 0x3A00,
+            0x3B00, 0x3A00, 0x3B00, 0x3A00,
+            0x3800, 0x3600, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3800, 0x0, 0x0,
+            0x0, 0x0, 0x3900, 0x3800,
+            0x3A00, 0x3900, 0x0, 0x0,
+            0x3A00, 0x3900, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3B00, 0x3A00,
+            0x3BC0, 0x3B00, 0x3BC0, 0x3B00,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3BC0, 0x3900, 0x3BC0,
+            0x0, 0x0, 0x3900, 0x3BC0,
+            0x3400, 0x3200, 0x3400, 0x3200,
+            0x3400, 0x3200, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3B00, 0x3A00,
+            0x3B00, 0x3A00, 0x3B00, 0x3A00 ]
   - Name: ExpectedOut3
     Format: Float16
     Stride: 8
-    Data: [ 0x3c00, 0x4900, 0x5640, 0x0, 0x4000, 0x4d00, 0x5a40, 0x0, 0x4200, 0x4f80, 0x5cb0, 0x0, 0x4400, 0x5100, 0x5e40, 0x0 ]
+    # 2 value sets, 16 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
+    Data: [ 0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3B00, 0x3BC0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3600, 0x3800, 0x3900, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3200, 0x3400, 0x3600, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3400, 0x3600, 0x3800, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3400, 0x3600, 0x3800, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3BC0, 0x3900, 0x0,
+            0x3B00, 0x3BC0, 0x3900, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x0,
+            0x3200, 0x3400, 0x3600, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3400, 0x3600, 0x3800, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3400, 0x3600, 0x3800, 0x0,
+            0x3600, 0x3800, 0x3900, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3600, 0x3800, 0x3900, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3800, 0x3900, 0x3A00, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3800, 0x3900, 0x3A00, 0x0,
+            0x3800, 0x3900, 0x3A00, 0x0,
+            0x3900, 0x3A00, 0x3B00, 0x0,
+            0x3900, 0x3A00, 0x3B00, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3B00, 0x3BC0, 0x0,
+            0x3A00, 0x3B00, 0x3BC0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3B00, 0x3BC0, 0x0,
+            0x3B00, 0x3BC0, 0x3200, 0x0,
+            0x3B00, 0x3BC0, 0x3200, 0x0,
+            0x3B00, 0x3BC0, 0x3200, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x2000, 0x3BC0, 0x3B00, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3900, 0x3800, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3800, 0x3600, 0x3400, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3800, 0x3600, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3800, 0x3600, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3900, 0x3BC0, 0x0,
+            0x3A00, 0x3900, 0x3BC0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x3800, 0x3600, 0x3400, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3800, 0x3600, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3800, 0x3600, 0x0,
+            0x3A00, 0x3900, 0x3800, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3900, 0x3800, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x3BC0, 0x3B00, 0x3A00, 0x0,
+            0x3BC0, 0x3B00, 0x3A00, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3BC0, 0x3B00, 0x0,
+            0x3900, 0x3BC0, 0x3B00, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3BC0, 0x3B00, 0x0,
+            0x3400, 0x3200, 0x3BC0, 0x0,
+            0x3400, 0x3200, 0x3BC0, 0x0,
+            0x3400, 0x3200, 0x3BC0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x0 ]
   - Name: ExpectedOut4
     Format: Float16
     Stride: 8
-    Data: [ 0x3c00, 0x4900, 0x5640, 0x63d0, 0x4000, 0x4d00, 0x5a40, 0x67d0, 0x4200, 0x4f80, 0x5cb0, 0x69dc, 0x4400, 0x5100, 0x5e40, 0x6bd0 ]
+    Data: [ 0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3B00, 0x3BC0, 0x2000,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3600, 0x3800, 0x3900, 0x3A00,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x3B00,
+            0x3BC0, 0x3900, 0x3A00, 0x3B00,
+            0x0, 0x0, 0x0, 0x0,
+            0x3200, 0x3400, 0x3600, 0x3800,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3400, 0x3600, 0x3800, 0x3900,
+            0x0, 0x0, 0x0, 0x0,
+            0x3400, 0x3600, 0x3800, 0x3900,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3BC0, 0x3900, 0x3A00,
+            0x3B00, 0x3BC0, 0x3900, 0x3A00,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x3B00,
+            0x3BC0, 0x3900, 0x3A00, 0x3B00,
+            0x3BC0, 0x3900, 0x3A00, 0x3B00,
+            0x3200, 0x3400, 0x3600, 0x3800,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3400, 0x3600, 0x3800, 0x3900,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3400, 0x3600, 0x3800, 0x3900,
+            0x3600, 0x3800, 0x3900, 0x3A00,
+            0x0, 0x0, 0x0, 0x0,
+            0x3600, 0x3800, 0x3900, 0x3A00,
+            0x0, 0x0, 0x0, 0x0,
+            0x3800, 0x3900, 0x3A00, 0x3B00,
+            0x0, 0x0, 0x0, 0x0,
+            0x3800, 0x3900, 0x3A00, 0x3B00,
+            0x3800, 0x3900, 0x3A00, 0x3B00,
+            0x3900, 0x3A00, 0x3B00, 0x3BC0,
+            0x3900, 0x3A00, 0x3B00, 0x3BC0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3B00, 0x3BC0, 0x3900,
+            0x3A00, 0x3B00, 0x3BC0, 0x3900,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3B00, 0x3BC0, 0x3900,
+            0x3B00, 0x3BC0, 0x3200, 0x3400,
+            0x3B00, 0x3BC0, 0x3200, 0x3400,
+            0x3B00, 0x3BC0, 0x3200, 0x3400,
+            0x0, 0x0, 0x0, 0x0,
+            0x3BC0, 0x3900, 0x3A00, 0x3B00,
+            0x3BC0, 0x3900, 0x3A00, 0x3B00,
+            0x3BC0, 0x3900, 0x3A00, 0x3B00,
+            0x3BC0, 0x3900, 0x3A00, 0x3B00,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x2000, 0x3BC0, 0x3B00, 0x3A00,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3900, 0x3800, 0x3600,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x3BC0,
+            0x3B00, 0x3A00, 0x3900, 0x3BC0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3800, 0x3600, 0x3400, 0x3200,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3800, 0x3600, 0x3400,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3800, 0x3600, 0x3400,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3900, 0x3BC0, 0x3B00,
+            0x3A00, 0x3900, 0x3BC0, 0x3B00,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x3BC0,
+            0x3B00, 0x3A00, 0x3900, 0x3BC0,
+            0x3B00, 0x3A00, 0x3900, 0x3BC0,
+            0x3800, 0x3600, 0x3400, 0x3200,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3800, 0x3600, 0x3400,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3800, 0x3600, 0x3400,
+            0x3A00, 0x3900, 0x3800, 0x3600,
+            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3900, 0x3800, 0x3600,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x3800,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x3800,
+            0x3B00, 0x3A00, 0x3900, 0x3800,
+            0x3BC0, 0x3B00, 0x3A00, 0x3900,
+            0x3BC0, 0x3B00, 0x3A00, 0x3900,
+            0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3BC0, 0x3B00, 0x3A00,
+            0x3900, 0x3BC0, 0x3B00, 0x3A00,
+            0x0, 0x0, 0x0, 0x0,
+            0x3900, 0x3BC0, 0x3B00, 0x3A00,
+            0x3400, 0x3200, 0x3BC0, 0x3B00,
+            0x3400, 0x3200, 0x3BC0, 0x3B00,
+            0x3400, 0x3200, 0x3BC0, 0x3B00,
+            0x0, 0x0, 0x0, 0x0,
+            0x3B00, 0x3A00, 0x3900, 0x3BC0,
+            0x3B00, 0x3A00, 0x3900, 0x3BC0,
+            0x3B00, 0x3A00, 0x3900, 0x3BC0,
+            0x3B00, 0x3A00, 0x3900, 0x3BC0 ]
   - Name: ExpectedOut5
     Format: Float16
     Stride: 8
@@ -163,6 +665,13 @@ DescriptorSets:
         Space: 0
       VulkanBinding:
         Binding: 5
+    - Name: Masks
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
 
 ...
 #--- end

From ab089ec96cd72bf74c32b3269a5cf0de2525b8be Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Fri, 26 Sep 2025 13:54:41 -0700
Subject: [PATCH 10/11] use proper filter syntax

---
 test/WaveOps/WaveActiveMax.int64.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test
index ebe252ca9..418727387 100644
--- a/test/WaveOps/WaveActiveMax.int64.test
+++ b/test/WaveOps/WaveActiveMax.int64.test
@@ -325,7 +325,7 @@ DescriptorSets:
 # XFAIL: Metal
 
 # Bug https://github.com/llvm/offload-test-suite/issues/430
-# XFAIL: DirectX-WARP
+# XFAIL: DirectX && WARP
 
 # RUN: split-file %s %t
 # RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl

From 0bf8a4337dcb1d394bae66449067875848fe72a5 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Fri, 3 Oct 2025 18:24:29 -0700
Subject: [PATCH 11/11] simplify to 4 masks and apply changes to all tests

---
 test/WaveOps/WaveActiveMax.fp16.test  | 560 +++++---------------------
 test/WaveOps/WaveActiveMax.fp32.test  | 236 ++++++++---
 test/WaveOps/WaveActiveMax.fp64.test  | 250 +++++++++---
 test/WaveOps/WaveActiveMax.int16.test | 368 ++++++++---------
 test/WaveOps/WaveActiveMax.int32.test | 370 ++++++++---------
 test/WaveOps/WaveActiveMax.int64.test | 372 +++++++++--------
 6 files changed, 1009 insertions(+), 1147 deletions(-)

diff --git a/test/WaveOps/WaveActiveMax.fp16.test b/test/WaveOps/WaveActiveMax.fp16.test
index 08b17a417..80a1cda8a 100644
--- a/test/WaveOps/WaveActiveMax.fp16.test
+++ b/test/WaveOps/WaveActiveMax.fp16.test
@@ -1,6 +1,6 @@
 #--- source.hlsl
 #define VALUE_SETS 2
-#define NUM_MASKS 16
+#define NUM_MASKS 4
 #define NUM_THREADS 4
 
 struct MaskStruct {
@@ -51,7 +51,7 @@ Buffers:
     Stride: 8
     # 2 value sets
     # For each value set, 
-    # and for each specific one of the 16 thread masks in that value set,
+    # and for each specific one of the 4 thread masks in that value set,
     # and for each of the 4 threads in that thread mask,
     # there will be a unique set of 4 values, such that 
     # none of the other threads in that thread mask share any values
@@ -72,54 +72,6 @@ Buffers:
     0x3000, 0x3200, 0x3400, 0x3600,
     0x3800, 0x3900, 0x3A00, 0x3B00,
     0x3BC0, 0x2000, 0x2200, 0x2400,
-    0x2A00, 0x2C00, 0x2E00, 0x3000,
-    0x3200, 0x3400, 0x3600, 0x3800,
-    0x3900, 0x3A00, 0x3B00, 0x3BC0,
-    0x2000, 0x2200, 0x2400, 0x2800,
-    0x2C00, 0x2E00, 0x3000, 0x3200,
-    0x3400, 0x3600, 0x3800, 0x3900,
-    0x3A00, 0x3B00, 0x3BC0, 0x2000,
-    0x2200, 0x2400, 0x2800, 0x2A00,
-    0x2E00, 0x3000, 0x3200, 0x3400,
-    0x3600, 0x3800, 0x3900, 0x3A00,
-    0x3B00, 0x3BC0, 0x2000, 0x2200,
-    0x2400, 0x2800, 0x2A00, 0x2C00,
-    0x3000, 0x3200, 0x3400, 0x3600,
-    0x3800, 0x3900, 0x3A00, 0x3B00,
-    0x3BC0, 0x2000, 0x2200, 0x2400,
-    0x2800, 0x2A00, 0x2C00, 0x2E00,
-    0x3200, 0x3400, 0x3600, 0x3800,
-    0x3900, 0x3A00, 0x3B00, 0x3BC0,
-    0x2000, 0x2200, 0x2400, 0x2800,
-    0x2A00, 0x2C00, 0x2E00, 0x3000,
-    0x3400, 0x3600, 0x3800, 0x3900,
-    0x3A00, 0x3B00, 0x3BC0, 0x2000,
-    0x2200, 0x2400, 0x2800, 0x2A00,
-    0x2C00, 0x2E00, 0x3000, 0x3200,
-    0x3600, 0x3800, 0x3900, 0x3A00,
-    0x3B00, 0x3BC0, 0x2000, 0x2200,
-    0x2400, 0x2800, 0x2A00, 0x2C00,
-    0x2E00, 0x3000, 0x3200, 0x3400,
-    0x3800, 0x3900, 0x3A00, 0x3B00,
-    0x3BC0, 0x2000, 0x2200, 0x2400,
-    0x2800, 0x2A00, 0x2C00, 0x2E00,
-    0x3000, 0x3200, 0x3400, 0x3600,
-    0x3900, 0x3A00, 0x3B00, 0x3BC0,
-    0x2000, 0x2200, 0x2400, 0x2800,
-    0x2A00, 0x2C00, 0x2E00, 0x3000,
-    0x3200, 0x3400, 0x3600, 0x3800,
-    0x3A00, 0x3B00, 0x3BC0, 0x2000,
-    0x2200, 0x2400, 0x2800, 0x2A00,
-    0x2C00, 0x2E00, 0x3000, 0x3200,
-    0x3400, 0x3600, 0x3800, 0x3900,
-    0x3B00, 0x3BC0, 0x2000, 0x2200,
-    0x2400, 0x2800, 0x2A00, 0x2C00,
-    0x2E00, 0x3000, 0x3200, 0x3400,
-    0x3600, 0x3800, 0x3900, 0x3A00,
-    0x3BC0, 0x2000, 0x2200, 0x2400,
-    0x2800, 0x2A00, 0x2C00, 0x2E00,
-    0x3000, 0x3200, 0x3400, 0x3600,
-    0x3800, 0x3900, 0x3A00, 0x3B00,
     0x2800, 0x2400, 0x2200, 0x2000, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
     0x3000, 0x2E00, 0x2C00, 0x2A00, 
     0x3800, 0x3600, 0x3400, 0x3200, 
@@ -135,467 +87,143 @@ Buffers:
     0x2E00, 0x2C00, 0x2A00, 0x2800, 
     0x3600, 0x3400, 0x3200, 0x3000, 
     0x3B00, 0x3A00, 0x3900, 0x3800, 
-    0x2400, 0x2200, 0x2000, 0x3BC0, 
-    0x3000, 0x2E00, 0x2C00, 0x2A00, 
-    0x3800, 0x3600, 0x3400, 0x3200, 
-    0x3BC0, 0x3B00, 0x3A00, 0x3900, 
-    0x2800, 0x2400, 0x2200, 0x2000, 
-    0x3200, 0x3000, 0x2E00, 0x2C00, 
-    0x3900, 0x3800, 0x3600, 0x3400, 
-    0x2000, 0x3BC0, 0x3B00, 0x3A00, 
-    0x2A00, 0x2800, 0x2400, 0x2200, 
-    0x3400, 0x3200, 0x3000, 0x2E00, 
-    0x3A00, 0x3900, 0x3800, 0x3600, 
-    0x2200, 0x2000, 0x3BC0, 0x3B00, 
-    0x2C00, 0x2A00, 0x2800, 0x2400, 
-    0x3600, 0x3400, 0x3200, 0x3000, 
-    0x3B00, 0x3A00, 0x3900, 0x3800, 
-    0x2400, 0x2200, 0x2000, 0x3BC0, 
-    0x2E00, 0x2C00, 0x2A00, 0x2800, 
-    0x3800, 0x3600, 0x3400, 0x3200, 
-    0x3BC0, 0x3B00, 0x3A00, 0x3900, 
-    0x2800, 0x2400, 0x2200, 0x2000, 
-    0x3000, 0x2E00, 0x2C00, 0x2A00, 
-    0x3900, 0x3800, 0x3600, 0x3400, 
-    0x2000, 0x3BC0, 0x3B00, 0x3A00, 
-    0x2A00, 0x2800, 0x2400, 0x2200, 
-    0x3200, 0x3000, 0x2E00, 0x2C00, 
-    0x3A00, 0x3900, 0x3800, 0x3600, 
-    0x2200, 0x2000, 0x3BC0, 0x3B00, 
-    0x2C00, 0x2A00, 0x2800, 0x2400, 
-    0x3400, 0x3200, 0x3000, 0x2E00, 
-    0x3B00, 0x3A00, 0x3900, 0x3800, 
-    0x2400, 0x2200, 0x2000, 0x3BC0, 
-    0x2E00, 0x2C00, 0x2A00, 0x2800, 
-    0x3600, 0x3400, 0x3200, 0x3000, 
-    0x3BC0, 0x3B00, 0x3A00, 0x3900, 
-    0x2800, 0x2400, 0x2200, 0x2000, 
-    0x3000, 0x2E00, 0x2C00, 0x2A00, 
-    0x3800, 0x3600, 0x3400, 0x3200, 
-    0x2000, 0x3BC0, 0x3B00, 0x3A00, 
-    0x2A00, 0x2800, 0x2400, 0x2200, 
-    0x3200, 0x3000, 0x2E00, 0x2C00, 
-    0x3900, 0x3800, 0x3600, 0x3400, 
-    0x2200, 0x2000, 0x3BC0, 0x3B00, 
-    0x2C00, 0x2A00, 0x2800, 0x2400, 
-    0x3400, 0x3200, 0x3000, 0x2E00, 
-    0x3A00, 0x3900, 0x3800, 0x3600, 
-    0x2400, 0x2200, 0x2000, 0x3BC0, 
-    0x2E00, 0x2C00, 0x2A00, 0x2800, 
-    0x3600, 0x3400, 0x3200, 0x3000, 
-    0x3B00, 0x3A00, 0x3900, 0x3800 ]
+    0x2400, 0x2200, 0x2000, 0x3BC0 ]
 
   - Name: Out1
     Format: Float16
     Stride: 2
-    # 1 half is 2 bytes, * 4 halves for 4 threads, * 16 thread masks, * 2 value sets
-    ZeroInitSize: 256  
+    # 1 half is 2 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 64  
   - Name: Out2
     Format: Float16
     Stride: 4
-    ZeroInitSize: 512
+    ZeroInitSize: 128
   - Name: Out3
     Format: Float16
     Stride: 8
-    ZeroInitSize: 1024
+    ZeroInitSize: 256
   - Name: Out4
     Format: Float16
     Stride: 8
-    ZeroInitSize: 1024
+    ZeroInitSize: 256
   - Name: Out5
     Format: Float16
     Stride: 8
     ZeroInitSize: 8
   - Name: Masks
     Format: Int32
-    Stride: 8
-    # 16 active mask sets for threads 0, 1, 2, 3:
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
     # 0 0 0 0
-    # 0 0 0 1
-    # 0 0 1 0
-    # 0 0 1 1
-    # 0 1 0 0
-    # 0 1 0 1
-    # 0 1 1 0
-    # 0 1 1 1
+    # 1 1 1 1    
     # 1 0 0 0
-    # 1 0 0 1
-    # 1 0 1 0
-    # 1 0 1 1
-    # 1 1 0 0
-    # 1 1 0 1
-    # 1 1 1 0
-    # 1 1 1 1
+    # 0 1 1 0
     Data: [ 
-    0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1]
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
   - Name: ExpectedOut1
     Format: Float16
     Stride: 8
-    # 2 value sets, 16 masks per value set, 4 threads per mask, 1 result value per thread
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
     Data:  [ 0x0, 0x0, 0x0, 0x0, 
-             0x0, 0x0, 0x0, 0x3A00, 
-             0x0, 0x0, 0x3600, 0x0, 
-             0x0, 0x0, 0x3BC0, 0x3BC0, 
-             0x0, 0x3200, 0x0, 0x0, 
-             0x0, 0x3400, 0x0, 0x3400, 
-             0x0, 0x3B00, 0x3B00, 0x0, 
-             0x0, 0x3BC0, 0x3BC0, 0x3BC0, 
-             0x3200, 0x0, 0x0, 0x0, 
-             0x3400, 0x0, 0x0, 0x3400, 
-             0x3600, 0x0, 0x3600, 0x0,
-             0x3800, 0x0, 0x3800, 0x3800, 
-             0x3900, 0x3900, 0x0, 0x0,
-             0x3A00, 0x3A00, 0x0, 0x3A00, 
-             0x3B00, 0x3B00, 0x3B00, 0x0,
-             0x3BC0, 0x3BC0, 0x3BC0, 0x3BC0, 
-             0x0, 0x0, 0x0, 0x0,
-             0x0, 0x0, 0x0, 0x2000, 
-             0x0, 0x0, 0x3A00, 0x0, 
-             0x0, 0x0, 0x3B00, 0x3B00, 
-             0x0, 0x3800, 0x0, 0x0, 
-             0x0, 0x3900, 0x0, 0x3900, 
-             0x0, 0x3A00, 0x3A00, 0x0, 
-             0x0, 0x3B00, 0x3B00, 0x3B00, 
-             0x3800, 0x0, 0x0, 0x0, 
-             0x3900, 0x0, 0x0, 0x3900, 
-             0x3A00, 0x0, 0x3A00, 0x0, 
-             0x3B00, 0x0, 0x3B00, 0x3B00, 
-             0x3BC0, 0x3BC0, 0x0, 0x0, 
-             0x3900, 0x3900, 0x0, 0x3900, 
-             0x3400, 0x3400, 0x3400, 0x0, 
-             0x3B00, 0x3B00, 0x3B00, 0x3B00 ]
+    0x3A00, 0x3A00, 0x3A00, 0x3A00,
+    0x2400, 0x0, 0x0, 0x0, 
+    0x0, 0x3800, 0x3800, 0x0, 
+    0x0, 0x0, 0x0, 0x0, 
+    0x3900, 0x3900, 0x3900, 0x3900, 
+    0x2C00, 0x0, 0x0, 0x0, 
+    0x0, 0x3B00, 0x3B00, 0x0 ]
   - Name: ExpectedOut2
     Format: Float16
     Stride: 8
-    # 2 value sets, 16 masks per value set, 4 threads per mask, 1 result value per thread
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
     Data: [ 0x0, 0x0, 0x0, 0x0, 
-            0x0, 0x0, 0x0, 0x0, 
-            0x0, 0x0, 0x0, 0x0, 
-            0x0, 0x0, 0x3A00, 0x3B00, 
-            0x0, 0x0, 0x0, 0x0,
-            0x3600, 0x3800, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3BC0, 0x3900, 0x3BC0, 0x3900,
-            0x0, 0x0, 0x3200, 0x3400,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x3400, 0x3600,
-            0x0, 0x0, 0x3400, 0x3600,
-            0x0, 0x0, 0x3B00, 0x3BC0,
-            0x3B00, 0x3BC0, 0x0, 0x0,
-            0x0, 0x0, 0x3BC0, 0x3900,
-            0x3BC0, 0x3900, 0x3BC0, 0x3900,
-            0x3200, 0x3400, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3400, 0x3600, 0x0, 0x0,
-            0x0, 0x0, 0x3400, 0x3600,
-            0x3600, 0x3800, 0x0, 0x0,
-            0x3600, 0x3800, 0x0, 0x0,
-            0x3800, 0x3900, 0x0, 0x0,
-            0x3800, 0x3900, 0x3800, 0x3900,
-            0x3900, 0x3A00, 0x3900, 0x3A00,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3B00, 0x3A00, 0x3B00,
-            0x0, 0x0, 0x3A00, 0x3B00,
-            0x3B00, 0x3BC0, 0x3B00, 0x3BC0,
-            0x3B00, 0x3BC0, 0x0, 0x0,
-            0x3BC0, 0x3900, 0x3BC0, 0x3900,
-            0x3BC0, 0x3900, 0x3BC0, 0x3900,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x2000, 0x3BC0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3900, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3B00, 0x3A00,
-            0x0, 0x0, 0x3800, 0x3600,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x3900, 0x3800,
-            0x0, 0x0, 0x3900, 0x3800,
-            0x0, 0x0, 0x3A00, 0x3900,
-            0x3A00, 0x3900, 0x0, 0x0,
-            0x0, 0x0, 0x3B00, 0x3A00,
-            0x3B00, 0x3A00, 0x3B00, 0x3A00,
-            0x3800, 0x3600, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3800, 0x0, 0x0,
-            0x0, 0x0, 0x3900, 0x3800,
-            0x3A00, 0x3900, 0x0, 0x0,
-            0x3A00, 0x3900, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3B00, 0x3A00,
-            0x3BC0, 0x3B00, 0x3BC0, 0x3B00,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3BC0, 0x3900, 0x3BC0,
-            0x0, 0x0, 0x3900, 0x3BC0,
-            0x3400, 0x3200, 0x3400, 0x3200,
-            0x3400, 0x3200, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3B00, 0x3A00,
-            0x3B00, 0x3A00, 0x3B00, 0x3A00 ]
+    0x0, 0x0, 0x0, 0x0, 
+    0x3A00, 0x3B00, 0x3A00, 0x3B00, 
+    0x3A00, 0x3B00, 0x3A00, 0x3B00, 
+    0x2400, 0x2800, 0x0, 0x0, 
+    0x0, 0x0, 0x0, 0x0, 
+    0x0, 0x0, 0x3800, 0x3900, 
+    0x3800, 0x3900, 0x0, 0x0, 
+    0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 
+    0x3900, 0x3BC0, 0x3900, 0x3BC0,
+    0x3900, 0x3BC0, 0x3900, 0x3BC0, 
+    0x2C00, 0x2A00, 0x0, 0x0, 
+    0x0, 0x0, 0x0, 0x0, 
+    0x0, 0x0, 0x3B00, 0x3A00, 
+    0x3B00, 0x3A00, 0x0, 0x0 ]
   - Name: ExpectedOut3
     Format: Float16
     Stride: 8
-    # 2 value sets, 16 masks per value set, 4 threads per mask, 4 result values per thread
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread
     # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
-    Data: [ 0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3B00, 0x3BC0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3600, 0x3800, 0x3900, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3200, 0x3400, 0x3600, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3400, 0x3600, 0x3800, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3400, 0x3600, 0x3800, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3BC0, 0x3900, 0x0,
-            0x3B00, 0x3BC0, 0x3900, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x0,
-            0x3200, 0x3400, 0x3600, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3400, 0x3600, 0x3800, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3400, 0x3600, 0x3800, 0x0,
-            0x3600, 0x3800, 0x3900, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3600, 0x3800, 0x3900, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3800, 0x3900, 0x3A00, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3800, 0x3900, 0x3A00, 0x0,
-            0x3800, 0x3900, 0x3A00, 0x0,
-            0x3900, 0x3A00, 0x3B00, 0x0,
-            0x3900, 0x3A00, 0x3B00, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3B00, 0x3BC0, 0x0,
-            0x3A00, 0x3B00, 0x3BC0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
+    Data: [ 0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
             0x3A00, 0x3B00, 0x3BC0, 0x0,
-            0x3B00, 0x3BC0, 0x3200, 0x0,
-            0x3B00, 0x3BC0, 0x3200, 0x0,
-            0x3B00, 0x3BC0, 0x3200, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x2000, 0x3BC0, 0x3B00, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3900, 0x3800, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3800, 0x3600, 0x3400, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3800, 0x3600, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3800, 0x3600, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3900, 0x3BC0, 0x0,
-            0x3A00, 0x3900, 0x3BC0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x3800, 0x3600, 0x3400, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3800, 0x3600, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3800, 0x3600, 0x0,
-            0x3A00, 0x3900, 0x3800, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3900, 0x3800, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x3BC0, 0x3B00, 0x3A00, 0x0,
-            0x3BC0, 0x3B00, 0x3A00, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3BC0, 0x3B00, 0x0,
-            0x3900, 0x3BC0, 0x3B00, 0x0,
+            0x3A00, 0x3B00, 0x3BC0, 0x0, 
+            0x3A00, 0x3B00, 0x3BC0, 0x0, 
+            0x3A00, 0x3B00, 0x3BC0, 0x0, 
+            0x2400, 0x2800, 0x2A00, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3800, 0x3900, 0x3A00, 0x0, 
+            0x3800, 0x3900, 0x3A00, 0x0, 
             0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
             0x3900, 0x3BC0, 0x3B00, 0x0,
-            0x3400, 0x3200, 0x3BC0, 0x0,
-            0x3400, 0x3200, 0x3BC0, 0x0,
-            0x3400, 0x3200, 0x3BC0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x0 ]
+            0x3900, 0x3BC0, 0x3B00, 0x0, 
+            0x3900, 0x3BC0, 0x3B00, 0x0, 
+            0x3900, 0x3BC0, 0x3B00, 0x0, 
+            0x2C00, 0x2A00, 0x2800, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3B00, 0x3A00, 0x3900, 0x0, 
+            0x3B00, 0x3A00, 0x3900, 0x0, 
+            0x0, 0x0, 0x0, 0x0 ]
   - Name: ExpectedOut4
     Format: Float16
     Stride: 8
-    Data: [ 0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3B00, 0x3BC0, 0x2000,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3600, 0x3800, 0x3900, 0x3A00,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x3B00,
-            0x3BC0, 0x3900, 0x3A00, 0x3B00,
-            0x0, 0x0, 0x0, 0x0,
-            0x3200, 0x3400, 0x3600, 0x3800,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3400, 0x3600, 0x3800, 0x3900,
-            0x0, 0x0, 0x0, 0x0,
-            0x3400, 0x3600, 0x3800, 0x3900,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3BC0, 0x3900, 0x3A00,
-            0x3B00, 0x3BC0, 0x3900, 0x3A00,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x3B00,
-            0x3BC0, 0x3900, 0x3A00, 0x3B00,
-            0x3BC0, 0x3900, 0x3A00, 0x3B00,
-            0x3200, 0x3400, 0x3600, 0x3800,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3400, 0x3600, 0x3800, 0x3900,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3400, 0x3600, 0x3800, 0x3900,
-            0x3600, 0x3800, 0x3900, 0x3A00,
-            0x0, 0x0, 0x0, 0x0,
-            0x3600, 0x3800, 0x3900, 0x3A00,
-            0x0, 0x0, 0x0, 0x0,
-            0x3800, 0x3900, 0x3A00, 0x3B00,
-            0x0, 0x0, 0x0, 0x0,
-            0x3800, 0x3900, 0x3A00, 0x3B00,
-            0x3800, 0x3900, 0x3A00, 0x3B00,
-            0x3900, 0x3A00, 0x3B00, 0x3BC0,
-            0x3900, 0x3A00, 0x3B00, 0x3BC0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3B00, 0x3BC0, 0x3900,
-            0x3A00, 0x3B00, 0x3BC0, 0x3900,
-            0x0, 0x0, 0x0, 0x0,
+    Data: [ 0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
             0x3A00, 0x3B00, 0x3BC0, 0x3900,
-            0x3B00, 0x3BC0, 0x3200, 0x3400,
-            0x3B00, 0x3BC0, 0x3200, 0x3400,
-            0x3B00, 0x3BC0, 0x3200, 0x3400,
-            0x0, 0x0, 0x0, 0x0,
-            0x3BC0, 0x3900, 0x3A00, 0x3B00,
-            0x3BC0, 0x3900, 0x3A00, 0x3B00,
-            0x3BC0, 0x3900, 0x3A00, 0x3B00,
-            0x3BC0, 0x3900, 0x3A00, 0x3B00,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x2000, 0x3BC0, 0x3B00, 0x3A00,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3900, 0x3800, 0x3600,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x3BC0,
-            0x3B00, 0x3A00, 0x3900, 0x3BC0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3800, 0x3600, 0x3400, 0x3200,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3800, 0x3600, 0x3400,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3800, 0x3600, 0x3400,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3900, 0x3BC0, 0x3B00,
-            0x3A00, 0x3900, 0x3BC0, 0x3B00,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x3BC0,
-            0x3B00, 0x3A00, 0x3900, 0x3BC0,
-            0x3B00, 0x3A00, 0x3900, 0x3BC0,
-            0x3800, 0x3600, 0x3400, 0x3200,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3800, 0x3600, 0x3400,
-            0x0, 0x0, 0x0, 0x0,
-            0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3800, 0x3600, 0x3400,
-            0x3A00, 0x3900, 0x3800, 0x3600,
-            0x0, 0x0, 0x0, 0x0,
-            0x3A00, 0x3900, 0x3800, 0x3600,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x3800,
-            0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x3800,
-            0x3B00, 0x3A00, 0x3900, 0x3800,
-            0x3BC0, 0x3B00, 0x3A00, 0x3900,
-            0x3BC0, 0x3B00, 0x3A00, 0x3900,
-            0x0, 0x0, 0x0, 0x0,
+            0x3A00, 0x3B00, 0x3BC0, 0x3900, 
+            0x3A00, 0x3B00, 0x3BC0, 0x3900, 
+            0x3A00, 0x3B00, 0x3BC0, 0x3900, 
+            0x2400, 0x2800, 0x2A00, 0x2C00, 
+            0x0, 0x0, 0x0, 0x0, 
             0x0, 0x0, 0x0, 0x0,
-            0x3900, 0x3BC0, 0x3B00, 0x3A00,
-            0x3900, 0x3BC0, 0x3B00, 0x3A00,
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3800, 0x3900, 0x3A00, 0x3B00, 
+            0x3800, 0x3900, 0x3A00, 0x3B00, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
             0x0, 0x0, 0x0, 0x0,
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3900, 0x3BC0, 0x3B00, 0x3A00, 
+            0x3900, 0x3BC0, 0x3B00, 0x3A00, 
+            0x3900, 0x3BC0, 0x3B00, 0x3A00, 
             0x3900, 0x3BC0, 0x3B00, 0x3A00,
-            0x3400, 0x3200, 0x3BC0, 0x3B00,
-            0x3400, 0x3200, 0x3BC0, 0x3B00,
-            0x3400, 0x3200, 0x3BC0, 0x3B00,
+            0x2C00, 0x2A00, 0x2800, 0x2400,
             0x0, 0x0, 0x0, 0x0,
-            0x3B00, 0x3A00, 0x3900, 0x3BC0,
-            0x3B00, 0x3A00, 0x3900, 0x3BC0,
-            0x3B00, 0x3A00, 0x3900, 0x3BC0,
-            0x3B00, 0x3A00, 0x3900, 0x3BC0 ]
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x0, 0x0, 0x0, 0x0, 
+            0x3B00, 0x3A00, 0x3900, 0x3800, 
+            0x3B00, 0x3A00, 0x3900, 0x3800, 
+            0x0, 0x0, 0x0, 0x0 ]
   - Name: ExpectedOut5
     Format: Float16
     Stride: 8
diff --git a/test/WaveOps/WaveActiveMax.fp32.test b/test/WaveOps/WaveActiveMax.fp32.test
index 4b7fcaf7f..5f378766d 100644
--- a/test/WaveOps/WaveActiveMax.fp32.test
+++ b/test/WaveOps/WaveActiveMax.fp32.test
@@ -1,50 +1,43 @@
 #--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    float mask[NUM_THREADS];
+};
+
 StructuredBuffer<float4> In  : register(t0);
-RWStructuredBuffer<float4> Out1 : register(u1); // test scalar
-RWStructuredBuffer<float4> Out2 : register(u2); // test float2
+RWStructuredBuffer<float> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<float2> Out2 : register(u2); // test float2
 RWStructuredBuffer<float4> Out3 : register(u3); // test float3
 RWStructuredBuffer<float4> Out4 : register(u4); // test float4
 RWStructuredBuffer<float4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
+
 
-[numthreads(4,1,1)]
+[numthreads(NUM_THREADS,1,1)]
 void main(uint3 tid : SV_GroupThreadID)
 {
-    float4 v = In[tid.x];
-
-    float s1 = WaveActiveMax( v.x );
-    float s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
-    float s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
-    float s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
-
-    float2 v2_1 = WaveActiveMax( v.xy );
-    float2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : float2(0,0);
-    float2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : float2(0,0);
-    float2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : float2(0,0);
-
-    float3 v3_1 = WaveActiveMax( v.xyz );
-    float3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : float3(0,0,0);
-    float3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : float3(0,0,0);
-    float3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : float3(0,0,0);
-
-    float4 v4_1 = WaveActiveMax( v );
-    float4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : float4(0,0,0,0);
-    float4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : float4(0,0,0,0);
-    float4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : float4(0,0,0,0);
-
-    float scalars[4] = { s4, s3, s2, s1 };
-    float2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
-    float3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
-    float4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
-
-    Out1[tid.x].x   = scalars[tid.x];
-    Out2[tid.x].xy  = vec2s[tid.x];
-    Out3[tid.x].xyz = vec3s[tid.x];
-    Out4[tid.x]     = vec4s[tid.x];
+    for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (float MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            float4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
 
     // constant folding case
-    Out5[0] = WaveActiveMax(float4(1,2,3,4));
+    Out5[0] = WaveActiveMax(float4(1.5,2.5,3.5,4.5));
 }
 
+
 //--- pipeline.yaml
 
 ---
@@ -56,47 +49,185 @@ Buffers:
   - Name: In
     Format: Float32
     Stride: 16
-    Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ]
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    1.5, 2.5, 3.5, 4.5, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    5.5, 6.5, 7.5, 8.5, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    9.5, 10.5, 11.5, 12.5,
+    13.5, 14.5, 15.5, 16.5,
+    2.5, 3.5, 4.5, 5.5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    6.5, 7.5, 8.5, 9.5,
+    10.5, 11.5, 12.5, 13.5,
+    14.5, 15.5, 16.5, 1.5,
+    3.5, 4.5, 5.5, 6.5,
+    7.5, 8.5, 9.5, 10.5,
+    11.5, 12.5, 13.5, 14.5,
+    15.5, 16.5, 1.5, 2.5,
+    4.5, 5.5, 6.5, 7.5,
+    8.5, 9.5, 10.5, 11.5,
+    12.5, 13.5, 14.5, 15.5,
+    16.5, 1.5, 2.5, 3.5,
+    4.5, 3.5, 2.5, 1.5, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    8.5, 7.5, 6.5, 5.5, 
+    12.5, 11.5, 10.5, 9.5, 
+    16.5, 15.5, 14.5, 13.5, 
+    5.5, 4.5, 3.5, 2.5, 
+    9.5, 8.5, 7.5, 6.5, 
+    13.5, 12.5, 11.5, 10.5, 
+    1.5, 16.5, 15.5, 14.5, 
+    6.5, 5.5, 4.5, 3.5, 
+    10.5, 9.5, 8.5, 7.5, 
+    14.5, 13.5, 12.5, 11.5, 
+    2.5, 1.5, 16.5, 15.5, 
+    7.5, 6.5, 5.5, 4.5, 
+    11.5, 10.5, 9.5, 8.5, 
+    15.5, 14.5, 13.5, 12.5, 
+    3.5, 2.5, 1.5, 16 ]
+
   - Name: Out1
     Format: Float32
-    Stride: 16
-    ZeroInitSize: 64  
+    Stride: 4
+    # 1 float is 4 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 128  
   - Name: Out2
     Format: Float32
-    Stride: 16
-    ZeroInitSize: 64
+    Stride: 8
+    ZeroInitSize: 256
   - Name: Out3
     Format: Float32
     Stride: 16
-    ZeroInitSize: 64
+    ZeroInitSize: 512
   - Name: Out4
     Format: Float32
     Stride: 16
-    ZeroInitSize: 64
+    ZeroInitSize: 512
   - Name: Out5
     Format: Float32
     Stride: 16
     ZeroInitSize: 16
+  - Name: Masks
+    Format: Float32
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
   - Name: ExpectedOut1
     Format: Float32
     Stride: 16
-    Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0 ]
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0, 0, 0, 0, 
+    14.5, 14.5, 14.5, 14.5,
+    3.5, 0, 0, 0, 
+    0, 12.5, 12.5, 0, 
+    0, 0, 0, 0, 
+    13.5, 13.5, 13.5, 13.5, 
+    6.5, 0, 0, 0, 
+    0, 15.5, 15.5, 0 ]
   - Name: ExpectedOut2
     Format: Float32
     Stride: 16
-    Data: [ 1.0, 10.0, 0.0, 0.0, 2.0, 20.0, 0.0, 0.0, 3.0, 30.0, 0.0, 0.0, 4.0, 40.0, 0.0, 0.0 ]
+    # 2 value sets.5, 4 masks per value set, 4 threads per mask.5, 1 result value per thread
+    Data: [ 0, 0, 0, 0, 
+    0, 0, 0, 0,  
+    14.5, 15.5, 14.5, 15.5, 
+    14.5, 15.5, 14.5, 15.5, 
+    3.5, 4.5, 0, 0, 
+    0, 0, 0, 0,  
+    0, 0, 12.5, 13.5, 
+    12.5, 13.5, 0, 0, 
+    0, 0, 0, 0,
+    0, 0, 0, 0, 
+    13.5, 16.5, 13.5, 16.5,
+    13.5, 16.5, 13.5, 16.5, 
+    6.5, 5.5, 0, 0,, 
+    0, 0, 0, 0, 
+    0, 0,, 15.5, 14.5, 
+    15.5, 14.5, 0, 0 ]
   - Name: ExpectedOut3
     Format: Float32
     Stride: 16
-    Data: [ 1.0, 10.0, 100.0, 0.0, 2.0, 20.0, 200.0, 0.0, 3.0, 30.0, 300.0, 0.0, 4.0, 40.0, 400.0, 0.0 ]
+    # 2 value sets.5, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14.5, 15.5, 16.5, 0,
+            14.5, 15.5, 16.5, 0, 
+            14.5, 15.5, 16.5, 0, 
+            14.5, 15.5, 16.5, 0, 
+            3.5, 4.5, 5.5, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,  
+            12.5, 13.5, 14.5, 0, 
+            12.5, 13.5, 14.5, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,  
+            13.5, 16.5, 15.5, 0,
+            13.5, 16.5, 15.5, 0, 
+            13.5, 16.5, 15.5, 0, 
+            13.5, 16.5, 15.5, 0, 
+            6.5, 5.5, 4.5, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            15.5, 14.5, 13.5, 0, 
+            15.5, 14.5, 13.5, 0, 
+            0, 0, 0, 0 ]
   - Name: ExpectedOut4
     Format: Float32
     Stride: 16
-    Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ]
+    Data: [ 0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            14.5, 15.5, 16.5, 13.5,
+            14.5, 15.5, 16.5, 13.5, 
+            14.5, 15.5, 16.5, 13.5, 
+            14.5, 15.5, 16.5, 13.5, 
+            3.5, 4.5, 5.5, 6.5, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            12.5, 13.5, 14.5, 15.5, 
+            12.5, 13.5, 14.5, 15.5, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5,
+            6.5, 5.5, 4.5, 3.5,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            15.5, 14.5, 13.5, 12.5, 
+            15.5, 14.5, 13.5, 12.5, 
+            0, 0, 0, 0 ]
   - Name: ExpectedOut5
     Format: Float32
-    Stride: 16
-    Data: [ 1.0, 2.0, 3.0, 4.0 ]
+    Stride: 8
+    Data: [ 1.5, 2.5, 3.5, 4.5 ]
 Results:
   - Result: ExpectedOut1
     Rule: BufferExact
@@ -162,6 +293,13 @@ DescriptorSets:
         Space: 0
       VulkanBinding:
         Binding: 5
+    - Name: Masks
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
 
 ...
 #--- end
@@ -169,7 +307,7 @@ DescriptorSets:
 # Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
-# Tracked by https://github.com/llvm/offload-test-suite/issues/393
+# Bug https://github.com/llvm/offload-test-suite/issues/393
 # XFAIL: Metal
 
 # RUN: split-file %s %t
diff --git a/test/WaveOps/WaveActiveMax.fp64.test b/test/WaveOps/WaveActiveMax.fp64.test
index 50f9ed873..91f5b98f8 100644
--- a/test/WaveOps/WaveActiveMax.fp64.test
+++ b/test/WaveOps/WaveActiveMax.fp64.test
@@ -1,50 +1,43 @@
 #--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    double mask[NUM_THREADS];
+};
+
 StructuredBuffer<double4> In  : register(t0);
-RWStructuredBuffer<double4> Out1 : register(u1); // test scalar
-RWStructuredBuffer<double4> Out2 : register(u2); // test double2
+RWStructuredBuffer<double> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<double2> Out2 : register(u2); // test double2
 RWStructuredBuffer<double4> Out3 : register(u3); // test double3
 RWStructuredBuffer<double4> Out4 : register(u4); // test double4
 RWStructuredBuffer<double4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
 
-[numthreads(4,1,1)]
+
+[numthreads(NUM_THREADS,1,1)]
 void main(uint3 tid : SV_GroupThreadID)
 {
-    double4 v = In[tid.x];
-
-    double s1 = WaveActiveMax( v.x );
-    double s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
-    double s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
-    double s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
-
-    double2 v2_1 = WaveActiveMax( v.xy );
-    double2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : double2(0,0);
-    double2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : double2(0,0);
-    double2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : double2(0,0);
-
-    double3 v3_1 = WaveActiveMax( v.xyz );
-    double3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : double3(0,0,0);
-    double3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : double3(0,0,0);
-    double3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : double3(0,0,0);
-
-    double4 v4_1 = WaveActiveMax( v );
-    double4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : double4(0,0,0,0);
-    double4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : double4(0,0,0,0);
-    double4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : double4(0,0,0,0);
-
-    double scalars[4] = { s4, s3, s2, s1 };
-    double2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
-    double3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
-    double4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
-
-    Out1[tid.x].x   = scalars[tid.x];
-    Out2[tid.x].xy  = vec2s[tid.x];
-    Out3[tid.x].xyz = vec3s[tid.x];
-    Out4[tid.x]     = vec4s[tid.x];
+    for (uint ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (double MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            double4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
 
     // constant folding case
-    Out5[0] = WaveActiveMax(double4(1,2,3,4));
+    Out5[0] = WaveActiveMax(double4(1.5,2.5,3.5,4.5));
 }
 
+
 //--- pipeline.yaml
 
 ---
@@ -55,48 +48,186 @@ Shaders:
 Buffers:
   - Name: In
     Format: Float64
-    Stride: 32
-    Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ]
+    Stride: 16
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    1.5, 2.5, 3.5, 4.5, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    5.5, 6.5, 7.5, 8.5, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    9.5, 10.5, 11.5, 12.5,
+    13.5, 14.5, 15.5, 16.5,
+    2.5, 3.5, 4.5, 5.5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    6.5, 7.5, 8.5, 9.5,
+    10.5, 11.5, 12.5, 13.5,
+    14.5, 15.5, 16.5, 1.5,
+    3.5, 4.5, 5.5, 6.5,
+    7.5, 8.5, 9.5, 10.5,
+    11.5, 12.5, 13.5, 14.5,
+    15.5, 16.5, 1.5, 2.5,
+    4.5, 5.5, 6.5, 7.5,
+    8.5, 9.5, 10.5, 11.5,
+    12.5, 13.5, 14.5, 15.5,
+    16.5, 1.5, 2.5, 3.5,
+    4.5, 3.5, 2.5, 1.5, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    8.5, 7.5, 6.5, 5.5, 
+    12.5, 11.5, 10.5, 9.5, 
+    16.5, 15.5, 14.5, 13.5, 
+    5.5, 4.5, 3.5, 2.5, 
+    9.5, 8.5, 7.5, 6.5, 
+    13.5, 12.5, 11.5, 10.5, 
+    1.5, 16.5, 15.5, 14.5, 
+    6.5, 5.5, 4.5, 3.5, 
+    10.5, 9.5, 8.5, 7.5, 
+    14.5, 13.5, 12.5, 11.5, 
+    2.5, 1.5, 16.5, 15.5, 
+    7.5, 6.5, 5.5, 4.5, 
+    11.5, 10.5, 9.5, 8.5, 
+    15.5, 14.5, 13.5, 12.5, 
+    3.5, 2.5, 1.5, 16 ]
+
   - Name: Out1
     Format: Float64
-    Stride: 32
-    ZeroInitSize: 128  
+    Stride: 4
+    # 1 double is 8 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 256  
   - Name: Out2
     Format: Float64
-    Stride: 32
-    ZeroInitSize: 128
+    Stride: 8
+    ZeroInitSize: 512
   - Name: Out3
     Format: Float64
-    Stride: 32
-    ZeroInitSize: 128
+    Stride: 16
+    ZeroInitSize: 1024
   - Name: Out4
     Format: Float64
-    Stride: 32
-    ZeroInitSize: 128
+    Stride: 16
+    ZeroInitSize: 1024
   - Name: Out5
     Format: Float64
     Stride: 32
     ZeroInitSize: 32
+  - Name: Masks
+    Format: Float64
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
   - Name: ExpectedOut1
     Format: Float64
-    Stride: 32
-    Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0 ]
+    Stride: 16
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0, 0, 0, 0, 
+    14.5, 14.5, 14.5, 14.5,
+    3.5, 0, 0, 0, 
+    0, 12.5, 12.5, 0, 
+    0, 0, 0, 0, 
+    13.5, 13.5, 13.5, 13.5, 
+    6.5, 0, 0, 0, 
+    0, 15.5, 15.5, 0 ]
   - Name: ExpectedOut2
     Format: Float64
-    Stride: 32
-    Data: [ 1.0, 10.0, 0.0, 0.0, 2.0, 20.0, 0.0, 0.0, 3.0, 30.0, 0.0, 0.0, 4.0, 40.0, 0.0, 0.0 ]
+    Stride: 16
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data: [ 0, 0, 0, 0, 
+    0, 0, 0, 0,  
+    14.5, 15.5, 14.5, 15.5, 
+    14.5, 15.5, 14.5, 15.5, 
+    3.5, 4.5, 0, 0, 
+    0, 0, 0, 0,  
+    0, 0, 12.5, 13.5, 
+    12.5, 13.5, 0, 0, 
+    0, 0, 0, 0,
+    0, 0, 0, 0, 
+    13.5, 16.5, 13.5, 16.5,
+    13.5, 16.5, 13.5, 16.5, 
+    6.5, 5.5, 0, 0,, 
+    0, 0, 0, 0, 
+    0, 0,, 15.5, 14.5, 
+    15.5, 14.5, 0, 0 ]
   - Name: ExpectedOut3
     Format: Float64
-    Stride: 32
-    Data: [ 1.0, 10.0, 100.0, 0.0, 2.0, 20.0, 200.0, 0.0, 3.0, 30.0, 300.0, 0.0, 4.0, 40.0, 400.0, 0.0 ]
+    Stride: 16
+    # 2 value sets.5, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned.5, so the 3 result values are placed doubleo a 4 element vec
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14.5, 15.5, 16.5, 0,
+            14.5, 15.5, 16.5, 0, 
+            14.5, 15.5, 16.5, 0, 
+            14.5, 15.5, 16.5, 0, 
+            3.5, 4.5, 5.5, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,  
+            12.5, 13.5, 14.5, 0, 
+            12.5, 13.5, 14.5, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,  
+            13.5, 16.5, 15.5, 0,
+            13.5, 16.5, 15.5, 0, 
+            13.5, 16.5, 15.5, 0, 
+            13.5, 16.5, 15.5, 0, 
+            6.5, 5.5, 4.5, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            15.5, 14.5, 13.5, 0, 
+            15.5, 14.5, 13.5, 0, 
+            0, 0, 0, 0 ]
   - Name: ExpectedOut4
     Format: Float64
-    Stride: 32
-    Data: [ 1.0, 10.0, 100.0, 1000.0, 2.0, 20.0, 200.0, 2000.0, 3.0, 30.0, 300.0, 3000.0, 4.0, 40.0, 400.0, 4000.0 ]
+    Stride: 16
+    Data: [ 0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            14.5, 15.5, 16.5, 13.5,
+            14.5, 15.5, 16.5, 13.5, 
+            14.5, 15.5, 16.5, 13.5, 
+            14.5, 15.5, 16.5, 13.5, 
+            3.5, 4.5, 5.5, 6.5, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            12.5, 13.5, 14.5, 15.5, 
+            12.5, 13.5, 14.5, 15.5, 
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5, 
+            13.5, 16.5, 15.5, 14.5,
+            6.5, 5.5, 4.5, 3.5,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            15.5, 14.5, 13.5, 12.5, 
+            15.5, 14.5, 13.5, 12.5, 
+            0, 0, 0, 0 ]
   - Name: ExpectedOut5
     Format: Float64
-    Stride: 32
-    Data: [ 1.0, 2.0, 3.0, 4.0 ]
+    Stride: 8
+    Data: [ 1.5, 2.5, 3.5, 4.5 ]
 Results:
   - Result: ExpectedOut1
     Rule: BufferExact
@@ -162,12 +293,17 @@ DescriptorSets:
         Space: 0
       VulkanBinding:
         Binding: 5
+    - Name: Masks
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
 
 ...
 #--- end
 
-# REQUIRES: Double
-
 # Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang
 
diff --git a/test/WaveOps/WaveActiveMax.int16.test b/test/WaveOps/WaveActiveMax.int16.test
index 496fb1591..1a8689905 100644
--- a/test/WaveOps/WaveActiveMax.int16.test
+++ b/test/WaveOps/WaveActiveMax.int16.test
@@ -1,95 +1,43 @@
 #--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    int mask[NUM_THREADS];
+};
+
 StructuredBuffer<int16_t4> In  : register(t0);
-RWStructuredBuffer<int16_t4> Out1 : register(u1); // test scalar
-RWStructuredBuffer<int16_t4> Out2 : register(u2); // test int16_t2
+RWStructuredBuffer<int16_t> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<int16_t2> Out2 : register(u2); // test int16_t2
 RWStructuredBuffer<int16_t4> Out3 : register(u3); // test int16_t3
 RWStructuredBuffer<int16_t4> Out4 : register(u4); // test int16_t4
 RWStructuredBuffer<int16_t4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
 
-// uints
-StructuredBuffer<uint16_t4> UIn  : register(t6);
-RWStructuredBuffer<uint16_t4> UOut1 : register(u7);
-RWStructuredBuffer<uint16_t4> UOut2 : register(u8);
-RWStructuredBuffer<uint16_t4> UOut3 : register(u9);
-RWStructuredBuffer<uint16_t4> UOut4 : register(u10);
-RWStructuredBuffer<uint16_t4> UOut5 : register(u11);
 
-[numthreads(4,1,1)]
+[numthreads(NUM_THREADS,1,1)]
 void main(uint3 tid : SV_GroupThreadID)
 {
-    int16_t4 v = In[tid.x];
-
-    int16_t s1 = WaveActiveMax( v.x );
-    int16_t s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
-    int16_t s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
-    int16_t s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
-
-    int16_t2 v2_1 = WaveActiveMax( v.xy );
-    int16_t2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int16_t2(0,0);
-    int16_t2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int16_t2(0,0);
-    int16_t2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int16_t2(0,0);
-
-    int16_t3 v3_1 = WaveActiveMax( v.xyz );
-    int16_t3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0);
-    int16_t3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0);
-    int16_t3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int16_t3(0,0,0);
-
-    int16_t4 v4_1 = WaveActiveMax( v );
-    int16_t4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int16_t4(0,0,0,0);
-    int16_t4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int16_t4(0,0,0,0);
-    int16_t4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int16_t4(0,0,0,0);
-
-    int16_t scalars[4] = { s4, s3, s2, s1 };
-    int16_t2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
-    int16_t3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
-    int16_t4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
-
-    Out1[tid.x].x   = scalars[tid.x];
-    Out2[tid.x].xy  = vec2s[tid.x];
-    Out3[tid.x].xyz = vec3s[tid.x];
-    Out4[tid.x]     = vec4s[tid.x];
+    for (int ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (int MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            int16_t4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
 
     // constant folding case
     Out5[0] = WaveActiveMax(int16_t4(1,2,3,4));
-
-    // UINT case
-
-    uint16_t4 uv = UIn[tid.x];
-
-    uint16_t us1 = WaveActiveMax( uv.x );
-    uint16_t us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0;
-    uint16_t us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0;
-    uint16_t us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0;
-
-    uint16_t2 uv2_1 = WaveActiveMax( uv.xy );
-    uint16_t2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0);
-    uint16_t2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0);
-    uint16_t2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint16_t2(0,0);
-
-    uint16_t3 uv3_1 = WaveActiveMax( uv.xyz );
-    uint16_t3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0);
-    uint16_t3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0);
-    uint16_t3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint16_t3(0,0,0);
-
-    uint16_t4 uv4_1 = WaveActiveMax( uv );
-    uint16_t4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0);
-    uint16_t4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0);
-    uint16_t4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint16_t4(0,0,0,0);
-
-    uint16_t uscalars[4] = { us4, us3, us2, us1 };
-    uint16_t2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 };
-    uint16_t3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 };
-    uint16_t4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 };    
-
-    UOut1[tid.x].x   = uscalars[tid.x];
-    UOut2[tid.x].xy  = uvec2s[tid.x];
-    UOut3[tid.x].xyz = uvec3s[tid.x];
-    UOut4[tid.x]     = uvec4s[tid.x];
-
-    // constant folding case
-    UOut5[0] = WaveActiveMax(uint16_t4(1,2,3,4));
 }
 
+
 //--- pipeline.yaml
 
 ---
@@ -100,93 +48,186 @@ Shaders:
 Buffers:
   - Name: In
     Format: Int16
-    Stride: 8    
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+    Stride: 8
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    9, 10, 11, 12,
+    13, 14, 15, 16,
+    2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    6, 7, 8, 9,
+    10, 11, 12, 13,
+    14, 15, 16, 1,
+    3, 4, 5, 6,
+    7, 8, 9, 10,
+    11, 12, 13, 14,
+    15, 16, 1, 2,
+    4, 5, 6, 7,
+    8, 9, 10, 11,
+    12, 13, 14, 15,
+    16, 1, 2, 3,
+    4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    8, 7, 6, 5, 
+    12, 11, 10, 9, 
+    16, 15, 14, 13, 
+    5, 4, 3, 2, 
+    9, 8, 7, 6, 
+    13, 12, 11, 10, 
+    1, 16, 15, 14, 
+    6, 5, 4, 3, 
+    10, 9, 8, 7, 
+    14, 13, 12, 11, 
+    2, 1, 16, 15, 
+    7, 6, 5, 4, 
+    11, 10, 9, 8, 
+    15, 14, 13, 12, 
+    3, 2, 1, 16 ]
+
   - Name: Out1
     Format: Int16
-    Stride: 8
-    ZeroInitSize: 32  
+    Stride: 2
+    # 1 int16_t is 2 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 64  
   - Name: Out2
     Format: Int16
-    Stride: 8
-    ZeroInitSize: 32
+    Stride: 4
+    ZeroInitSize: 128
   - Name: Out3
     Format: Int16
     Stride: 8
-    ZeroInitSize: 32
+    ZeroInitSize: 256
   - Name: Out4
     Format: Int16
     Stride: 8
-    ZeroInitSize: 32
+    ZeroInitSize: 256
   - Name: Out5
     Format: Int16
     Stride: 8
     ZeroInitSize: 8
+  - Name: Masks
+    Format: Int32
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
   - Name: ExpectedOut1
     Format: Int16
     Stride: 8
-    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0, 0, 0, 0, 
+    14, 14, 14, 14,
+    3, 0, 0, 0, 
+    0, 12, 12, 0, 
+    0, 0, 0, 0, 
+    13, 13, 13, 13, 
+    6, 0, 0, 0, 
+    0, 15, 15, 0 ]
   - Name: ExpectedOut2
     Format: Int16
     Stride: 8
-    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data: [ 0, 0, 0, 0, 
+    0, 0, 0, 0, 
+    14, 15, 14, 15, 
+    14, 15, 14, 15, 
+    3, 4, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 12, 13, 
+    12, 13, 0, 0, 
+    0, 0, 0, 0,
+    0, 0, 0, 0, 
+    13, 16, 13, 16,
+    13, 16, 13, 16, 
+    6, 5, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 15, 14, 
+    15, 14, 0, 0 ]
   - Name: ExpectedOut3
     Format: Int16
     Stride: 8
-    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 0,
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            3, 4, 5, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 0, 
+            12, 13, 14, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 0,
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            6, 5, 4, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 0, 
+            15, 14, 13, 0, 
+            0, 0, 0, 0 ]
   - Name: ExpectedOut4
     Format: Int16
     Stride: 8
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 13,
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            3, 4, 5, 6, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 15, 
+            12, 13, 14, 15, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14,
+            6, 5, 4, 3,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 12, 
+            15, 14, 13, 12, 
+            0, 0, 0, 0 ]
   - Name: ExpectedOut5
     Format: Int16
     Stride: 8
     Data: [ 1, 2, 3, 4 ]
-  - Name: UIn
-    Format: UInt16
-    Stride: 2
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
-  - Name: UOut1
-    Format: UInt16
-    Stride: 8
-    ZeroInitSize: 32  
-  - Name: UOut2
-    Format: UInt16
-    Stride: 8
-    ZeroInitSize: 32
-  - Name: UOut3
-    Format: UInt16
-    Stride: 8
-    ZeroInitSize: 32
-  - Name: UOut4
-    Format: UInt16
-    Stride: 8
-    ZeroInitSize: 32
-  - Name: UOut5
-    Format: UInt16
-    Stride: 8
-    ZeroInitSize: 8
-  - Name: UExpectedOut1
-    Format: UInt16
-    Stride: 8
-    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
-  - Name: UExpectedOut2
-    Format: UInt16
-    Stride: 8
-    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
-  - Name: UExpectedOut3
-    Format: UInt16
-    Stride: 8
-    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
-  - Name: UExpectedOut4
-    Format: UInt16
-    Stride: 8
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
-  - Name: UExpectedOut5
-    Format: UInt16
-    Stride: 2
-    Data: [ 1, 2, 3, 4 ]
-
 Results:
   - Result: ExpectedOut1
     Rule: BufferExact
@@ -208,26 +249,6 @@ Results:
     Rule: BufferExact
     Actual: Out5
     Expected: ExpectedOut5
-  - Result: UExpectedOut1
-    Rule: BufferExact
-    Actual: UOut1
-    Expected: UExpectedOut1
-  - Result: UExpectedOut2
-    Rule: BufferExact
-    Actual: UOut2
-    Expected: UExpectedOut2
-  - Result: UExpectedOut3
-    Rule: BufferExact
-    Actual: UOut3
-    Expected: UExpectedOut3
-  - Result: UExpectedOut4
-    Rule: BufferExact
-    Actual: UOut4
-    Expected: UExpectedOut4
-  - Result: UExpectedOut5
-    Rule: BufferExact
-    Actual: UOut5
-    Expected: UExpectedOut5
 DescriptorSets:
   - Resources:
     - Name: In
@@ -272,48 +293,13 @@ DescriptorSets:
         Space: 0
       VulkanBinding:
         Binding: 5
-    - Name: UIn
+    - Name: Masks
       Kind: StructuredBuffer
       DirectXBinding:
         Register: 6
         Space: 0
       VulkanBinding:
         Binding: 6
-    - Name: UOut1
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 7
-        Space: 0
-      VulkanBinding:
-        Binding: 7
-    - Name: UOut2
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 8
-        Space: 0
-      VulkanBinding:
-        Binding: 8
-    - Name: UOut3
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 9
-        Space: 0
-      VulkanBinding:
-        Binding: 9
-    - Name: UOut4
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 10
-        Space: 0
-      VulkanBinding:
-        Binding: 10
-    - Name: UOut5
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 11
-        Space: 0
-      VulkanBinding:
-        Binding: 11
 
 ...
 #--- end
diff --git a/test/WaveOps/WaveActiveMax.int32.test b/test/WaveOps/WaveActiveMax.int32.test
index 0aa6a43ba..721d6c426 100644
--- a/test/WaveOps/WaveActiveMax.int32.test
+++ b/test/WaveOps/WaveActiveMax.int32.test
@@ -1,95 +1,43 @@
 #--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    int mask[NUM_THREADS];
+};
+
 StructuredBuffer<int4> In  : register(t0);
-RWStructuredBuffer<int4> Out1 : register(u1); // test scalar
-RWStructuredBuffer<int4> Out2 : register(u2); // test int2
+RWStructuredBuffer<int> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<int2> Out2 : register(u2); // test int2
 RWStructuredBuffer<int4> Out3 : register(u3); // test int3
 RWStructuredBuffer<int4> Out4 : register(u4); // test int4
 RWStructuredBuffer<int4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
 
-// uints
-StructuredBuffer<uint4> UIn  : register(t6);
-RWStructuredBuffer<uint4> UOut1 : register(u7);
-RWStructuredBuffer<uint4> UOut2 : register(u8);
-RWStructuredBuffer<uint4> UOut3 : register(u9);
-RWStructuredBuffer<uint4> UOut4 : register(u10);
-RWStructuredBuffer<uint4> UOut5 : register(u11);
 
-[numthreads(4,1,1)]
+[numthreads(NUM_THREADS,1,1)]
 void main(uint3 tid : SV_GroupThreadID)
 {
-    int4 v = In[tid.x];
-
-    int s1 = WaveActiveMax( v.x );
-    int s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
-    int s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
-    int s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
-
-    int2 v2_1 = WaveActiveMax( v.xy );
-    int2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int2(0,0);
-    int2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int2(0,0);
-    int2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int2(0,0);
-
-    int3 v3_1 = WaveActiveMax( v.xyz );
-    int3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int3(0,0,0);
-    int3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int3(0,0,0);
-    int3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int3(0,0,0);
-
-    int4 v4_1 = WaveActiveMax( v );
-    int4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int4(0,0,0,0);
-    int4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int4(0,0,0,0);
-    int4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int4(0,0,0,0);
-
-    int scalars[4] = { s4, s3, s2, s1 };
-    int2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
-    int3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
-    int4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
-
-    Out1[tid.x].x   = scalars[tid.x];
-    Out2[tid.x].xy  = vec2s[tid.x];
-    Out3[tid.x].xyz = vec3s[tid.x];
-    Out4[tid.x]     = vec4s[tid.x];
+    for (int ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (int MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            int4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
 
     // constant folding case
     Out5[0] = WaveActiveMax(int4(1,2,3,4));
-
-    // UINT case
-
-    uint4 uv = UIn[tid.x];
-
-    uint us1 = WaveActiveMax( uv.x );
-    uint us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0;
-    uint us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0;
-    uint us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0;
-
-    uint2 uv2_1 = WaveActiveMax( uv.xy );
-    uint2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint2(0,0);
-    uint2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint2(0,0);
-    uint2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint2(0,0);
-
-    uint3 uv3_1 = WaveActiveMax( uv.xyz );
-    uint3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0);
-    uint3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0);
-    uint3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint3(0,0,0);
-
-    uint4 uv4_1 = WaveActiveMax( uv );
-    uint4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint4(0,0,0,0);
-    uint4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint4(0,0,0,0);
-    uint4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint4(0,0,0,0);
-
-    uint uscalars[4] = { us4, us3, us2, us1 };
-    uint2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 };
-    uint3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 };
-    uint4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 };    
-
-    UOut1[tid.x].x   = uscalars[tid.x];
-    UOut2[tid.x].xy  = uvec2s[tid.x];
-    UOut3[tid.x].xyz = uvec3s[tid.x];
-    UOut4[tid.x]     = uvec4s[tid.x];
-
-    // constant folding case
-    UOut5[0] = WaveActiveMax(uint4(1,2,3,4));
 }
 
+
 //--- pipeline.yaml
 
 ---
@@ -100,93 +48,186 @@ Shaders:
 Buffers:
   - Name: In
     Format: Int32
-    Stride: 16    
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+    Stride: 16
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    9, 10, 11, 12,
+    13, 14, 15, 16,
+    2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    6, 7, 8, 9,
+    10, 11, 12, 13,
+    14, 15, 16, 1,
+    3, 4, 5, 6,
+    7, 8, 9, 10,
+    11, 12, 13, 14,
+    15, 16, 1, 2,
+    4, 5, 6, 7,
+    8, 9, 10, 11,
+    12, 13, 14, 15,
+    16, 1, 2, 3,
+    4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    8, 7, 6, 5, 
+    12, 11, 10, 9, 
+    16, 15, 14, 13, 
+    5, 4, 3, 2, 
+    9, 8, 7, 6, 
+    13, 12, 11, 10, 
+    1, 16, 15, 14, 
+    6, 5, 4, 3, 
+    10, 9, 8, 7, 
+    14, 13, 12, 11, 
+    2, 1, 16, 15, 
+    7, 6, 5, 4, 
+    11, 10, 9, 8, 
+    15, 14, 13, 12, 
+    3, 2, 1, 16 ]
+
   - Name: Out1
     Format: Int32
-    Stride: 16
-    ZeroInitSize: 64  
+    Stride: 4
+    # 1 int is 4 bytes, * 4 halves for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 128  
   - Name: Out2
     Format: Int32
-    Stride: 16
-    ZeroInitSize: 64
+    Stride: 8
+    ZeroInitSize: 256
   - Name: Out3
     Format: Int32
     Stride: 16
-    ZeroInitSize: 64
+    ZeroInitSize: 512
   - Name: Out4
     Format: Int32
     Stride: 16
-    ZeroInitSize: 64
+    ZeroInitSize: 512
   - Name: Out5
     Format: Int32
     Stride: 16
     ZeroInitSize: 16
+  - Name: Masks
+    Format: Int32
+    Stride: 16
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
   - Name: ExpectedOut1
     Format: Int32
     Stride: 16
-    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0, 0, 0, 0, 
+    14, 14, 14, 14,
+    3, 0, 0, 0, 
+    0, 12, 12, 0, 
+    0, 0, 0, 0, 
+    13, 13, 13, 13, 
+    6, 0, 0, 0, 
+    0, 15, 15, 0 ]
   - Name: ExpectedOut2
     Format: Int32
     Stride: 16
-    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data: [ 0, 0, 0, 0, 
+    0, 0, 0, 0, 
+    14, 15, 14, 15, 
+    14, 15, 14, 15, 
+    3, 4, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 12, 13, 
+    12, 13, 0, 0, 
+    0, 0, 0, 0,
+    0, 0, 0, 0, 
+    13, 16, 13, 16,
+    13, 16, 13, 16, 
+    6, 5, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 15, 14, 
+    15, 14, 0, 0 ]
   - Name: ExpectedOut3
     Format: Int32
     Stride: 16
-    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 0,
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            3, 4, 5, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 0, 
+            12, 13, 14, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 0,
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            6, 5, 4, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 0, 
+            15, 14, 13, 0, 
+            0, 0, 0, 0 ]
   - Name: ExpectedOut4
     Format: Int32
     Stride: 16
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 13,
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            3, 4, 5, 6, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 15, 
+            12, 13, 14, 15, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14,
+            6, 5, 4, 3,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 12, 
+            15, 14, 13, 12, 
+            0, 0, 0, 0 ]
   - Name: ExpectedOut5
     Format: Int32
-    Stride: 16
-    Data: [ 1, 2, 3, 4 ]
-  - Name: UIn
-    Format: UInt32
-    Stride: 16
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
-  - Name: UOut1
-    Format: UInt32
-    Stride: 16
-    ZeroInitSize: 64  
-  - Name: UOut2
-    Format: UInt32
-    Stride: 16
-    ZeroInitSize: 64
-  - Name: UOut3
-    Format: UInt32
-    Stride: 16
-    ZeroInitSize: 64
-  - Name: UOut4
-    Format: UInt32
-    Stride: 16
-    ZeroInitSize: 64
-  - Name: UOut5
-    Format: UInt32
-    Stride: 16
-    ZeroInitSize: 16
-  - Name: UExpectedOut1
-    Format: UInt32
-    Stride: 16
-    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
-  - Name: UExpectedOut2
-    Format: UInt32
-    Stride: 16
-    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
-  - Name: UExpectedOut3
-    Format: UInt32
-    Stride: 16
-    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
-  - Name: UExpectedOut4
-    Format: UInt32
-    Stride: 16
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
-  - Name: UExpectedOut5
-    Format: UInt32
-    Stride: 4
+    Stride: 8
     Data: [ 1, 2, 3, 4 ]
-
 Results:
   - Result: ExpectedOut1
     Rule: BufferExact
@@ -208,26 +249,6 @@ Results:
     Rule: BufferExact
     Actual: Out5
     Expected: ExpectedOut5
-  - Result: UExpectedOut1
-    Rule: BufferExact
-    Actual: UOut1
-    Expected: UExpectedOut1
-  - Result: UExpectedOut2
-    Rule: BufferExact
-    Actual: UOut2
-    Expected: UExpectedOut2
-  - Result: UExpectedOut3
-    Rule: BufferExact
-    Actual: UOut3
-    Expected: UExpectedOut3
-  - Result: UExpectedOut4
-    Rule: BufferExact
-    Actual: UOut4
-    Expected: UExpectedOut4
-  - Result: UExpectedOut5
-    Rule: BufferExact
-    Actual: UOut5
-    Expected: UExpectedOut5
 DescriptorSets:
   - Resources:
     - Name: In
@@ -272,48 +293,13 @@ DescriptorSets:
         Space: 0
       VulkanBinding:
         Binding: 5
-    - Name: UIn
+    - Name: Masks
       Kind: StructuredBuffer
       DirectXBinding:
         Register: 6
         Space: 0
       VulkanBinding:
         Binding: 6
-    - Name: UOut1
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 7
-        Space: 0
-      VulkanBinding:
-        Binding: 7
-    - Name: UOut2
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 8
-        Space: 0
-      VulkanBinding:
-        Binding: 8
-    - Name: UOut3
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 9
-        Space: 0
-      VulkanBinding:
-        Binding: 9
-    - Name: UOut4
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 10
-        Space: 0
-      VulkanBinding:
-        Binding: 10
-    - Name: UOut5
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 11
-        Space: 0
-      VulkanBinding:
-        Binding: 11
 
 ...
 #--- end
diff --git a/test/WaveOps/WaveActiveMax.int64.test b/test/WaveOps/WaveActiveMax.int64.test
index 418727387..a0b56a1eb 100644
--- a/test/WaveOps/WaveActiveMax.int64.test
+++ b/test/WaveOps/WaveActiveMax.int64.test
@@ -1,95 +1,43 @@
 #--- source.hlsl
+#define VALUE_SETS 2
+#define NUM_MASKS 4
+#define NUM_THREADS 4
+
+struct MaskStruct {
+    int mask[NUM_THREADS];
+};
+
 StructuredBuffer<int64_t4> In  : register(t0);
-RWStructuredBuffer<int64_t4> Out1 : register(u1); // test scalar
-RWStructuredBuffer<int64_t4> Out2 : register(u2); // test int64_t2
+RWStructuredBuffer<int64_t> Out1 : register(u1);  // test scalar
+RWStructuredBuffer<int64_t2> Out2 : register(u2); // test int64_t2
 RWStructuredBuffer<int64_t4> Out3 : register(u3); // test int64_t3
 RWStructuredBuffer<int64_t4> Out4 : register(u4); // test int64_t4
 RWStructuredBuffer<int64_t4> Out5 : register(u5); // constant folding
+StructuredBuffer<MaskStruct> Masks  : register(t6);
 
-// uint64_ts
-StructuredBuffer<uint64_t4> UIn  : register(t6);
-RWStructuredBuffer<uint64_t4> UOut1 : register(u7);
-RWStructuredBuffer<uint64_t4> UOut2 : register(u8);
-RWStructuredBuffer<uint64_t4> UOut3 : register(u9);
-RWStructuredBuffer<uint64_t4> UOut4 : register(u10);
-RWStructuredBuffer<uint64_t4> UOut5 : register(u11);
 
-[numthreads(4,1,1)]
+[numthreads(NUM_THREADS,1,1)]
 void main(uint3 tid : SV_GroupThreadID)
 {
-    int64_t4 v = In[tid.x];
-
-    int64_t s1 = WaveActiveMax( v.x );
-    int64_t s2 = tid.x < 3 ? WaveActiveMax( v.x ) : 0;
-    int64_t s3 = tid.x < 2 ? WaveActiveMax( v.x ) : 0;
-    int64_t s4 = tid.x < 1 ? WaveActiveMax( v.x ) : 0;
-
-    int64_t2 v2_1 = WaveActiveMax( v.xy );
-    int64_t2 v2_2 = tid.x < 3 ? WaveActiveMax( v.xy ) : int64_t2(0,0);
-    int64_t2 v2_3 = tid.x < 2 ? WaveActiveMax( v.xy ) : int64_t2(0,0);
-    int64_t2 v2_4 = tid.x < 1 ? WaveActiveMax( v.xy ) : int64_t2(0,0);
-
-    int64_t3 v3_1 = WaveActiveMax( v.xyz );
-    int64_t3 v3_2 = tid.x < 3 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0);
-    int64_t3 v3_3 = tid.x < 2 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0);
-    int64_t3 v3_4 = tid.x < 1 ? WaveActiveMax( v.xyz ) : int64_t3(0,0,0);
-
-    int64_t4 v4_1 = WaveActiveMax( v );
-    int64_t4 v4_2 = tid.x < 3 ? WaveActiveMax( v ) : int64_t4(0,0,0,0);
-    int64_t4 v4_3 = tid.x < 2 ? WaveActiveMax( v ) : int64_t4(0,0,0,0);
-    int64_t4 v4_4 = tid.x < 1 ? WaveActiveMax( v ) : int64_t4(0,0,0,0);
-
-    int64_t scalars[4] = { s4, s3, s2, s1 };
-    int64_t2 vec2s [4] = { v2_4, v2_3, v2_2, v2_1 };
-    int64_t3 vec3s [4] = { v3_4, v3_3, v3_2, v3_1 };
-    int64_t4 vec4s [4] = { v4_4, v4_3, v4_2, v4_1 };    
-
-    Out1[tid.x].x   = scalars[tid.x];
-    Out2[tid.x].xy  = vec2s[tid.x];
-    Out3[tid.x].xyz = vec3s[tid.x];
-    Out4[tid.x]     = vec4s[tid.x];
+    for (int ValueSet = 0; ValueSet < VALUE_SETS; ValueSet++) {
+        const uint ValueSetOffset = ValueSet * NUM_MASKS * NUM_THREADS;
+        for (int MaskIdx = 0; MaskIdx < NUM_MASKS; MaskIdx++) {
+            int64_t4 v = In[ValueSet * ValueSetOffset + MaskIdx * NUM_THREADS + tid.x];
+            const uint OutIdx = ValueSetOffset + MaskIdx * NUM_THREADS + tid.x;
+            if (Masks[MaskIdx].mask[tid.x]) {
+                Out1[OutIdx] = WaveActiveMax( v.x );
+                Out2[OutIdx].xy = WaveActiveMax( v.xy );
+                Out3[OutIdx].xyz = WaveActiveMax( v.xyz );
+                Out4[OutIdx] = WaveActiveMax( v );
+            }
+        }
+    }
 
     // constant folding case
     Out5[0] = WaveActiveMax(int64_t4(1,2,3,4));
-
-    // UINT64_t case
-
-    uint64_t4 uv = UIn[tid.x];
-
-    uint64_t us1 = WaveActiveMax( uv.x );
-    uint64_t us2 = tid.x < 3 ? WaveActiveMax( uv.x ) : 0;
-    uint64_t us3 = tid.x < 2 ? WaveActiveMax( uv.x ) : 0;
-    uint64_t us4 = tid.x < 1 ? WaveActiveMax( uv.x ) : 0;
-
-    uint64_t2 uv2_1 = WaveActiveMax( uv.xy );
-    uint64_t2 uv2_2 = tid.x < 3 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0);
-    uint64_t2 uv2_3 = tid.x < 2 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0);
-    uint64_t2 uv2_4 = tid.x < 1 ? WaveActiveMax( uv.xy ) : uint64_t2(0,0);
-
-    uint64_t3 uv3_1 = WaveActiveMax( uv.xyz );
-    uint64_t3 uv3_2 = tid.x < 3 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0);
-    uint64_t3 uv3_3 = tid.x < 2 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0);
-    uint64_t3 uv3_4 = tid.x < 1 ? WaveActiveMax( uv.xyz ) : uint64_t3(0,0,0);
-
-    uint64_t4 uv4_1 = WaveActiveMax( uv );
-    uint64_t4 uv4_2 = tid.x < 3 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0);
-    uint64_t4 uv4_3 = tid.x < 2 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0);
-    uint64_t4 uv4_4 = tid.x < 1 ? WaveActiveMax( uv ) : uint64_t4(0,0,0,0);
-
-    uint64_t uscalars[4] = { us4, us3, us2, us1 };
-    uint64_t2 uvec2s [4] = { uv2_4, uv2_3, uv2_2, uv2_1 };
-    uint64_t3 uvec3s [4] = { uv3_4, uv3_3, uv3_2, uv3_1 };
-    uint64_t4 uvec4s [4] = { uv4_4, uv4_3, uv4_2, uv4_1 };    
-
-    UOut1[tid.x].x   = uscalars[tid.x];
-    UOut2[tid.x].xy  = uvec2s[tid.x];
-    UOut3[tid.x].xyz = uvec3s[tid.x];
-    UOut4[tid.x]     = uvec4s[tid.x];
-
-    // constant folding case
-    UOut5[0] = WaveActiveMax(uint64_t4(1,2,3,4));
 }
 
+
 //--- pipeline.yaml
 
 ---
@@ -100,93 +48,186 @@ Shaders:
 Buffers:
   - Name: In
     Format: Int64
-    Stride: 32    
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+    Stride: 32
+    # 2 value sets
+    # For each value set, 
+    # and for each specific one of the 4 thread masks in that value set,
+    # and for each of the 4 threads in that thread mask,
+    # there will be a unique set of 4 values, such that 
+    # none of the other threads in that thread mask share any values
+    Data: [
+    1, 2, 3, 4, # <-- Value set 0, thread mask 0, thread id 0 will read these In values
+    5, 6, 7, 8, # <-- Value set 0, thread mask 0, thread id 1 will read these In values
+    9, 10, 11, 12,
+    13, 14, 15, 16,
+    2, 3, 4, 5, # <-- Value set 0, thread mask 1, thread id 0 will read these In values
+    6, 7, 8, 9,
+    10, 11, 12, 13,
+    14, 15, 16, 1,
+    3, 4, 5, 6,
+    7, 8, 9, 10,
+    11, 12, 13, 14,
+    15, 16, 1, 2,
+    4, 5, 6, 7,
+    8, 9, 10, 11,
+    12, 13, 14, 15,
+    16, 1, 2, 3,
+    4, 3, 2, 1, # <-- Value set 1, thread mask 0, thread id 0 will read these In values
+    8, 7, 6, 5, 
+    12, 11, 10, 9, 
+    16, 15, 14, 13, 
+    5, 4, 3, 2, 
+    9, 8, 7, 6, 
+    13, 12, 11, 10, 
+    1, 16, 15, 14, 
+    6, 5, 4, 3, 
+    10, 9, 8, 7, 
+    14, 13, 12, 11, 
+    2, 1, 16, 15, 
+    7, 6, 5, 4, 
+    11, 10, 9, 8, 
+    15, 14, 13, 12, 
+    3, 2, 1, 16 ]
+
   - Name: Out1
     Format: Int64
-    Stride: 32
-    ZeroInitSize: 128  
+    Stride: 8
+    # 1 int is 8 bytes, * 4 ints for 4 threads, * 4 thread masks, * 2 value sets
+    ZeroInitSize: 256
   - Name: Out2
     Format: Int64
-    Stride: 32
-    ZeroInitSize: 128
+    Stride: 16
+    ZeroInitSize: 512
   - Name: Out3
     Format: Int64
     Stride: 32
-    ZeroInitSize: 128
+    ZeroInitSize: 1024
   - Name: Out4
     Format: Int64
     Stride: 32
-    ZeroInitSize: 128
+    ZeroInitSize: 1024
   - Name: Out5
     Format: Int64
     Stride: 32
     ZeroInitSize: 32
+  - Name: Masks
+    Format: Int64
+    Stride: 8
+    # 4 active mask sets for threads 0, 1, 2, 3:
+    # 0 0 0 0
+    # 1 1 1 1    
+    # 1 0 0 0
+    # 0 1 1 0
+    Data: [ 
+    0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0]
   - Name: ExpectedOut1
     Format: Int64
     Stride: 32
-    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data:  [ 0, 0, 0, 0, 
+    14, 14, 14, 14,
+    3, 0, 0, 0, 
+    0, 12, 12, 0, 
+    0, 0, 0, 0, 
+    13, 13, 13, 13, 
+    6, 0, 0, 0, 
+    0, 15, 15, 0 ]
   - Name: ExpectedOut2
     Format: Int64
     Stride: 32
-    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 1 result value per thread
+    Data: [ 0, 0, 0, 0, 
+    0, 0, 0, 0, 
+    14, 15, 14, 15, 
+    14, 15, 14, 15, 
+    3, 4, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 12, 13, 
+    12, 13, 0, 0, 
+    0, 0, 0, 0,
+    0, 0, 0, 0, 
+    13, 16, 13, 16,
+    13, 16, 13, 16, 
+    6, 5, 0, 0, 
+    0, 0, 0, 0, 
+    0, 0, 15, 14, 
+    15, 14, 0, 0 ]
   - Name: ExpectedOut3
     Format: Int64
     Stride: 32
-    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
+    # 2 value sets, 4 masks per value set, 4 threads per mask, 4 result values per thread
+    # Note, vecs of 3 must be aligned, so the 3 result values are placed into a 4 element vec
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 0,
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            14, 15, 16, 0, 
+            3, 4, 5, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 0, 
+            12, 13, 14, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 0,
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            13, 16, 15, 0, 
+            6, 5, 4, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 0, 
+            15, 14, 13, 0, 
+            0, 0, 0, 0 ]
   - Name: ExpectedOut4
     Format: Int64
     Stride: 32
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
+    Data: [ 0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            14, 15, 16, 13,
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            14, 15, 16, 13, 
+            3, 4, 5, 6, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            12, 13, 14, 15, 
+            12, 13, 14, 15, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14, 
+            13, 16, 15, 14,
+            6, 5, 4, 3,
+            0, 0, 0, 0,
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            0, 0, 0, 0, 
+            15, 14, 13, 12, 
+            15, 14, 13, 12, 
+            0, 0, 0, 0 ]
   - Name: ExpectedOut5
     Format: Int64
-    Stride: 8
+    Stride: 16
     Data: [ 1, 2, 3, 4 ]
-  - Name: UIn
-    Format: UInt64
-    Stride: 32
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
-  - Name: UOut1
-    Format: UInt64
-    Stride: 32
-    ZeroInitSize: 128  
-  - Name: UOut2
-    Format: UInt64
-    Stride: 32
-    ZeroInitSize: 128
-  - Name: UOut3
-    Format: UInt64
-    Stride: 32
-    ZeroInitSize: 128
-  - Name: UOut4
-    Format: UInt64
-    Stride: 32
-    ZeroInitSize: 128
-  - Name: UOut5
-    Format: UInt64
-    Stride: 32
-    ZeroInitSize: 32
-  - Name: UExpectedOut1
-    Format: UInt64
-    Stride: 32
-    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0 ]
-  - Name: UExpectedOut2
-    Format: UInt64
-    Stride: 32
-    Data: [ 1, 10, 0, 0, 2, 20, 0, 0, 3, 30, 0, 0, 4, 40, 0, 0 ]
-  - Name: UExpectedOut3
-    Format: UInt64
-    Stride: 32
-    Data: [ 1, 10, 100, 0, 2, 20, 200, 0, 3, 30, 300, 0, 4, 40, 400, 0 ]
-  - Name: UExpectedOut4
-    Format: UInt64
-    Stride: 32
-    Data: [ 1, 10, 100, 1000, 2, 20, 200, 2000, 3, 30, 300, 3000, 4, 40, 400, 4000 ]
-  - Name: UExpectedOut5
-    Format: UInt64
-    Stride: 8
-    Data: [ 1, 2, 3, 4 ]
-
 Results:
   - Result: ExpectedOut1
     Rule: BufferExact
@@ -208,26 +249,6 @@ Results:
     Rule: BufferExact
     Actual: Out5
     Expected: ExpectedOut5
-  - Result: UExpectedOut1
-    Rule: BufferExact
-    Actual: UOut1
-    Expected: UExpectedOut1
-  - Result: UExpectedOut2
-    Rule: BufferExact
-    Actual: UOut2
-    Expected: UExpectedOut2
-  - Result: UExpectedOut3
-    Rule: BufferExact
-    Actual: UOut3
-    Expected: UExpectedOut3
-  - Result: UExpectedOut4
-    Rule: BufferExact
-    Actual: UOut4
-    Expected: UExpectedOut4
-  - Result: UExpectedOut5
-    Rule: BufferExact
-    Actual: UOut5
-    Expected: UExpectedOut5
 DescriptorSets:
   - Resources:
     - Name: In
@@ -272,52 +293,19 @@ DescriptorSets:
         Space: 0
       VulkanBinding:
         Binding: 5
-    - Name: UIn
+    - Name: Masks
       Kind: StructuredBuffer
       DirectXBinding:
         Register: 6
         Space: 0
       VulkanBinding:
         Binding: 6
-    - Name: UOut1
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 7
-        Space: 0
-      VulkanBinding:
-        Binding: 7
-    - Name: UOut2
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 8
-        Space: 0
-      VulkanBinding:
-        Binding: 8
-    - Name: UOut3
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 9
-        Space: 0
-      VulkanBinding:
-        Binding: 9
-    - Name: UOut4
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 10
-        Space: 0
-      VulkanBinding:
-        Binding: 10
-    - Name: UOut5
-      Kind: RWStructuredBuffer
-      DirectXBinding:
-        Register: 11
-        Space: 0
-      VulkanBinding:
-        Binding: 11
 
 ...
 #--- end
 
+# REQUIRES: Int64
+
 # Bug https://github.com/llvm/llvm-project/issues/156775
 # XFAIL: Clang