-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[NVPTX] Add support for local volatile memory operations #150099
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
CC: @gonzalobg |
@llvm/pr-subscribers-backend-nvptx Author: Akshay Deodhar (akshayrdeodhar) ChangesSupport for local volatile loads/stores in NVPTX. Patch is 25.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150099.diff 7 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 65e7c56774547..84762eb1bf71f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -645,15 +645,17 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
// Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
// behavior due to lack of Independent Forward Progress. Lowering these
// to weak memory operations in sm_60- is therefore fine.
- //
// TODO: lower atomic and volatile operations to memory locations
// in local, const, and param to two PTX instructions in sm_70+:
// - the "weak" memory instruction we are currently lowering to, and
// - some other instruction that preserves the side-effect, e.g.,
// a dead dummy volatile load.
- if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
- CodeAddrSpace == NVPTX::AddressSpace::Const ||
- CodeAddrSpace == NVPTX::AddressSpace::Param) {
+
+ if (CodeAddrSpace == NVPTX::AddressSpace::Const ||
+ CodeAddrSpace == NVPTX::AddressSpace::Param ||
+ (CodeAddrSpace == NVPTX::AddressSpace::Local
+ && (!N->isVolatile() || Ordering != AtomicOrdering::NotAtomic))) {
+ // Allow non-atomic local volatile operations
return NVPTX::Ordering::NotAtomic;
}
@@ -677,12 +679,13 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
// from .generic, .global, or .shared. The behavior of PTX volatile and PTX
// atomics is undefined if the generic address does not refer to a .global or
// .shared memory location.
- bool AddrGenericOrGlobalOrShared =
+ bool AddrGenericOrGlobalOrSharedorLocal =
(CodeAddrSpace == NVPTX::AddressSpace::Generic ||
CodeAddrSpace == NVPTX::AddressSpace::Global ||
CodeAddrSpace == NVPTX::AddressSpace::Shared ||
- CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
- if (!AddrGenericOrGlobalOrShared)
+ CodeAddrSpace == NVPTX::AddressSpace::SharedCluster ||
+ CodeAddrSpace == NVPTX::AddressSpace::Local);
+ if (!AddrGenericOrGlobalOrSharedorLocal)
return NVPTX::Ordering::NotAtomic;
bool UseRelaxedMMIO =
diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
index ed8f6b4511079..f53fc3a27de15 100644
--- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
+++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll
@@ -85,7 +85,7 @@ define i32 @test_modify_param(ptr byval([10 x i32]) %a, i32 %b, i32 %c ) {
; CHECK-NEXT: mov.b64 %rd1, test_modify_param_param_0;
; CHECK-NEXT: ld.param.b32 %r1, [test_modify_param_param_1];
; CHECK-NEXT: ld.param.b32 %r2, [test_modify_param_param_2];
-; CHECK-NEXT: st.local.b32 [%rd1+2], %r1;
+; CHECK-NEXT: st.volatile.local.b32 [%rd1+2], %r1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%p2 = getelementptr i8, ptr %a, i32 2
diff --git a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
index bac59be5158ea..58ca8d613b09b 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll
@@ -2643,9 +2643,9 @@ define void @local_volatile_i8(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_i8_param_0];
-; CHECK-NEXT: ld.local.b8 %rs1, [%rd1];
+; CHECK-NEXT: ld.volatile.local.b8 %rs1, [%rd1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.b8 [%rd1], %rs2;
+; CHECK-NEXT: st.volatile.local.b8 [%rd1], %rs2;
; CHECK-NEXT: ret;
%a.load = load volatile i8, ptr addrspace(5) %a
%a.add = add i8 %a.load, 1
@@ -2661,9 +2661,9 @@ define void @local_volatile_i16(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_i16_param_0];
-; CHECK-NEXT: ld.local.b16 %rs1, [%rd1];
+; CHECK-NEXT: ld.volatile.local.b16 %rs1, [%rd1];
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
-; CHECK-NEXT: st.local.b16 [%rd1], %rs2;
+; CHECK-NEXT: st.volatile.local.b16 [%rd1], %rs2;
; CHECK-NEXT: ret;
%a.load = load volatile i16, ptr addrspace(5) %a
%a.add = add i16 %a.load, 1
@@ -2679,9 +2679,9 @@ define void @local_volatile_i32(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_i32_param_0];
-; CHECK-NEXT: ld.local.b32 %r1, [%rd1];
+; CHECK-NEXT: ld.volatile.local.b32 %r1, [%rd1];
; CHECK-NEXT: add.s32 %r2, %r1, 1;
-; CHECK-NEXT: st.local.b32 [%rd1], %r2;
+; CHECK-NEXT: st.volatile.local.b32 [%rd1], %r2;
; CHECK-NEXT: ret;
%a.load = load volatile i32, ptr addrspace(5) %a
%a.add = add i32 %a.load, 1
@@ -2696,9 +2696,9 @@ define void @local_volatile_i64(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_i64_param_0];
-; CHECK-NEXT: ld.local.b64 %rd2, [%rd1];
+; CHECK-NEXT: ld.volatile.local.b64 %rd2, [%rd1];
; CHECK-NEXT: add.s64 %rd3, %rd2, 1;
-; CHECK-NEXT: st.local.b64 [%rd1], %rd3;
+; CHECK-NEXT: st.volatile.local.b64 [%rd1], %rd3;
; CHECK-NEXT: ret;
%a.load = load volatile i64, ptr addrspace(5) %a
%a.add = add i64 %a.load, 1
@@ -2714,9 +2714,9 @@ define void @local_volatile_float(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_float_param_0];
-; CHECK-NEXT: ld.local.b32 %r1, [%rd1];
+; CHECK-NEXT: ld.volatile.local.b32 %r1, [%rd1];
; CHECK-NEXT: add.rn.f32 %r2, %r1, 0f3F800000;
-; CHECK-NEXT: st.local.b32 [%rd1], %r2;
+; CHECK-NEXT: st.volatile.local.b32 [%rd1], %r2;
; CHECK-NEXT: ret;
%a.load = load volatile float, ptr addrspace(5) %a
%a.add = fadd float %a.load, 1.
@@ -2731,9 +2731,9 @@ define void @local_volatile_double(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_double_param_0];
-; CHECK-NEXT: ld.local.b64 %rd2, [%rd1];
+; CHECK-NEXT: ld.volatile.local.b64 %rd2, [%rd1];
; CHECK-NEXT: add.rn.f64 %rd3, %rd2, 0d3FF0000000000000;
-; CHECK-NEXT: st.local.b64 [%rd1], %rd3;
+; CHECK-NEXT: st.volatile.local.b64 [%rd1], %rd3;
; CHECK-NEXT: ret;
%a.load = load volatile double, ptr addrspace(5) %a
%a.add = fadd double %a.load, 1.
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
index ed170e92917f5..91a70e4468154 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
@@ -1550,7 +1550,6 @@ define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(
}
;; local statespace
-
; CHECK-LABEL: local_unordered_cluster
define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK-LABEL: local_unordered_cluster(
diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
index 68c53cde7f9ac..4423efcca1ff4 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
@@ -1280,11 +1280,11 @@ define void @local_volatile_32xi8(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_32xi8_param_0];
-; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_32xi8_param_1];
-; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <32 x i8>, ptr addrspace(5) %a
store volatile <32 x i8> %a.load, ptr addrspace(5) %b
@@ -1299,11 +1299,11 @@ define void @local_volatile_16xi16(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xi16_param_0];
-; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_16xi16_param_1];
-; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x i16>, ptr addrspace(5) %a
store volatile <16 x i16> %a.load, ptr addrspace(5) %b
@@ -1318,11 +1318,11 @@ define void @local_volatile_16xhalf(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xhalf_param_0];
-; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_16xhalf_param_1];
-; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x half>, ptr addrspace(5) %a
store volatile <16 x half> %a.load, ptr addrspace(5) %b
@@ -1337,11 +1337,11 @@ define void @local_volatile_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b)
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xbfloat_param_0];
-; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_16xbfloat_param_1];
-; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x bfloat>, ptr addrspace(5) %a
store volatile <16 x bfloat> %a.load, ptr addrspace(5) %b
@@ -1356,11 +1356,11 @@ define void @local_volatile_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0];
-; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_8xi32_param_1];
-; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
-; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%a.load = load volatile <8 x i32>, ptr addrspace(5) %a
store volatile <8 x i32> %a.load, ptr addrspace(5) %b
@@ -1374,11 +1374,11 @@ define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xi64_param_0];
-; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT: ld.volatile.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_4xi64_param_1];
-; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
-; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
+; CHECK-NEXT: st.volatile.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT: st.volatile.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load volatile <4 x i64>, ptr addrspace(5) %a
store volatile <4 x i64> %a.load, ptr addrspace(5) %b
@@ -1392,11 +1392,11 @@ define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0];
-; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT: ld.volatile.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1];
-; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
-; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
+; CHECK-NEXT: st.volatile.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT: st.volatile.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load volatile <8 x float>, ptr addrspace(5) %a
store volatile <8 x float> %a.load, ptr addrspace(5) %b
@@ -1410,11 +1410,11 @@ define void @local_volatile_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xdouble_param_0];
-; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
-; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
+; CHECK-NEXT: ld.volatile.local.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v2.b64 {%rd4, %rd5}, [%rd1+16];
; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_4xdouble_param_1];
-; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5};
-; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3};
+; CHECK-NEXT: st.volatile.local.v2.b64 [%rd6+16], {%rd4, %rd5};
+; CHECK-NEXT: st.volatile.local.v2.b64 [%rd6], {%rd2, %rd3};
; CHECK-NEXT: ret;
%a.load = load volatile <4 x double>, ptr addrspace(5) %a
store volatile <4 x double> %a.load, ptr addrspace(5) %b
diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
index 7e013390a39db..37e66894110ca 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll
@@ -2846,10 +2846,10 @@ define void @local_volatile_2xi8(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_2xi8_param_0];
-; CHECK-NEXT: ld.local.v2.b8 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v2.b8 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT: st.local.v2.b8 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT: st.volatile.local.v2.b8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT: ret;
%a.load = load volatile <2 x i8>, ptr addrspace(5) %a
%a.add = add <2 x i8> %a.load, <i8 1, i8 1>
@@ -2866,7 +2866,7 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xi8_param_0];
-; CHECK-NEXT: ld.local.b32 %r1, [%rd1];
+; CHECK-NEXT: ld.volatile.local.b32 %r1, [%rd1];
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
@@ -2886,7 +2886,7 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U;
; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U;
-; CHECK-NEXT: st.local.b32 [%rd1], %r12;
+; CHECK-NEXT: st.volatile.local.b32 [%rd1], %r12;
; CHECK-NEXT: ret;
%a.load = load volatile <4 x i8>, ptr addrspace(5) %a
%a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
@@ -2903,7 +2903,7 @@ define void @local_volatile_8xi8(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xi8_param_0];
-; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
@@ -2942,7 +2942,7 @@ define void @local_volatile_8xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: cvt.u32.u16 %r22, %rs16;
; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U;
; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U;
-; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13};
+; CHECK-NEXT: st.volatile.local.v2.b32 [%rd1], {%r24, %r13};
; CHECK-NEXT: ret;
%a.load = load volatile <8 x i8>, ptr addrspace(5) %a
%a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -2959,7 +2959,7 @@ define void @local_volatile_16xi8(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xi8_param_0];
-; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
@@ -3036,7 +3036,7 @@ define void @local_volatile_16xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: cvt.u32.u16 %r46, %rs32;
; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U;
; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U;
-; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
+; CHECK-NEXT: st.volatile.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
; CHECK-NEXT: ret;
%a.load = load volatile <16 x i8>, ptr addrspace(5) %a
%a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -3052,10 +3052,10 @@ define void @local_volatile_2xi16(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_2xi16_param_0];
-; CHECK-NEXT: ld.local.v2.b16 {%rs1, %rs2}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v2.b16 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT: add.s16 %rs3, %rs2, 1;
; CHECK-NEXT: add.s16 %rs4, %rs1, 1;
-; CHECK-NEXT: st.local.v2.b16 [%rd1], {%rs4, %rs3};
+; CHECK-NEXT: st.volatile.local.v2.b16 [%rd1], {%rs4, %rs3};
; CHECK-NEXT: ret;
%a.load = load volatile <2 x i16>, ptr addrspace(5) %a
%a.add = add <2 x i16> %a.load, <i16 1, i16 1>
@@ -3071,12 +3071,12 @@ define void @local_volatile_4xi16(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xi16_param_0];
-; CHECK-NEXT: ld.local.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
+; CHECK-NEXT: ld.volatile.local.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; CHECK-NEXT: add.s16 %rs5, %rs4, 1;
; CHECK-NEXT: add.s16 %rs6, %rs3, 1;
; CHECK-NEXT: add.s16 %rs7, %rs2, 1;
; CHECK-NEXT: add.s16 %rs8, %rs1, 1;
-; CHECK-NEXT: st.local.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
+; CHECK-NEXT: st.volatile.local.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
; CHECK-NEXT: ret;
%a.load = load volatile <4 x i16>, ptr addrspace(5) %a
%a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
@@ -3093,7 +3093,7 @@ define void @local_volatile_8xi16(ptr addrspace(5) %a) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xi16_param_0]...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions cpp -- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 84762eb1b..c083a863b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -653,8 +653,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
if (CodeAddrSpace == NVPTX::AddressSpace::Const ||
CodeAddrSpace == NVPTX::AddressSpace::Param ||
- (CodeAddrSpace == NVPTX::AddressSpace::Local
- && (!N->isVolatile() || Ordering != AtomicOrdering::NotAtomic))) {
+ (CodeAddrSpace == NVPTX::AddressSpace::Local &&
+ (!N->isVolatile() || Ordering != AtomicOrdering::NotAtomic))) {
// Allow non-atomic local volatile operations
return NVPTX::Ordering::NotAtomic;
}
|
It seems that Assuming this is something that will be released in a future version of ptxas, I think we should only emit |
Will upstream once PTX support becomes available. |
Support for local volatile loads/stores in NVPTX.