[MLIR][NVVM] Add tcgen05 alloc/dealloc Ops (#125674)

durga4github · web-flow · commit 4287c72404fb · 2025-02-05T16:16:47.000+05:30
PR #124961 adds intrinsics for the tcgen05 alloc/dealloc PTX instructions. This patch adds NVVM Ops for the same. Tests are added to verify the lowering to the corresponding intrinsics in tcgen05-alloc.mlir file. PTX ISA link: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -39,7 +39,11 @@ enum NVVMMemorySpace {
   /// Shared memory space identifier.
   kSharedMemorySpace = 3,
   /// Constant memory space identifier.
-  kConstantMemorySpace = 4
+  kConstantMemorySpace = 4,
+  /// Tensor memory space identifier.
+  /// Tensor memory is available only in arch-accelerated
+  /// variants from sm100 onwards.
+  kTensorMemorySpace = 6
 };
 
 /// Return the element type and number of elements associated with a wmma matrix
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -23,6 +23,7 @@ include "mlir/Interfaces/InferIntRangeInterface.td"
 def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
 def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
 def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
+def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>;
 
 //===----------------------------------------------------------------------===//
 // NVVM dialect definitions
@@ -2592,6 +2593,110 @@ def NVVM_Breakpoint : NVVM_Op<"breakpoint"> {
   let assemblyFormat = "attr-dict";
 }
 
+//===----------------------------------------------------------------------===//
+// NVVM TCGEN05 Ops
+//===----------------------------------------------------------------------===//
+// Num CTAs in a group participating in the TCGEN05 operation.
+// This corresponds to the "cta_group::1", "cta_group::2"
+// modifiers in the PTX instructions.
+def Tcgen05GroupCTA_1 : I32EnumAttrCase<"CTA_1", 0, "cta_1">;
+def Tcgen05GroupCTA_2 : I32EnumAttrCase<"CTA_2", 1, "cta_2">;
+
+def Tcgen05GroupKind : I32EnumAttr<"Tcgen05GroupKind",
+                            "NVVM Tcgen05 group kind",
+  [Tcgen05GroupCTA_1, Tcgen05GroupCTA_2]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def Tcgen05GroupKindAttr :
+  EnumAttr<NVVM_Dialect, Tcgen05GroupKind, "tcgen05_group"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
+  let summary = "Tcgen05 alloc operation";
+  let description = [{
+    The `tcgen05.alloc` Op allocates tensor core memory for
+    the amount specified by `nCols` and writes the destination
+    address to the `addr` argument. The `nCols` operand specifies the
+    number of columns to be allocated and it must be a power-of-two.
+    [For more information, refer to the PTX ISA]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
+  }];
+
+  let arguments = (ins
+    AnyTypeOf<[LLVM_AnyPointer, LLVM_PointerShared]>:$addr,
+    I32:$nCols,
+    DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
+
+  let assemblyFormat = "$addr `,` $nCols attr-dict `:` type(operands)";
+
+  let extraClassDeclaration = [{
+    static llvm::Intrinsic::ID
+      getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                            llvm::SmallVector<llvm::Value *> &args);
+  }];
+  string llvmBuilder = [{
+    llvm::SmallVector<llvm::Value *> args;
+    auto id = NVVM::Tcgen05AllocOp::getIntrinsicIDAndArgs(
+      *op, moduleTranslation, args);
+    createIntrinsicCall(builder, id, args);
+  }];
+}
+
+def NVVM_Tcgen05DeallocOp : NVVM_Op<"tcgen05.dealloc"> {
+  let summary = "Tcgen05 dealloc operation";
+  let description = [{
+    The `tcgen05.dealloc` Op de-allocates the tensor core memory
+    specified by `tmemAddr`, which must be from a previous tensor
+    memory allocation. The `nCols` operand specifies the number
+    of columns to be de-allocated, and it must be a power-of-two.
+    [For more information, refer to the PTX ISA]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
+  }];
+
+  let arguments = (ins LLVM_PointerTensor:$taddr, I32:$nCols,
+    DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
+
+  let assemblyFormat = "$taddr `,` $nCols attr-dict `:` type(operands)";
+
+  let extraClassDeclaration = [{
+    static llvm::Intrinsic::ID
+      getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                            llvm::SmallVector<llvm::Value *> &args);
+  }];
+  string llvmBuilder = [{
+    llvm::SmallVector<llvm::Value *> args;
+    auto id = NVVM::Tcgen05DeallocOp::getIntrinsicIDAndArgs(
+      *op, moduleTranslation, args);
+    createIntrinsicCall(builder, id, args);
+  }];
+}
+
+def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_permit"> {
+  let summary = "Tcgen05 Op to relinquish the right to allocate";
+  let description = [{
+    The `tcgen05.relinquish_alloc_permit` Op specifies that the CTA
+    of the executing thread is relinquishing the right to allocate
+    Tensor Memory. So, it is illegal for a CTA to perform `tcgen05.alloc`
+    after any of its constituent threads execute `tcgen05.relinquish_alloc_permit`.
+    [For more information, refer to the PTX ISA]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
+  }];
+
+  let arguments = (ins
+    DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
+
+  let assemblyFormat = "attr-dict";
+
+  string llvmBuilder = [{
+    auto id = ($group == NVVM::Tcgen05GroupKind::CTA_1) ?
+      llvm::Intrinsic::nvvm_tcgen05_relinq_alloc_permit_cg1 :
+      llvm::Intrinsic::nvvm_tcgen05_relinq_alloc_permit_cg2;
+    createIntrinsicCall(builder, id);
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM target attribute.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1243,6 +1243,47 @@ llvm::Intrinsic::ID CvtFloatToTF32Op::getIntrinsicID(NVVM::FPRoundingMode rnd,
   }
 }
 
+llvm::Intrinsic::ID
+Tcgen05AllocOp::getIntrinsicIDAndArgs(Operation &op,
+                                      LLVM::ModuleTranslation &mt,
+                                      llvm::SmallVector<llvm::Value *> &args) {
+  auto curOp = cast<NVVM::Tcgen05AllocOp>(op);
+  unsigned AS = llvm::cast<LLVM::LLVMPointerType>(curOp.getAddr().getType())
+                    .getAddressSpace();
+  bool isShared = AS == NVVMMemorySpace::kSharedMemorySpace;
+  bool is2CTAMode = curOp.getGroup() == Tcgen05GroupKind::CTA_2;
+
+  llvm::Intrinsic::ID id;
+  if (isShared) {
+    id = is2CTAMode ? llvm::Intrinsic::nvvm_tcgen05_alloc_shared_cg2
+                    : llvm::Intrinsic::nvvm_tcgen05_alloc_shared_cg1;
+  } else {
+    id = is2CTAMode ? llvm::Intrinsic::nvvm_tcgen05_alloc_cg2
+                    : llvm::Intrinsic::nvvm_tcgen05_alloc_cg1;
+  }
+
+  // Fill the Intrinsic Args
+  args.push_back(mt.lookupValue(curOp.getAddr()));
+  args.push_back(mt.lookupValue(curOp.getNCols()));
+
+  return id;
+}
+
+llvm::Intrinsic::ID Tcgen05DeallocOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt,
+    llvm::SmallVector<llvm::Value *> &args) {
+  auto curOp = cast<NVVM::Tcgen05DeallocOp>(op);
+  auto id = (curOp.getGroup() == Tcgen05GroupKind::CTA_1)
+                ? llvm::Intrinsic::nvvm_tcgen05_dealloc_cg1
+                : llvm::Intrinsic::nvvm_tcgen05_dealloc_cg2;
+
+  // Fill the Intrinsic Args
+  args.push_back(mt.lookupValue(curOp.getTaddr()));
+  args.push_back(mt.lookupValue(curOp.getNCols()));
+
+  return id;
+}
+
 /// Infer the result ranges for the NVVM SpecialRangeableRegisterOp that might
 /// have ConstantRangeAttr.
 static void nvvmInferResultRanges(Operation *op, Value result,
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-LLVM
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_alloc
+llvm.func @llvm_nvvm_tcgen05_alloc(%addr : !llvm.ptr, %ncols : i32) {
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.cg1(ptr %{{.*}}, i32 %{{.*}})
+  nvvm.tcgen05.alloc %addr, %ncols : !llvm.ptr, i32
+
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.cg2(ptr %{{.*}}, i32 %{{.*}})
+  nvvm.tcgen05.alloc %addr, %ncols {group = #nvvm.tcgen05_group<cta_2>} : !llvm.ptr, i32
+  llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_alloc_shared
+llvm.func @llvm_nvvm_tcgen05_alloc_shared(%addr : !llvm.ptr<3>, %ncols : i32) {
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
+  nvvm.tcgen05.alloc %addr, %ncols : !llvm.ptr<3>, i32
+
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
+  nvvm.tcgen05.alloc %addr, %ncols {group = #nvvm.tcgen05_group<cta_2>} : !llvm.ptr<3>, i32
+  llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_dealloc
+llvm.func @llvm_nvvm_tcgen05_dealloc(%addr : !llvm.ptr<6>, %ncols : i32) {
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %{{.*}}, i32 %{{.*}})
+  nvvm.tcgen05.dealloc %addr, %ncols : !llvm.ptr<6>, i32
+
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %{{.*}}, i32 %{{.*}})
+  nvvm.tcgen05.dealloc %addr, %ncols {group = #nvvm.tcgen05_group<cta_2>} : !llvm.ptr<6>, i32
+  llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_relinquish_alloc_permit
+llvm.func @llvm_nvvm_tcgen05_relinquish_alloc_permit() {
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1()
+  nvvm.tcgen05.relinquish_alloc_permit
+
+  // CHECK-LLVM: call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2()
+  nvvm.tcgen05.relinquish_alloc_permit {group = #nvvm.tcgen05_group<cta_2>}
+  llvm.return
+}