@@ -23,6 +23,7 @@ include "mlir/Interfaces/InferIntRangeInterface.td"
23
23
def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
24
24
def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
25
25
def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
26
+ def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>;
26
27
27
28
//===----------------------------------------------------------------------===//
28
29
// NVVM dialect definitions
@@ -2592,6 +2593,110 @@ def NVVM_Breakpoint : NVVM_Op<"breakpoint"> {
2592
2593
let assemblyFormat = "attr-dict";
2593
2594
}
2594
2595
2596
+ //===----------------------------------------------------------------------===//
2597
+ // NVVM TCGEN05 Ops
2598
+ //===----------------------------------------------------------------------===//
2599
+ // Num CTAs in a group participating in the TCGEN05 operation.
2600
+ // This corresponds to the "cta_group::1", "cta_group::2"
2601
+ // modifiers in the PTX instructions.
2602
+ def Tcgen05GroupCTA_1 : I32EnumAttrCase<"CTA_1", 0, "cta_1">;
2603
+ def Tcgen05GroupCTA_2 : I32EnumAttrCase<"CTA_2", 1, "cta_2">;
2604
+
2605
+ def Tcgen05GroupKind : I32EnumAttr<"Tcgen05GroupKind",
2606
+ "NVVM Tcgen05 group kind",
2607
+ [Tcgen05GroupCTA_1, Tcgen05GroupCTA_2]> {
2608
+ let genSpecializedAttr = 0;
2609
+ let cppNamespace = "::mlir::NVVM";
2610
+ }
2611
+ def Tcgen05GroupKindAttr :
2612
+ EnumAttr<NVVM_Dialect, Tcgen05GroupKind, "tcgen05_group"> {
2613
+ let assemblyFormat = "`<` $value `>`";
2614
+ }
2615
+
2616
+ def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
2617
+ let summary = "Tcgen05 alloc operation";
2618
+ let description = [{
2619
+ The `tcgen05.alloc` Op allocates tensor core memory for
2620
+ the amount specified by `nCols` and writes the destination
2621
+ address to the `addr` argument. The `nCols` operand specifies the
2622
+ number of columns to be allocated and it must be a power-of-two.
2623
+ [For more information, refer to the PTX ISA]
2624
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
2625
+ }];
2626
+
2627
+ let arguments = (ins
2628
+ AnyTypeOf<[LLVM_AnyPointer, LLVM_PointerShared]>:$addr,
2629
+ I32:$nCols,
2630
+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
2631
+
2632
+ let assemblyFormat = "$addr `,` $nCols attr-dict `:` type(operands)";
2633
+
2634
+ let extraClassDeclaration = [{
2635
+ static llvm::Intrinsic::ID
2636
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
2637
+ llvm::SmallVector<llvm::Value *> &args);
2638
+ }];
2639
+ string llvmBuilder = [{
2640
+ llvm::SmallVector<llvm::Value *> args;
2641
+ auto id = NVVM::Tcgen05AllocOp::getIntrinsicIDAndArgs(
2642
+ *op, moduleTranslation, args);
2643
+ createIntrinsicCall(builder, id, args);
2644
+ }];
2645
+ }
2646
+
2647
+ def NVVM_Tcgen05DeallocOp : NVVM_Op<"tcgen05.dealloc"> {
2648
+ let summary = "Tcgen05 dealloc operation";
2649
+ let description = [{
2650
+ The `tcgen05.dealloc` Op de-allocates the tensor core memory
2651
+ specified by `tmemAddr`, which must be from a previous tensor
2652
+ memory allocation. The `nCols` operand specifies the number
2653
+ of columns to be de-allocated, and it must be a power-of-two.
2654
+ [For more information, refer to the PTX ISA]
2655
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
2656
+ }];
2657
+
2658
+ let arguments = (ins LLVM_PointerTensor:$taddr, I32:$nCols,
2659
+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
2660
+
2661
+ let assemblyFormat = "$taddr `,` $nCols attr-dict `:` type(operands)";
2662
+
2663
+ let extraClassDeclaration = [{
2664
+ static llvm::Intrinsic::ID
2665
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
2666
+ llvm::SmallVector<llvm::Value *> &args);
2667
+ }];
2668
+ string llvmBuilder = [{
2669
+ llvm::SmallVector<llvm::Value *> args;
2670
+ auto id = NVVM::Tcgen05DeallocOp::getIntrinsicIDAndArgs(
2671
+ *op, moduleTranslation, args);
2672
+ createIntrinsicCall(builder, id, args);
2673
+ }];
2674
+ }
2675
+
2676
+ def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_permit"> {
2677
+ let summary = "Tcgen05 Op to relinquish the right to allocate";
2678
+ let description = [{
2679
+ The `tcgen05.relinquish_alloc_permit` Op specifies that the CTA
2680
+ of the executing thread is relinquishing the right to allocate
2681
+ Tensor Memory. So, it is illegal for a CTA to perform `tcgen05.alloc`
2682
+ after any of its constituent threads execute `tcgen05.relinquish_alloc_permit`.
2683
+ [For more information, refer to the PTX ISA]
2684
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
2685
+ }];
2686
+
2687
+ let arguments = (ins
2688
+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
2689
+
2690
+ let assemblyFormat = "attr-dict";
2691
+
2692
+ string llvmBuilder = [{
2693
+ auto id = ($group == NVVM::Tcgen05GroupKind::CTA_1) ?
2694
+ llvm::Intrinsic::nvvm_tcgen05_relinq_alloc_permit_cg1 :
2695
+ llvm::Intrinsic::nvvm_tcgen05_relinq_alloc_permit_cg2;
2696
+ createIntrinsicCall(builder, id);
2697
+ }];
2698
+ }
2699
+
2595
2700
//===----------------------------------------------------------------------===//
2596
2701
// NVVM target attribute.
2597
2702
//===----------------------------------------------------------------------===//
0 commit comments