-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtst.py
134 lines (115 loc) · 4.04 KB
/
tst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from cutlass_library.manifest import Manifest
from cutlass_library.generator import GeneratorTarget
from cutlass_library.generator import (
GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled,
)
# Create Options object for manifest configuration
class Options:
def __init__(self):
self.kernels = "all" # Generate all kernels
self.curr_build_dir = "."
self.operations = "all"
self.ignore_kernels = ""
self.exclude_kernels = ""
self.kernel_filter_file = None
self.architectures = "100" # For SM100
self.filter_by_cc = True
self.disable_full_archs_compilation = False
self.instantiation_level = "max"
def GenerateCustomE4M3Gemm(manifest, cuda_version):
from cutlass_library import (
LayoutType,
MathOperation,
DataType,
MathInstruction,
OpcodeClass,
TileDescription,
DynamicClusterShape,
EpilogueScheduleType,
TileSchedulerType,
KernelScheduleType,
)
from cutlass_library.generator import CreateGemmUniversal3xOperator, CudaToolkitVersionSatisfies
if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
return
layouts = [
[
[LayoutType.RowMajor, 128],
[LayoutType.ColumnMajor, 128],
[LayoutType.RowMajor, 0],
]
]
instruction_sizes = [[128, 128, 32]] # Single instruction size for simplicity
min_cc = 100
max_cc = 100
math_instructions = []
for instr_size in instruction_sizes:
math_instructions.append(
MathInstruction(
instr_size,
DataType.e4m3, # A type
DataType.e4m3, # B type
DataType.f32, # Accumulator type
OpcodeClass.BlockScaledTensorOp,
MathOperation.multiply_add,
DataType.ue8m0,
)
)
cluster_shapes = [[1, 1, 1]] # Single cluster shape
for math_inst in math_instructions:
tile_descriptions = []
for cluster_shape in cluster_shapes:
multiplier = (
(1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
)
tile_descriptions.append(
TileDescription(
[
math_inst.instruction_shape[0] * multiplier[0],
math_inst.instruction_shape[1] * multiplier[1],
math_inst.instruction_shape[2] * 4 * multiplier[2],
],
0,
[4, 1, 1],
math_inst,
min_cc,
max_cc,
cluster_shape,
)
)
data_type = {
"a_type": math_inst.element_a,
"b_type": math_inst.element_b,
"c_type": DataType.void, # No bias
"d_type": DataType.bf16, # Output in bf16
"acc_type": DataType.f32, # Accumulate in f32
"epi_type": DataType.f32,
"sf_type": math_inst.element_scale_factor,
"sfd_type": {"type": DataType.void, "vector_size": None, "layout": None},
}
CreateGemmUniversal3xOperator(
manifest,
layouts,
tile_descriptions,
data_type,
[
[
KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100,
EpilogueScheduleType.TmaWarpSpecialized1Sm,
]
],
tile_schedulers=[TileSchedulerType.Default],
)
if __name__ == "__main__":
args = Options()
manifest = Manifest(args)
GenerateCustomE4M3Gemm(manifest, cuda_version="12.8")
manifest.emit()
# # Initialize manifest with options
# args = Options()
# manifest = Manifest(args)
# # Generate the kernels
# cuda_version = "12.8"
# GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version)
# # Emit the manifest
# manifest.emit(GeneratorTarget.Library)