diff --git a/.gitmodules b/.gitmodules index 9f350cc65..09f14ae0b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -16,6 +16,3 @@ [submodule "submodules/aiter"] path = submodules/aiter url = https://github.com/ROCm/aiter.git -[submodule "submodules/quack"] - path = submodules/quack - url = https://github.com/Dao-AILab/quack.git diff --git a/submodules/quack b/submodules/quack deleted file mode 160000 index a42fef7e5..000000000 --- a/submodules/quack +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a42fef7e5249513441dd08863cf35f94c115dc68 diff --git a/tools/quack/install.py b/tools/quack/install.py index 3de8495f3..8bfe8f23f 100644 --- a/tools/quack/install.py +++ b/tools/quack/install.py @@ -1,4 +1,5 @@ import os +import shutil import subprocess from pathlib import Path @@ -6,9 +7,26 @@ REPO_PATH = Path(os.path.abspath(__file__)).parent.parent.parent CURRENT_DIR = Path(os.path.abspath(__file__)).parent -QUACK_PATH = REPO_PATH.joinpath("submodules", "quack") + +QUACK_REPO = "https://github.com/Dao-AILab/quack.git" +QUACK_SHA = "bceb632dbac9bb0b55d48a7ed3ad204bd952fcb2" + +QUACK_INSTALL_PATH = REPO_PATH.joinpath(".install") def install_quack(): cmd = ["pip", "install", "-e", "."] subprocess.check_call(cmd, cwd=QUACK_PATH) + + +def install_quack(): + QUACK_INSTALL_PATH.mkdir(parents=True, exist_ok=True) + quack_path = QUACK_INSTALL_PATH.joinpath("quack") + if quack_path.exists(): + shutil.rmtree(quack_path) + git_clone_cmd = ["git", "clone", QUACK_REPO] + subprocess.check_call(git_clone_cmd, cwd=QUACK_INSTALL_PATH) + git_checkout_cmd = ["git", "checkout", QUACK_SHA] + subprocess.check_call(git_checkout_cmd, cwd=quack_path) + install_helion_cmd = ["pip", "install", "-e", ".[dev]"] + subprocess.check_call(install_helion_cmd, cwd=quack_path) diff --git a/tritonbench/operators/launch_latency/operator.py b/tritonbench/operators/launch_latency/operator.py index 4b4243a43..c1df7a871 100644 --- a/tritonbench/operators/launch_latency/operator.py +++ b/tritonbench/operators/launch_latency/operator.py @@ -93,6 +93,24 @@ def nop_cutedsl(self, *args): cute_args = cute_args[:-5] return lambda: kernel(*cute_args) + @register_benchmark(enabled=HAS_CUTEDSL) + def nop_cutedsl_tvm_ffi(self, *args): + if len(args) == 0: + kernel = cute.compile(cutedsl_nop_kernel) + return lambda: kernel() + cute_args = [] + for arg in args: + if isinstance(arg, torch.Tensor): + cute_args.append(cute.runtime.from_dlpack(arg, enable_tvm_ffi=True)) + else: + cute_args.append(arg) + kernel = cute.compile( + cutedsl_nop_with_args_kernel, *cute_args, options="--enable-tvm-ffi" + ) + # remove constexpr args + cute_args = cute_args[:-5] + return lambda: kernel(*cute_args) + @register_benchmark(baseline=True) def nop_python_function(self, *args): def nop(): diff --git a/tritonbench/utils/parser.py b/tritonbench/utils/parser.py index 91086e3f5..025c3ddf4 100644 --- a/tritonbench/utils/parser.py +++ b/tritonbench/utils/parser.py @@ -416,4 +416,7 @@ def get_parser(args=None): ) if args.isolate: parser.error("A/B testing is not compatible with --isolate mode") + + if args.metrics and "walltime_kineto_trace" in args.metrics and args.repcnt is None: + parser.error("Walltime Kineto trace requires --repcnt to be specified") return parser