diff --git a/.gitmodules b/.gitmodules
index 9f350cc65..09f14ae0b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -16,6 +16,3 @@
 [submodule "submodules/aiter"]
 	path = submodules/aiter
 	url = https://github.com/ROCm/aiter.git
-[submodule "submodules/quack"]
-	path = submodules/quack
-	url = https://github.com/Dao-AILab/quack.git
diff --git a/submodules/quack b/submodules/quack
deleted file mode 160000
index a42fef7e5..000000000
--- a/submodules/quack
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a42fef7e5249513441dd08863cf35f94c115dc68
diff --git a/tools/quack/install.py b/tools/quack/install.py
index 3de8495f3..8bfe8f23f 100644
--- a/tools/quack/install.py
+++ b/tools/quack/install.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 import subprocess
 
 from pathlib import Path
@@ -6,9 +7,26 @@
 
 REPO_PATH = Path(os.path.abspath(__file__)).parent.parent.parent
 CURRENT_DIR = Path(os.path.abspath(__file__)).parent
-QUACK_PATH = REPO_PATH.joinpath("submodules", "quack")
+
+QUACK_REPO = "https://github.com/Dao-AILab/quack.git"
+QUACK_SHA = "bceb632dbac9bb0b55d48a7ed3ad204bd952fcb2"
+
+QUACK_INSTALL_PATH = REPO_PATH.joinpath(".install")
 
 
 def install_quack():
     cmd = ["pip", "install", "-e", "."]
     subprocess.check_call(cmd, cwd=QUACK_PATH)
+
+
+def install_quack():
+    QUACK_INSTALL_PATH.mkdir(parents=True, exist_ok=True)
+    quack_path = QUACK_INSTALL_PATH.joinpath("quack")
+    if quack_path.exists():
+        shutil.rmtree(quack_path)
+    git_clone_cmd = ["git", "clone", QUACK_REPO]
+    subprocess.check_call(git_clone_cmd, cwd=QUACK_INSTALL_PATH)
+    git_checkout_cmd = ["git", "checkout", QUACK_SHA]
+    subprocess.check_call(git_checkout_cmd, cwd=quack_path)
+    install_helion_cmd = ["pip", "install", "-e", ".[dev]"]
+    subprocess.check_call(install_helion_cmd, cwd=quack_path)
diff --git a/tritonbench/operators/launch_latency/operator.py b/tritonbench/operators/launch_latency/operator.py
index 4b4243a43..c1df7a871 100644
--- a/tritonbench/operators/launch_latency/operator.py
+++ b/tritonbench/operators/launch_latency/operator.py
@@ -93,6 +93,24 @@ def nop_cutedsl(self, *args):
         cute_args = cute_args[:-5]
         return lambda: kernel(*cute_args)
 
+    @register_benchmark(enabled=HAS_CUTEDSL)
+    def nop_cutedsl_tvm_ffi(self, *args):
+        if len(args) == 0:
+            kernel = cute.compile(cutedsl_nop_kernel)
+            return lambda: kernel()
+        cute_args = []
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                cute_args.append(cute.runtime.from_dlpack(arg, enable_tvm_ffi=True))
+            else:
+                cute_args.append(arg)
+        kernel = cute.compile(
+            cutedsl_nop_with_args_kernel, *cute_args, options="--enable-tvm-ffi"
+        )
+        # remove constexpr args
+        cute_args = cute_args[:-5]
+        return lambda: kernel(*cute_args)
+
     @register_benchmark(baseline=True)
     def nop_python_function(self, *args):
         def nop():
diff --git a/tritonbench/utils/parser.py b/tritonbench/utils/parser.py
index 91086e3f5..025c3ddf4 100644
--- a/tritonbench/utils/parser.py
+++ b/tritonbench/utils/parser.py
@@ -416,4 +416,7 @@ def get_parser(args=None):
             )
         if args.isolate:
             parser.error("A/B testing is not compatible with --isolate mode")
+
+    if args.metrics and "walltime_kineto_trace" in args.metrics and args.repcnt is None:
+        parser.error("Walltime Kineto trace requires --repcnt to be specified")
     return parser