FAv4 CuteDSL Bench for decode

Aya Ibrahim · facebook-github-bot · commit 2b5cc79bfacf · 2025-09-04T09:02:00.000-07:00
Summary: use of headq = 8 , is doing much better. Maybe because headq= 5 probably doesn't work with TMA_q used here.

Differential Revision: D80830933
diff --git a/tritonbench/operators/decoding_attention/operator.py b/tritonbench/operators/decoding_attention/operator.py
@@ -72,6 +72,17 @@
     HAS_AITER = False
 
 
+# [Optional] flash_fwd cute-DSL backend
+HAS_FLASH_CUTE = True
+try:
+    from flash_attn.cute.interface import (
+        flash_attn_func as flash_attn_cute_func
+    )
+except (ImportError, IOError, AttributeError):
+    HAS_FLASH_CUTE = False
+    flash_attn_cute_func = None  # Define it as None to avoid NameError
+
+
 def parse_op_args(args: List[str]):
     parser = argparse.ArgumentParser()
     parser.add_argument("--batch", type=int, help="Batch size")
@@ -559,6 +570,26 @@ def fbgemm_gqa_fp8kv(
             cache_logical_dtype_int=1,  # FP8 = 1
         )
 
+   
+    @register_benchmark(enabled=HAS_FLASH_CUTE)
+    def flash_cute_dsl(
+        self,
+        q: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+    ) -> Callable:
+        """Flash Attention implementation using cute-DSL backend."""
+        # For GQA, cute-DSL handles the head expansion internally
+        # We pass the original KV tensors without manual expansion
+        q_heads = q.shape[2]
+        kv_heads = k_cache.shape[2]
+        return lambda:flash_attn_cute_func(
+                    q, k_cache, v_cache, 
+                    causal=CAUSAL,
+                    pack_gqa=(q_heads != kv_heads)
+                )
+
     @register_benchmark(enabled=HAS_AITER)
     def aiter_paged_fp8kv(
         self,