vllm-project · WindChimeRan · Apr 20, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/tests/test_paged_deterministic.py b/tests/test_paged_deterministic.py
@@ -121,11 +121,17 @@ def _set_env():
 def vllm_outputs():
     """Run vLLM offline inference once for all prompts.
 
-    Uses max_num_seqs=1 to avoid batch-invariance non-determinism on Metal.
+    Pinned to ``enable_prefix_caching=False`` so the golden token IDs
+    (cache-off reference) remain the invariant under test regardless of
+    upstream default changes.
     """
-    llm = LLM(model=MODEL_NAME, max_model_len=512, max_num_seqs=1)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_model_len=512,
+        max_num_seqs=1,
+        enable_prefix_caching=False,
+    )
 
-    # Verify paged KV path is active when requested
     if os.environ.get("VLLM_METAL_USE_PAGED_ATTENTION", "0") == "1":
         runner = llm.llm_engine.model_executor.driver_worker.model_runner
         assert runner._paged_attention_backend is not None, (

diff --git a/tests/test_paged_prefix_caching_e2e.py b/tests/test_paged_prefix_caching_e2e.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+"""End-to-end correctness of paged prefix caching (issue #182).
+
+Fires the deterministic-test prompts twice through ``vllm.LLM`` with
+prefix caching enabled.  The first pass primes the cache; the second
+pass exercises the model_runner's ``start_pos > 0`` path because the
+upstream scheduler reports ``num_computed_tokens > 0``.  The asserted
+token sequence is the existing cache-off golden, so a broken cache-hit
+path surfaces as a token mismatch.
+
+The LLM body runs in a spawned child process (``multiprocessing`` with
+the ``spawn`` start method) so Metal device init happens in a fresh
+interpreter.  This is required on the Metal platform because:
+  - ``fork`` inherits the parent's Metal context and segfaults in the
+    child (Metal is not fork-safe).
+  - Running in the parent pytest process alongside the cache-off
+    baseline fixture in ``test_paged_deterministic`` causes
+    ``kv_budget=0`` — MLX wired buffers aren't released by Python gc.
+"""
+
+from __future__ import annotations
+
+import multiprocessing as mp
+import os
+
+import pytest
+
+from tests.test_paged_deterministic import (
+    DEFAULT_PAGED_MEMORY_FRACTION,
+    DEFAULT_USE_PAGED_ATTENTION,
+)
+
+
+def _setenv_default(key: str, default: str) -> None:
+    if os.environ.get(key) is None:
+        os.environ[key] = default
+
+
+def _run_prefix_cache_correctness() -> None:
+    """Body of the e2e test — runs in a spawned child process.
+
+    Imports happen lazily inside the child so vllm / MLX init is not
+    inherited from the parent process.
+    """
+    _setenv_default("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    _setenv_default("VLLM_METAL_USE_PAGED_ATTENTION", DEFAULT_USE_PAGED_ATTENTION)
+    _setenv_default("VLLM_METAL_MEMORY_FRACTION", DEFAULT_PAGED_MEMORY_FRACTION)
+
+    if os.environ.get("VLLM_METAL_USE_PAGED_ATTENTION", "0") != "1":
+        return  # non-paged path: nothing to test
+
+    from vllm import LLM, SamplingParams
+
+    from tests.test_paged_deterministic import (
+        GOLDEN_MLX,
+        GOLDEN_PAGED,
+        MAX_TOKENS,
+        MODEL_NAME,
+        PROMPTS,
+    )
+
+    llm = LLM(
+        model=MODEL_NAME,
+        max_model_len=512,
+        max_num_seqs=1,
+        enable_prefix_caching=True,
+    )
+    sp = SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
+    llm.generate(PROMPTS, sp)  # prime the cache
+    outputs = llm.generate(PROMPTS, sp)  # cache hits expected
+    by_prompt = {o.prompt: o for o in outputs}
+
+    mismatches = []
+    for prompt in PROMPTS:
+        output = by_prompt[prompt]
+        token_ids = list(output.outputs[0].token_ids)
+        mlx_expected = GOLDEN_MLX[prompt]
+        paged_expected = GOLDEN_PAGED[prompt]
+        if token_ids != mlx_expected and token_ids != paged_expected:
+            mismatches.append(
+                f"  {prompt!r}\n"
+                f"    got:        {token_ids}\n"
+                f"    mlx golden: {mlx_expected}\n"
+                f"    pgd golden: {paged_expected}"
+            )
+
+    if mismatches:
+        raise AssertionError(
+            "Prefix-cached output matched neither golden set for some prompts:\n"
+            + "\n".join(mismatches)
+        )
+
+
+@pytest.mark.slow
+def test_prefix_cached_matches_golden() -> None:
+    ctx = mp.get_context("spawn")
+    proc = ctx.Process(target=_run_prefix_cache_correctness)
+    proc.start()
+    proc.join()
+    if proc.exitcode != 0:
+        raise AssertionError(
+            f"Prefix-cache e2e test failed in spawned child "
+            f"(exit code: {proc.exitcode})"
+        )
diff --git a/tools/README.md b/tools/README.md
@@ -65,16 +65,18 @@ python -m tools.benchmark.attention_benchmark --mode varlen --q-lens 1,4,16,64 -
 
 ## Prefix Caching Benchmark
 
-Measures TTFT with shared-prefix workloads using `prefix_repetition` dataset.
-Establishes a baseline before prefix caching is implemented (#159).
+Measures TTFT / TPOT / E2EL with shared-prefix workloads using the
+upstream `prefix_repetition` dataset.  Compare cache-off baseline vs
+cache-on by toggling `--enable-prefix-caching` / `--no-enable-prefix-caching`.
 
 **1. Start the server:**
 
 ```bash
 # Adjust MEMORY_FRACTION based on available RAM (lower if OOM).
 VLLM_METAL_USE_PAGED_ATTENTION=1 VLLM_METAL_MEMORY_FRACTION=0.7 \
   vllm serve Qwen/Qwen3-0.6B \
-    --port 8000 --max-model-len 2048 --max-num-seqs 8
+    --port 8000 --max-model-len 2048 --max-num-seqs 8 \
+    --enable-prefix-caching
 ```
 
 **2. Run the benchmark:**
@@ -93,8 +95,8 @@ vllm bench serve \
   --request-rate inf \
   --percentile-metrics ttft,tpot,e2el \
   --metric-percentiles 50,99 \
-  --save-result --label baseline
+  --save-result --label cache-on
 ```
 
-Key metric is **TTFT** — with prefix caching enabled, requests sharing
-the same prefix should show lower TTFT on cache hits.
+For a cache-off baseline, restart the server with
+`--no-enable-prefix-caching` and re-run with `--label baseline`.
diff --git a/vllm_metal/platform.py b/vllm_metal/platform.py
@@ -275,15 +275,6 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                     scheduler_config.max_num_batched_tokens,
                 )
 
-        if config.use_paged_attention and getattr(
-            cache_config, "enable_prefix_caching", False
-        ):
-            # The unified paged path does not yet safely support vLLM core
-            # prefix-cache hits for new requests. Disable the feature at the
-            # platform layer until that path is fully supported.
-            cache_config.enable_prefix_caching = False
-            logger.info("Metal: disabled prefix caching")
-
         # Configure cache — ensure block_size is at least the Metal kernel
         # minimum.  With chunked prefill enabled, upstream may default to
         # block_size=1 for fine-grained scheduling, but our Metal paged