meta-pytorch
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/forge/actors/vllm/v1/generator.py‎
Lines changed: 24 additions & 2 deletions b/‎src/forge/actors/vllm/v1/generator.py‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎tests/integration_tests/fixtures/golden_outputs/completion_0.pt‎
3.2 KB b/‎tests/integration_tests/fixtures/golden_outputs/completion_0.pt‎
3.2 KB
diff --git a/‎tests/integration_tests/fixtures/golden_outputs/completion_1.pt‎
3.14 KB b/‎tests/integration_tests/fixtures/golden_outputs/completion_1.pt‎
3.14 KB
diff --git a/‎tests/integration_tests/fixtures/golden_outputs/completion_2.pt‎
3.2 KB b/‎tests/integration_tests/fixtures/golden_outputs/completion_2.pt‎
3.2 KB
diff --git a/‎tests/integration_tests/fixtures/golden_outputs/completion_3.pt‎
3.26 KB b/‎tests/integration_tests/fixtures/golden_outputs/completion_3.pt‎
3.26 KB
diff --git a/‎tests/integration_tests/fixtures/golden_outputs/completion_4.pt‎
2.58 KB b/‎tests/integration_tests/fixtures/golden_outputs/completion_4.pt‎
2.58 KB
diff --git a/‎tests/integration_tests/fixtures/golden_outputs/metadata.pt‎
1.42 KB b/‎tests/integration_tests/fixtures/golden_outputs/metadata.pt‎
1.42 KB
diff --git a/‎tests/integration_tests/generate_golden_outputs.py‎
Lines changed: 113 additions & 0 deletions b/‎tests/integration_tests/generate_golden_outputs.py‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎tests/integration_tests/test_vllm_policy_correctness.py‎
Lines changed: 77 additions & 1 deletion b/‎tests/integration_tests/test_vllm_policy_correctness.py‎
Lines changed: 77 additions & 1 deletion
@@ -84,6 +84,9 @@ ipython_config.py
 *.distcp
 .metadata
 
+# Track golden test outputs (these are checked in for correctness tests)
+!tests/integration_tests/fixtures/golden_outputs/*.pt
+
 # mypy
 .mypy_cache/
 .dmypy.json
 
@@ -527,6 +527,24 @@ async def validate_model_params(self, validate_fn):
         logger.info("start validating model parameters.")
         return await self.workers.validate_model_params.call(validate_fn)
 
+    def _extract_logprobs(self, output) -> torch.Tensor | None:
+        """Extract logprobs from vLLM output as a torch.Tensor.
+
+        Args:
+            output: vLLM CompletionOutput with optional logprobs.
+
+        Returns:
+            torch.Tensor of logprobs for each token, or None if not available.
+        """
+        if output.logprobs is not None:
+            return torch.tensor(
+                [
+                    top_k_dict[token].logprob
+                    for token, top_k_dict in zip(output.token_ids, output.logprobs)
+                ]
+            )
+        return None
+
     def _to_completions(
         self, request_output: RequestOutput, prompt: str
     ) -> list[Completion]:
@@ -553,15 +571,19 @@ def _to_completions(
                 token_ids=torch.tensor(
                     output.token_ids if hasattr(output, "token_ids") else []
                 ),
-                logprobs=(output.logprobs if hasattr(output, "logprobs") else None),
+                logprobs=self._extract_logprobs(output),
                 stop_reason=output.finish_reason,
                 generator_version=self.generator_version,
-                metadata=None,
+                metadata={"num_cached_tokens": request_output.num_cached_tokens},
             )
             completions.append(completion)
 
         return completions
 
+    @endpoint
+    async def _reset_prefix_cache(self):
+        await self.llm.reset_prefix_cache()
+
 
 class _WeightFetcher(ForgeActor):
     """Fetches weights from torchstore and loads them into shared memory.
 
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Generate baseline golden output files using the current Generator implementation.
+
+These golden files serve as a baseline for verifying that new implementations
+produce identical outputs. Uses deterministic sampling (temperature=0) for
+reproducibility.
+
+NOTE: Golden output artifacts are checked into git. Keep the number of prompts
+and MAX_TOKENS small to avoid bloating the repository. Current artifacts are
+~20KB total.
+
+Usage:
+    python tests/integration_tests/generate_golden_outputs.py
+
+The script will generate golden files in tests/integration_tests/fixtures/golden_outputs/
+"""
+
+import asyncio
+from pathlib import Path
+
+import torch
+from forge.actors.generator import Generator
+
+
+# Configuration - matches test_vllm_policy_correctness.py
+MODEL_NAME = "facebook/opt-125m"
+MAX_MODEL_LEN = 512
+GPU_MEMORY_UTILIZATION = 0.1
+ENFORCE_EAGER = True
+ENABLE_PREFIX_CACHING = True
+TENSOR_PARALLEL_SIZE = 1
+
+# Deterministic sampling
+MAX_TOKENS = 50
+TEMPERATURE = 0.0
+TOP_P = 1.0
+N_SAMPLES = 1
+
+TEST_PROMPTS = [
+    "Hello, how are you?",
+    "What is 2+2?",
+    "Tell me a joke.",
+    "Explain machine learning briefly.",
+    "What color is the sky?",
+]
+
+
+async def generate_golden_outputs():
+    """Generate golden outputs using the current Generator."""
+    golden_dir = Path(__file__).parent / "fixtures" / "golden_outputs"
+    golden_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Generating golden outputs to: {golden_dir}")
+    print(f"Model: {MODEL_NAME}")
+
+    generator = None
+    try:
+        generator = await Generator.options(
+            procs=1, num_replicas=1, with_gpus=True
+        ).as_service(
+            engine_args={
+                "model": MODEL_NAME,
+                "tensor_parallel_size": TENSOR_PARALLEL_SIZE,
+                "enforce_eager": ENFORCE_EAGER,
+                "max_model_len": MAX_MODEL_LEN,
+                "gpu_memory_utilization": GPU_MEMORY_UTILIZATION,
+                "enable_prefix_caching": ENABLE_PREFIX_CACHING,
+            },
+            sampling_params={
+                "n": N_SAMPLES,
+                "max_tokens": MAX_TOKENS,
+                "temperature": TEMPERATURE,
+                "top_p": TOP_P,
+                "logprobs": 1,
+            },
+        )
+
+        print("Generator ready. Generating outputs...\n")
+
+        for i, prompt in enumerate(TEST_PROMPTS):
+            print(f"[{i + 1}/{len(TEST_PROMPTS)}] Prompt: {prompt[:50]}...")
+
+            result = await generator.generate.route(prompt)
+            completion = result[0]
+
+            # Serialize entire Completion object
+            golden_path = golden_dir / f"completion_{i}.pt"
+            torch.save(completion, golden_path)
+            print(f"    Saved: {golden_path}")
+
+        metadata = {
+            "model": MODEL_NAME,
+            "max_tokens": MAX_TOKENS,
+            "temperature": TEMPERATURE,
+            "prompts": TEST_PROMPTS,
+        }
+        torch.save(metadata, golden_dir / "metadata.pt")
+
+        print("\nGolden output generation complete!")
+
+    finally:
+        if generator is not None:
+            await generator.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(generate_golden_outputs())
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import pytest
+import torch
 from forge.actors.generator import Generator
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -29,7 +30,7 @@
 
 @pytest.mark.asyncio
 async def test_same_output():
-    """Compare outputs between vLLM and Generator service"""
+    """Compare outputs between vLLM and Generator service."""
     test_prompts = [
         "Hello, how are you?",
         "What is 2+2?",
@@ -236,3 +237,78 @@ async def test_cache_usage():
     finally:
         if generator is not None:
             await generator.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_generator_matches_golden():
+    """Verify Generator produces identical outputs to baseline golden files.
+
+    Golden files are already committed. Only regenerate when updating baseline:
+        python tests/integration_tests/generate_golden_outputs.py
+    """
+    from dataclasses import fields
+    from pathlib import Path
+
+    def completions_equal(a, b) -> bool:
+        """Compare two Completion objects, handling tensors correctly."""
+        for field in fields(a):
+            val_a = getattr(a, field.name)
+            val_b = getattr(b, field.name)
+            if isinstance(val_a, torch.Tensor) and isinstance(val_b, torch.Tensor):
+                if not torch.equal(val_a, val_b):
+                    return False
+            elif val_a != val_b:
+                return False
+        return True
+
+    golden_dir = Path(__file__).parent / "fixtures" / "golden_outputs"
+    metadata_path = golden_dir / "metadata.pt"
+
+    if not metadata_path.exists():
+        pytest.skip(
+            "Golden files not found. Generate baseline first: "
+            "python tests/integration_tests/generate_golden_outputs.py"
+        )
+
+    metadata = torch.load(metadata_path, weights_only=False)
+    test_prompts = metadata["prompts"]
+
+    generator = None
+    try:
+        generator = await Generator.options(
+            procs=1, num_replicas=1, with_gpus=True
+        ).as_service(
+            engine_args={
+                "model": MODEL_NAME,
+                "tensor_parallel_size": TENSOR_PARALLEL_SIZE,
+                "enforce_eager": ENFORCE_EAGER,
+                "max_model_len": MAX_MODEL_LEN,
+                "gpu_memory_utilization": GPU_MEMORY_UTILIZATION,
+                "enable_prefix_caching": ENABLE_PREFIX_CACHING,
+            },
+            sampling_params={
+                "n": N_SAMPLES,
+                "max_tokens": MAX_TOKENS,
+                "temperature": TEMPERATURE,
+                "top_p": TOP_P,
+                "logprobs": 1,
+            },
+        )
+
+        for i, prompt in enumerate(test_prompts):
+            golden_path = golden_dir / f"completion_{i}.pt"
+            assert golden_path.exists(), f"Golden file not found: {golden_path}"
+
+            golden = torch.load(golden_path, weights_only=False)
+            result = await generator.generate.route(prompt)
+            completion = result[0]
+
+            assert completions_equal(
+                completion, golden
+            ), f"Prompt {i}: completion mismatch"
+
+            print(f"Prompt {i}: PASS")
+
+    finally:
+        if generator is not None:
+            await generator.shutdown()