Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/source/environments.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,16 @@ Use the CLI to quickly test:
vf-eval my-math-env -m gpt-4.1-mini -n 5 # runs a small batch of rollouts; use -h to see options
```

A common workflow is to run an evaluation once, save the results (`-s`), and then iterate on your rubric. You can re-score the saved rollouts using the `--rerun-scoring-from` flag, passing the unique ID of the run.

```bash
# First, run and save the results
vf-eval my-math-env -m gpt-4.1-mini -n 5 -s

# Then, after modifying your rubric, re-score the same rollouts
vf-eval my-math-env --rerun-scoring-from -m gpt-4.1-mini <RUN_ID_FROM_PREVIOUS_STEP>
```

Or test programmatically:

```python
Expand Down
7 changes: 7 additions & 0 deletions docs/source/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,13 @@ vf-install my-environment-module # from ./environments/my_environment_module
vf-eval my-environment-module -m gpt-5 -n 10 -r 5 -s
```

You can re-run the scoring on a previously saved evaluation (`-s` flag), which is useful for debugging a rubric without re-running expensive model inferences.

```bash
# Re-score a previous run using its unique ID.
vf-eval my-environment-module -m gpt-5 --rerun-scoring-from <RUN_ID> -s
```

We also provide a TUI for browsing locally-cached (with `-s`) eval results:
```bash
vf-tui
Expand Down
2 changes: 2 additions & 0 deletions tests/test_eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def __init__(self, api_key=None, base_url=None):
save_dataset=False,
save_to_hf_hub=False,
hf_hub_dataset_name="",
rerun_scoring_from=None,
)

sa = captured["sampling_args"]
Expand Down Expand Up @@ -118,6 +119,7 @@ def __init__(self, api_key=None, base_url=None):
save_dataset=False,
save_to_hf_hub=False,
hf_hub_dataset_name="",
rerun_scoring_from=None,
)

sa = captured["sampling_args"]
Expand Down
161 changes: 145 additions & 16 deletions verifiers/scripts/eval.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import asyncio
import importlib
import importlib.util
import json
Expand All @@ -22,6 +23,80 @@
logger = logging.getLogger("verifiers.scripts.eval")


def _find_run_path(env_id: str, model: str, run_id: str, env_dir_path: str) -> Path:
"""Locate the path to a specific evaluation run directory."""
env_model_str = f"{env_id}--{model.replace('/', '--')}"

# Check local environment directory first
module_name = env_id.replace("-", "_")
local_env_dir = Path(env_dir_path) / module_name

potential_paths = [
local_env_dir / "outputs" / "evals" / env_model_str / run_id,
Path("./outputs") / "evals" / env_model_str / run_id,
]

for path in potential_paths:
if path.is_dir() and (path / "results.jsonl").exists():
logger.debug(f"Found run directory at: {path}")
return path

raise FileNotFoundError(
f"Could not find run directory for env='{env_id}', model='{model}', run_id='{run_id}'"
)


def _load_results_for_rescoring(results_file: Path) -> vf.GenerateOutputs:
"""Parse a results.jsonl file and load it into a GenerateOutputs object."""
prompts, completions, answers, tasks, infos, states = [], [], [], [], [], []

with open(results_file, "r") as f:
for line in f:
data = json.loads(line)

# --- Robustly deserialize prompt and completion ---
for key, target_list in [("prompt", prompts), ("completion", completions)]:
field_data = data.get(key)
if isinstance(field_data, str):
try:
# Attempt to parse as JSON (for chat format)
deserialized = json.loads(field_data)
target_list.append(deserialized)
except json.JSONDecodeError:
# If it fails, it's a plain string (for completion format)
target_list.append(field_data)
else:
# It's already in the correct list/dict format (or None)
target_list.append(field_data or [])

answers.append(data.get("answer", ""))
tasks.append(data.get("task", "default"))
infos.append(data.get("info", {}))

# Reconstruct state, including cached judge responses if present
state = {
"timing": {
"generation_ms": data.get("generation_ms", 0),
"scoring_ms": 0, # Reset scoring time
"total_ms": data.get("generation_ms", 0),
}
}
if "judge_response" in data:
state["judge_response"] = data["judge_response"]
states.append(state)

return vf.GenerateOutputs(
prompt=prompts,
completion=completions,
answer=answers,
state=states,
info=infos,
task=tasks,
reward=[], # To be populated by re-scoring
metrics={}, # To be populated by re-scoring
)


def eval_environment(
env: str,
env_args: dict,
Expand All @@ -40,6 +115,7 @@ def eval_environment(
save_dataset: bool,
save_to_hf_hub: bool,
hf_hub_dataset_name: str,
rerun_scoring_from: str | None,
extra_headers: Dict[str, str],
):
setup_logging("DEBUG" if verbose else "INFO")
Expand Down Expand Up @@ -107,21 +183,66 @@ def eval_environment(
if temperature is not None and "temperature" not in merged_sampling_args:
merged_sampling_args["temperature"] = temperature

logger.info(f"Starting evaluation with model: {model}")
logger.info(
f"Configuration: num_examples={num_examples}, rollouts_per_example={rollouts_per_example}, max_concurrent={max_concurrent}"
)
start_time = time.time()
results = vf_env.evaluate(
client=client,
model=model,
sampling_args=merged_sampling_args,
num_examples=num_examples,
rollouts_per_example=rollouts_per_example,
max_concurrent=max_concurrent,
)
end_time = time.time()
logger.info(f"Evaluation completed in {end_time - start_time:.2f} seconds")
if rerun_scoring_from:
logger.info(f"Attempting to re-run scoring from run ID: {rerun_scoring_from}")
start_time = time.time()

run_path = _find_run_path(env, model, rerun_scoring_from, env_dir_path)
with open(run_path / "metadata.json") as f:
metadata = json.load(f)

num_examples = metadata.get("num_examples", num_examples)
rollouts_per_example = metadata.get(
"rollouts_per_example", rollouts_per_example
)
logger.info(
f"Loaded metadata: num_examples={num_examples}, rollouts_per_example={rollouts_per_example}"
)

loaded_data = _load_results_for_rescoring(run_path / "results.jsonl")

async def rescore_data():
return await vf_env.rubric.score_rollouts(
prompts=loaded_data.prompt,
completions=loaded_data.completion,
answers=loaded_data.answer,
states=loaded_data.state,
tasks=loaded_data.task,
infos=loaded_data.info,
max_concurrent=max_concurrent,
)

try:
loop = asyncio.get_running_loop()
import nest_asyncio

nest_asyncio.apply()
new_scores = loop.run_until_complete(rescore_data())
except RuntimeError:
new_scores = asyncio.run(rescore_data())
end_time = time.time()

loaded_data.reward = new_scores.reward
loaded_data.metrics = new_scores.metrics
results = loaded_data
logger.info(f"Re-scoring completed in {end_time - start_time:.2f} seconds")
else:
logger.info(f"Starting evaluation with model: {model}")
logger.info(
f"Configuration: num_examples={num_examples}, rollouts_per_example={rollouts_per_example}, max_concurrent={max_concurrent}"
)
start_time = time.time()
results = vf_env.evaluate(
client=client,
model=model,
sampling_args=merged_sampling_args,
num_examples=num_examples,
rollouts_per_example=rollouts_per_example,
max_concurrent=max_concurrent,
)
end_time = time.time()
logger.info(f"Evaluation completed in {end_time - start_time:.2f} seconds")

print("--- Evaluation ---")
print(f"Environment: {env}")
print(f"Model: {model}")
Expand Down Expand Up @@ -335,7 +456,14 @@ def main():
"-D",
type=str,
default="",
help="Name of dataset to save to Hugging Face Hub",
help="Name for dataset on Hugging Face Hub",
)
parser.add_argument(
"--rerun-scoring-from",
"-R",
type=str,
default=None,
help="Run ID of a previous evaluation to re-score from results.jsonl",
)
args = parser.parse_args()

Expand Down Expand Up @@ -369,6 +497,7 @@ def main():
save_to_hf_hub=args.save_to_hf_hub,
hf_hub_dataset_name=args.hf_hub_dataset_name,
extra_headers=merged_headers,
rerun_scoring_from=args.rerun_scoring_from,
)


Expand Down
Loading