Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions examples/terminal/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
### Terminal-Bench examples

- Requirements: Python >= 3.12
- Install Terminal-Bench:

```bash
pip install terminal-bench
```

After installing, you can run the sample script in this folder to evaluate openai/o4-mini on the terminal-bench-core v0.1.1 dataset with Terminal Bench's terminus 1 agent.


80 changes: 80 additions & 0 deletions examples/terminal/prepare_terminal_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from __future__ import annotations

from pathlib import Path
from typing import Any, Dict, List, Optional
import yaml

from terminal_bench.dataset.dataset import Dataset

def load_terminal_bench_dataset(
dataset_name: str,
dataset_version: str = "head",
task_ids: Optional[List[str]] = None,
n_tasks: Optional[int] = None,
cache_path: Optional[Path] = None,
local_registry_path: Optional[Path] = None,
) -> List[Dict[str, Any]]:
"""Load Terminal-Bench dataset and convert to minimal rLLM task dicts.

Args:
dataset_name: Dataset registry name.
dataset_version: Concrete version or "head".
task_ids: Optional subset of task IDs to include.
n_tasks: Optional cap on number of tasks.
cache_path: Optional path for dataset cache.
local_registry_path: Optional path to a local registry.

Returns:
List[Dict[str, Any]]: Each dict includes ``task_path``, ``task_id``,
and ``instruction``.
"""
dataset = Dataset(
name=dataset_name,
version=dataset_version,
task_ids=task_ids,
n_tasks=n_tasks,
local_registry_path=local_registry_path,
)

tasks: List[Dict[str, Any]] = []
for task_path in dataset:
task_config = load_task_config(task_path)

task_dict = {
"task_path": str(task_path),
"task_id": task_path.name,
"instruction": task_config["instruction"],
}
tasks.append(task_dict)

return tasks


def load_task_config(task_path: Path) -> Dict[str, Any]:
"""Load and validate task configuration from task.yaml file.

Args:
task_path: Path to a Terminal-Bench task directory.

Returns:
Dict[str, Any]: Parsed YAML mapping.

Raises:
FileNotFoundError: If ``task.yaml`` is missing.
ValueError: If required fields are missing.
"""
task_yaml_path = task_path / "task.yaml"

if not task_yaml_path.exists():
raise FileNotFoundError(f"task.yaml not found at {task_yaml_path}")

with open(task_yaml_path, 'r') as f:
config = yaml.safe_load(f)

# Validate required fields
required_fields = ["instruction"]
for field in required_fields:
if field not in config:
raise ValueError(f"Missing required field '{field}' in {task_yaml_path}")

return config
55 changes: 55 additions & 0 deletions examples/terminal/run_terminal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import asyncio
import os

from rllm.engine.agent_workflow_engine import AgentWorkflowEngine
from rllm.integrations.terminal_terminus_1 import TerminalLiteLLMEngine
from rllm.workflows.terminal_workflow import TerminalWorkflow
from rllm.agents.terminal_terminus_agent import TerminalTerminusAgent
from rllm.environments.terminal.terminal_terminus import TerminalTerminusEnv
from examples.terminal.prepare_terminal_data import load_terminal_bench_dataset

if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "true"

dataset_name = "terminal-bench-core"
dataset_version = "0.1.1"

model_name = "openai/o4-mini"
openai_base_url = None
max_turns = 50
max_agent_timeout_sec = 600.0

env_args = {"model_name": model_name, "api_base": openai_base_url, "cleanup": True}
rollout_engine = TerminalLiteLLMEngine(
model=env_args["model_name"], api_base=env_args["api_base"]
)

engine = AgentWorkflowEngine(
workflow_cls=TerminalWorkflow,
workflow_args={
"agent_cls": TerminalTerminusAgent,
"env_cls": TerminalTerminusEnv,
"env_args": env_args,
"max_steps": max_turns,
"global_agent_timeout_sec": max_agent_timeout_sec,
},
rollout_engine=rollout_engine,
n_parallel_tasks=1,
# Terminal-Bench already retries LLM calls 3 times in handle_llm_interaction
retry_limit=1,
)

asyncio.run(engine.initialize_pool())

tasks = load_terminal_bench_dataset(
dataset_name=dataset_name,
dataset_version=dataset_version,
)

print(f"Loaded {len(tasks)} tasks from {dataset_name} {dataset_version}")

episodes = asyncio.run(engine.execute_tasks(tasks=tasks))

total = len(episodes)
correct = sum(ep.is_correct for ep in episodes)
print(f"Accuracy: {correct}/{total} = {correct / total:.3f}")
90 changes: 90 additions & 0 deletions rllm/agents/terminal_terminus_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from typing import Any, Dict, List, Optional
import copy

from rllm.agents.agent import Action, BaseAgent, Step, Trajectory


class TerminalTerminusAgent(BaseAgent):
"""Thin agent wrapper; environment handles Terminal-Bench specifics.

Maintains a simple alternating chat message history and mirrors raw
model responses to ``Action`` objects consumed by the environment.
"""

def __init__(self, **kwargs):
"""Initialize internal state."""
self.reset()

def update_from_env(
self,
observation: Any,
reward: float,
done: bool,
info: Dict[str, Any],
**kwargs,
) -> None:
"""Update agent state from an environment transition.

Args:
observation: Latest observation dict from the environment.
reward: Scalar reward from the previous action.
done: Whether the episode has terminated.
info: Auxiliary environment info.
**kwargs: Unused; reserved for extensions.
"""
if self._trajectory.steps:
prior_step = self._trajectory.steps[-1]
prior_step.observation = observation
prior_step.reward = reward
prior_step.done = done
prior_step.info = info

self.messages.append({"role": "user", "content": observation["prompt"]})
self.cur_step = Step(observation=observation)

def update_from_model(self, response: str, **kwargs) -> Action:
"""Record model response and produce an action.

Args:
response: Raw assistant text.
**kwargs: Unused; reserved for extensions.

Returns:
Action: Action object whose ``action`` is the raw response.
"""
self._trajectory.steps.append(self.cur_step)

cur_step = self._trajectory.steps[-1]
cur_step.model_response = response
cur_step.action = response

self.messages.append({"role": "assistant", "content": response})
cur_step.chat_completions = copy.deepcopy(self.messages)
self.step += 1
return Action(action=response)

def get_current_state(self) -> Optional[Step]:
"""Return the most recent step in the trajectory.

Returns:
Optional[Step]: Last step if available.
"""
assert self._trajectory.steps, "Trajectory should not be empty when get_current_state is called."
return self._trajectory.steps[-1]

def reset(self) -> None:
"""Reset message history and trajectory."""
self._trajectory = Trajectory()
self.messages = []
self.step = 0

@property
def chat_completions(self) -> List[Dict[str, str]]:
"""OpenAI-style message history consumed by the rollout engine."""
return self.messages

@property
def trajectory(self) -> Trajectory:
return self._trajectory


Loading