rllm-org · JasonWei05 · Aug 26, 2025 · Aug 27, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/examples/terminal/README.md b/examples/terminal/README.md
@@ -0,0 +1,12 @@
+### Terminal-Bench examples
+
+- Requirements: Python >= 3.12
+- Install Terminal-Bench:
+
+```bash
+pip install terminal-bench
+```
+
+After installing, you can run the sample script in this folder to evaluate openai/o4-mini on the terminal-bench-core v0.1.1 dataset with Terminal Bench's terminus 1 agent.
+
+
diff --git a/examples/terminal/prepare_terminal_data.py b/examples/terminal/prepare_terminal_data.py
@@ -0,0 +1,80 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import yaml
+
+from terminal_bench.dataset.dataset import Dataset
+
+def load_terminal_bench_dataset(
+    dataset_name: str,
+    dataset_version: str = "head",
+    task_ids: Optional[List[str]] = None,
+    n_tasks: Optional[int] = None,
+    cache_path: Optional[Path] = None,
+    local_registry_path: Optional[Path] = None,
+) -> List[Dict[str, Any]]:
+    """Load Terminal-Bench dataset and convert to minimal rLLM task dicts.
+
+    Args:
+        dataset_name: Dataset registry name.
+        dataset_version: Concrete version or "head".
+        task_ids: Optional subset of task IDs to include.
+        n_tasks: Optional cap on number of tasks.
+        cache_path: Optional path for dataset cache.
+        local_registry_path: Optional path to a local registry.
+
+    Returns:
+        List[Dict[str, Any]]: Each dict includes ``task_path``, ``task_id``,
+        and ``instruction``.
+    """
+    dataset = Dataset(
+        name=dataset_name,
+        version=dataset_version,
+        task_ids=task_ids,
+        n_tasks=n_tasks,
+        local_registry_path=local_registry_path,
+    )
+
+    tasks: List[Dict[str, Any]] = []
+    for task_path in dataset:
+        task_config = load_task_config(task_path)
+
+        task_dict = {
+            "task_path": str(task_path),
+            "task_id": task_path.name,
+            "instruction": task_config["instruction"],
+        }
+        tasks.append(task_dict)
+
+    return tasks
+
+
+def load_task_config(task_path: Path) -> Dict[str, Any]:
+    """Load and validate task configuration from task.yaml file.
+
+    Args:
+        task_path: Path to a Terminal-Bench task directory.
+
+    Returns:
+        Dict[str, Any]: Parsed YAML mapping.
+
+    Raises:
+        FileNotFoundError: If ``task.yaml`` is missing.
+        ValueError: If required fields are missing.
+    """
+    task_yaml_path = task_path / "task.yaml"
+
+    if not task_yaml_path.exists():
+        raise FileNotFoundError(f"task.yaml not found at {task_yaml_path}")
+
+    with open(task_yaml_path, 'r') as f:
+        config = yaml.safe_load(f)
+
+    # Validate required fields
+    required_fields = ["instruction"]
+    for field in required_fields:
+        if field not in config:
+            raise ValueError(f"Missing required field '{field}' in {task_yaml_path}")
+
+    return config
diff --git a/examples/terminal/run_terminal.py b/examples/terminal/run_terminal.py
@@ -0,0 +1,55 @@
+import asyncio
+import os
+
+from rllm.engine.agent_workflow_engine import AgentWorkflowEngine
+from rllm.integrations.terminal_terminus_1 import TerminalLiteLLMEngine
+from rllm.workflows.terminal_workflow import TerminalWorkflow
+from rllm.agents.terminal_terminus_agent import TerminalTerminusAgent
+from rllm.environments.terminal.terminal_terminus import TerminalTerminusEnv
+from examples.terminal.prepare_terminal_data import load_terminal_bench_dataset
+
+if __name__ == "__main__":
+    os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+    dataset_name = "terminal-bench-core"
+    dataset_version = "0.1.1"
+
+    model_name = "openai/o4-mini"
+    openai_base_url = None
+    max_turns = 50
+    max_agent_timeout_sec = 600.0
+
+    env_args = {"model_name": model_name, "api_base": openai_base_url, "cleanup": True}
+    rollout_engine = TerminalLiteLLMEngine(
+        model=env_args["model_name"], api_base=env_args["api_base"]
+    )
+
+    engine = AgentWorkflowEngine(
+        workflow_cls=TerminalWorkflow,
+        workflow_args={
+            "agent_cls": TerminalTerminusAgent,
+            "env_cls": TerminalTerminusEnv,
+            "env_args": env_args,
+            "max_steps": max_turns,
+            "global_agent_timeout_sec": max_agent_timeout_sec,
+        },
+        rollout_engine=rollout_engine,
+        n_parallel_tasks=1,
+        # Terminal-Bench already retries LLM calls 3 times in handle_llm_interaction
+        retry_limit=1,
+    )
+
+    asyncio.run(engine.initialize_pool())
+
+    tasks = load_terminal_bench_dataset(
+        dataset_name=dataset_name,
+        dataset_version=dataset_version,
+    )
+
+    print(f"Loaded {len(tasks)} tasks from {dataset_name} {dataset_version}")
+
+    episodes = asyncio.run(engine.execute_tasks(tasks=tasks))
+
+    total = len(episodes)
+    correct = sum(ep.is_correct for ep in episodes)
+    print(f"Accuracy: {correct}/{total} = {correct / total:.3f}")
diff --git a/rllm/agents/terminal_terminus_agent.py b/rllm/agents/terminal_terminus_agent.py
@@ -0,0 +1,90 @@
+from typing import Any, Dict, List, Optional
+import copy
+
+from rllm.agents.agent import Action, BaseAgent, Step, Trajectory
+
+
+class TerminalTerminusAgent(BaseAgent):
+    """Thin agent wrapper; environment handles Terminal-Bench specifics.
+
+    Maintains a simple alternating chat message history and mirrors raw
+    model responses to ``Action`` objects consumed by the environment.
+    """
+
+    def __init__(self, **kwargs):
+        """Initialize internal state."""
+        self.reset()
+
+    def update_from_env(
+        self,
+        observation: Any,
+        reward: float,
+        done: bool,
+        info: Dict[str, Any],
+        **kwargs,
+    ) -> None:
+        """Update agent state from an environment transition.
+
+        Args:
+            observation: Latest observation dict from the environment.
+            reward: Scalar reward from the previous action.
+            done: Whether the episode has terminated.
+            info: Auxiliary environment info.
+            **kwargs: Unused; reserved for extensions.
+        """
+        if self._trajectory.steps:
+            prior_step = self._trajectory.steps[-1]
+            prior_step.observation = observation
+            prior_step.reward = reward
+            prior_step.done = done
+            prior_step.info = info
+
+        self.messages.append({"role": "user", "content": observation["prompt"]})
+        self.cur_step = Step(observation=observation)
+
+    def update_from_model(self, response: str, **kwargs) -> Action:
+        """Record model response and produce an action.
+
+        Args:
+            response: Raw assistant text.
+            **kwargs: Unused; reserved for extensions.
+
+        Returns:
+            Action: Action object whose ``action`` is the raw response.
+        """
+        self._trajectory.steps.append(self.cur_step)
+
+        cur_step = self._trajectory.steps[-1]
+        cur_step.model_response = response
+        cur_step.action = response
+
+        self.messages.append({"role": "assistant", "content": response})
+        cur_step.chat_completions = copy.deepcopy(self.messages)
+        self.step += 1
+        return Action(action=response)
+
+    def get_current_state(self) -> Optional[Step]:
+        """Return the most recent step in the trajectory.
+
+        Returns:
+            Optional[Step]: Last step if available.
+        """
+        assert self._trajectory.steps, "Trajectory should not be empty when get_current_state is called."
+        return self._trajectory.steps[-1]
+
+    def reset(self) -> None:
+        """Reset message history and trajectory."""
+        self._trajectory = Trajectory()
+        self.messages = []
+        self.step = 0
+
+    @property
+    def chat_completions(self) -> List[Dict[str, str]]:
+        """OpenAI-style message history consumed by the rollout engine."""
+        return self.messages
+
+    @property
+    def trajectory(self) -> Trajectory:
+        return self._trajectory
+
+