Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions environments/math/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


def load_environment(**kwargs) -> vf.Environment:
'''
"""
Loads a custom environment.
'''
"""
raise NotImplementedError("Implement your custom environment here.")
99 changes: 99 additions & 0 deletions integrations/art_framework/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# art_framework

### Overview
- **Environment ID**: `art_framework`
- **Source Implementation**: [Occupying-Mars/prime-environments](https://github.com/Occupying-Mars/prime-environments/tree/ART-verifier/environments/art_framework)
- **Author**: [@OccupyingM](https://x.com/OccupyingM)
- **Short description**: Universal adapter enabling bidirectional portability between ART (Autonomous Reasoning Tool) and verifiers ecosystems
- **Tags**: `art`, `framework`, `portability`, `tool-use`, `adapter`, `multi-turn`

### Purpose

This environment provides a portability layer between [OpenPipe's ART framework](https://github.com/OpenPipe/ART) and the verifiers evaluation system. It enables:

1. ART → verifiers: Load any ART task configuration and run it as a verifiers environment
2. verifiers → ART: Export any verifiers ToolEnv to run with ART agents
3. Shared tool definitions: Use the same tool schemas across both frameworks
4. Unified evaluation: Compare agent performance using consistent rubrics

### Key Features

- Automatic tool conversion between ART and verifiers tool schemas
- JSON schema validation and strict JSON output (no markdown fences)
- Flexible evaluation: exact match or LLM judge scoring
- Example configs and simple end-to-end test
- Bidirectional export utilities

### Quickstart

Setup:
```bash
uv run vf-install art_framework

# Set API key if using LLM judge
export OPENAI_API_KEY=sk-your-key
```

Test:
```bash
cd environments/art_framework
uv run python test_env.py
```

Evaluate:
```bash
uv run vf-eval -s art_framework -m gpt-4.1-mini -n 5 -r 3
```

### Environment Arguments

| Arg | Type | Default | Description |
| --- | ---- | ------- | ----------- |
| `task_config_path` | str | `None` | Path to ART task config JSON file |
| `task_config_dict` | dict | `None` | ART config as dictionary (alternative to file path) |
| `dataset` | Dataset | `None` | Custom training dataset (uses examples if None) |
| `eval_dataset` | Dataset | `None` | Custom evaluation dataset |
| `max_turns` | int | `10` | Maximum interaction turns per episode |
| `use_llm_judge` | bool | `False` | Whether to use LLM judge for evaluation |
| `judge_model` | str | `"gpt-4.1-mini"` | Model for LLM judge |
| `judge_client` | OpenAI | `None` | Custom OpenAI client (creates default if None) |
| `judge_api_key_var` | str | `"OPENAI_API_KEY"` | Environment variable for judge API key |

### ART Task Config Format

```json
{
"name": "task_name",
"tools": [
{
"name": "tool_name",
"description": "What it does",
"parameters": {"type": "object", "properties": {"x": {"type": "number"}}, "required": ["x"]},
"implementation": "lambda x: x"
}
],
"completion_tool_name": "submit_answer",
"system_prompt": "System prompt"
}
```

### Portability

ART → verifiers:
```bash
uv run vf-eval -s art_framework -a '{"task_config_path": "art_task.json"}'
```

verifiers → ART:
```python
from art_framework.utils.verifiers_adapter import export_verifiers_env
export_verifiers_env(my_env, "exported.json")
```

### Dependencies

- verifiers>=0.1.3
- datasets>=2.19
- pydantic>=2.0.0
- openai>=1.0.0 (optional, for LLM judge)

122 changes: 122 additions & 0 deletions integrations/art_framework/art_framework.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import json
from typing import Any, Callable

import verifiers as vf
from datasets import Dataset

from utils.art_adapter import (
art_config_to_tools,
build_dataset_from_art_config,
get_completion_tool_name,
)
from utils.verifiers_adapter import export_verifiers_env


class ARTParser(vf.Parser):
def __init__(self, completion_tool_name: str):
super().__init__()
self.completion_tool_name = completion_tool_name

def parse_answer(self, completion: vf.Messages) -> str | None:
if not isinstance(completion, list):
return super().parse_answer(completion)
# find the last assistant tool-call with completion tool name
for msg in reversed(completion):
if msg.get("role") == "assistant" and msg.get("tool_calls"):
tool_calls = msg["tool_calls"] or []
for tc in tool_calls:
try:
# handle both typed and dict tool-calls
if hasattr(tc, "function"):
name = tc.function.name
args_s = tc.function.arguments
else:
name = tc["function"]["name"]
args_s = tc["function"]["arguments"]
if name == self.completion_tool_name:
args = json.loads(args_s)
# answer field is any single value or "answer"
if isinstance(args, dict):
if "answer" in args:
return str(args["answer"])
# fallback: stringified dict
return json.dumps(args)
return str(args)
except Exception:
continue
return None


def load_environment(
task_config_path: str | None = None,
task_config_dict: dict | None = None,
dataset: Dataset | None = None,
eval_dataset: Dataset | None = None,
max_turns: int = 10,
use_llm_judge: bool = False,
judge_model: str = "gpt-4.1-mini",
judge_client: Any | None = None,
judge_api_key_var: str = "OPENAI_API_KEY",
**kwargs,
) -> vf.Environment:
"""Load ART framework adapter environment.

If no datasets are provided, builds tiny train/eval datasets from the task config examples.
"""

if task_config_path is None and task_config_dict is None:
raise ValueError("Provide task_config_path or task_config_dict")
if task_config_dict is None:
with open(task_config_path, "r") as f: # type: ignore[arg-type]
task_config_dict = json.load(f)

assert isinstance(task_config_dict, dict)
completion_tool_name = get_completion_tool_name(task_config_dict)
tools: list[Callable] = art_config_to_tools(task_config_dict)

# default datasets from config examples if not supplied
if dataset is None or eval_dataset is None:
ds_train, ds_eval = build_dataset_from_art_config(task_config_dict)
dataset = dataset or ds_train
eval_dataset = eval_dataset or ds_eval

parser = ARTParser(completion_tool_name=completion_tool_name)
if use_llm_judge:
rubric = vf.JudgeRubric(
parser=parser, judge_model=judge_model, judge_client=judge_client
)
else:

class ExactMatchRubric(vf.Rubric):
async def correct_answer(
self, parser: vf.Parser, completion: vf.Messages, answer: str, **_: Any
) -> float:
pred = parser.parse_answer(completion) or ""
return 1.0 if str(pred) == str(answer) and pred != "" else 0.0

rubric = ExactMatchRubric(parser=parser)
rubric.add_reward_func(rubric.correct_answer, weight=1.0) # type: ignore

env = vf.ToolEnv(
dataset=dataset,
eval_dataset=eval_dataset,
parser=parser,
rubric=rubric,
tools=tools,
max_turns=max_turns,
env_id="art_framework",
env_args={
"task_config": task_config_dict,
"use_llm_judge": use_llm_judge,
"judge_model": judge_model,
},
**kwargs,
)
return env


__all__ = [
"load_environment",
"ARTParser",
"export_verifiers_env",
]
36 changes: 36 additions & 0 deletions integrations/art_framework/examples/calculator.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"name": "calculator",
"system_prompt": "Use tools and return via submit_answer.",
"completion_tool_name": "submit_answer",
"tools": [
{
"name": "add",
"description": "Add two integers",
"parameters": {
"type": "object",
"properties": {
"a": {"type": "integer"},
"b": {"type": "integer"}
},
"required": ["a", "b"]
},
"implementation": "lambda a, b: a + b"
},
{
"name": "submit_answer",
"description": "Return final answer",
"parameters": {
"type": "object",
"properties": {"answer": {"type": "string"}},
"required": ["answer"]
},
"implementation": "lambda answer: answer"
}
],
"examples": [
{"question": "Add 2 and 3", "answer": "5"},
{"question": "Add 7 and 3", "answer": "10"},
{"question": "Add 1 and 9", "answer": "10"},
{"question": "Add 4 and 4", "answer": "8"}
]
}
21 changes: 21 additions & 0 deletions integrations/art_framework/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[project]
name = "art-framework"
description = "ART <-> verifiers adapter environment"
tags = ["train", "eval"]
version = "0.1.0"
requires-python = ">=3.11"
dependencies = [
"verifiers>=0.1.3",
"datasets>=2.19",
"pydantic>=2.0.0",
"openai>=1.0.0",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build]
include = ["art_framework.py", "utils/*.py", "README.md", "test_env.py"]


81 changes: 81 additions & 0 deletions integrations/art_framework/test_env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import json
from pathlib import Path

import verifiers as vf

from .art_framework import load_environment


def _write_tmp_config(tmp_path: Path) -> str:
cfg = {
"name": "calc",
"system_prompt": "Use tools and return via submit_answer.",
"completion_tool_name": "submit_answer",
"tools": [
{
"name": "add",
"description": "Add two integers",
"parameters": {
"type": "object",
"properties": {"a": {"type": "integer"}, "b": {"type": "integer"}},
"required": ["a", "b"],
},
"implementation": "lambda a, b: a + b",
},
{
"name": "submit_answer",
"description": "Return final answer",
"parameters": {
"type": "object",
"properties": {"answer": {"type": "string"}},
"required": ["answer"],
},
"implementation": "lambda answer: answer",
},
],
"examples": [
{"question": "Add 2 and 3", "answer": "5"},
{"question": "Add 7 and 3", "answer": "10"},
{"question": "Add 1 and 9", "answer": "10"},
{"question": "Add 4 and 4", "answer": "8"},
],
}
p = tmp_path / "art_task.json"
p.write_text(json.dumps(cfg))
return str(p)


def test_art_tool_conversion_and_parser(tmp_path):
cfg_path = _write_tmp_config(tmp_path)
env = load_environment(task_config_path=cfg_path, max_turns=2)
assert isinstance(env, vf.ToolEnv)
# smoke test: ensure tools exist and parser extracts from completion tool
tool_names = [t.__name__ for t in env.tools] # type: ignore
assert "add" in tool_names and "submit_answer" in tool_names

# construct a fake completion that calls submit_answer
completion = [
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "1",
"type": "function",
"function": {
"name": "submit_answer",
"arguments": json.dumps({"answer": "10"}),
},
}
],
}
]
parsed = env.parser.parse_answer(completion)
assert parsed == "10"

# reward should be 1.0 for matching answer
prompt = [{"role": "user", "content": "What is 7+3?"}]
rs = env.rubric.score_rollout_sync(
prompt=prompt, completion=completion, answer="10", state={}
) # type: ignore[attr-defined]
assert rs.reward == 1.0
Loading