PrimeIntellect-ai · Occupying-Mars · Oct 28, 2025 · Oct 28, 2025
diff --git a/environments/math/math.py b/environments/math/math.py
@@ -2,7 +2,7 @@
 
 
 def load_environment(**kwargs) -> vf.Environment:
-    '''
+    """
     Loads a custom environment.
-    '''
+    """
     raise NotImplementedError("Implement your custom environment here.")
diff --git a/integrations/art_framework/README.md b/integrations/art_framework/README.md
@@ -0,0 +1,99 @@
+# art_framework
+
+### Overview
+- **Environment ID**: `art_framework`
+- **Source Implementation**: [Occupying-Mars/prime-environments](https://github.com/Occupying-Mars/prime-environments/tree/ART-verifier/environments/art_framework)
+- **Author**: [@OccupyingM](https://x.com/OccupyingM)
+- **Short description**: Universal adapter enabling bidirectional portability between ART (Autonomous Reasoning Tool) and verifiers ecosystems
+- **Tags**: `art`, `framework`, `portability`, `tool-use`, `adapter`, `multi-turn`
+
+### Purpose
+
+This environment provides a portability layer between [OpenPipe's ART framework](https://github.com/OpenPipe/ART) and the verifiers evaluation system. It enables:
+
+1. ART → verifiers: Load any ART task configuration and run it as a verifiers environment
+2. verifiers → ART: Export any verifiers ToolEnv to run with ART agents
+3. Shared tool definitions: Use the same tool schemas across both frameworks
+4. Unified evaluation: Compare agent performance using consistent rubrics
+
+### Key Features
+
+- Automatic tool conversion between ART and verifiers tool schemas
+- JSON schema validation and strict JSON output (no markdown fences)
+- Flexible evaluation: exact match or LLM judge scoring
+- Example configs and simple end-to-end test
+- Bidirectional export utilities
+
+### Quickstart
+
+Setup:
+```bash
+uv run vf-install art_framework
+
+# Set API key if using LLM judge
+export OPENAI_API_KEY=sk-your-key
+```
+
+Test:
+```bash
+cd environments/art_framework
+uv run python test_env.py
+```
+
+Evaluate:
+```bash
+uv run vf-eval -s art_framework -m gpt-4.1-mini -n 5 -r 3
+```
+
+### Environment Arguments
+
+| Arg | Type | Default | Description |
+| --- | ---- | ------- | ----------- |
+| `task_config_path` | str | `None` | Path to ART task config JSON file |
+| `task_config_dict` | dict | `None` | ART config as dictionary (alternative to file path) |
+| `dataset` | Dataset | `None` | Custom training dataset (uses examples if None) |
+| `eval_dataset` | Dataset | `None` | Custom evaluation dataset |
+| `max_turns` | int | `10` | Maximum interaction turns per episode |
+| `use_llm_judge` | bool | `False` | Whether to use LLM judge for evaluation |
+| `judge_model` | str | `"gpt-4.1-mini"` | Model for LLM judge |
+| `judge_client` | OpenAI | `None` | Custom OpenAI client (creates default if None) |
+| `judge_api_key_var` | str | `"OPENAI_API_KEY"` | Environment variable for judge API key |
+
+### ART Task Config Format
+
+```json
+{
+  "name": "task_name",
+  "tools": [
+    {
+      "name": "tool_name",
+      "description": "What it does",
+      "parameters": {"type": "object", "properties": {"x": {"type": "number"}}, "required": ["x"]},
+      "implementation": "lambda x: x"
+    }
+  ],
+  "completion_tool_name": "submit_answer",
+  "system_prompt": "System prompt"
+}
+```
+
+### Portability
+
+ART → verifiers:
+```bash
+uv run vf-eval -s art_framework -a '{"task_config_path": "art_task.json"}'
+```
+
+verifiers → ART:
+```python
+from art_framework.utils.verifiers_adapter import export_verifiers_env
+export_verifiers_env(my_env, "exported.json")
+```
+
+### Dependencies
+
+- verifiers>=0.1.3
+- datasets>=2.19
+- pydantic>=2.0.0
+- openai>=1.0.0 (optional, for LLM judge)
+
diff --git a/integrations/art_framework/art_framework.py b/integrations/art_framework/art_framework.py
@@ -0,0 +1,122 @@
+import json
+from typing import Any, Callable
+
+import verifiers as vf
+from datasets import Dataset
+
+from utils.art_adapter import (
+    art_config_to_tools,
+    build_dataset_from_art_config,
+    get_completion_tool_name,
+)
+from utils.verifiers_adapter import export_verifiers_env
+
+
+class ARTParser(vf.Parser):
+    def __init__(self, completion_tool_name: str):
+        super().__init__()
+        self.completion_tool_name = completion_tool_name
+
+    def parse_answer(self, completion: vf.Messages) -> str | None:
+        if not isinstance(completion, list):
+            return super().parse_answer(completion)
+        # find the last assistant tool-call with completion tool name
+        for msg in reversed(completion):
+            if msg.get("role") == "assistant" and msg.get("tool_calls"):
+                tool_calls = msg["tool_calls"] or []
+                for tc in tool_calls:
+                    try:
+                        # handle both typed and dict tool-calls
+                        if hasattr(tc, "function"):
+                            name = tc.function.name
+                            args_s = tc.function.arguments
+                        else:
+                            name = tc["function"]["name"]
+                            args_s = tc["function"]["arguments"]
+                        if name == self.completion_tool_name:
+                            args = json.loads(args_s)
+                            # answer field is any single value or "answer"
+                            if isinstance(args, dict):
+                                if "answer" in args:
+                                    return str(args["answer"])
+                                # fallback: stringified dict
+                                return json.dumps(args)
+                            return str(args)
+                    except Exception:
+                        continue
+        return None
+
+
+def load_environment(
+    task_config_path: str | None = None,
+    task_config_dict: dict | None = None,
+    dataset: Dataset | None = None,
+    eval_dataset: Dataset | None = None,
+    max_turns: int = 10,
+    use_llm_judge: bool = False,
+    judge_model: str = "gpt-4.1-mini",
+    judge_client: Any | None = None,
+    judge_api_key_var: str = "OPENAI_API_KEY",
+    **kwargs,
+) -> vf.Environment:
+    """Load ART framework adapter environment.
+
+    If no datasets are provided, builds tiny train/eval datasets from the task config examples.
+    """
+
+    if task_config_path is None and task_config_dict is None:
+        raise ValueError("Provide task_config_path or task_config_dict")
+    if task_config_dict is None:
+        with open(task_config_path, "r") as f:  # type: ignore[arg-type]
+            task_config_dict = json.load(f)
+
+    assert isinstance(task_config_dict, dict)
+    completion_tool_name = get_completion_tool_name(task_config_dict)
+    tools: list[Callable] = art_config_to_tools(task_config_dict)
+
+    # default datasets from config examples if not supplied
+    if dataset is None or eval_dataset is None:
+        ds_train, ds_eval = build_dataset_from_art_config(task_config_dict)
+        dataset = dataset or ds_train
+        eval_dataset = eval_dataset or ds_eval
+
+    parser = ARTParser(completion_tool_name=completion_tool_name)
+    if use_llm_judge:
+        rubric = vf.JudgeRubric(
+            parser=parser, judge_model=judge_model, judge_client=judge_client
+        )
+    else:
+
+        class ExactMatchRubric(vf.Rubric):
+            async def correct_answer(
+                self, parser: vf.Parser, completion: vf.Messages, answer: str, **_: Any
+            ) -> float:
+                pred = parser.parse_answer(completion) or ""
+                return 1.0 if str(pred) == str(answer) and pred != "" else 0.0
+
+        rubric = ExactMatchRubric(parser=parser)
+        rubric.add_reward_func(rubric.correct_answer, weight=1.0)  # type: ignore
+
+    env = vf.ToolEnv(
+        dataset=dataset,
+        eval_dataset=eval_dataset,
+        parser=parser,
+        rubric=rubric,
+        tools=tools,
+        max_turns=max_turns,
+        env_id="art_framework",
+        env_args={
+            "task_config": task_config_dict,
+            "use_llm_judge": use_llm_judge,
+            "judge_model": judge_model,
+        },
+        **kwargs,
+    )
+    return env
+
+
+__all__ = [
+    "load_environment",
+    "ARTParser",
+    "export_verifiers_env",
+]
diff --git a/integrations/art_framework/examples/calculator.json b/integrations/art_framework/examples/calculator.json
@@ -0,0 +1,36 @@
+{
+  "name": "calculator",
+  "system_prompt": "Use tools and return via submit_answer.",
+  "completion_tool_name": "submit_answer",
+  "tools": [
+    {
+      "name": "add",
+      "description": "Add two integers",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "a": {"type": "integer"},
+          "b": {"type": "integer"}
+        },
+        "required": ["a", "b"]
+      },
+      "implementation": "lambda a, b: a + b"
+    },
+    {
+      "name": "submit_answer",
+      "description": "Return final answer",
+      "parameters": {
+        "type": "object",
+        "properties": {"answer": {"type": "string"}},
+        "required": ["answer"]
+      },
+      "implementation": "lambda answer: answer"
+    }
+  ],
+  "examples": [
+    {"question": "Add 2 and 3", "answer": "5"},
+    {"question": "Add 7 and 3", "answer": "10"},
+    {"question": "Add 1 and 9", "answer": "10"},
+    {"question": "Add 4 and 4", "answer": "8"}
+  ]
+}
diff --git a/integrations/art_framework/pyproject.toml b/integrations/art_framework/pyproject.toml
@@ -0,0 +1,21 @@
+[project]
+name = "art-framework"
+description = "ART <-> verifiers adapter environment"
+tags = ["train", "eval"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.3",
+    "datasets>=2.19",
+    "pydantic>=2.0.0",
+    "openai>=1.0.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["art_framework.py", "utils/*.py", "README.md", "test_env.py"]
+
+
diff --git a/integrations/art_framework/test_env.py b/integrations/art_framework/test_env.py
@@ -0,0 +1,81 @@
+import json
+from pathlib import Path
+
+import verifiers as vf
+
+from .art_framework import load_environment
+
+
+def _write_tmp_config(tmp_path: Path) -> str:
+    cfg = {
+        "name": "calc",
+        "system_prompt": "Use tools and return via submit_answer.",
+        "completion_tool_name": "submit_answer",
+        "tools": [
+            {
+                "name": "add",
+                "description": "Add two integers",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"a": {"type": "integer"}, "b": {"type": "integer"}},
+                    "required": ["a", "b"],
+                },
+                "implementation": "lambda a, b: a + b",
+            },
+            {
+                "name": "submit_answer",
+                "description": "Return final answer",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"answer": {"type": "string"}},
+                    "required": ["answer"],
+                },
+                "implementation": "lambda answer: answer",
+            },
+        ],
+        "examples": [
+            {"question": "Add 2 and 3", "answer": "5"},
+            {"question": "Add 7 and 3", "answer": "10"},
+            {"question": "Add 1 and 9", "answer": "10"},
+            {"question": "Add 4 and 4", "answer": "8"},
+        ],
+    }
+    p = tmp_path / "art_task.json"
+    p.write_text(json.dumps(cfg))
+    return str(p)
+
+
+def test_art_tool_conversion_and_parser(tmp_path):
+    cfg_path = _write_tmp_config(tmp_path)
+    env = load_environment(task_config_path=cfg_path, max_turns=2)
+    assert isinstance(env, vf.ToolEnv)
+    # smoke test: ensure tools exist and parser extracts from completion tool
+    tool_names = [t.__name__ for t in env.tools]  # type: ignore
+    assert "add" in tool_names and "submit_answer" in tool_names
+
+    # construct a fake completion that calls submit_answer
+    completion = [
+        {
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [
+                {
+                    "id": "1",
+                    "type": "function",
+                    "function": {
+                        "name": "submit_answer",
+                        "arguments": json.dumps({"answer": "10"}),
+                    },
+                }
+            ],
+        }
+    ]
+    parsed = env.parser.parse_answer(completion)
+    assert parsed == "10"
+
+    # reward should be 1.0 for matching answer
+    prompt = [{"role": "user", "content": "What is 7+3?"}]
+    rs = env.rubric.score_rollout_sync(
+        prompt=prompt, completion=completion, answer="10", state={}
+    )  # type: ignore[attr-defined]
+    assert rs.reward == 1.0