PolicyEngine
diff --git a/‎scripts/seed.py‎
Lines changed: 6 additions & 2 deletions b/‎scripts/seed.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/policyengine_api/agent_sandbox.py‎
Lines changed: 142 additions & 22 deletions b/‎src/policyengine_api/agent_sandbox.py‎
Lines changed: 142 additions & 22 deletions
diff --git a/‎src/policyengine_api/api/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/policyengine_api/api/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/policyengine_api/api/agent.py‎
Lines changed: 18 additions & 4 deletions b/‎src/policyengine_api/api/agent.py‎
Lines changed: 18 additions & 4 deletions
@@ -223,7 +223,9 @@ def seed_model(model_version, session, lite: bool = False) -> TaxBenefitModelVer
             seen_names.add(p.name)
 
         filter_msg = f"  Filtered to {len(parameters_to_add)} user-facing parameters"
-        filter_msg += f" (from {len(model_version.parameters)} total, deduplicated by name)"
+        filter_msg += (
+            f" (from {len(model_version.parameters)} total, deduplicated by name)"
+        )
         if lite and skipped_state_params > 0:
             filter_msg += f", skipped {skipped_state_params} state params (lite mode)"
         console.print(filter_msg)
@@ -626,7 +628,9 @@ def main():
 
     with logfire.span("database_seeding"):
         mode_str = " (lite mode)" if args.lite else ""
-        console.print(f"[bold green]PolicyEngine database seeding{mode_str}[/bold green]\n")
+        console.print(
+            f"[bold green]PolicyEngine database seeding{mode_str}[/bold green]\n"
+        )
 
         with next(get_quiet_session()) as session:
             # Seed UK model
 
@@ -76,6 +76,56 @@ def configure_logfire(traceparent: str | None = None):
    - POST /analysis/economic-impact with tax_benefit_model_name, policy_id and dataset_id
    - GET /analysis/economic-impact/{report_id} for results (includes decile_impacts and program_statistics)
 
+4. **Structural reforms** (custom variable formulas):
+   For reforms that can't be expressed as parameter changes (e.g., new benefits, eligibility changes):
+
+   **IMPORTANT: Always test your modifier code first using the execute_python tool!**
+
+   Steps:
+   a. Write your simulation_modifier code
+   b. Test it with execute_python to check for syntax errors and basic logic
+   c. POST /agent/results/policy-with-modifier to create the policy
+   d. Use the policy_id in /analysis/economic-impact as normal
+
+   Example test with execute_python:
+   ```python
+   # Test the modifier code compiles and basic logic works
+   from numpy import where
+
+   def modify(simulation):
+       # Your modifier code here
+       pass
+
+   # Test the function exists and is callable
+   print(f"modify function defined: {callable(modify)}")
+
+   # Test any helper logic
+   income = 15000
+   benefit = 1000 if income < 20000 else 0
+   print(f"Test case: income={income} -> benefit={benefit}")
+   ```
+
+   Example simulation_modifier for a new benefit:
+   ```python
+   def modify(simulation):
+       from policyengine_core.variables import Variable
+       from policyengine_core.periods import YEAR
+       from numpy import where
+
+       Person = simulation.tax_benefit_system.entities_by_name()["person"]
+
+       @simulation.tax_benefit_system.variable("my_new_benefit")
+       class my_new_benefit(Variable):
+           value_type = float
+           entity = Person
+           definition_period = YEAR
+           label = "My new benefit"
+
+           def formula(person, period, parameters):
+               income = person("employment_income", period)
+               return where(income < 20000, 1000, 0)
+   ```
+
 ## Response formatting
 
 Follow PolicyEngine's writing style:
@@ -124,6 +174,66 @@ def configure_logfire(traceparent: str | None = None):
     },
 }
 
+# Python execution tool for testing code
+EXECUTE_PYTHON_TOOL = {
+    "name": "execute_python",
+    "description": "Execute Python code and return the output. Use this to test simulation modifier code before submitting it. The code runs in a sandboxed environment with numpy available. Returns stdout/stderr and any exceptions.",
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "code": {
+                "type": "string",
+                "description": "Python code to execute. Should include print statements to show results.",
+            }
+        },
+        "required": ["code"],
+    },
+}
+
+
+def execute_python_code(code: str) -> str:
+    """Execute Python code in a restricted environment and return output."""
+    import io
+    import sys
+    import traceback
+
+    # Capture stdout/stderr
+    old_stdout = sys.stdout
+    old_stderr = sys.stderr
+    sys.stdout = captured_out = io.StringIO()
+    sys.stderr = captured_err = io.StringIO()
+
+    result = ""
+    try:
+        # Create a restricted namespace with common imports available
+        namespace = {
+            "__builtins__": __builtins__,
+        }
+
+        # Execute the code
+        exec(code, namespace)
+
+        stdout_val = captured_out.getvalue()
+        stderr_val = captured_err.getvalue()
+
+        if stdout_val:
+            result += f"Output:\n{stdout_val}"
+        if stderr_val:
+            result += f"\nStderr:\n{stderr_val}"
+        if not stdout_val and not stderr_val:
+            result = "Code executed successfully (no output)"
+
+    except Exception as e:
+        result = (
+            f"Error: {type(e).__name__}: {e}\n\nTraceback:\n{traceback.format_exc()}"
+        )
+
+    finally:
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr
+
+    return result[:5000]  # Limit output length
+
 
 def fetch_openapi_spec(api_base_url: str) -> dict:
     """Fetch and cache OpenAPI spec."""
@@ -235,8 +345,7 @@ def openapi_to_claude_tools(spec: dict) -> list[dict]:
 
                 prop = schema_to_json_schema(spec, param_schema)
                 prop["description"] = (
-                    param.get("description", "")
-                    + f" (in: {param_in})"
+                    param.get("description", "") + f" (in: {param_in})"
                 )
                 properties[param_name] = prop
 
@@ -268,16 +377,18 @@ def openapi_to_claude_tools(spec: dict) -> list[dict]:
             if required:
                 input_schema["required"] = list(set(required))
 
-            tools.append({
-                "name": tool_name,
-                "description": full_desc[:1024],  # Claude has limits
-                "input_schema": input_schema,
-                "_meta": {
-                    "path": path,
-                    "method": method,
-                    "parameters": operation.get("parameters", []),
-                },
-            })
+            tools.append(
+                {
+                    "name": tool_name,
+                    "description": full_desc[:1024],  # Claude has limits
+                    "input_schema": input_schema,
+                    "_meta": {
+                        "path": path,
+                        "method": method,
+                        "parameters": operation.get("parameters", []),
+                    },
+                }
+            )
 
     return tools
 
@@ -347,7 +458,9 @@ def execute_api_tool(
                 url, params=query_params, json=body_data, headers=headers, timeout=60
             )
         elif method == "delete":
-            resp = requests.delete(url, params=query_params, headers=headers, timeout=60)
+            resp = requests.delete(
+                url, params=query_params, headers=headers, timeout=60
+            )
         else:
             return f"Unsupported method: {method}"
 
@@ -415,11 +528,10 @@ def log(msg: str) -> None:
     tool_lookup = {t["name"]: t for t in tools}
 
     # Strip _meta from tools before sending to Claude (it doesn't need it)
-    claude_tools = [
-        {k: v for k, v in t.items() if k != "_meta"} for t in tools
-    ]
-    # Add the sleep tool
+    claude_tools = [{k: v for k, v in t.items() if k != "_meta"} for t in tools]
+    # Add built-in tools
     claude_tools.append(SLEEP_TOOL)
+    claude_tools.append(EXECUTE_PYTHON_TOOL)
 
     client = anthropic.Anthropic()
 
@@ -466,6 +578,12 @@ def log(msg: str) -> None:
                     log(f"[SLEEP] Waiting {seconds} seconds...")
                     time.sleep(seconds)
                     result = f"Slept for {seconds} seconds"
+                elif block.name == "execute_python":
+                    # Handle Python execution tool
+                    code = block.input.get("code", "")
+                    log(f"[PYTHON] Executing code ({len(code)} chars)...")
+                    result = execute_python_code(code)
+                    log(f"[PYTHON] Result: {result[:200]}")
                 else:
                     tool = tool_lookup.get(block.name)
                     if tool:
@@ -477,11 +595,13 @@ def log(msg: str) -> None:
 
                 log(f"[TOOL_RESULT] {result[:300]}")
 
-                tool_results.append({
-                    "type": "tool_result",
-                    "tool_use_id": block.id,
-                    "content": result,
-                })
+                tool_results.append(
+                    {
+                        "type": "tool_result",
+                        "tool_use_id": block.id,
+                        "content": result,
+                    }
+                )
 
         messages.append({"role": "assistant", "content": assistant_content})
 
 
@@ -4,6 +4,7 @@
 
 from . import (
     agent,
+    agent_results,
     analysis,
     change_aggregates,
     datasets,
@@ -35,5 +36,6 @@
 api_router.include_router(household.router)
 api_router.include_router(analysis.router)
 api_router.include_router(agent.router)
+api_router.include_router(agent_results.router)
 
 __all__ = ["api_router"]
@@ -24,6 +24,7 @@ def get_traceparent() -> str | None:
     TraceContextTextMapPropagator().inject(carrier)
     return carrier.get("traceparent")
 
+
 router = APIRouter(prefix="/agent", tags=["agent"])
 
 
@@ -93,7 +94,9 @@ def _run_local_agent(
     from policyengine_api.agent_sandbox import _run_agent_impl
 
     try:
-        history_dicts = [{"role": m.role, "content": m.content} for m in (history or [])]
+        history_dicts = [
+            {"role": m.role, "content": m.content} for m in (history or [])
+        ]
         result = _run_agent_impl(question, api_base_url, call_id, history_dicts)
         _calls[call_id]["status"] = result.get("status", "completed")
         _calls[call_id]["result"] = result
@@ -136,9 +139,15 @@ async def run_agent(request: RunRequest) -> RunResponse:
 
         traceparent = get_traceparent()
         run_fn = modal.Function.from_name("policyengine-sandbox", "run_agent")
-        history_dicts = [{"role": m.role, "content": m.content} for m in request.history]
+        history_dicts = [
+            {"role": m.role, "content": m.content} for m in request.history
+        ]
         call = run_fn.spawn(
-            request.question, api_base_url, call_id, history_dicts, traceparent=traceparent
+            request.question,
+            api_base_url,
+            call_id,
+            history_dicts,
+            traceparent=traceparent,
         )
 
         _calls[call_id] = {
@@ -166,7 +175,12 @@ async def run_agent(request: RunRequest) -> RunResponse:
         # Run in background using asyncio
         loop = asyncio.get_event_loop()
         loop.run_in_executor(
-            None, _run_local_agent, call_id, request.question, api_base_url, request.history
+            None,
+            _run_local_agent,
+            call_id,
+            request.question,
+            api_base_url,
+            request.history,
         )
 
     return RunResponse(call_id=call_id, status="running")