From f1c329ace41f75928f969c8fc86810de27826b8e Mon Sep 17 00:00:00 2001
From: Matt Van Horn <mvanhorn@gmail.com>
Date: Mon, 9 Mar 2026 07:40:18 -0700
Subject: [PATCH] write eval results to results.json for structured agent
 consumption

The agent loop currently reads results by grepping stdout from run.log,
which mixes trusted metrics with arbitrary training output. Writing a
structured results.json gives agents a reliable, parseable results
channel. Existing stdout output is unchanged.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 program.md | 4 ++--
 train.py   | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/program.md b/program.md
index dea9bcc01..035aee12f 100644
--- a/program.md
+++ b/program.md
@@ -97,8 +97,8 @@ LOOP FOREVER:
 2. Tune `train.py` with an experimental idea by directly hacking the code.
 3. git commit
 4. Run the experiment: `uv run train.py > run.log 2>&1` (redirect everything — do NOT use tee or let output flood your context)
-5. Read out the results: `grep "^val_bpb:\|^peak_vram_mb:" run.log`
-6. If the grep output is empty, the run crashed. Run `tail -n 50 run.log` to read the Python stack trace and attempt a fix. If you can't get things to work after more than a few attempts, give up.
+5. Read out the results: `cat results.json` (structured JSON written by train.py). Fallback: `grep "^val_bpb:\|^peak_vram_mb:" run.log`
+6. If results.json doesn't exist and the grep output is empty, the run crashed. Run `tail -n 50 run.log` to read the Python stack trace and attempt a fix. If you can't get things to work after more than a few attempts, give up.
 7. Record the results in the tsv (NOTE: do not commit the results.tsv file, leave it untracked by git)
 8. If val_bpb improved (lower), you "advance" the branch, keeping the git commit
 9. If val_bpb is equal or worse, you git reset back to where you started
diff --git a/train.py b/train.py
index 6994fb9bb..cb711a032 100644
--- a/train.py
+++ b/train.py
@@ -9,6 +9,7 @@
 os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
 
 import gc
+import json
 import time
 from dataclasses import dataclass, asdict
 
@@ -627,3 +628,10 @@ def get_weight_decay(progress):
 print(f"num_steps:        {step}")
 print(f"num_params_M:     {num_params / 1e6:.1f}")
 print(f"depth:            {DEPTH}")
+
+# Write structured results for reliable agent consumption
+with open("results.json", "w") as f:
+    json.dump({"val_bpb": val_bpb, "training_seconds": round(total_training_time, 1),
+               "total_seconds": round(t_end - t_start, 1), "peak_vram_mb": round(peak_vram_mb, 1),
+               "mfu_percent": round(steady_state_mfu, 2), "num_steps": step,
+               "num_params_M": round(num_params / 1e6, 1), "depth": DEPTH}, f)