From f1c329ace41f75928f969c8fc86810de27826b8e Mon Sep 17 00:00:00 2001 From: Matt Van Horn Date: Mon, 9 Mar 2026 07:40:18 -0700 Subject: [PATCH] write eval results to results.json for structured agent consumption The agent loop currently reads results by grepping stdout from run.log, which mixes trusted metrics with arbitrary training output. Writing a structured results.json gives agents a reliable, parseable results channel. Existing stdout output is unchanged. Co-Authored-By: Claude Opus 4.6 --- program.md | 4 ++-- train.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/program.md b/program.md index dea9bcc01..035aee12f 100644 --- a/program.md +++ b/program.md @@ -97,8 +97,8 @@ LOOP FOREVER: 2. Tune `train.py` with an experimental idea by directly hacking the code. 3. git commit 4. Run the experiment: `uv run train.py > run.log 2>&1` (redirect everything — do NOT use tee or let output flood your context) -5. Read out the results: `grep "^val_bpb:\|^peak_vram_mb:" run.log` -6. If the grep output is empty, the run crashed. Run `tail -n 50 run.log` to read the Python stack trace and attempt a fix. If you can't get things to work after more than a few attempts, give up. +5. Read out the results: `cat results.json` (structured JSON written by train.py). Fallback: `grep "^val_bpb:\|^peak_vram_mb:" run.log` +6. If results.json doesn't exist and the grep output is empty, the run crashed. Run `tail -n 50 run.log` to read the Python stack trace and attempt a fix. If you can't get things to work after more than a few attempts, give up. 7. Record the results in the tsv (NOTE: do not commit the results.tsv file, leave it untracked by git) 8. If val_bpb improved (lower), you "advance" the branch, keeping the git commit 9. If val_bpb is equal or worse, you git reset back to where you started diff --git a/train.py b/train.py index 6994fb9bb..cb711a032 100644 --- a/train.py +++ b/train.py @@ -9,6 +9,7 @@ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" import gc +import json import time from dataclasses import dataclass, asdict @@ -627,3 +628,10 @@ def get_weight_decay(progress): print(f"num_steps: {step}") print(f"num_params_M: {num_params / 1e6:.1f}") print(f"depth: {DEPTH}") + +# Write structured results for reliable agent consumption +with open("results.json", "w") as f: + json.dump({"val_bpb": val_bpb, "training_seconds": round(total_training_time, 1), + "total_seconds": round(t_end - t_start, 1), "peak_vram_mb": round(peak_vram_mb, 1), + "mfu_percent": round(steady_state_mfu, 2), "num_steps": step, + "num_params_M": round(num_params / 1e6, 1), "depth": DEPTH}, f)