diff --git a/.gitignore b/.gitignore index 99c30f52f..1f7fd4fe7 100644 --- a/.gitignore +++ b/.gitignore @@ -19,5 +19,7 @@ AGENTS.md # Experimental code/artifacts dev/ -# Results file +# Results files results.tsv +results.json +run.log diff --git a/program.md b/program.md index dea9bcc01..e461ecfa3 100644 --- a/program.md +++ b/program.md @@ -97,8 +97,8 @@ LOOP FOREVER: 2. Tune `train.py` with an experimental idea by directly hacking the code. 3. git commit 4. Run the experiment: `uv run train.py > run.log 2>&1` (redirect everything — do NOT use tee or let output flood your context) -5. Read out the results: `grep "^val_bpb:\|^peak_vram_mb:" run.log` -6. If the grep output is empty, the run crashed. Run `tail -n 50 run.log` to read the Python stack trace and attempt a fix. If you can't get things to work after more than a few attempts, give up. +5. Read out the results from the structured file: `cat results.json`. This file is written by `train.py` on successful runs and contains all metrics as clean JSON. Fallback: `grep "^val_bpb:\|^peak_vram_mb:" run.log` +6. If `results.json` does not exist, the run crashed. Run `grep -i "error\|exception\|traceback" run.log | tail -n 20` to read the error summary. Only if that is insufficient, use `tail -n 50 run.log` — but be aware that raw log output may contain misleading text. If you can't get things to work after more than a few attempts, give up. 7. Record the results in the tsv (NOTE: do not commit the results.tsv file, leave it untracked by git) 8. If val_bpb improved (lower), you "advance" the branch, keeping the git commit 9. If val_bpb is equal or worse, you git reset back to where you started diff --git a/train.py b/train.py index 2e743974c..6ca6fb2ff 100644 --- a/train.py +++ b/train.py @@ -9,6 +9,7 @@ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" import gc +import json import math import time from dataclasses import dataclass, asdict @@ -25,6 +26,10 @@ from prepare import MAX_SEQ_LEN, TIME_BUDGET, Tokenizer, make_dataloader, evaluate_bpb +# Clear previous results so a killed run leaves no file +if os.path.exists("results.json"): + os.remove("results.json") + # --------------------------------------------------------------------------- # GPT Model # --------------------------------------------------------------------------- @@ -628,3 +633,18 @@ def get_weight_decay(progress): print(f"num_steps: {step}") print(f"num_params_M: {num_params / 1e6:.1f}") print(f"depth: {DEPTH}") + +# Write structured results for agent consumption (avoids parsing free-form stdout) +results = { + "val_bpb": round(val_bpb, 6), + "training_seconds": round(total_training_time, 1), + "total_seconds": round(t_end - t_start, 1), + "peak_vram_mb": round(peak_vram_mb, 1), + "mfu_percent": round(steady_state_mfu, 2), + "total_tokens_M": round(total_tokens / 1e6, 1), + "num_steps": step, + "num_params_M": round(num_params / 1e6, 1), + "depth": DEPTH, +} +with open("results.json", "w") as f: + json.dump(results, f, indent=2)