From 331367ce62beab3261035d647ea754e15bf7d8b3 Mon Sep 17 00:00:00 2001 From: Shehab Anwer | The Adimension Date: Mon, 16 Mar 2026 15:59:54 +0200 Subject: [PATCH 01/10] =?UTF-8?q?ground.json=20[NEW]=20=E2=80=94=20externa?= =?UTF-8?q?lize=20platform=20config=20from=20karpathy/autoresearch=20prepa?= =?UTF-8?q?re.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract hardcoded constants (data paths, tokenizer settings, time budgets, processor overrides) from karpathy/autoresearch prepare.py into a user-owned, read-only JSON config. Enables transparent platform configuration without modifying source code. Fields: mode (test/train), data (HuggingFace cache/URL/shards), tokenizer (vocab_size=8192, BPE split pattern, special tokens), training (max_seq_len=2048, time budgets: test=60s/train=300s), processor (dtype/compile/flash_attention/peak_flops — all 'auto' by default). Upstream ref: karpathy/autoresearch master @ c2450ad Blob SHA: 823225c3138454398593dddcc1953ad20e3f9e5f --- ground.json | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 ground.json diff --git a/ground.json b/ground.json new file mode 100644 index 000000000..823225c31 --- /dev/null +++ b/ground.json @@ -0,0 +1,36 @@ +{ + "codename": "mar15-2-rtx5000", + "mode": "test", + + "data": { + "cache_dir": "~/.cache/autoresearch", + "base_url": "https://huggingface.co/datasets/karpathy/climbmix-400b-shuffle/resolve/main", + "max_shard": 6542, + "val_shard": 6542, + "num_shards": 10, + "download_workers": 8 + }, + + "tokenizer": { + "vocab_size": 8192, + "split_pattern": "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}{1,2}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", + "special_tokens_count": 4, + "bos_token": "<|reserved_0|>" + }, + + "training": { + "max_seq_len": 2048, + "time_budget_test": 60, + "time_budget_train": 300, + "eval_tokens_multiplier": 40, + "eval_tokens_unit": 524288, + "max_run_wall_seconds": 30 + }, + + "processor": { + "dtype": "auto", + "compile": "auto", + "flash_attention": "auto", + "peak_flops": "auto" + } +} From 1aa243a154e980a99842c375f55ef7bffd57974a Mon Sep 17 00:00:00 2001 From: Shehab Anwer | The Adimension Date: Mon, 16 Mar 2026 16:00:04 +0200 Subject: [PATCH 02/10] =?UTF-8?q?model.json=20[NEW]=20=E2=80=94=20external?= =?UTF-8?q?ize=20hyperparameters=20from=20karpathy/autoresearch=20train.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract architecture and optimization constants from karpathy/autoresearch train.py into an agent-owned JSON config. The agent modifies this file during experiment iterations; the human reviews via version control. Fields: architecture (depth=8, aspect_ratio=128, head_dim=64, window_pattern=SL), optimization (total_batch_size_power=17, device_batch_size=16, LRs, betas, warmup/warmdown ratios), evaluation (batch_size=16, tokens=3145728). Upstream ref: karpathy/autoresearch master @ c2450ad Blob SHA: b0227afc959959da9a78ed0106aee45a4631de4d --- model.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 model.json diff --git a/model.json b/model.json new file mode 100644 index 000000000..b0227afc9 --- /dev/null +++ b/model.json @@ -0,0 +1,27 @@ +{ + "architecture": { + "depth": 8, + "aspect_ratio": 128, + "head_dim": 64, + "window_pattern": "SL" + }, + + "optimization": { + "total_batch_size_power": 17, + "device_batch_size": 16, + "embedding_lr": 0.1, + "unembedding_lr": 0.002, + "matrix_lr": 0.01, + "scalar_lr": 0.25, + "weight_decay": 0.01, + "adam_betas": [0.8, 0.95], + "warmup_ratio": 0.2, + "warmdown_ratio": 0.75, + "final_lr_frac": 0.1 + }, + + "evaluation": { + "batch_size": 16, + "tokens": 3145728 + } +} From 235bc5cddc1d833e7efd758917d73efe95b004c2 Mon Sep 17 00:00:00 2001 From: Shehab Anwer | The Adimension Date: Mon, 16 Mar 2026 16:00:19 +0200 Subject: [PATCH 03/10] =?UTF-8?q?prepare.py=20[MODIFIED=20+125/-17]=20?= =?UTF-8?q?=E2=80=94=20add=20GPU=20platform=20detection,=20read=20config?= =?UTF-8?q?=20from=20ground.json?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace hardcoded constants with ground.json reads at import time. Add _GPU_OPS_PER_CYCLE_PER_SM lookup table for compute capabilities: Volta (7.0), Turing (7.5), Ampere (8.0/8.6/8.7), Ada (8.9), Hopper (9.0), Blackwell (10.0). New functions: - _estimate_peak_flops(): compute peak FP16/BF16 tensor TFLOPS from SM count, clock rate, and ops-per-cycle lookup. - _detect_platform(): auto-select dtype, attention backend (flash/sdpa), torch.compile, GradScaler, and embedding_dtype per GPU generation. Hopper+: bf16/flash/compile. Ampere/Ada: bf16/flash/compile. Turing/older: fp16/sdpa/no-compile/GradScaler. Windows compile guards (sys.platform != 'win32') for triton. Exports: MAX_SEQ_LEN, TIME_BUDGET, PLATFORM dict. ground.json processor overrides applied for non-'auto' values. Platform detection ref: jsegov/autoresearch-win-rtx (Windows RTX adaptation) Upstream ref: karpathy/autoresearch master @ c2450ad Master blob: 06bea9165abd3ae94ea82dd733997aec7928f40c Modified blob: ed13834e7f9d7e646da010d9d82bafe96d6a7632 --- prepare.py | 142 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 125 insertions(+), 17 deletions(-) diff --git a/prepare.py b/prepare.py index 06bea9165..ed13834e7 100644 --- a/prepare.py +++ b/prepare.py @@ -21,34 +21,142 @@ import pyarrow.parquet as pq import rustbpe import tiktoken +import json import torch # --------------------------------------------------------------------------- -# Constants (fixed, do not modify) +# Constants — loaded from ground.json when available, else hardcoded defaults # --------------------------------------------------------------------------- -MAX_SEQ_LEN = 2048 # context length -TIME_BUDGET = 300 # training time budget in seconds (5 minutes) -EVAL_TOKENS = 40 * 524288 # number of tokens for val eval +_GROUND_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ground.json") +with open(_GROUND_PATH, "r", encoding="utf-8") as _f: + _ground = json.load(_f) + +_training = _ground["training"] +_data = _ground["data"] +_tok = _ground["tokenizer"] +_mode = _ground["mode"] + +MAX_SEQ_LEN = _training["max_seq_len"] +TIME_BUDGET = _training["time_budget_test"] if _mode == "test" else _training["time_budget_train"] +EVAL_TOKENS = _training["eval_tokens_multiplier"] * _training["eval_tokens_unit"] # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- -CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "autoresearch") +CACHE_DIR = os.path.expanduser(_data["cache_dir"]) DATA_DIR = os.path.join(CACHE_DIR, "data") TOKENIZER_DIR = os.path.join(CACHE_DIR, "tokenizer") -BASE_URL = "https://huggingface.co/datasets/karpathy/climbmix-400b-shuffle/resolve/main" -MAX_SHARD = 6542 # the last datashard is shard_06542.parquet -VAL_SHARD = MAX_SHARD # pinned validation shard (shard_06542) +BASE_URL = _data["base_url"] +MAX_SHARD = _data["max_shard"] +VAL_SHARD = _data["val_shard"] VAL_FILENAME = f"shard_{VAL_SHARD:05d}.parquet" -VOCAB_SIZE = 8192 +VOCAB_SIZE = _tok["vocab_size"] -# BPE split pattern (GPT-4 style, with \p{N}{1,2} instead of {1,3}) -SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" +SPLIT_PATTERN = _tok["split_pattern"] -SPECIAL_TOKENS = [f"<|reserved_{i}|>" for i in range(4)] -BOS_TOKEN = "<|reserved_0|>" +_n_special = _tok["special_tokens_count"] +SPECIAL_TOKENS = [f"<|reserved_{i}|>" for i in range(_n_special)] +BOS_TOKEN = _tok["bos_token"] + +# --------------------------------------------------------------------------- +# Platform detection — auto-detect GPU, with ground.json processor overrides +# Exports PLATFORM dict consumed by train.py alongside MAX_SEQ_LEN, TIME_BUDGET +# --------------------------------------------------------------------------- + +_proc = _ground["processor"] + +# FP16/BF16 tensor-core ops per cycle per SM, by (major, minor) compute capability. +# Used with SM count + boost clock to compute peak FLOPS at runtime. +_GPU_OPS_PER_CYCLE_PER_SM = { + (7, 0): 128, # Volta (V100) + (7, 5): 128, # Turing (RTX 20xx, Quadro RTX, T4) + (8, 0): 512, # Ampere GA100 (A100) + (8, 6): 256, # Ampere GA10x (RTX 30xx, A40) + (8, 7): 256, # Ampere GA10B (Jetson Orin) + (8, 9): 512, # Ada Lovelace (RTX 40xx, L40) + (9, 0): 1024, # Hopper (H100 SXM) + (10, 0): 1024, # Blackwell (B100/B200) — provisional +} + +def _estimate_peak_flops(device_idx=0): + """Compute peak FP16/BF16 tensor TFLOPS from SM count and clock.""" + import torch as _torch + props = _torch.cuda.get_device_properties(device_idx) + cc = (props.major, props.minor) + sm_count = props.multi_processor_count + clock_ghz = props.clock_rate / 1e6 # clock_rate is in kHz + ops_per_cycle = _GPU_OPS_PER_CYCLE_PER_SM.get(cc) + if ops_per_cycle is None: + major_fallbacks = {7: 128, 8: 256, 9: 1024, 10: 1024} + ops_per_cycle = major_fallbacks.get(cc[0], 128) + print(f"Warning: unknown GPU CC {cc}, using fallback ops/cycle={ops_per_cycle}") + peak = sm_count * ops_per_cycle * clock_ghz * 1e9 * 2 # *2 for FMA + print(f"GPU: {props.name} | CC {cc[0]}.{cc[1]} | {sm_count} SMs | " + f"{clock_ghz:.2f} GHz | peak {peak/1e12:.1f} TFLOPS") + return peak + + +def _detect_platform(): + import torch as _torch + p = {"device": "cpu", "dtype": "fp32", "use_grad_scaler": False, + "attention": "naive", "compile": False, "embedding_dtype": "fp32", + "fa3_repo": None, "peak_flops": 0.0} + if _torch.cuda.is_available(): + p["device"] = "cuda" + cc = _torch.cuda.get_device_capability(0) + p["peak_flops"] = _estimate_peak_flops(0) + if cc[0] >= 9: + # Hopper+ — bf16, FA3, compile + p["dtype"] = "bf16" + p["embedding_dtype"] = "bf16" + p["fa3_repo"] = "varunneal/flash-attention-3" + p["attention"] = "flash" + try: + import triton # noqa: F401 + p["compile"] = sys.platform != "win32" + except ImportError: + print("Warning: triton not found — torch.compile disabled (Hopper)") + elif cc[0] >= 8: + # Ampere/Ada — bf16, FA3, compile + p["dtype"] = "bf16" + p["embedding_dtype"] = "bf16" + p["fa3_repo"] = "kernels-community/flash-attn3" + p["attention"] = "flash" + try: + import triton # noqa: F401 + p["compile"] = sys.platform != "win32" + except ImportError: + print("Warning: triton not found — torch.compile disabled (Ampere/Ada)") + else: + # Turing / older — fp16, SDPA, no compile + p["dtype"] = "fp16" + p["use_grad_scaler"] = True + p["attention"] = "sdpa" + p["embedding_dtype"] = "fp32" + + # Apply ground.json processor overrides (non-"auto" values) + if _proc["dtype"] != "auto": + p["dtype"] = _proc["dtype"] + p["use_grad_scaler"] = p["dtype"] == "fp16" + p["embedding_dtype"] = "bf16" if p["dtype"] == "bf16" else "fp32" + if _proc["compile"] != "auto": + p["compile"] = bool(_proc["compile"]) + if _proc["flash_attention"] != "auto": + fa = _proc["flash_attention"] + if fa is False or fa == "sdpa": + p["attention"] = "sdpa" + p["fa3_repo"] = None + elif isinstance(fa, str) and fa not in ("auto", "sdpa"): + p["attention"] = "flash" + p["fa3_repo"] = fa + pf = _proc.get("peak_flops", "auto") + if pf != "auto": + p["peak_flops"] = float(pf) + return p + +PLATFORM = _detect_platform() # --------------------------------------------------------------------------- # Data download @@ -81,8 +189,8 @@ def download_single_shard(index): if os.path.exists(path): try: os.remove(path) - except OSError: - pass + except OSError as cleanup_err: + print(f" Warning: failed to clean up {path}: {cleanup_err}") if attempt < max_attempts: time.sleep(2 ** attempt) return False @@ -370,8 +478,8 @@ def evaluate_bpb(model, tokenizer, batch_size): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Prepare data and tokenizer for autoresearch") - parser.add_argument("--num-shards", type=int, default=10, help="Number of training shards to download (-1 = all). Val shard is always pinned.") - parser.add_argument("--download-workers", type=int, default=8, help="Number of parallel download workers") + parser.add_argument("--num-shards", type=int, default=_data["num_shards"], help="Number of training shards to download (-1 = all). Val shard is always pinned.") + parser.add_argument("--download-workers", type=int, default=_data["download_workers"], help="Number of parallel download workers") args = parser.parse_args() num_shards = MAX_SHARD if args.num_shards == -1 else args.num_shards From 1ac6081c6ba128ca5e0cda5d369cfcec0b8157b7 Mon Sep 17 00:00:00 2001 From: Shehab Anwer | The Adimension Date: Mon, 16 Mar 2026 16:00:33 +0200 Subject: [PATCH 04/10] =?UTF-8?q?train.py=20[MODIFIED=20+194/-55]=20?= =?UTF-8?q?=E2=80=94=20load=20from=20model.json,=20import=20PLATFORM,=20GP?= =?UTF-8?q?U-safe=20numerics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace hardcoded hyperparameters with model.json reads at startup. Import PLATFORM dict from prepare.py for dtype, attention, compile, GradScaler configuration. Key changes: - fp32 optimizer moments for fp16 parameters (Turing numerical stability) - Gradient upcast to fp32 in AdamW update step - _MUON_ORTHO_DTYPE: float32 for Turing (CC<8), bfloat16 for Ampere+ - Sliding-window attention mask caching (avoid recomputation per step) - torch.amp.GradScaler(enabled=PLATFORM['use_grad_scaler']) - autocast dtype from PLATFORM['dtype'] - update_research_memory(): append experiment outcome to sessions/memory.md (agent-owned, never writes to program.md) - _crash_handler: sys.excepthook that calls update_research_memory on crash - Parseable '---'-delimited key=value summary block at end of training Upstream ref: karpathy/autoresearch master @ c2450ad Master blob: 2e743974c7f06b54311643b314712303fbb26e65 Modified blob: bba5418f47684344e986f6697249770048549606 --- train.py | 249 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 194 insertions(+), 55 deletions(-) diff --git a/train.py b/train.py index 2e743974c..bba5418f4 100644 --- a/train.py +++ b/train.py @@ -17,13 +17,104 @@ import torch.nn as nn import torch.nn.functional as F -from kernels import get_kernel -cap = torch.cuda.get_device_capability() -# varunneal's FA3 is Hopper only, use kernels-community on non-Hopper GPUs -repo = "varunneal/flash-attention-3" if cap == (9, 0) else "kernels-community/flash-attn3" -fa3 = get_kernel(repo).flash_attn_interface +import json +import sys +import re +import csv +import traceback +from datetime import datetime -from prepare import MAX_SEQ_LEN, TIME_BUDGET, Tokenizer, make_dataloader, evaluate_bpb +from prepare import MAX_SEQ_LEN, TIME_BUDGET, PLATFORM, Tokenizer, make_dataloader, evaluate_bpb, get_token_bytes + +# --------------------------------------------------------------------------- +# Persistent experiment memory (hardened) +# --------------------------------------------------------------------------- +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_RESULTS_PATH = os.path.join(_SCRIPT_DIR, "results.tsv") +_MEMORY_PATH = os.path.join(_SCRIPT_DIR, "sessions", "memory.md") + + +def update_research_memory(outcome_str: str) -> None: + """Write experiment memory summary to sessions/memory.md. + + Reads results.tsv to find top historical runs and writes a Markdown + summary. Never mutates program.md or any user-owned file. + """ + if not os.path.exists(_RESULTS_PATH): + return + + # 1. Parse active hyperparameters from model.json (already loaded) + try: + hparams = [ + f"DEPTH={_model_cfg['architecture']['depth']}", + f"BS=2**{_model_cfg['optimization']['total_batch_size_power']}", + f"LR={_model_cfg['optimization']['matrix_lr']}", + ] + hparam_str = ", ".join(hparams) + except (KeyError, TypeError) as exc: + print(f"WARNING: could not read hyperparams for memory: {exc}") + hparam_str = "unknown" + + # 2. Parse top historical runs from results.tsv + runs: list[dict] = [] + try: + with open(_RESULTS_PATH, "r", encoding="utf-8") as f: + for row in csv.DictReader(f, delimiter="\t"): + try: + v = float(row.get("val_bpb", "0")) + if v > 0: + runs.append({ + "run_id": row.get("run_id", "unknown")[:8], + "val_bpb": v, + "desc": row.get("description", "")[:60], + }) + except (ValueError, TypeError): + continue # skip rows with unparseable val_bpb + except OSError as exc: + print(f"WARNING: could not read results.tsv for memory: {exc}") + return + + # 3. Build memory payload + ts = datetime.now().strftime("%Y-%m-%d %H:%M") + lines = [f"## Experiment Memory (auto-generated {ts})\n"] + if runs: + runs.sort(key=lambda x: x["val_bpb"]) + lines.append(f"**Best val_bpb ever:** {runs[0]['val_bpb']:.4f}\n") + lines.append("**Top runs:**\n") + for i, r in enumerate(runs[:5], 1): + lines.append(f"{i}. [{r['run_id']}] {r['desc']} -> {r['val_bpb']:.4f}") + lines.append("") + lines.append(f"**Last run context:**") + lines.append(f"- Hyperparams: `{hparam_str}`") + lines.append(f"- Outcome: {outcome_str}\n") + + # 4. Write to sessions/memory.md (agent-owned, never program.md) + os.makedirs(os.path.dirname(_MEMORY_PATH), exist_ok=True) + try: + with open(_MEMORY_PATH, "w", encoding="utf-8") as f: + f.write("\n".join(lines) + "\n") + except OSError as exc: + print(f"WARNING: could not write experiment memory: {exc}") + + +def _crash_handler(exc_type, exc_val, exc_tb): + """On unhandled exception, record crash in experiment memory then re-raise.""" + err = "".join(traceback.format_exception_only(exc_type, exc_val)).strip() + try: + update_research_memory(f"CRASHED: `{err}`") + except Exception: + pass # crash handler must never itself crash + sys.__excepthook__(exc_type, exc_val, exc_tb) + + +sys.excepthook = _crash_handler + +# --------------------------------------------------------------------------- +# Load model configuration from model.json +# --------------------------------------------------------------------------- +_model_cfg_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "model.json") +with open(_model_cfg_path, "r") as _f: + _model_cfg = json.load(_f) # --------------------------------------------------------------------------- # GPT Model @@ -31,11 +122,11 @@ @dataclass class GPTConfig: - sequence_len: int = 2048 - vocab_size: int = 32768 - n_layer: int = 12 - n_head: int = 6 - n_kv_head: int = 6 + sequence_len: int = 1024 + vocab_size: int = 16384 + n_layer: int = 6 + n_head: int = 2 + n_kv_head: int = 2 n_embd: int = 768 window_pattern: str = "SSSL" @@ -73,6 +164,9 @@ def __init__(self, config, layer_idx): self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False) self.ve_gate_channels = 32 self.ve_gate = nn.Linear(self.ve_gate_channels, self.n_kv_head, bias=False) if has_ve(layer_idx, config.n_layer) else None + # Sliding-window mask cache (constant for a given T × win; avoids re-creation every forward) + self._cached_mask = None + self._cached_mask_key = (0, 0) def forward(self, x, ve, cos_sin, window_size): B, T, C = x.size() @@ -90,8 +184,26 @@ def forward(self, x, ve, cos_sin, window_size): q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin) q, k = norm(q), norm(k) - y = fa3.flash_attn_func(q, k, v, causal=True, window_size=window_size) - y = y.contiguous().view(B, T, -1) + # Transpose to (B, H, T, D) for SDPA + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + + win = window_size[0] + if win >= T: + # Full context — use efficient is_causal path + y = F.scaled_dot_product_attention(q, k, v, is_causal=True) + else: + # Sliding window causal — use cached mask + key = (T, win) + if self._cached_mask_key != key: + rows = torch.arange(T, device=q.device).unsqueeze(1) + cols = torch.arange(T, device=q.device).unsqueeze(0) + self._cached_mask = (cols <= rows) & ((rows - cols) < win) + self._cached_mask_key = key + y = F.scaled_dot_product_attention(q, k, v, attn_mask=self._cached_mask) + + y = y.transpose(1, 2).contiguous().view(B, T, -1) y = self.c_proj(y) return y @@ -175,10 +287,7 @@ def init_weights(self): head_dim = self.config.n_embd // self.config.n_head cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim) self.cos, self.sin = cos, sin - # Cast embeddings to bf16 - self.transformer.wte.to(dtype=torch.bfloat16) - for ve in self.value_embeds.values(): - ve.to(dtype=torch.bfloat16) + # Embeddings stay in FP32; autocast handles FP16 during forward def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): if device is None: @@ -188,7 +297,7 @@ def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=No t = torch.arange(seq_len, dtype=torch.float32, device=device) freqs = torch.outer(t, inv_freq) cos, sin = freqs.cos(), freqs.sin() - cos, sin = cos.bfloat16(), sin.bfloat16() + cos, sin = cos.float(), sin.float() cos, sin = cos[None, :, None, :], sin[None, :, None, :] return cos, sin @@ -302,26 +411,25 @@ def forward(self, idx, targets=None, reduction='mean'): (2.3465413258596377, -1.7097828382687081, 0.42323551169305323), ] -@torch.compile(dynamic=False, fullgraph=True) def adamw_step_fused(p, grad, exp_avg, exp_avg_sq, step_t, lr_t, beta1_t, beta2_t, eps_t, wd_t): p.mul_(1 - lr_t * wd_t) - exp_avg.lerp_(grad, 1 - beta1_t) - exp_avg_sq.lerp_(grad.square(), 1 - beta2_t) + g = grad.to(exp_avg.dtype) # upcast fp16 grads to fp32 moment dtype + exp_avg.lerp_(g, 1 - beta1_t) + exp_avg_sq.lerp_(g.square(), 1 - beta2_t) bias1 = 1 - beta1_t ** step_t bias2 = 1 - beta2_t ** step_t denom = (exp_avg_sq / bias2).sqrt() + eps_t step_size = lr_t / bias1 - p.add_(exp_avg / denom, alpha=-step_size) + p.add_((exp_avg / denom * (-step_size)).to(p.dtype)) -@torch.compile(dynamic=False, fullgraph=True) def muon_step_fused(stacked_grads, stacked_params, momentum_buffer, second_momentum_buffer, momentum_t, lr_t, wd_t, beta2_t, ns_steps, red_dim): # Nesterov momentum momentum = momentum_t.to(stacked_grads.dtype) momentum_buffer.lerp_(stacked_grads, 1 - momentum) g = stacked_grads.lerp_(momentum_buffer, momentum) - # Polar express orthogonalization - X = g.bfloat16() + # Polar express orthogonalization — float32 on Turing (no hw bf16); bf16 on Ampere+ + X = g.to(_MUON_ORTHO_DTYPE) X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6) if g.size(-2) > g.size(-1): for a, b, c in polar_express_coeffs[:ns_steps]: @@ -353,6 +461,14 @@ def muon_step_fused(stacked_grads, stacked_params, momentum_buffer, second_momen stacked_params.sub_(lr * g + lr * wd * stacked_params * mask) +# Muon orthogonalization dtype: bf16 on Ampere+ (hw support), float32 on Turing/older +_MUON_ORTHO_DTYPE = ( + torch.bfloat16 + if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 + else torch.float32 +) + + class MuonAdamW(torch.optim.Optimizer): """Combined optimizer: Muon for 2D matrix params, AdamW for others.""" @@ -378,8 +494,10 @@ def _step_adamw(self, group): state = self.state[p] if not state: state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p) - state['exp_avg_sq'] = torch.zeros_like(p) + # float32 moments for fp16 params — prevents grad² underflow + moment_dtype = torch.float32 if p.dtype == torch.float16 else p.dtype + state['exp_avg'] = torch.zeros(p.shape, dtype=moment_dtype, device=p.device) + state['exp_avg_sq'] = torch.zeros(p.shape, dtype=moment_dtype, device=p.device) state['step'] += 1 self._adamw_step_t.fill_(state['step']) self._adamw_lr_t.fill_(group['lr']) @@ -426,29 +544,27 @@ def step(self): self._step_muon(group) # --------------------------------------------------------------------------- -# Hyperparameters (edit these directly, no CLI flags needed) +# Hyperparameters — loaded from model.json (single source of truth) # --------------------------------------------------------------------------- # Model architecture -ASPECT_RATIO = 64 # model_dim = depth * ASPECT_RATIO -HEAD_DIM = 128 # target head dimension for attention -WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context +DEPTH = _model_cfg["architecture"]["depth"] +ASPECT_RATIO = _model_cfg["architecture"]["aspect_ratio"] +HEAD_DIM = _model_cfg["architecture"]["head_dim"] +WINDOW_PATTERN = _model_cfg["architecture"]["window_pattern"] # Optimization -TOTAL_BATCH_SIZE = 2**19 # ~524K tokens per optimizer step -EMBEDDING_LR = 0.6 # learning rate for token embeddings (Adam) -UNEMBEDDING_LR = 0.004 # learning rate for lm_head (Adam) -MATRIX_LR = 0.04 # learning rate for matrix parameters (Muon) -SCALAR_LR = 0.5 # learning rate for per-layer scalars (Adam) -WEIGHT_DECAY = 0.2 # cautious weight decay for Muon -ADAM_BETAS = (0.8, 0.95) # Adam beta1, beta2 -WARMUP_RATIO = 0.0 # fraction of time budget for LR warmup -WARMDOWN_RATIO = 0.5 # fraction of time budget for LR warmdown -FINAL_LR_FRAC = 0.0 # final LR as fraction of initial - -# Model size -DEPTH = 8 # number of transformer layers -DEVICE_BATCH_SIZE = 128 # per-device batch size (reduce if OOM) +TOTAL_BATCH_SIZE = 2 ** _model_cfg["optimization"]["total_batch_size_power"] +DEVICE_BATCH_SIZE = _model_cfg["optimization"]["device_batch_size"] +EMBEDDING_LR = _model_cfg["optimization"]["embedding_lr"] +UNEMBEDDING_LR = _model_cfg["optimization"]["unembedding_lr"] +MATRIX_LR = _model_cfg["optimization"]["matrix_lr"] +SCALAR_LR = _model_cfg["optimization"]["scalar_lr"] +WEIGHT_DECAY = _model_cfg["optimization"]["weight_decay"] +ADAM_BETAS = tuple(_model_cfg["optimization"]["adam_betas"]) +WARMUP_RATIO = _model_cfg["optimization"]["warmup_ratio"] +WARMDOWN_RATIO = _model_cfg["optimization"]["warmdown_ratio"] +FINAL_LR_FRAC = _model_cfg["optimization"]["final_lr_frac"] # --------------------------------------------------------------------------- # Setup: tokenizer, model, optimizer, dataloader @@ -459,8 +575,9 @@ def step(self): torch.cuda.manual_seed(42) torch.set_float32_matmul_precision("high") device = torch.device("cuda") -autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16) -H100_BF16_PEAK_FLOPS = 989.5e12 +_dtype_map = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32} +autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=_dtype_map[PLATFORM["dtype"]]) +PEAK_FLOPS = PLATFORM["peak_flops"] tokenizer = Tokenizer.from_directory() vocab_size = tokenizer.get_vocab_size() @@ -505,7 +622,9 @@ def build_model_config(depth): weight_decay=WEIGHT_DECAY, ) -model = torch.compile(model, dynamic=False) +# torch.compile disabled — requires Triton (unavailable on Windows) + +scaler = torch.amp.GradScaler(enabled=PLATFORM["use_grad_scaler"]) train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, MAX_SEQ_LEN, "train") x, y, epoch = next(train_loader) # prefetch first batch @@ -548,7 +667,7 @@ def get_weight_decay(progress): loss = model(x, y) train_loss = loss.detach() loss = loss / grad_accum_steps - loss.backward() + scaler.scale(loss).backward() x, y, epoch = next(train_loader) # Progress and schedules @@ -561,7 +680,9 @@ def get_weight_decay(progress): if group['kind'] == 'muon': group["momentum"] = muon_momentum group["weight_decay"] = muon_weight_decay - optimizer.step() + scaler.unscale_(optimizer) + scaler.step(optimizer) + scaler.update() model.zero_grad(set_to_none=True) train_loss_f = train_loss.item() @@ -584,7 +705,7 @@ def get_weight_decay(progress): debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) pct_done = 100 * progress tok_per_sec = int(TOTAL_BATCH_SIZE / dt) - mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE / dt / H100_BF16_PEAK_FLOPS + mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE / dt / PEAK_FLOPS remaining = max(0, TIME_BUDGET - total_training_time) print(f"\rstep {step:05d} ({pct_done:.1f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt*1000:.0f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.1f}% | epoch: {epoch} | remaining: {remaining:.0f}s ", end="", flush=True) @@ -607,15 +728,31 @@ def get_weight_decay(progress): total_tokens = step * TOTAL_BATCH_SIZE -# Final eval +# Final eval — fast version: ~3M tokens instead of 20M to stay within time budget +# (seq=2048 makes full 20M eval take ~420s; 3M ≈ 60s; same val shard so comparisons are valid) +EVAL_BATCH_SIZE = _model_cfg["evaluation"]["batch_size"] +FAST_EVAL_TOKENS = _model_cfg["evaluation"]["tokens"] +eval_steps = FAST_EVAL_TOKENS // (EVAL_BATCH_SIZE * MAX_SEQ_LEN) model.eval() -with autocast_ctx: - val_bpb = evaluate_bpb(model, tokenizer, DEVICE_BATCH_SIZE) +_token_bytes = get_token_bytes(device="cuda") +_val_loader = make_dataloader(tokenizer, EVAL_BATCH_SIZE, MAX_SEQ_LEN, "val") +_total_nats = 0.0 +_total_bytes = 0 +with torch.no_grad(), autocast_ctx: + for _ in range(eval_steps): + _x, _y, _ = next(_val_loader) + _loss_flat = model(_x, _y, reduction='none').view(-1) + _y_flat = _y.view(-1) + _nbytes = _token_bytes[_y_flat] + _mask = _nbytes > 0 + _total_nats += (_loss_flat * _mask).sum().item() + _total_bytes += _nbytes.sum().item() +val_bpb = _total_nats / (math.log(2) * _total_bytes) # Final summary t_end = time.time() startup_time = t_start_training - t_start -steady_state_mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE * (step - 10) / total_training_time / H100_BF16_PEAK_FLOPS if total_training_time > 0 else 0 +steady_state_mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE * (step - 10) / total_training_time / PEAK_FLOPS if total_training_time > 0 else 0 peak_vram_mb = torch.cuda.max_memory_allocated() / 1024 / 1024 print("---") @@ -628,3 +765,5 @@ def get_weight_decay(progress): print(f"num_steps: {step}") print(f"num_params_M: {num_params / 1e6:.1f}") print(f"depth: {DEPTH}") + +update_research_memory(f"val_bpb={val_bpb:.4f}, steps={step}, tokens={total_tokens/1e6:.1f}M") \ No newline at end of file From 059b937415e632d3b5c08508429526fea41f3b15 Mon Sep 17 00:00:00 2001 From: Shehab Anwer | The Adimension Date: Mon, 16 Mar 2026 16:00:45 +0200 Subject: [PATCH 05/10] =?UTF-8?q?program.md=20[MODIFIED=20+96/-114]=20?= =?UTF-8?q?=E2=80=94=20structured=20agent=20protocol=20replacing=20free-fo?= =?UTF-8?q?rm=20narrative?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite agent instructions as a structured protocol with numbered sections: 1. Orientation — mandatory file reads (ground.json, model.json, prepare.py, train.py) 2. Decision metrics — table: val_bpb, peak_vram_mb, mfu_percent, training_seconds, total_tokens_M, num_params_M 3. File ownership — governance table: user-owned read-only (ground.json, prepare.py, program.md) vs agent-owned editable (model.json, train.py, results.tsv) 4. Execution sequence — first run (setup + baseline) and subsequent runs (hypothesis-driven experiment loop with keep/discard/crash status) 5. Logging rules — per-run log files in sessions/, append-only results.tsv 6. Constraints — time budget enforcement, no new packages, edit restrictions 7. Autonomy — continue iterating until manually stopped Upstream ref: karpathy/autoresearch master @ c2450ad Master blob: dea9bcc0174f1502d0ba64000b94b81ba605855b Modified blob: 46ca3df525219136b42d0147e7522cef0cd29e99 --- program.md | 210 ++++++++++++++++++++++++----------------------------- 1 file changed, 96 insertions(+), 114 deletions(-) diff --git a/program.md b/program.md index dea9bcc01..46ca3df52 100644 --- a/program.md +++ b/program.md @@ -1,114 +1,96 @@ -# autoresearch - -This is an experiment to have the LLM do its own research. - -## Setup - -To set up a new experiment, work with the user to: - -1. **Agree on a run tag**: propose a tag based on today's date (e.g. `mar5`). The branch `autoresearch/` must not already exist — this is a fresh run. -2. **Create the branch**: `git checkout -b autoresearch/` from current master. -3. **Read the in-scope files**: The repo is small. Read these files for full context: - - `README.md` — repository context. - - `prepare.py` — fixed constants, data prep, tokenizer, dataloader, evaluation. Do not modify. - - `train.py` — the file you modify. Model architecture, optimizer, training loop. -4. **Verify data exists**: Check that `~/.cache/autoresearch/` contains data shards and a tokenizer. If not, tell the human to run `uv run prepare.py`. -5. **Initialize results.tsv**: Create `results.tsv` with just the header row. The baseline will be recorded after the first run. -6. **Confirm and go**: Confirm setup looks good. - -Once you get confirmation, kick off the experimentation. - -## Experimentation - -Each experiment runs on a single GPU. The training script runs for a **fixed time budget of 5 minutes** (wall clock training time, excluding startup/compilation). You launch it simply as: `uv run train.py`. - -**What you CAN do:** -- Modify `train.py` — this is the only file you edit. Everything is fair game: model architecture, optimizer, hyperparameters, training loop, batch size, model size, etc. - -**What you CANNOT do:** -- Modify `prepare.py`. It is read-only. It contains the fixed evaluation, data loading, tokenizer, and training constants (time budget, sequence length, etc). -- Install new packages or add dependencies. You can only use what's already in `pyproject.toml`. -- Modify the evaluation harness. The `evaluate_bpb` function in `prepare.py` is the ground truth metric. - -**The goal is simple: get the lowest val_bpb.** Since the time budget is fixed, you don't need to worry about training time — it's always 5 minutes. Everything is fair game: change the architecture, the optimizer, the hyperparameters, the batch size, the model size. The only constraint is that the code runs without crashing and finishes within the time budget. - -**VRAM** is a soft constraint. Some increase is acceptable for meaningful val_bpb gains, but it should not blow up dramatically. - -**Simplicity criterion**: All else being equal, simpler is better. A small improvement that adds ugly complexity is not worth it. Conversely, removing something and getting equal or better results is a great outcome — that's a simplification win. When evaluating whether to keep a change, weigh the complexity cost against the improvement magnitude. A 0.001 val_bpb improvement that adds 20 lines of hacky code? Probably not worth it. A 0.001 val_bpb improvement from deleting code? Definitely keep. An improvement of ~0 but much simpler code? Keep. - -**The first run**: Your very first run should always be to establish the baseline, so you will run the training script as is. - -## Output format - -Once the script finishes it prints a summary like this: - -``` ---- -val_bpb: 0.997900 -training_seconds: 300.1 -total_seconds: 325.9 -peak_vram_mb: 45060.2 -mfu_percent: 39.80 -total_tokens_M: 499.6 -num_steps: 953 -num_params_M: 50.3 -depth: 8 -``` - -Note that the script is configured to always stop after 5 minutes, so depending on the computing platform of this computer the numbers might look different. You can extract the key metric from the log file: - -``` -grep "^val_bpb:" run.log -``` - -## Logging results - -When an experiment is done, log it to `results.tsv` (tab-separated, NOT comma-separated — commas break in descriptions). - -The TSV has a header row and 5 columns: - -``` -commit val_bpb memory_gb status description -``` - -1. git commit hash (short, 7 chars) -2. val_bpb achieved (e.g. 1.234567) — use 0.000000 for crashes -3. peak memory in GB, round to .1f (e.g. 12.3 — divide peak_vram_mb by 1024) — use 0.0 for crashes -4. status: `keep`, `discard`, or `crash` -5. short text description of what this experiment tried - -Example: - -``` -commit val_bpb memory_gb status description -a1b2c3d 0.997900 44.0 keep baseline -b2c3d4e 0.993200 44.2 keep increase LR to 0.04 -c3d4e5f 1.005000 44.0 discard switch to GeLU activation -d4e5f6g 0.000000 0.0 crash double model width (OOM) -``` - -## The experiment loop - -The experiment runs on a dedicated branch (e.g. `autoresearch/mar5` or `autoresearch/mar5-gpu0`). - -LOOP FOREVER: - -1. Look at the git state: the current branch/commit we're on -2. Tune `train.py` with an experimental idea by directly hacking the code. -3. git commit -4. Run the experiment: `uv run train.py > run.log 2>&1` (redirect everything — do NOT use tee or let output flood your context) -5. Read out the results: `grep "^val_bpb:\|^peak_vram_mb:" run.log` -6. If the grep output is empty, the run crashed. Run `tail -n 50 run.log` to read the Python stack trace and attempt a fix. If you can't get things to work after more than a few attempts, give up. -7. Record the results in the tsv (NOTE: do not commit the results.tsv file, leave it untracked by git) -8. If val_bpb improved (lower), you "advance" the branch, keeping the git commit -9. If val_bpb is equal or worse, you git reset back to where you started - -The idea is that you are a completely autonomous researcher trying things out. If they work, keep. If they don't, discard. And you're advancing the branch so that you can iterate. If you feel like you're getting stuck in some way, you can rewind but you should probably do this very very sparingly (if ever). - -**Timeout**: Each experiment should take ~5 minutes total (+ a few seconds for startup and eval overhead). If a run exceeds 10 minutes, kill it and treat it as a failure (discard and revert). - -**Crashes**: If a run crashes (OOM, or a bug, or etc.), use your judgment: If it's something dumb and easy to fix (e.g. a typo, a missing import), fix it and re-run. If the idea itself is fundamentally broken, just skip it, log "crash" as the status in the tsv, and move on. - -**NEVER STOP**: Once the experiment loop has begun (after the initial setup), do NOT pause to ask the human if you should continue. Do NOT ask "should I keep going?" or "is this a good stopping point?". The human might be asleep, or gone from a computer and expects you to continue working *indefinitely* until you are manually stopped. You are autonomous. If you run out of ideas, think harder — read papers referenced in the code, re-read the in-scope files for new angles, try combining previous near-misses, try more radical architectural changes. The loop runs until the human interrupts you, period. - -As an example use case, a user might leave you running while they sleep. If each experiment takes you ~5 minutes then you can run approx 12/hour, for a total of about 100 over the duration of the average human sleep. The user then wakes up to experimental results, all completed by you while they slept! +# autoresearch — agent instructions + +## 1. Orientation (do this first, every run) + +Before making any changes, read and understand the codebase: + +1. Read `ground.json` — the user-owned, read-only configuration: + - `mode`: `"test"` or `"train"` — determines which time budget applies. + - `training.time_budget_test` / `training.time_budget_train` — the wall-clock seconds the training loop is allowed to run. **Respect this strictly.** + - `training.max_seq_len` — sequence length, fixed. + - `processor` — dtype, compile, flash_attention, peak_flops overrides (all `"auto"` by default). +2. Read `model.json` — your hyperparameter file (you own this): + - `architecture`: depth, aspect_ratio, head_dim, window_pattern. + - `optimization`: batch sizes, learning rates, weight decay, adam betas, warmup/warmdown ratios, final_lr_frac. + - `evaluation`: batch_size, tokens (for the fast eval after training). +3. Read `prepare.py` — understand but **never edit**: + - Exports: `MAX_SEQ_LEN`, `TIME_BUDGET`, `PLATFORM`, `Tokenizer`, `make_dataloader`, `evaluate_bpb`, `get_token_bytes`. + - `PLATFORM` dict: device, dtype, use_grad_scaler, attention, compile, peak_flops (auto-detected from GPU hardware specs). +4. Read `train.py` — the model and training loop (you own this): + - Loads all hyperparameters from `model.json` at startup. + - Imports platform config from `prepare.py`. + - Prints a `---` separator followed by key=value summary lines at the end of training. +5. Note the key metric: **`val_bpb`** (bits per byte) — lower is better. This is printed by `train.py` after the training loop completes. + +## 2. Decision metrics + +Use these to guide your experiment choices: + +| Metric | Source | Meaning | +|---|---|---| +| `val_bpb` | train.py stdout | Primary objective — minimize this | +| `peak_vram_mb` | train.py stdout | Must not OOM — watch this when increasing batch/model size | +| `mfu_percent` | train.py stdout | Hardware utilization — indicates if compute is bottlenecked | +| `training_seconds` | train.py stdout | Must stay within `TIME_BUDGET` | +| `total_tokens_M` | train.py stdout | Throughput — more tokens = more learning within budget | +| `num_params_M` | train.py stdout | Model capacity — larger is not always better under time constraint | + +## 3. File ownership + +| File | Owner | Editable | Purpose | +|---|---|---|---| +| `ground.json` | User | **NO** | Platform config, data paths, time budgets | +| `prepare.py` | User | **NO** | Data prep, tokenizer, dataloader, eval, platform detection | +| `model.json` | Agent | **YES** | Architecture + optimization hyperparameters | +| `train.py` | Agent | **YES** | Model definition, optimizer, training loop | +| `results.tsv` | Agent | **YES** | Experiment log — append only | +| `program.md` | User | **NO** | This document | + +## 4. Execution sequence + +### First run (setup) + +1. Run `uv run prepare.py` to ensure data and tokenizer are cached. +2. Initialize `results.tsv` with this exact header (tab-separated): + + ``` + run_id val_bpb peak_vram_mb mfu_percent training_seconds total_tokens_M num_params_M status description + ``` + +3. Run `uv run train.py`, capturing stdout to `sessions/.log`. + - `run_id` = short git commit hash or a timestamp tag — unique per run. +4. Parse the `---` block from the log to extract metrics. +5. Append one row to `results.tsv` with the extracted values and `status=baseline`. + +### Subsequent runs (experiment loop) + +1. Form one hypothesis from the current code and most recent run metrics. +2. Edit `model.json` and/or `train.py`. +3. Commit with a message describing the hypothesis. +4. Run `uv run train.py`, capturing stdout to `sessions/.log` (use the new commit hash as `run_id`). +5. Parse the `---` block. Extract `val_bpb`, `peak_vram_mb`, `mfu_percent`, `training_seconds`, `total_tokens_M`, `num_params_M`. +6. Append one row to `results.tsv`: + - `status=keep` if val_bpb improved. + - `status=discard` if val_bpb did not improve. + - `status=crash` if the run failed. +7. If `discard` or `crash`: revert with `git reset --hard HEAD~1`. +8. Continue to next hypothesis. + +## 5. Logging rules + +- Every run MUST have its own log file: `sessions/.log`. +- Every run MUST have exactly one row appended to `results.tsv`. +- The `run_id` in `results.tsv` must match the log filename (without `.log`). +- Never overwrite or delete previous log files or results rows. + +## 6. Constraints + +1. **Time budget**: `train.py` self-enforces via `TIME_BUDGET` from `ground.json`. Do not circumvent this. +2. **No new packages**: use only what is already installed in the environment. +3. **Do not edit** `ground.json`, `prepare.py`, or `program.md`. +4. **Prefer simpler changes** when two options yield similar `val_bpb`. +5. **VRAM**: if a run OOMs, reduce `device_batch_size` in `model.json` or model size before retrying. + +## 7. Autonomy + +Continue iterating experiments until manually stopped. Do not pause for permission between runs. From 849b0fd0e95c7e4389fe30dcf3214d00026a5d66 Mon Sep 17 00:00:00 2001 From: Shehab Anwer | The Adimension Date: Mon, 16 Mar 2026 16:00:55 +0200 Subject: [PATCH 06/10] =?UTF-8?q?.gitignore=20[MODIFIED=20+5/-2]=20?= =?UTF-8?q?=E2=80=94=20update=20ignore=20rules=20for=20multi-GPU=20workflo?= =?UTF-8?q?w?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove results.tsv from ignore list (now tracked as append-only experiment log). Add *.pkl (serialized model checkpoints) and run.log (runtime log). Master blob: 99c30f52f1cb7b022668ec7215a604aa6b96f77a Modified blob: 986b512a60410506dd6f553fb2fc0ec503061305 --- .gitignore | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 99c30f52f..986b512a6 100644 --- a/.gitignore +++ b/.gitignore @@ -19,5 +19,8 @@ AGENTS.md # Experimental code/artifacts dev/ -# Results file -results.tsv +# Cached data files +*.pkl + +# Training logs +run.log From 64850c1edb7587939c2f74475edba6c0fedf2b9f Mon Sep 17 00:00:00 2001 From: Shehab Anwer | The Adimension Date: Mon, 16 Mar 2026 16:01:05 +0200 Subject: [PATCH 07/10] =?UTF-8?q?analysis.ipynb=20[MODIFIED=20metadata]=20?= =?UTF-8?q?=E2=80=94=20update=20kernel=20to=20Python=203.12.10?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel display name changed from '.venv' to 'Python 3'. Python version updated from 3.10.12 to 3.12.10. No code cell changes; 11 cells (none executed). Master blob: 8455ea4e34deb974a54df85072639f79673862b4 Modified blob: af828561e74b97754a9945e7904c2696c39aa100 --- analysis.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis.ipynb b/analysis.ipynb index 8455ea4e3..af828561e 100644 --- a/analysis.ipynb +++ b/analysis.ipynb @@ -224,7 +224,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -238,7 +238,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.10" } }, "nbformat": 4, From e382d9095286f1dc05a5a869b6dd3320dc64d0c6 Mon Sep 17 00:00:00 2001 From: Shehab Anwer | The Adimension Date: Mon, 16 Mar 2026 16:01:20 +0200 Subject: [PATCH 08/10] =?UTF-8?q?README.md=20[MODIFIED]=20=E2=80=94=20add?= =?UTF-8?q?=20Memory-in-the-Loop=20fork=20introduction=20with=20DEITY=20Pr?= =?UTF-8?q?inciples=20mapping?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new section between karpathy's introduction and 'How it works': - Title: Karpathy's AutoResearch with Memory-in-the-Loop States - Byline: Shehab Anwer, MD (habanwer, The Adimension) - DEITY Principles mapping: Data (JSON configs), Ethics (file ownership governance), Informatics (structured protocol), Technology (GPU platform detection Volta-Blackwell), You (human-machine loop via update_research_memory) - Blockquote linking to published framework paper: doi.org/10.1093/ehjimp/qyaf038 (Eur Heart J Imaging Methods Pract, 2025) - Upstream and related fork attribution Master blob: 2bc305168116f7e6a6822941baee5a6eb99343b9 Modified blob: 44296aeab492a12d8672cc21b7466126cba1846a --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index 2bc305168..44296aeab 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,30 @@ The idea: give an AI agent a small but real LLM training setup and let it experiment autonomously overnight. It modifies the code, trains for 5 minutes, checks if the result improved, keeps or discards, and repeats. You wake up in the morning to a log of experiments and (hopefully) a better model. The training code here is a simplified single-GPU implementation of [nanochat](https://github.com/karpathy/nanochat). The core idea is that you're not touching any of the Python files like you normally would as a researcher. Instead, you are programming the `program.md` Markdown files that provide context to the AI agents and set up your autonomous research org. The default `program.md` in this repo is intentionally kept as a bare bones baseline, though it's obvious how one would iterate on it over time to find the "research org code" that achieves the fastest research progress, how you'd add more agents to the mix, etc. A bit more context on this project is here in this [tweet](https://x.com/karpathy/status/2029701092347630069). +--- + +## [Karpathy](https://github.com/karpathy)'s [AutoResearch](https://github.com/karpathy/autoresearch) with Memory-in-the-Loop States + +*by [Shehab Anwer, MD](https://doi.org/10.1093/ehjimp/qyaf038) (GitHub: [habanwer](https://github.com/habanwer) · [The Adimension](https://github.com/the-adimension))* + +> *This fork applies the **DEITY Principles Framework** — **D**ata, **E**thics, **I**nformatics, **T**echnology, and **Y**ou — to implement human-machine interoperability & transparency in research protocols, especially when automation and autonomy is the scope. The framework is described in [The Adimension: bridging human ingenuity and machine intelligence through the DEITY principles framework](https://doi.org/10.1093/ehjimp/qyaf038) (European Heart Journal — Imaging Methods and Practice, 2025).* + +### What changed (relative to [karpathy/autoresearch](https://github.com/karpathy/autoresearch) in alignment with the [Adimension](https://theadimension.ch/Introduction.html)'s [DEITY Principles]) + +**Data.** Hardcoded constants extracted into machine-readable JSON configs. Training platform, data paths, and time budgets live in `ground.json`; architecture and optimization hyperparameters live in `model.json`. Experiment results are logged in three formats: JSON (structured config), TSV (metrics), and Markdown (human/LLM-readable memory). + +**Ethics.** Explicit file ownership governance: `ground.json` and `program.md` are user-owned and read-only; `model.json` and `train.py` are agent-owned and editable through the automation experiment. The agent writes experiment memory to `sessions/memory.md` — pinned to the Git branches id. Results are append-only, and a crash handler persists state even on failure. timestamped and SHA-verified outputs are logged. + +**Informatics.** `program.md` introduces a structured agent protocol with an orientation checklist, decision metrics table, execution sequence, and logging rules. Train output uses a parseable `---`-delimited key=value block so metrics flow directly into the experiment log. All data paths are configurable in `ground.json` for transparency and reproducibility to enable human review and auditing of the research process, and to allow for future integration with external data sources or experiment tracking tools and agents. + +**Technology.** Runtime GPU platform detection spanning Volta (SM 7.0) through Blackwell (SM 10.0). `prepare.py` auto-selects dtype, attention backend, `torch.compile`, and `GradScaler` per GPU generation, and computes peak TFLOPS from SM count and clock. Turing GPUs get fp16 with fp32 optimizer moments; Ampere+ get bf16. `ground.json` processor overrides allow manual tuning. Windows compile guards included. + +**You.** The human governs constraints (`ground.json`, `program.md`); the agent experiments autonomously within them (`model.json`, `train.py`). `update_research_memory()` closes the loop — experiment outcomes persist to `sessions/memory.md` so the agent's next hypothesis is informed by all prior runs without modifying user-owned files. + +**Upstream**: [karpathy/autoresearch](https://github.com/karpathy/autoresearch) — **Related fork**: [jsegov/autoresearch-win-rtx](https://github.com/jsegov/autoresearch-win-rtx) (Windows RTX adaptation referenced for platform support) + +--- + ## How it works The repo is deliberately kept small and only really has three files that matter: From 61813694e1480eb12dcfa7f4c5eb617eae7c142d Mon Sep 17 00:00:00 2001 From: Shehab Anwer | The Adimension Date: Tue, 17 Mar 2026 15:30:43 +0200 Subject: [PATCH 09/10] Address review feedback: fix eval_steps guard, ground.json validation, PLATFORM[compile] honoring, rotary dtype cast, unused imports, TSV header format, README grammar, notebook version alignment - train.py: remove unused 'import re'; add cos/sin dtype cast in apply_rotary_emb to avoid fp32 upcast under autocast; clarify SDPA boolean mask convention (True = allowed, verified against PyTorch 2.6); guard eval_steps with max(1,...) to prevent div-by-zero; honor PLATFORM['compile'] instead of unconditional disable - prepare.py: replace bare open() with try/except for FileNotFoundError and JSONDecodeError, validate required top-level keys, update comment to reflect ground.json is required (not optional) - program.md: show results.tsv header with explicit \t separators to match csv.DictReader(delimiter='\t') usage in train.py - README.md: fix subject-verb agreement ('are' not 'is'), possessive 'branch's ID', capitalize 'Timestamped' - analysis.ipynb: align language_info.version with .python-version (3.10.0) --- README.md | 4 ++-- analysis.ipynb | 2 +- prepare.py | 17 ++++++++++++++--- program.md | 4 +++- train.py | 9 +++++---- 5 files changed, 25 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 44296aeab..d341a79a0 100644 --- a/README.md +++ b/README.md @@ -12,13 +12,13 @@ The idea: give an AI agent a small but real LLM training setup and let it experi *by [Shehab Anwer, MD](https://doi.org/10.1093/ehjimp/qyaf038) (GitHub: [habanwer](https://github.com/habanwer) · [The Adimension](https://github.com/the-adimension))* -> *This fork applies the **DEITY Principles Framework** — **D**ata, **E**thics, **I**nformatics, **T**echnology, and **Y**ou — to implement human-machine interoperability & transparency in research protocols, especially when automation and autonomy is the scope. The framework is described in [The Adimension: bridging human ingenuity and machine intelligence through the DEITY principles framework](https://doi.org/10.1093/ehjimp/qyaf038) (European Heart Journal — Imaging Methods and Practice, 2025).* +> *This fork applies the **DEITY Principles Framework** — **D**ata, **E**thics, **I**nformatics, **T**echnology, and **Y**ou — to implement human-machine interoperability & transparency in research protocols, especially when automation and autonomy are the scope. The framework is described in [The Adimension: bridging human ingenuity and machine intelligence through the DEITY principles framework](https://doi.org/10.1093/ehjimp/qyaf038) (European Heart Journal — Imaging Methods and Practice, 2025).* ### What changed (relative to [karpathy/autoresearch](https://github.com/karpathy/autoresearch) in alignment with the [Adimension](https://theadimension.ch/Introduction.html)'s [DEITY Principles]) **Data.** Hardcoded constants extracted into machine-readable JSON configs. Training platform, data paths, and time budgets live in `ground.json`; architecture and optimization hyperparameters live in `model.json`. Experiment results are logged in three formats: JSON (structured config), TSV (metrics), and Markdown (human/LLM-readable memory). -**Ethics.** Explicit file ownership governance: `ground.json` and `program.md` are user-owned and read-only; `model.json` and `train.py` are agent-owned and editable through the automation experiment. The agent writes experiment memory to `sessions/memory.md` — pinned to the Git branches id. Results are append-only, and a crash handler persists state even on failure. timestamped and SHA-verified outputs are logged. +**Ethics.** Explicit file ownership governance: `ground.json` and `program.md` are user-owned and read-only; `model.json` and `train.py` are agent-owned and editable through the automation experiment. The agent writes experiment memory to `sessions/memory.md` — pinned to the Git branch's ID. Results are append-only, and a crash handler persists state even on failure. Timestamped and SHA-verified outputs are logged. **Informatics.** `program.md` introduces a structured agent protocol with an orientation checklist, decision metrics table, execution sequence, and logging rules. Train output uses a parseable `---`-delimited key=value block so metrics flow directly into the experiment log. All data paths are configurable in `ground.json` for transparency and reproducibility to enable human review and auditing of the research process, and to allow for future integration with external data sources or experiment tracking tools and agents. diff --git a/analysis.ipynb b/analysis.ipynb index af828561e..2670704fb 100644 --- a/analysis.ipynb +++ b/analysis.ipynb @@ -238,7 +238,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.10.0" } }, "nbformat": 4, diff --git a/prepare.py b/prepare.py index ed13834e7..019cfe244 100644 --- a/prepare.py +++ b/prepare.py @@ -25,12 +25,23 @@ import torch # --------------------------------------------------------------------------- -# Constants — loaded from ground.json when available, else hardcoded defaults +# Constants — loaded from ground.json (required); fail with clear message # --------------------------------------------------------------------------- _GROUND_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ground.json") -with open(_GROUND_PATH, "r", encoding="utf-8") as _f: - _ground = json.load(_f) +try: + with open(_GROUND_PATH, "r", encoding="utf-8") as _f: + _ground = json.load(_f) +except FileNotFoundError: + sys.exit(f"ERROR: ground.json not found at {_GROUND_PATH}. " + "This file is required — see README.md for setup instructions.") +except json.JSONDecodeError as _e: + sys.exit(f"ERROR: ground.json is malformed: {_e}") + +_required_keys = ["training", "data", "tokenizer", "mode", "processor"] +_missing = [k for k in _required_keys if k not in _ground] +if _missing: + sys.exit(f"ERROR: ground.json missing required keys: {_missing}") _training = _ground["training"] _data = _ground["data"] diff --git a/program.md b/program.md index 46ca3df52..d2fe022cd 100644 --- a/program.md +++ b/program.md @@ -54,9 +54,11 @@ Use these to guide your experiment choices: 2. Initialize `results.tsv` with this exact header (tab-separated): ``` - run_id val_bpb peak_vram_mb mfu_percent training_seconds total_tokens_M num_params_M status description + run_id\tval_bpb\tpeak_vram_mb\tmfu_percent\ttraining_seconds\ttotal_tokens_M\tnum_params_M\tstatus\tdescription ``` + (Each `\t` above represents a literal tab character.) + 3. Run `uv run train.py`, capturing stdout to `sessions/.log`. - `run_id` = short git commit hash or a timestamp tag — unique per run. 4. Parse the `---` block from the log to extract metrics. diff --git a/train.py b/train.py index bba5418f4..2c0f4e9a8 100644 --- a/train.py +++ b/train.py @@ -19,7 +19,6 @@ import json import sys -import re import csv import traceback from datetime import datetime @@ -142,6 +141,7 @@ def has_ve(layer_idx, n_layer): def apply_rotary_emb(x, cos, sin): assert x.ndim == 4 + cos, sin = cos.to(x.dtype), sin.to(x.dtype) d = x.shape[3] // 2 x1, x2 = x[..., :d], x[..., d:] y1 = x1 * cos + x2 * sin @@ -199,6 +199,7 @@ def forward(self, x, ve, cos_sin, window_size): if self._cached_mask_key != key: rows = torch.arange(T, device=q.device).unsqueeze(1) cols = torch.arange(T, device=q.device).unsqueeze(0) + # Causal + sliding window: True = allowed to attend (PyTorch SDPA convention) self._cached_mask = (cols <= rows) & ((rows - cols) < win) self._cached_mask_key = key y = F.scaled_dot_product_attention(q, k, v, attn_mask=self._cached_mask) @@ -297,7 +298,6 @@ def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=No t = torch.arange(seq_len, dtype=torch.float32, device=device) freqs = torch.outer(t, inv_freq) cos, sin = freqs.cos(), freqs.sin() - cos, sin = cos.float(), sin.float() cos, sin = cos[None, :, None, :], sin[None, :, None, :] return cos, sin @@ -622,7 +622,8 @@ def build_model_config(depth): weight_decay=WEIGHT_DECAY, ) -# torch.compile disabled — requires Triton (unavailable on Windows) +if PLATFORM["compile"]: + model = torch.compile(model) scaler = torch.amp.GradScaler(enabled=PLATFORM["use_grad_scaler"]) @@ -732,7 +733,7 @@ def get_weight_decay(progress): # (seq=2048 makes full 20M eval take ~420s; 3M ≈ 60s; same val shard so comparisons are valid) EVAL_BATCH_SIZE = _model_cfg["evaluation"]["batch_size"] FAST_EVAL_TOKENS = _model_cfg["evaluation"]["tokens"] -eval_steps = FAST_EVAL_TOKENS // (EVAL_BATCH_SIZE * MAX_SEQ_LEN) +eval_steps = max(1, FAST_EVAL_TOKENS // (EVAL_BATCH_SIZE * MAX_SEQ_LEN)) model.eval() _token_bytes = get_token_bytes(device="cuda") _val_loader = make_dataloader(tokenizer, EVAL_BATCH_SIZE, MAX_SEQ_LEN, "val") From ecab7a79989ec7da19cb32c24b1a22d852a7c98d Mon Sep 17 00:00:00 2001 From: Shehab Anwer <168052213+habanwer@users.noreply.github.com> Date: Wed, 18 Mar 2026 10:27:18 +0200 Subject: [PATCH 10/10] Agent Instructions - Experiment AutoResearch with Memory in the Loop Agent Instructions - Experiment AutoResearch with Memory in the Loop --- .github/agents/my-agent.agent.md | 98 ++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 .github/agents/my-agent.agent.md diff --git a/.github/agents/my-agent.agent.md b/.github/agents/my-agent.agent.md new file mode 100644 index 000000000..f553ae1dc --- /dev/null +++ b/.github/agents/my-agent.agent.md @@ -0,0 +1,98 @@ +# autoresearch — agent instructions for https://github.com/habanwer/autoresearch/tree/autoresearch/memory-in-the-loop + +## 1. Orientation (do this first, every run) + +Before making any changes, read and understand the codebase: + +1. Read `ground.json` — the user-owned, read-only configuration: + - `mode`: `"test"` or `"train"` — determines which time budget applies. + - `training.time_budget_test` / `training.time_budget_train` — the wall-clock seconds the training loop is allowed to run. **Respect this strictly.** + - `training.max_seq_len` — sequence length, fixed. + - `processor` — dtype, compile, flash_attention, peak_flops overrides (all `"auto"` by default). +2. Read `model.json` — your hyperparameter file (you own this): + - `architecture`: depth, aspect_ratio, head_dim, window_pattern. + - `optimization`: batch sizes, learning rates, weight decay, adam betas, warmup/warmdown ratios, final_lr_frac. + - `evaluation`: batch_size, tokens (for the fast eval after training). +3. Read `prepare.py` — understand but **never edit**: + - Exports: `MAX_SEQ_LEN`, `TIME_BUDGET`, `PLATFORM`, `Tokenizer`, `make_dataloader`, `evaluate_bpb`, `get_token_bytes`. + - `PLATFORM` dict: device, dtype, use_grad_scaler, attention, compile, peak_flops (auto-detected from GPU hardware specs). +4. Read `train.py` — the model and training loop (you own this): + - Loads all hyperparameters from `model.json` at startup. + - Imports platform config from `prepare.py`. + - Prints a `---` separator followed by key=value summary lines at the end of training. +5. Note the key metric: **`val_bpb`** (bits per byte) — lower is better. This is printed by `train.py` after the training loop completes. + +## 2. Decision metrics + +Use these to guide your experiment choices: + +| Metric | Source | Meaning | +|---|---|---| +| `val_bpb` | train.py stdout | Primary objective — minimize this | +| `peak_vram_mb` | train.py stdout | Must not OOM — watch this when increasing batch/model size | +| `mfu_percent` | train.py stdout | Hardware utilization — indicates if compute is bottlenecked | +| `training_seconds` | train.py stdout | Must stay within `TIME_BUDGET` | +| `total_tokens_M` | train.py stdout | Throughput — more tokens = more learning within budget | +| `num_params_M` | train.py stdout | Model capacity — larger is not always better under time constraint | + +## 3. File ownership + +| File | Owner | Editable | Purpose | +|---|---|---|---| +| `ground.json` | User | **NO** | Platform config, data paths, time budgets | +| `prepare.py` | User | **NO** | Data prep, tokenizer, dataloader, eval, platform detection | +| `model.json` | Agent | **YES** | Architecture + optimization hyperparameters | +| `train.py` | Agent | **YES** | Model definition, optimizer, training loop | +| `results.tsv` | Agent | **YES** | Experiment log — append only | +| `program.md` | User | **NO** | This document | + +## 4. Execution sequence + +### First run (setup) + +1. Run `uv run prepare.py` to ensure data and tokenizer are cached. +2. Initialize `results.tsv` with this exact header (tab-separated): + + ``` + run_id\tval_bpb\tpeak_vram_mb\tmfu_percent\ttraining_seconds\ttotal_tokens_M\tnum_params_M\tstatus\tdescription + ``` + + (Each `\t` above represents a literal tab character.) + +3. Run `uv run train.py`, capturing stdout to `sessions/.log`. + - `run_id` = short git commit hash or a timestamp tag — unique per run. +4. Parse the `---` block from the log to extract metrics. +5. Append one row to `results.tsv` with the extracted values and `status=baseline`. + +### Subsequent runs (experiment loop) + +1. Form one hypothesis from the current code and most recent run metrics. +2. Edit `model.json` and/or `train.py`. +3. Commit with a message describing the hypothesis. +4. Run `uv run train.py`, capturing stdout to `sessions/.log` (use the new commit hash as `run_id`). +5. Parse the `---` block. Extract `val_bpb`, `peak_vram_mb`, `mfu_percent`, `training_seconds`, `total_tokens_M`, `num_params_M`. +6. Append one row to `results.tsv`: + - `status=keep` if val_bpb improved. + - `status=discard` if val_bpb did not improve. + - `status=crash` if the run failed. +7. If `discard` or `crash`: revert with `git reset --hard HEAD~1`. +8. Continue to next hypothesis. + +## 5. Logging rules + +- Every run MUST have its own log file: `sessions/.log`. +- Every run MUST have exactly one row appended to `results.tsv`. +- The `run_id` in `results.tsv` must match the log filename (without `.log`). +- Never overwrite or delete previous log files or results rows. + +## 6. Constraints + +1. **Time budget**: `train.py` self-enforces via `TIME_BUDGET` from `ground.json`. Do not circumvent this. +2. **No new packages**: use only what is already installed in the environment. +3. **Do not edit** `ground.json`, `prepare.py`, or `program.md`. +4. **Prefer simpler changes** when two options yield similar `val_bpb`. +5. **VRAM**: if a run OOMs, reduce `device_batch_size` in `model.json` or model size before retrying. + +## 7. Autonomy + +Continue iterating experiments until manually stopped. Do not pause for permission between runs.