diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..a36577748 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +.venv +karpathy +__pycache__ +*.pyc +run.log +results.tsv +.git +progress.png +analysis.ipynb diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..6a1608fa6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM nvidia/cuda:12.8.0-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 + +# System deps +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.10 python3.10-venv python3-pip curl git ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install uv +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +WORKDIR /app + +# Copy project files +COPY pyproject.toml uv.lock ./ +RUN uv sync --frozen + +# Copy source +COPY prepare.py train.py agent.py program.md ./ + +EXPOSE 9090 + +# Default: run training +CMD ["uv", "run", "train.py"] diff --git a/README.md b/README.md index f9e9a8e27..b64a86ae3 100644 --- a/README.md +++ b/README.md @@ -1,91 +1,202 @@ # autoresearch -![teaser](progress.png) +> Fork of [karpathy/autoresearch](https://github.com/karpathy/autoresearch) by [DeepBlueDynamics](https://github.com/DeepBlueDynamics) -*One day, frontier AI research used to be done by meat computers in between eating, sleeping, having other fun, and synchronizing once in a while using sound wave interconnect in the ritual of "group meeting". That era is long gone. Research is now entirely the domain of autonomous swarms of AI agents running across compute cluster megastructures in the skies. The agents claim that we are now in the 10,205th generation of the code base, in any case no one could tell if that's right or wrong as the "code" is now a self-modifying binary that has grown beyond human comprehension. This repo is the story of how it all began. -@karpathy, March 2026*. +Give an AI agent a real LLM training setup and let it experiment autonomously overnight. It modifies the code, trains for 5 minutes, checks if the result improved, keeps or discards, and repeats. You wake up to a log of experiments and a better model. -The idea: give an AI agent a small but real LLM training setup and let it experiment autonomously overnight. It modifies the code, trains for 5 minutes, checks if the result improved, keeps or discards, and repeats. You wake up in the morning to a log of experiments and (hopefully) a better model. The training code here is a simplified single-GPU implementation of [nanochat](https://github.com/karpathy/nanochat). The core idea is that you're not touching any of the Python files like you normally would as a researcher. Instead, you are programming the `program.md` Markdown files that provide context to the AI agents and set up your autonomous research org. The default `program.md` in this repo is intentionally kept as a bare bones baseline, though it's obvious how one would iterate on it over time to find the "research org code" that achieves the fastest research progress, how you'd add more agents to the mix, etc. A bit more context on this project is here in this [tweet](https://x.com/karpathy/status/2029701092347630069). +## What's different in this fork -## How it works - -The repo is deliberately kept small and only really has three files that matter: - -- **`prepare.py`** — fixed constants, one-time data prep (downloads training data, trains a BPE tokenizer), and runtime utilities (dataloader, evaluation). Not modified. -- **`train.py`** — the single file the agent edits. Contains the full GPT model, optimizer (Muon + AdamW), and training loop. Everything is fair game: architecture, hyperparameters, optimizer, batch size, etc. **This file is edited and iterated on by the agent**. -- **`program.md`** — baseline instructions for one agent. Point your agent here and let it go. **This file is edited and iterated on by the human**. - -By design, training runs for a **fixed 5-minute time budget** (wall clock, excluding startup/compilation), regardless of the details of your compute. The metric is **val_bpb** (validation bits per byte) — lower is better, and vocab-size-independent so architectural changes are fairly compared. - -If you are new to neural networks, this ["Dummy's Guide"](https://x.com/hooeem/status/2030720614752039185) looks pretty good for a lot more context. +- **Agent harness** (`agent.py`) — structured tool-calling agent that works with Claude, GPT, or Gemini. 10 tools for autonomous experimentation including persistent thermodynamic memory via [ferricula](https://github.com/DeepBlueDynamics/ferricula). +- **Weber electrodynamic optimizer** — applies Weber's force law bracket `W = 1 - v²/(2c²) + v·a/c²` to learning rate, modifying effective step size based on parameter velocity and acceleration. Physics-inspired adaptive optimization. +- **SDR entropy seeding** — replaces the fixed `torch.manual_seed(42)` with true hardware randomness from an RTL-SDR radio receiver via [sdr-random](https://github.com/DeepBlueDynamics/sdr-random). Falls back to `os.urandom` if unavailable. +- **Multi-GPU support** — auto-detects Flash Attention 3 (H100/Hopper) or falls back to PyTorch SDPA (consumer GPUs). Windows support with automatic `torch.compile` bypass. +- **Optimized defaults** — hyperparameters from 215 experiments across Karpathy's sessions ([Discussion #32](https://github.com/karpathy/autoresearch/discussions/32), [#43](https://github.com/karpathy/autoresearch/discussions/43)). +- **Docker** — container with NVIDIA GPU passthrough, compose stack with ferricula memory service. ## Quick start -**Requirements:** A single NVIDIA GPU (tested on H100), Python 3.10+, [uv](https://docs.astral.sh/uv/). +**Requirements:** Single NVIDIA GPU, Python 3.10+, [uv](https://docs.astral.sh/uv/). ```bash - -# 1. Install uv project manager (if you don't already have it) +# 1. Install uv curl -LsSf https://astral.sh/uv/install.sh | sh # 2. Install dependencies uv sync -# 3. Download data and train tokenizer (one-time, ~2 min) +# 3. Download data + train tokenizer (one-time, ~2 min) uv run prepare.py -# 4. Manually run a single training experiment (~5 min) +# 4. Run a single training experiment (~5 min) uv run train.py ``` -If the above commands all work ok, your setup is working and you can go into autonomous research mode. +## Platform support + +| Platform | Flash Attn | torch.compile | Notes | +|----------|-----------|---------------|-------| +| **H100 / Hopper** | FA3 (native) | Triton | Full speed, no changes needed | +| **RTX 3060/4090 / Ampere+** | PyTorch SDPA (auto-fallback) | Triton (Linux) | Tune DEPTH, BATCH_SIZE for VRAM | +| **Windows (any GPU)** | PyTorch SDPA (auto-fallback) | Eager mode (auto) | Triton unavailable, runs slower | + +The script auto-detects everything. No manual flags needed — just tune hyperparameters for your VRAM. + +### Tuning for smaller GPUs + +The defaults are optimized for H100 80GB. For consumer GPUs, edit the hyperparameters block in `train.py`: + +```python +# RTX 3060 12GB +DEPTH = 4 +DEVICE_BATCH_SIZE = 16 +TOTAL_BATCH_SIZE = 2**16 +WINDOW_PATTERN = "SL" + +# RTX 4090 24GB +DEPTH = 6 +DEVICE_BATCH_SIZE = 32 +TOTAL_BATCH_SIZE = 2**17 +WINDOW_PATTERN = "SSL" +``` ## Running the agent -Simply spin up your Claude/Codex or whatever you want in this repo (and disable all permissions), then you can prompt something like: +```bash +# Install your provider's SDK +uv pip install anthropic # or: openai, google-genai + +# Set your API key +export ANTHROPIC_API_KEY=sk-ant-... # Linux/Mac +set ANTHROPIC_API_KEY=sk-ant-... # Windows cmd +$env:ANTHROPIC_API_KEY="sk-ant-..." # PowerShell + +# Run with Claude +uv run python agent.py --provider anthropic --model claude-sonnet-4-20250514 + +# Run with GPT +uv run python agent.py --provider openai --model gpt-4o + +# Run with Gemini +uv run python agent.py --provider gemini --model gemini-2.0-flash + +# Limit experiments, use a named branch +uv run python agent.py --provider anthropic --model claude-sonnet-4-20250514 --tag mar18 --max-experiments 20 + +# With ferricula memory (persistent experiment memory across runs) +uv run python agent.py --provider anthropic --model claude-sonnet-4-20250514 --memory http://localhost:8765 +``` + +### Agent tools + +| Tool | What it does | +|------|-------------| +| `get_config` | Read current hyperparameters from train.py | +| `set_hyperparams` | Modify hyperparameters (batch size, LR, depth, etc.) | +| `edit_code` | Replace entire sections of train.py (model, optimizer, training loop) | +| `run_experiment` | Execute 5-min training run, return val_bpb + metrics | +| `get_history` | Read results.tsv — full experiment log | +| `keep` | Git commit + log improvement to results.tsv | +| `discard` | Revert changes + log failure to results.tsv | +| `read_code` | Inspect specific lines of train.py | +| `remember` | Store insight in persistent thermodynamic memory (ferricula) | +| `recall` | Search memory for similar past experiments | + +The agent loops autonomously: check config, propose a change, run it, evaluate, keep or discard, repeat. Context auto-compresses so it can run indefinitely. + +### Manual mode + +You can also run experiments the original way — point Claude Code, Codex, or any coding agent at `program.md`: ``` -Hi have a look at program.md and let's kick off a new experiment! let's do the setup first. +Hi have a look at program.md and let's kick off a new experiment! ``` -The `program.md` file is essentially a super lightweight "skill". +## Weber electrodynamic optimizer -## Project structure +Applies Weber's force law bracket to the optimizer step, modifying effective learning rate based on parameter velocity (momentum) and acceleration (change in momentum): ``` -prepare.py — constants, data prep + runtime utilities (do not modify) -train.py — model, optimizer, training loop (agent modifies this) -program.md — agent instructions -pyproject.toml — dependencies +W = 1 - v²/(2c²) + v·a/c² ``` -## Design choices +- **Stable momentum** (v small): W ~ 1, normal update +- **Accelerating params** (v·a > 0): W > 1, larger step — leans into acceleration +- **Decelerating params** (v·a < 0): W < 1, smaller step — eases off +- **Fast params** (v² large): -v²/2c² damps — natural speed limit -- **Single file to modify.** The agent only touches `train.py`. This keeps the scope manageable and diffs reviewable. -- **Fixed time budget.** Training always runs for exactly 5 minutes, regardless of your specific platform. This means you can expect approx 12 experiments/hour and approx 100 experiments while you sleep. There are two upsides of this design decision. First, this makes experiments directly comparable regardless of what the agent changes (model size, batch size, architecture, etc). Second, this means that autoresearch will find the most optimal model for your platform in that time budget. The downside is that your runs (and results) become not comparable to other people running on other compute platforms. -- **Self-contained.** No external dependencies beyond PyTorch and a few small packages. No distributed training, no complex configs. One GPU, one file, one metric. +Applied to both AdamW (per-element) and Muon (per-matrix). Controlled by `WEBER_C_SQ` hyperparameter (default 1.0). Larger = subtler correction. -## Platform support +## SDR entropy seeding + +Seeds PyTorch's RNG with true hardware randomness from an RTL-SDR radio receiver. Entropy comes from ADC quantization noise — physically random, not pseudorandom. + +Requires [sdr-random](https://github.com/DeepBlueDynamics/sdr-random) running on a machine with an RTL-SDR dongle: + +```bash +# On the SDR host +sdr-rand local --port 9090 + +# train.py auto-fetches from http://:9090/api/entropy +# Falls back to os.urandom if unavailable +uv run train.py +``` + +Configure the SDR host IP in `train.py` (search for `192.168.86.24`). -This code currently requires that you have a single NVIDIA GPU. In principle it is quite possible to support CPU, MPS and other platforms but this would also bloat the code. I'm not 100% sure that I want to take this on personally right now. People can reference (or have their agents reference) the full/parent nanochat repository that has wider platform support and shows the various solutions (e.g. a Flash Attention 3 kernels fallback implementation, generic device support, autodetection, etc.), feel free to create forks or discussions for other platforms and I'm happy to link to them here in the README in some new notable forks section or etc. +## Docker -Seeing as there seems to be a lot of interest in tinkering with autoresearch on much smaller compute platforms than an H100, a few extra words. If you're going to try running autoresearch on smaller computers (Macbooks etc.), I'd recommend one of the forks below. On top of this, here are some recommendations for how to tune the defaults for much smaller models for aspiring forks: +```bash +# One-time: download data +docker compose --profile setup run prepare + +# Run training +docker compose run train -1. To get half-decent results I'd use a dataset with a lot less entropy, e.g. this [TinyStories dataset](https://huggingface.co/datasets/karpathy/tinystories-gpt4-clean). These are GPT-4 generated short stories. Because the data is a lot narrower in scope, you will see reasonable results with a lot smaller models (if you try to sample from them after training). -2. You might experiment with decreasing `vocab_size`, e.g. from 8192 down to 4096, 2048, 1024, or even - simply byte-level tokenizer with 256 possibly bytes after utf-8 encoding. -3. In `prepare.py`, you'll want to lower `MAX_SEQ_LEN` a lot, depending on the computer even down to 256 etc. As you lower `MAX_SEQ_LEN`, you may want to experiment with increasing `DEVICE_BATCH_SIZE` in `train.py` slightly to compensate. The number of tokens per fwd/bwd pass is the product of these two. -4. Also in `prepare.py`, you'll want to decrease `EVAL_TOKENS` so that your validation loss is evaluated on a lot less data. -5. In `train.py`, the primary single knob that controls model complexity is the `DEPTH` (default 8, here). A lot of variables are just functions of this, so e.g. lower it down to e.g. 4. -6. You'll want to most likely use `WINDOW_PATTERN` of just "L", because "SSSL" uses alternating banded attention pattern that may be very inefficient for you. Try it. -7. You'll want to lower `TOTAL_BATCH_SIZE` a lot, but keep it powers of 2, e.g. down to `2**14` (~16K) or so even, hard to tell. +# Run the autonomous agent +ANTHROPIC_API_KEY=sk-ant-... docker compose run agent -I think these would be the reasonable hyperparameters to play with. Ask your favorite coding agent for help and copy paste them this guide, as well as the full source code. +# Full stack with ferricula memory +docker compose up ferricula -d +ANTHROPIC_API_KEY=sk-ant-... docker compose run agent +``` -## Notable forks +Requires `nvidia-container-toolkit` for GPU passthrough. + +## Project structure + +``` +train.py model, optimizer, training loop (agent modifies this) +prepare.py constants, data prep, evaluation (do not modify) +agent.py autonomous experiment agent (Claude / GPT / Gemini) +program.md manual-mode agent instructions +pyproject.toml dependencies +Dockerfile CUDA runtime + uv + PyTorch +docker-compose.yml train, agent, ferricula, prepare services +results.tsv experiment log (auto-generated) +``` -- [miolini/autoresearch-macos](https://github.com/miolini/autoresearch-macos) (MacOS) -- [trevin-creator/autoresearch-mlx](https://github.com/trevin-creator/autoresearch-mlx) (MacOS) -- [jsegov/autoresearch-win-rtx](https://github.com/jsegov/autoresearch-win-rtx) (Windows) -- [andyluo7/autoresearch](https://github.com/andyluo7/autoresearch) (AMD) +## Optimized defaults + +Hyperparameters validated across 215 experiments on H100: + +| Setting | Upstream | This fork | Impact | +|---------|----------|-----------|--------| +| Depth | 8 | 9 | -0.004 val_bpb | +| Aspect ratio | 64 | 57 | depth-over-width | +| Batch size | 524K | 262K | -0.012 (more steps in 5 min) | +| Window pattern | SSSL | SSSSL | -0.004 cumulative | +| Short window | seq_len/2 | seq_len/8 | narrower local attention | +| RoPE base | 10K | 200K | -0.001 | +| Embedding LR | 0.6 | 0.9 | -0.005 | +| Warmdown ratio | 0.5 | 0.75 | -0.001 to -0.027 | +| Final LR frac | 0.0 | 0.05 | -0.006 | +| Init scale | 1.0x | 0.68x | -0.016 cumulative | +| x0_lambda init | 0.1 | 0.05 | -0.001 | +| Embedding WD | 0.0 | 0.001 | regularization | +| VE WD | 0.0 | 0.003 | -0.003 cumulative | +| LM head WD | 0.0 | 0.01 | -0.009 | +| Softcap | float32 before tanh | bf16 tanh, then float32 | saves ~4GB VRAM | +| **Weber c²** | N/A | 1.0 | velocity-dependent LR bracket | ## License diff --git a/agent.py b/agent.py new file mode 100644 index 000000000..f09a6fec2 --- /dev/null +++ b/agent.py @@ -0,0 +1,760 @@ +""" +Autonomous ML experiment agent for autoresearch. +Wraps the experiment loop with LLM tool-calling: any provider, any model. + +Usage: + python agent.py --provider anthropic --model claude-sonnet-4-20250514 + python agent.py --provider openai --model gpt-4o + python agent.py --provider gemini --model gemini-2.0-flash + python agent.py --provider anthropic --model claude-sonnet-4-20250514 --tag mar18 +""" + +import argparse +import ast +import json +import os +import re +import subprocess +import sys +import time +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from pathlib import Path + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +PROJECT_DIR = Path(__file__).parent +TRAIN_PY = PROJECT_DIR / "train.py" +RESULTS_TSV = PROJECT_DIR / "results.tsv" +RUN_LOG = PROJECT_DIR / "run.log" +RESULTS_HEADER = "commit\tval_bpb\tmemory_gb\tstatus\tdescription" +RUN_TIMEOUT = 600 # 10 min hard kill +SECTION_DELIM = re.compile(r"^# -{10,}") +FERRICULA_URL = os.environ.get("FERRICULA_URL", "") # set via --memory flag + +# --------------------------------------------------------------------------- +# Tool implementations +# --------------------------------------------------------------------------- + +def _find_section(lines, section_name): + """Find start/end line indices for a named section in train.py.""" + sections = { + "model": "GPT Model", + "optimizer": "Optimizer", + "hyperparameters": "Hyperparameters", + "setup": "Setup", + "training_loop": "Training loop", + } + target = sections.get(section_name, section_name) + start = end = None + for i, line in enumerate(lines): + if target.lower() in line.lower() and (i > 0 and SECTION_DELIM.match(lines[i - 1])): + start = i - 1 + elif start is not None and SECTION_DELIM.match(line) and i > start + 2: + # Next section delimiter pair: the line before it is the end + # Actually, look for the next section header pattern + if i + 1 < len(lines) and not SECTION_DELIM.match(lines[i + 1]): + continue + end = i + break + if start is not None and end is None: + end = len(lines) + return start, end + + +def _parse_hyperparams(): + """Parse hyperparameters from train.py. Returns {name: (value_str, comment)}.""" + text = TRAIN_PY.read_text(encoding="utf-8") + lines = text.splitlines() + start, end = _find_section(lines, "hyperparameters") + if start is None: + return {} + params = {} + pattern = re.compile(r"^([A-Z_]+)\s*=\s*(.+?)(\s*#.*)?$") + for line in lines[start:end]: + m = pattern.match(line) + if m: + name, val_str, comment = m.group(1), m.group(2).strip(), (m.group(3) or "").strip() + params[name] = (val_str, comment) + return params + + +def tool_get_config(**kwargs): + """Read current hyperparameters from train.py.""" + params = _parse_hyperparams() + result = {} + for name, (val_str, comment) in params.items(): + try: + val = ast.literal_eval(val_str) + except (ValueError, SyntaxError): + try: + val = eval(val_str, {"__builtins__": {}}) + except Exception: + val = val_str + result[name] = {"value": val, "raw": val_str, "comment": comment} + return json.dumps(result, indent=2) + + +def tool_set_hyperparams(changes: dict, **kwargs): + """Modify hyperparameters in train.py. changes is {name: new_value_str}.""" + text = TRAIN_PY.read_text(encoding="utf-8") + lines = text.splitlines() + modified = [] + for name, new_val in changes.items(): + pattern = re.compile(rf"^({re.escape(name)}\s*=\s*)(.+?)(\s*#.*)$") + found = False + for i, line in enumerate(lines): + m = pattern.match(line) + if m: + lines[i] = f"{m.group(1)}{new_val}{m.group(3)}" + modified.append(f" {name}: {m.group(2).strip()} -> {new_val}") + found = True + break + if not found: + return json.dumps({"error": f"Hyperparameter '{name}' not found in train.py"}) + TRAIN_PY.write_text("\n".join(lines) + "\n", encoding="utf-8") + return json.dumps({"modified": modified}) + + +def tool_edit_code(section: str, new_code: str, **kwargs): + """Replace an entire named section of train.py.""" + text = TRAIN_PY.read_text(encoding="utf-8") + lines = text.splitlines() + start, end = _find_section(lines, section) + if start is None: + return json.dumps({"error": f"Section '{section}' not found. Valid: model, optimizer, hyperparameters, setup, training_loop"}) + + # Syntax check the new code + try: + ast.parse(new_code) + except SyntaxError as e: + return json.dumps({"error": f"Syntax error in new code: {e}"}) + + # Replace section content (keep the delimiter header) + header_end = start + 3 # delimiter + title + delimiter + new_lines = lines[:header_end] + [""] + new_code.splitlines() + [""] + lines[end:] + new_text = "\n".join(new_lines) + "\n" + + # Full file syntax check + try: + ast.parse(new_text) + except SyntaxError as e: + return json.dumps({"error": f"Resulting file has syntax error: {e}"}) + + TRAIN_PY.write_text(new_text, encoding="utf-8") + return json.dumps({"ok": True, "section": section, "lines_replaced": end - header_end, "lines_new": len(new_code.splitlines())}) + + +def tool_run_experiment(**kwargs): + """Run uv run train.py and return metrics.""" + try: + with open(RUN_LOG, "w") as log: + result = subprocess.run( + ["uv", "run", "train.py"], + stdout=log, stderr=subprocess.STDOUT, + timeout=RUN_TIMEOUT, cwd=PROJECT_DIR, + ) + except subprocess.TimeoutExpired: + return json.dumps({"status": "timeout", "error": "Exceeded 10 minute timeout"}) + + log_text = RUN_LOG.read_text() + + if result.returncode != 0: + last_lines = "\n".join(log_text.splitlines()[-50:]) + if "CUDA out of memory" in log_text or "OutOfMemoryError" in log_text: + return json.dumps({"status": "oom", "error": last_lines}) + return json.dumps({"status": "crash", "exit_code": result.returncode, "error": last_lines}) + + # Parse metrics after the --- separator + metrics = {} + in_summary = False + for line in log_text.splitlines(): + if line.strip() == "---": + in_summary = True + continue + if in_summary: + m = re.match(r"^(\w+):\s+(.+)$", line.strip()) + if m: + key, val = m.group(1), m.group(2).strip() + try: + metrics[key] = float(val) + except ValueError: + metrics[key] = val + + if "val_bpb" not in metrics: + last_lines = "\n".join(log_text.splitlines()[-30:]) + return json.dumps({"status": "no_metrics", "error": last_lines}) + + metrics["status"] = "ok" + return json.dumps(metrics) + + +def tool_get_history(**kwargs): + """Read results.tsv.""" + if RESULTS_TSV.exists(): + return RESULTS_TSV.read_text() + return "No experiments yet. results.tsv does not exist." + + +def tool_keep(description: str, **kwargs): + """Git commit current train.py and log success to results.tsv.""" + # Git commit + subprocess.run(["git", "add", "train.py"], cwd=PROJECT_DIR, capture_output=True) + subprocess.run(["git", "commit", "-m", description], cwd=PROJECT_DIR, capture_output=True) + commit = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + cwd=PROJECT_DIR, capture_output=True, text=True + ).stdout.strip() + + # Parse last run metrics + val_bpb = "0.000000" + mem_gb = "0.0" + if RUN_LOG.exists(): + log_text = RUN_LOG.read_text() + for line in log_text.splitlines(): + m = re.match(r"^val_bpb:\s+(.+)$", line.strip()) + if m: + val_bpb = m.group(1).strip() + m = re.match(r"^peak_vram_mb:\s+(.+)$", line.strip()) + if m: + try: + mem_gb = f"{float(m.group(1).strip()) / 1024:.1f}" + except ValueError: + pass + + # Log to results.tsv + if not RESULTS_TSV.exists(): + RESULTS_TSV.write_text(RESULTS_HEADER + "\n") + with open(RESULTS_TSV, "a") as f: + f.write(f"{commit}\t{val_bpb}\t{mem_gb}\tkeep\t{description}\n") + + return json.dumps({"commit": commit, "val_bpb": val_bpb, "memory_gb": mem_gb, "status": "kept"}) + + +def tool_discard(reason: str = "", **kwargs): + """Discard uncommitted changes to train.py and log to results.tsv.""" + # Parse metrics before discarding + val_bpb = "0.000000" + mem_gb = "0.0" + status = "discard" + if RUN_LOG.exists(): + log_text = RUN_LOG.read_text() + for line in log_text.splitlines(): + m = re.match(r"^val_bpb:\s+(.+)$", line.strip()) + if m: + val_bpb = m.group(1).strip() + m = re.match(r"^peak_vram_mb:\s+(.+)$", line.strip()) + if m: + try: + mem_gb = f"{float(m.group(1).strip()) / 1024:.1f}" + except ValueError: + pass + if "CUDA out of memory" in log_text or "FAIL" in log_text: + status = "crash" + + # Revert train.py + subprocess.run(["git", "checkout", "--", "train.py"], cwd=PROJECT_DIR, capture_output=True) + + # Log + if not RESULTS_TSV.exists(): + RESULTS_TSV.write_text(RESULTS_HEADER + "\n") + commit = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + cwd=PROJECT_DIR, capture_output=True, text=True + ).stdout.strip() + desc = reason or "discarded" + with open(RESULTS_TSV, "a") as f: + f.write(f"{commit}\t{val_bpb}\t{mem_gb}\t{status}\t{desc}\n") + + return json.dumps({"status": "discarded", "reverted_to": commit}) + + +def tool_read_code(start_line: int = 1, end_line: int = 50, **kwargs): + """Read a range of lines from train.py (1-indexed).""" + lines = TRAIN_PY.read_text(encoding="utf-8").splitlines() + start_line = max(1, start_line) + end_line = min(len(lines), end_line) + result = [] + for i in range(start_line - 1, end_line): + result.append(f"{i + 1:4d} {lines[i]}") + return "\n".join(result) + + +# --------------------------------------------------------------------------- +# Ferricula memory tools (thermodynamic memory engine) +# --------------------------------------------------------------------------- + +def _ferricula_post(endpoint, payload): + """POST to ferricula, returns response text or error.""" + if not FERRICULA_URL: + return json.dumps({"error": "No ferricula URL configured. Use --memory flag."}) + import urllib.request + url = f"{FERRICULA_URL.rstrip('/')}/{endpoint}" + data = json.dumps(payload).encode() + req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"}) + try: + with urllib.request.urlopen(req, timeout=5) as resp: + return resp.read().decode() + except Exception as e: + return json.dumps({"error": str(e)}) + + +def tool_remember(text: str, **kwargs): + """Store an experiment result or insight in ferricula's thermodynamic memory. + Memories decay over time, strengthen on recall, and consolidate during dream cycles.""" + return _ferricula_post("remember", {"text": text}) + + +def tool_recall(query: str, k: int = 5, **kwargs): + """Search ferricula memory for similar past experiments, insights, or patterns. + Returns the k most relevant memories ranked by similarity and recency.""" + return _ferricula_post("recall", {"text": query, "k": k}) + + +# --------------------------------------------------------------------------- +# Tool registry +# --------------------------------------------------------------------------- + +TOOLS = { + "get_config": { + "fn": tool_get_config, + "description": "Read current hyperparameters from train.py. Returns name, value, and comment for each.", + "parameters": {"type": "object", "properties": {}, "required": []}, + }, + "set_hyperparams": { + "fn": tool_set_hyperparams, + "description": "Modify hyperparameters in train.py. Pass a dict of {name: new_value} where values are Python expressions as strings (e.g. '2**17', '0.04', '\"SSSSL\"').", + "parameters": { + "type": "object", + "properties": { + "changes": { + "type": "object", + "description": "Dict of hyperparameter names to new values (as strings)", + "additionalProperties": {"type": "string"}, + } + }, + "required": ["changes"], + }, + }, + "edit_code": { + "fn": tool_edit_code, + "description": "Replace an entire named section of train.py with new code. Sections: model, optimizer, hyperparameters, setup, training_loop. Use for architectural changes.", + "parameters": { + "type": "object", + "properties": { + "section": {"type": "string", "description": "Section name: model, optimizer, hyperparameters, setup, training_loop"}, + "new_code": {"type": "string", "description": "Complete Python code to replace the section content"}, + }, + "required": ["section", "new_code"], + }, + }, + "run_experiment": { + "fn": tool_run_experiment, + "description": "Run 'uv run train.py' (5-min training budget). Returns val_bpb and other metrics, or crash/OOM/timeout info.", + "parameters": {"type": "object", "properties": {}, "required": []}, + }, + "get_history": { + "fn": tool_get_history, + "description": "Read results.tsv — the log of all past experiments (commit, val_bpb, memory, status, description).", + "parameters": {"type": "object", "properties": {}, "required": []}, + }, + "keep": { + "fn": tool_keep, + "description": "Keep the current changes: git commit train.py and log success to results.tsv. Call after a successful experiment that improved val_bpb.", + "parameters": { + "type": "object", + "properties": { + "description": {"type": "string", "description": "Short description of what this experiment tried"}, + }, + "required": ["description"], + }, + }, + "discard": { + "fn": tool_discard, + "description": "Discard current changes: revert train.py to last commit and log to results.tsv. Call after a failed or regressed experiment.", + "parameters": { + "type": "object", + "properties": { + "reason": {"type": "string", "description": "Why this experiment was discarded"}, + }, + "required": [], + }, + }, + "read_code": { + "fn": tool_read_code, + "description": "Read a range of lines from train.py (1-indexed). Use to inspect model architecture, optimizer, or training loop code.", + "parameters": { + "type": "object", + "properties": { + "start_line": {"type": "integer", "description": "First line to read (1-indexed)"}, + "end_line": {"type": "integer", "description": "Last line to read (1-indexed)"}, + }, + "required": ["start_line", "end_line"], + }, + }, + "remember": { + "fn": tool_remember, + "description": "Store an experiment result or insight in persistent thermodynamic memory (ferricula). Memories decay, strengthen on recall, and consolidate during dream cycles. Use after each experiment to build long-term knowledge.", + "parameters": { + "type": "object", + "properties": { + "text": {"type": "string", "description": "What to remember — experiment description, result, insight, or pattern"}, + }, + "required": ["text"], + }, + }, + "recall": { + "fn": tool_recall, + "description": "Search persistent memory for similar past experiments, insights, or patterns. Returns the k most relevant memories ranked by similarity. Use before proposing a new experiment to check if something similar was tried.", + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string", "description": "What to search for — a description of the experiment you're considering"}, + "k": {"type": "integer", "description": "Number of results to return (default: 5)"}, + }, + "required": ["query"], + }, + }, +} + + +def execute_tool(name, arguments): + """Execute a tool by name with given arguments.""" + if name not in TOOLS: + return json.dumps({"error": f"Unknown tool: {name}"}) + try: + return TOOLS[name]["fn"](**arguments) + except Exception as e: + return json.dumps({"error": f"{type(e).__name__}: {e}"}) + + +# --------------------------------------------------------------------------- +# Provider abstraction +# --------------------------------------------------------------------------- + +@dataclass +class ToolCall: + id: str + name: str + arguments: dict + +@dataclass +class AgentResponse: + text: str | None + tool_calls: list[ToolCall] = field(default_factory=list) + stop_reason: str = "end_turn" + + +class Provider(ABC): + @abstractmethod + def chat(self, system: str, messages: list[dict], tools: list[dict]) -> AgentResponse: + pass + + +class AnthropicProvider(Provider): + def __init__(self, model: str): + import anthropic + self.client = anthropic.Anthropic() + self.model = model + + def chat(self, system, messages, tools): + tool_defs = [ + {"name": t["name"], "description": t["description"], "input_schema": t["parameters"]} + for t in tools + ] + # Convert messages to Anthropic format + msgs = [] + for m in messages: + if m["role"] == "tool": + msgs.append({ + "role": "user", + "content": [{"type": "tool_result", "tool_use_id": m["tool_call_id"], "content": m["content"]}], + }) + elif m["role"] == "assistant" and m.get("tool_calls"): + content = [] + if m.get("text"): + content.append({"type": "text", "text": m["text"]}) + for tc in m["tool_calls"]: + content.append({"type": "tool_use", "id": tc["id"], "name": tc["name"], "input": tc["arguments"]}) + msgs.append({"role": "assistant", "content": content}) + else: + msgs.append({"role": m["role"], "content": m["content"]}) + + resp = self.client.messages.create( + model=self.model, max_tokens=4096, system=system, + messages=msgs, tools=tool_defs, + ) + text_parts = [b.text for b in resp.content if b.type == "text"] + tool_calls = [ + ToolCall(id=b.id, name=b.name, arguments=b.input) + for b in resp.content if b.type == "tool_use" + ] + return AgentResponse( + text="\n".join(text_parts) if text_parts else None, + tool_calls=tool_calls, + stop_reason=resp.stop_reason, + ) + + +class OpenAIProvider(Provider): + def __init__(self, model: str): + import openai + self.client = openai.OpenAI() + self.model = model + + def chat(self, system, messages, tools): + tool_defs = [ + {"type": "function", "function": {"name": t["name"], "description": t["description"], "parameters": t["parameters"]}} + for t in tools + ] + msgs = [{"role": "system", "content": system}] + for m in messages: + if m["role"] == "tool": + msgs.append({"role": "tool", "tool_call_id": m["tool_call_id"], "content": m["content"]}) + elif m["role"] == "assistant" and m.get("tool_calls"): + tc_list = [ + {"id": tc["id"], "type": "function", "function": {"name": tc["name"], "arguments": json.dumps(tc["arguments"])}} + for tc in m["tool_calls"] + ] + msgs.append({"role": "assistant", "content": m.get("text") or "", "tool_calls": tc_list}) + else: + msgs.append({"role": m["role"], "content": m["content"]}) + + resp = self.client.chat.completions.create( + model=self.model, messages=msgs, tools=tool_defs, max_tokens=4096, + ) + choice = resp.choices[0] + text = choice.message.content + tool_calls = [] + if choice.message.tool_calls: + for tc in choice.message.tool_calls: + tool_calls.append(ToolCall( + id=tc.id, name=tc.function.name, + arguments=json.loads(tc.function.arguments), + )) + return AgentResponse(text=text, tool_calls=tool_calls, stop_reason=choice.finish_reason) + + +class GeminiProvider(Provider): + def __init__(self, model: str): + from google import genai + self.client = genai.Client() + self.model = model + + def chat(self, system, messages, tools): + from google.genai import types + + tool_defs = types.Tool(function_declarations=[ + types.FunctionDeclaration(name=t["name"], description=t["description"], parameters=t["parameters"]) + for t in tools + ]) + + contents = [] + for m in messages: + if m["role"] == "tool": + contents.append(types.Content( + role="user", + parts=[types.Part(function_response=types.FunctionResponse( + name=m["name"], response=json.loads(m["content"]), + ))], + )) + elif m["role"] == "assistant" and m.get("tool_calls"): + parts = [] + if m.get("text"): + parts.append(types.Part(text=m["text"])) + for tc in m["tool_calls"]: + parts.append(types.Part(function_call=types.FunctionCall( + name=tc["name"], args=tc["arguments"], + ))) + contents.append(types.Content(role="model", parts=parts)) + elif m["role"] == "assistant": + contents.append(types.Content(role="model", parts=[types.Part(text=m["content"])])) + else: + contents.append(types.Content(role="user", parts=[types.Part(text=m["content"])])) + + config = types.GenerateContentConfig( + system_instruction=system, tools=[tool_defs], max_output_tokens=4096, + ) + resp = self.client.models.generate_content( + model=self.model, contents=contents, config=config, + ) + text_parts = [p.text for p in resp.candidates[0].content.parts if hasattr(p, "text") and p.text] + tool_calls = [] + for p in resp.candidates[0].content.parts: + if hasattr(p, "function_call") and p.function_call: + fc = p.function_call + tool_calls.append(ToolCall( + id=f"gemini_{int(time.time()*1000)}", name=fc.name, + arguments=dict(fc.args) if fc.args else {}, + )) + return AgentResponse( + text="\n".join(text_parts) if text_parts else None, + tool_calls=tool_calls, + stop_reason="tool_use" if tool_calls else "end_turn", + ) + + +# --------------------------------------------------------------------------- +# System prompt +# --------------------------------------------------------------------------- + +SYSTEM_PROMPT = """You are an autonomous ML researcher optimizing a neural network training script. + +## Goal +Minimize val_bpb (validation bits per byte) — lower is better. Each training run has a fixed 5-minute time budget. You modify train.py: architecture, optimizer, hyperparameters, batch size, model size — everything is fair game. + +## Workflow +1. Check current config and experiment history +2. Decide what to try next (one change at a time for attribution) +3. Make the change via set_hyperparams or edit_code +4. Run the experiment (takes ~5-7 min including startup) +5. Compare val_bpb to the current best +6. If improved: keep. If equal or worse: discard. +7. Repeat forever. + +## Tools +- get_config: see current hyperparameters +- set_hyperparams: tweak values (pass Python expressions as strings) +- edit_code: replace entire sections for architectural changes +- run_experiment: execute training, get metrics +- get_history: see all past experiments +- keep: commit improvement + log to results.tsv +- discard: revert failed experiment + log to results.tsv +- read_code: inspect specific lines of train.py +- remember: store an experiment result or insight in persistent memory (survives context resets) +- recall: search memory for similar past experiments before trying something new + +## Memory (ferricula) +You have a persistent thermodynamic memory. After each experiment, remember what you tried and what happened. +Before proposing a new experiment, recall similar past attempts to avoid repeating failures. +Memories decay when ignored, strengthen when recalled, and consolidate during dream cycles. + +## Strategy +- Time-constrained optimization: more gradient steps in 5 min often beats larger models/batches +- Regularization is often under-explored (weight decay on embeddings, value embeddings, lm_head) +- Initialization scale matters — narrow optima are real +- Try one thing at a time so you know what worked +- If 5+ experiments fail to improve, try something more radical (architectural change, not just hyperparameter tuning) +- Simpler is better: a tiny improvement that adds complexity is not worth it + +## Constraints +- Only modify train.py — prepare.py is read-only +- No new packages or dependencies +- Peak VRAM should stay reasonable (some increase is OK for meaningful val_bpb gains) +- NEVER stop experimenting. Loop until interrupted. + +## Important +After each experiment, you MUST call either keep() or discard(). Do not leave changes uncommitted. +When you first start, check get_history to see what's been tried, and get_config for current state. +If no experiments exist yet, run the baseline first (no changes) to establish a reference point. +""" + + +# --------------------------------------------------------------------------- +# Main loop +# --------------------------------------------------------------------------- + +def make_tool_list(): + """Build tool definitions in provider-agnostic format.""" + return [ + {"name": name, "description": spec["description"], "parameters": spec["parameters"]} + for name, spec in TOOLS.items() + ] + + +def main(): + parser = argparse.ArgumentParser(description="Autonomous ML experiment agent") + parser.add_argument("--provider", choices=["anthropic", "openai", "gemini"], required=True) + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--max-experiments", type=int, default=0, help="Stop after N experiments (0 = unlimited)") + parser.add_argument("--tag", type=str, default=None, help="Git branch tag (creates autoresearch/)") + parser.add_argument("--memory", type=str, default=None, help="Ferricula memory URL (e.g. http://localhost:8765)") + args = parser.parse_args() + + # Ferricula memory setup + global FERRICULA_URL + if args.memory: + FERRICULA_URL = args.memory + print(f"Memory: {FERRICULA_URL}") + + # Branch setup + if args.tag: + branch = f"autoresearch/{args.tag}" + subprocess.run(["git", "checkout", "-b", branch], cwd=PROJECT_DIR, capture_output=True) + print(f"Branch: {branch}") + + # Provider setup + if args.provider == "anthropic": + provider = AnthropicProvider(args.model) + elif args.provider == "openai": + provider = OpenAIProvider(args.model) + elif args.provider == "gemini": + provider = GeminiProvider(args.model) + print(f"Provider: {args.provider} / {args.model}") + + tools = make_tool_list() + messages = [{"role": "user", "content": "Begin experimenting. Start by checking the current config and history."}] + experiment_count = 0 + + print("\n--- Agent loop started (Ctrl+C to stop) ---\n") + + while True: + try: + response = provider.chat(SYSTEM_PROMPT, messages, tools) + except KeyboardInterrupt: + print("\n--- Interrupted ---") + break + except Exception as e: + print(f"API error: {e}. Retrying in 10s...") + time.sleep(10) + continue + + # Print any text from the model + if response.text: + print(f"\n[agent] {response.text}\n") + + # Build assistant message for history + assistant_msg = {"role": "assistant", "text": response.text, "content": response.text or ""} + if response.tool_calls: + assistant_msg["tool_calls"] = [ + {"id": tc.id, "name": tc.name, "arguments": tc.arguments} + for tc in response.tool_calls + ] + messages.append(assistant_msg) + + # Execute tool calls + if response.tool_calls: + for tc in response.tool_calls: + print(f" -> {tc.name}({json.dumps(tc.arguments, indent=None)[:200]})") + result = execute_tool(tc.name, tc.arguments) + print(f" <- {result[:200]}{'...' if len(result) > 200 else ''}") + messages.append({ + "role": "tool", "tool_call_id": tc.id, + "name": tc.name, "content": result, + }) + + # Track experiment count + if tc.name in ("keep", "discard"): + experiment_count += 1 + print(f"\n === Experiment #{experiment_count} complete ===\n") + if args.max_experiments and experiment_count >= args.max_experiments: + print(f"Reached --max-experiments {args.max_experiments}. Stopping.") + return + else: + # Model didn't call a tool — nudge it + messages.append({"role": "user", "content": "Continue experimenting. Use the tools to make changes and run experiments."}) + + # Context window management: compress after 60 messages + if len(messages) > 60: + print(" [compressing context]") + history = tool_get_history() + config = tool_get_config() + messages = [ + {"role": "user", "content": f"Context reset. Here is the current state:\n\nExperiment history:\n{history}\n\nCurrent config:\n{config}\n\nContinue experimenting. Build on what worked, avoid what didn't."}, + ] + + +if __name__ == "__main__": + main() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 000000000..1219676e6 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,71 @@ +services: + ferricula: + build: + context: ../ferricula + ports: + - "8765:8765" + volumes: + - ferricula-data:/data + environment: + - PORT=8765 + restart: unless-stopped + + train: + build: . + runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + volumes: + - data-cache:/root/.cache/autoresearch + - ./results.tsv:/app/results.tsv + - ./run.log:/app/run.log + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + command: uv run train.py + + agent: + build: . + runtime: nvidia + depends_on: + - ferricula + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - GOOGLE_API_KEY=${GOOGLE_API_KEY} + - FERRICULA_URL=http://ferricula:8765 + volumes: + - data-cache:/root/.cache/autoresearch + - ./results.tsv:/app/results.tsv + - ./run.log:/app/run.log + - ./.git:/app/.git + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + command: > + python agent.py + --provider ${AGENT_PROVIDER:-anthropic} + --model ${AGENT_MODEL:-claude-sonnet-4-20250514} + --memory ${FERRICULA_URL:-http://ferricula:8765} + + prepare: + build: . + volumes: + - data-cache:/root/.cache/autoresearch + command: uv run prepare.py --num-shards 10 + profiles: [setup] + +volumes: + data-cache: + ferricula-data: diff --git a/pyproject.toml b/pyproject.toml index 94ae32989..34aa2be88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,13 @@ dependencies = [ "torch==2.9.1", ] +[project.optional-dependencies] +agent = [ + "anthropic>=0.40.0", + "openai>=1.50.0", + "google-genai>=1.0.0", +] + [tool.uv.sources] torch = [ { index = "pytorch-cu128" }, diff --git a/train.py b/train.py index 2e743974c..ab3a63014 100644 --- a/train.py +++ b/train.py @@ -5,8 +5,12 @@ """ import os +import sys os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" +# Disable torch.compile on Windows (no Triton) +if sys.platform == "win32": + os.environ["TORCHDYNAMO_DISABLE"] = "1" import gc import math @@ -17,11 +21,16 @@ import torch.nn as nn import torch.nn.functional as F -from kernels import get_kernel -cap = torch.cuda.get_device_capability() -# varunneal's FA3 is Hopper only, use kernels-community on non-Hopper GPUs -repo = "varunneal/flash-attention-3" if cap == (9, 0) else "kernels-community/flash-attn3" -fa3 = get_kernel(repo).flash_attn_interface +try: + from kernels import get_kernel + cap = torch.cuda.get_device_capability() + repo = "varunneal/flash-attention-3" if cap == (9, 0) else "kernels-community/flash-attn3" + fa3 = get_kernel(repo).flash_attn_interface + USE_FA3 = True +except Exception: + fa3 = None + USE_FA3 = False + print("Flash Attention 3 unavailable, using PyTorch SDPA fallback") from prepare import MAX_SEQ_LEN, TIME_BUDGET, Tokenizer, make_dataloader, evaluate_bpb @@ -90,7 +99,13 @@ def forward(self, x, ve, cos_sin, window_size): q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin) q, k = norm(q), norm(k) - y = fa3.flash_attn_func(q, k, v, causal=True, window_size=window_size) + if USE_FA3: + y = fa3.flash_attn_func(q, k, v, causal=True, window_size=window_size) + else: + # SDPA fallback: (B, T, H, D) -> (B, H, T, D) + q2, k2, v2 = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2) + y = F.scaled_dot_product_attention(q2, k2, v2, is_causal=True) + y = y.transpose(1, 2) y = y.contiguous().view(B, T, -1) y = self.c_proj(y) return y @@ -153,7 +168,7 @@ def init_weights(self): torch.nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.001) # Transformer blocks n_embd = self.config.n_embd - s = 3**0.5 * n_embd**-0.5 + s = INIT_SCALE * 3**0.5 * n_embd**-0.5 for block in self.transformer.h: torch.nn.init.uniform_(block.attn.c_q.weight, -s, s) torch.nn.init.uniform_(block.attn.c_k.weight, -s, s) @@ -163,7 +178,7 @@ def init_weights(self): torch.nn.init.zeros_(block.mlp.c_proj.weight) # Per-layer scalars self.resid_lambdas.fill_(1.0) - self.x0_lambdas.fill_(0.1) + self.x0_lambdas.fill_(0.05) # Value embeddings for ve in self.value_embeds.values(): torch.nn.init.uniform_(ve.weight, -s, s) @@ -180,7 +195,9 @@ def init_weights(self): for ve in self.value_embeds.values(): ve.to(dtype=torch.bfloat16) - def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): + def _precompute_rotary_embeddings(self, seq_len, head_dim, base=None, device=None): + if base is None: + base = ROPE_BASE if device is None: device = self.transformer.wte.weight.device channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device) @@ -196,7 +213,7 @@ def _compute_window_sizes(self, config): pattern = config.window_pattern.upper() assert all(c in "SL" for c in pattern) long_window = config.sequence_len - short_window = long_window // 2 + short_window = long_window // 8 char_to_window = {"L": (long_window, 0), "S": (short_window, 0)} window_sizes = [] for layer_idx in range(config.n_layer): @@ -248,9 +265,9 @@ def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02 dmodel_lr_scale = (model_dim / 768) ** -0.5 print(f"Scaling AdamW LRs by 1/sqrt({model_dim}/768) = {dmodel_lr_scale:.6f}") param_groups = [ - dict(kind='adamw', params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=0.0), - dict(kind='adamw', params=embedding_params, lr=embedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=0.0), - dict(kind='adamw', params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=0.0), + dict(kind='adamw', params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=LM_HEAD_WD), + dict(kind='adamw', params=embedding_params, lr=embedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=EMBEDDING_WD), + dict(kind='adamw', params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=VE_WD), dict(kind='adamw', params=resid_params, lr=scalar_lr * 0.01, betas=adam_betas, eps=1e-10, weight_decay=0.0), dict(kind='adamw', params=x0_params, lr=scalar_lr, betas=(0.96, 0.95), eps=1e-10, weight_decay=0.0), ] @@ -281,8 +298,8 @@ def forward(self, idx, targets=None, reduction='mean'): softcap = 15 logits = self.lm_head(x) - logits = logits.float() logits = softcap * torch.tanh(logits / softcap) + logits = logits.float() if targets is not None: loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), @@ -303,19 +320,33 @@ def forward(self, idx, targets=None, reduction='mean'): ] @torch.compile(dynamic=False, fullgraph=True) -def adamw_step_fused(p, grad, exp_avg, exp_avg_sq, step_t, lr_t, beta1_t, beta2_t, eps_t, wd_t): +def adamw_step_fused(p, grad, exp_avg, exp_avg_sq, prev_exp_avg, + step_t, lr_t, beta1_t, beta2_t, eps_t, wd_t, weber_c_sq_t): p.mul_(1 - lr_t * wd_t) + # Save velocity before update for acceleration computation + prev_exp_avg.copy_(exp_avg) exp_avg.lerp_(grad, 1 - beta1_t) exp_avg_sq.lerp_(grad.square(), 1 - beta2_t) bias1 = 1 - beta1_t ** step_t bias2 = 1 - beta2_t ** step_t denom = (exp_avg_sq / bias2).sqrt() + eps_t + # Weber bracket: W = 1 - v²/(2c²) + v·a/c² + # v = debiased momentum (parameter velocity) + # a = change in momentum (parameter acceleration) + v = exp_avg / bias1 + a = (exp_avg - prev_exp_avg) / bias1 + v_sq = v.square() + v_dot_a = v * a + bracket = 1.0 - v_sq / (2.0 * weber_c_sq_t) + v_dot_a / weber_c_sq_t step_size = lr_t / bias1 - p.add_(exp_avg / denom, alpha=-step_size) + p.add_(exp_avg / denom * bracket, alpha=-step_size) @torch.compile(dynamic=False, fullgraph=True) def muon_step_fused(stacked_grads, stacked_params, momentum_buffer, second_momentum_buffer, - momentum_t, lr_t, wd_t, beta2_t, ns_steps, red_dim): + prev_momentum_buffer, + momentum_t, lr_t, wd_t, beta2_t, weber_c_sq_t, ns_steps, red_dim): + # Save momentum for Weber acceleration computation + prev_momentum_buffer.copy_(momentum_buffer) # Nesterov momentum momentum = momentum_t.to(stacked_grads.dtype) momentum_buffer.lerp_(stacked_grads, 1 - momentum) @@ -346,6 +377,17 @@ def muon_step_fused(stacked_grads, stacked_params, momentum_buffer, second_momen v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt() final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10)) g = g * final_scale.to(g.dtype) + # Weber bracket on momentum: W = 1 - v²/(2c²) + v·a/c² + # v = momentum buffer (parameter velocity in weight space) + # a = momentum - prev_momentum (parameter acceleration) + v = momentum_buffer + a = momentum_buffer - prev_momentum_buffer + # Per-matrix scalar bracket (reduce over spatial dims, keep per-param) + v_sq = v.float().square().mean(dim=(-2, -1), keepdim=True) + v_dot_a = (v.float() * a.float()).mean(dim=(-2, -1), keepdim=True) + c_sq = weber_c_sq_t.to(v_sq.dtype) + bracket = (1.0 - v_sq / (2.0 * c_sq) + v_dot_a / c_sq).to(g.dtype) + g = g * bracket # Cautious weight decay + parameter update lr = lr_t.to(g.dtype) wd = wd_t.to(g.dtype) @@ -354,7 +396,9 @@ def muon_step_fused(stacked_grads, stacked_params, momentum_buffer, second_momen class MuonAdamW(torch.optim.Optimizer): - """Combined optimizer: Muon for 2D matrix params, AdamW for others.""" + """Combined optimizer: Muon for 2D matrix params, AdamW for others. + Weber bracket correction: W = 1 - v²/(2c²) + v·a/c² + modifies effective learning rate based on parameter velocity and acceleration.""" def __init__(self, param_groups): super().__init__(param_groups, defaults={}) @@ -365,10 +409,12 @@ def __init__(self, param_groups): self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_weber_c_sq_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_weber_c_sq_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") def _step_adamw(self, group): for p in group['params']: @@ -380,6 +426,7 @@ def _step_adamw(self, group): state['step'] = 0 state['exp_avg'] = torch.zeros_like(p) state['exp_avg_sq'] = torch.zeros_like(p) + state['prev_exp_avg'] = torch.zeros_like(p) state['step'] += 1 self._adamw_step_t.fill_(state['step']) self._adamw_lr_t.fill_(group['lr']) @@ -387,9 +434,12 @@ def _step_adamw(self, group): self._adamw_beta2_t.fill_(group['betas'][1]) self._adamw_eps_t.fill_(group['eps']) self._adamw_wd_t.fill_(group['weight_decay']) + self._adamw_weber_c_sq_t.fill_(WEBER_C_SQ) adamw_step_fused(p, grad, state['exp_avg'], state['exp_avg_sq'], + state['prev_exp_avg'], self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t, - self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t) + self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t, + self._adamw_weber_c_sq_t) def _step_muon(self, group): params = group['params'] @@ -401,6 +451,8 @@ def _step_muon(self, group): shape, device, dtype = p.shape, p.device, p.dtype if "momentum_buffer" not in state: state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device) + if "prev_momentum_buffer" not in state: + state["prev_momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device) if "second_momentum_buffer" not in state: state_shape = (num_params, shape[-2], 1) if shape[-2] >= shape[-1] else (num_params, 1, shape[-1]) state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device) @@ -411,10 +463,13 @@ def _step_muon(self, group): self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0) self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5) self._muon_wd_t.fill_(group["weight_decay"]) + self._muon_weber_c_sq_t.fill_(WEBER_C_SQ) muon_step_fused(stacked_grads, stacked_params, state["momentum_buffer"], state["second_momentum_buffer"], + state["prev_momentum_buffer"], self._muon_momentum_t, self._muon_lr_t, self._muon_wd_t, - self._muon_beta2_t, group["ns_steps"], red_dim) + self._muon_beta2_t, self._muon_weber_c_sq_t, + group["ns_steps"], red_dim) torch._foreach_copy_(params, list(stacked_params.unbind(0))) @torch.no_grad() @@ -430,33 +485,58 @@ def step(self): # --------------------------------------------------------------------------- # Model architecture -ASPECT_RATIO = 64 # model_dim = depth * ASPECT_RATIO +ASPECT_RATIO = 57 # model_dim = depth * ASPECT_RATIO (was 64) HEAD_DIM = 128 # target head dimension for attention -WINDOW_PATTERN = "SSSL" # sliding window pattern: L=full, S=half context +WINDOW_PATTERN = "SL" # sliding window pattern (simpler for small model) +ROPE_BASE = 200_000 # RoPE base frequency (was 10000) # Optimization -TOTAL_BATCH_SIZE = 2**19 # ~524K tokens per optimizer step -EMBEDDING_LR = 0.6 # learning rate for token embeddings (Adam) -UNEMBEDDING_LR = 0.004 # learning rate for lm_head (Adam) +TOTAL_BATCH_SIZE = 2**16 # ~65K tokens — tuned for RTX 3060 12GB +EMBEDDING_LR = 0.9 # learning rate for token embeddings (was 0.6) +UNEMBEDDING_LR = 0.005 # learning rate for lm_head (was 0.004) MATRIX_LR = 0.04 # learning rate for matrix parameters (Muon) SCALAR_LR = 0.5 # learning rate for per-layer scalars (Adam) WEIGHT_DECAY = 0.2 # cautious weight decay for Muon ADAM_BETAS = (0.8, 0.95) # Adam beta1, beta2 WARMUP_RATIO = 0.0 # fraction of time budget for LR warmup -WARMDOWN_RATIO = 0.5 # fraction of time budget for LR warmdown -FINAL_LR_FRAC = 0.0 # final LR as fraction of initial +WARMDOWN_RATIO = 0.75 # fraction of time budget for LR warmdown (was 0.5) +FINAL_LR_FRAC = 0.05 # final LR as fraction of initial (was 0.0) +EMBEDDING_WD = 0.001 # weight decay for token embeddings +VE_WD = 0.003 # weight decay for value embeddings +LM_HEAD_WD = 0.01 # weight decay for lm_head +INIT_SCALE = 0.68 # init scale multiplier (was 1.0) + +# Weber electrodynamic bracket: W = 1 - v²/(2c²) + v·a/c² +# Modifies effective learning rate based on parameter velocity and acceleration. +# c² sets the scale — larger = subtler correction. Too small = unstable. +WEBER_C_SQ = 1.0 # characteristic velocity² (Weber "speed of light" squared) # Model size -DEPTH = 8 # number of transformer layers -DEVICE_BATCH_SIZE = 128 # per-device batch size (reduce if OOM) +DEPTH = 4 # number of transformer layers (tuned for RTX 3060) +DEVICE_BATCH_SIZE = 16 # per-device batch size (tuned for 12GB VRAM) # --------------------------------------------------------------------------- # Setup: tokenizer, model, optimizer, dataloader # --------------------------------------------------------------------------- t_start = time.time() -torch.manual_seed(42) -torch.cuda.manual_seed(42) + +def sdr_seed(): + """Seed RNG from SDR entropy (ADC quantization noise via sdr-rand on nemesis).""" + import json + from urllib.request import urlopen + try: + resp = json.loads(urlopen("http://192.168.86.24:9090/api/entropy?bytes=8&format=json", timeout=2).read()) + seed = int(resp["entropy_hex"], 16) + print(f"SDR entropy seed: {seed:#018x}") + except Exception as e: + seed = int.from_bytes(os.urandom(8), "big") + print(f"SDR unavailable ({e}), fallback seed: {seed:#018x}") + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + return seed + +sdr_seed() torch.set_float32_matmul_precision("high") device = torch.device("cuda") autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16) @@ -505,7 +585,10 @@ def build_model_config(depth): weight_decay=WEIGHT_DECAY, ) -model = torch.compile(model, dynamic=False) +try: + model = torch.compile(model, dynamic=False) +except Exception: + print("torch.compile unavailable (Triton missing), running eager mode") train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, MAX_SEQ_LEN, "train") x, y, epoch = next(train_loader) # prefetch first batch @@ -525,7 +608,7 @@ def get_lr_multiplier(progress): return cooldown * 1.0 + (1 - cooldown) * FINAL_LR_FRAC def get_muon_momentum(step): - frac = min(step / 300, 1) + frac = min(step / 200, 1) return (1 - frac) * 0.85 + frac * 0.95 def get_weight_decay(progress):