diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 9973c599..ec22b073 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -22,6 +22,9 @@ RUN rm -rf /var/lib/apt/lists/* && \ vim \ netcat-openbsd \ socat \ + bubblewrap \ + iptables \ + ipset \ chromium && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 61edaa29..166f25c0 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -8,6 +8,9 @@ "${localWorkspaceFolderBasename}" ], "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}", + "mounts": [ + "source=claude-code-config-${devcontainerId},target=/root/.claude,type=volume" + ], "customizations": { "vscode": { "extensions": [ diff --git a/README.md b/README.md index d81db38a..a991e7cd 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ curl -fsSL https://raw.githubusercontent.com/maxritter/pilot-shell/main/install. **Pilot Shell is different.** Every component solves a real problem: - **`/spec`** — plans, implements, and verifies features end-to-end with TDD +- **`/fix`** — bugfix workflow with TDD; bails out when complexity exceeds the standard fix lane - **`/prd`** — brainstorm ideas into clear requirements through with optional deep research - **Quality hooks** — enforce linting, formatting, type checking, and tests as quality gates - **Context engineering** — preserves decisions and knowledge across sessions @@ -107,6 +108,8 @@ curl -fsSL https://raw.githubusercontent.com/maxritter/pilot-shell/main/uninstal Pilot Shell works inside Dev Containers. Copy the [`.devcontainer`](https://github.com/maxritter/pilot-shell/tree/main/.devcontainer) folder from this repository into your project, adapt it to your needs (base image, extensions, dependencies), and run the installer inside the container. The installer auto-detects the container environment and skips system-level dependencies like Homebrew. +For tighter isolation when working with untrusted code, combine the dev container with Claude Code's [`/sandbox`](https://code.claude.com/docs/en/sandboxing) — `bubblewrap`, `socat`, `iptables`, and `ipset` are pre-installed in the Dockerfile so it works out of the box on Linux. See Anthropic's [development containers](https://code.claude.com/docs/en/devcontainer) and [sandboxing](https://code.claude.com/docs/en/sandboxing) docs for hardening patterns (egress allowlist, managed settings, persistent volumes). +
@@ -130,16 +133,14 @@ Pilot Shell works inside Dev Containers. Copy the [`.devcontainer`](https://gith Just chat — no plan, no approval gate. [Quick mode](https://pilot-shell.com/docs/workflows/quick-mode) is the default: quality hooks and TDD enforcement still apply, best for small tasks and exploration. For anything that needs a plan, use `/spec` — not Claude Code's built-in plan mode. -### /spec — Spec-Driven Development - -**[`/spec`](https://pilot-shell.com/docs/workflows/spec) replaces Claude Code's built-in plan mode** (Shift+Tab). It provides a complete planning workflow with TDD, verification, and code review — use `/spec` instead of plan mode for all planned work. +### /spec — Spec-Driven Development (features) -Features, bug fixes, refactoring — describe it and `/spec` handles the rest. Auto-detects whether it's a feature or a bugfix and adapts the workflow. Specs are saved to `docs/plans/` and visible in the Console's **Specification** tab. +**[`/spec`](https://pilot-shell.com/docs/workflows/spec) replaces Claude Code's built-in plan mode** (Shift+Tab) for new features, refactoring, and architectural work. It provides a complete planning workflow with TDD, verification, and code review. ```bash pilot -> /spec "Add user authentication with OAuth and JWT tokens" # → feature mode -> /spec "Fix the crash when deleting nodes with two children" # → bugfix mode (auto-detected) +> /spec "Add user authentication with OAuth and JWT tokens" +> /spec "Migrate the REST API to GraphQL" ``` ``` @@ -163,18 +164,177 @@ Full exploration workflow for new functionality, refactoring, or architectural c
+### /fix — Bugfix Workflow + +**[`/fix`](https://pilot-shell.com/docs/workflows/fix) is the bugfix command.** Investigate the bug, write the failing test, fix at the root cause, single-pass audit, done. No plan file, no approval mid-flow, no separate verify phase. + +```bash +pilot +> /fix "annotation persistence drops fields between save and reload" +> /fix "off-by-one in pagination at boundary" +> /fix "wrong default for max_retries" +``` + +```text +Investigate → RED → Fix → Audit → Quality Gate → Done +``` + +If investigation reveals the bug is multi-component or architectural, `/fix` stops cleanly and tells you to re-invoke with `/spec`. `/fix` is always quick; `/spec` is the full workflow. + +
+How /fix works + +For local bugs. Single file, obvious-once-traced root cause. No plan file, no approval mid-flow, no separate verify phase. TDD still enforced — bugfixes without a failing test don't ship. + +- **Investigate:** Reproduce the bug → trace to root cause at `file:line` with `codegraph_context` + targeted reads → state confidence (High/Medium required to proceed). For UI / async / race bugs, add temporary `SPEC-DEBUG:`-marked logs at component boundaries before tracing. +- **RED:** Write the failing test via an existing public entry point → run, must fail with the documented symptom. +- **Fix:** Minimal change at the root cause. Symptom patches are forbidden. Reproducing test must pass, then the targeted test module. Diff sanity check (root-cause file in diff, no unplanned files, < 20 lines, symptom-patching grep) catches issues with the fix itself. +- **Verify End-to-End:** The primary correctness signal. Run the actual program with the original input (Claude Code Chrome → Chrome DevTools MCP → playwright-cli → agent-browser for UI; CLI / API / REPL / job trigger for non-UI) and capture concrete evidence. A passing unit test alone is never accepted as proof. +- **Quality Gate:** Lint + types + build + full anti-regression suite, once. +- **Bail-out:** If investigation reveals the bug is multi-component, architectural, needs defense-in-depth at multiple layers, or two fix attempts have failed, `/fix` stops cleanly and tells you to re-invoke with `/spec`. It does not silently switch lanes. + +
+
-Bugfix Mode +How /spec handles bugs + +When you type `/spec ""`, the full bugfix workflow runs — for bugs that warrant a written plan, approval, code review, and the full verify ceremony. + +- **Behavior Contract:** every plan pins down `Given / When / Currently / Expected / Anti-regression` — the invariant the fix must produce and the behavior it must not break +- **Three uniform tasks** (always, regardless of bug size): Write Reproducing Test (RED) → Implement Fix at Root Cause → Quality Gate +- **Verify audit:** always-on `cp`+`trap` revert-test (proves the reproducing test would genuinely fail without the fix — rules out retroactive rubber-stamp tests) + root-cause-at-source audit (flags symptom patches and caller-side workarounds) + original-symptom re-check — no sub-agents, tests carry the proof +- **Iteration cap at 3:** after three failed verify cycles, the workflow stops and asks if the bug is architectural rather than letting you loop forever + +
+ +### /prd — Brainstorm Ideas Into Product Requirements Documents + +[`/prd`](https://pilot-shell.com/docs/workflows/prd) is the brainstorming surface for ideas that aren't specs yet — vague problem statements and fuzzy shapes. It pitches directions, pressure-tests them with you, and converges on a PRD you can hand to `/spec`. PRDs are saved to `docs/prd/` and visible in the Console's **Requirements** tab. + +```bash +pilot +> /prd "Add real-time notifications for team updates" +> /prd "We need better onboarding — users drop off after signup" +``` + +
+What /prd Does + +**When to use `/prd` over `/spec`:** `/prd` is for **what** and **why**; `/spec` is for **how**. Reach for `/prd` first when you only have a problem statement, want to riff across multiple directions, or need scope boundaries defined before someone starts building. -Investigation-first workflow for targeted fixes. Finds the root cause before touching any code. +**Flow:** two modes, picked automatically from how fuzzy the idea is: -**Investigate:** Reproduces the bug → traces backward through the call chain to find the **root cause** at a specific `file:line` → compares against working code patterns → states the fix with confidence level. If 3+ hypotheses fail, escalates as an architectural problem. +1. **Ideate** — free-form prose, Claude pitches 3-5 directions, you react (only runs when the idea is vague) +2. **Clarify → Converge → Write** — structured multiple-choice questions once the shape is known, then the PRD is written -**Test-Before-Fix:** Writes a regression test that FAILS on current code → implements the minimal fix at the root cause → verifies all tests pass. Defense-in-depth validation at multiple layers when the bug involves data flowing through shared code paths. +**Research tiers** (picked at the start): -**Verify:** Lightweight verification — regression test confirmation → full test suite → lint + type check → quality checks. No review sub-agents — the regression test proves the fix works, the full suite proves nothing else broke. +| Tier | Behavior | +|------|----------| +| **Quick** | Skip research | +| **Standard** | Web search for competitors, prior art, best practices | +| **Deep** | Parallel research agents for comprehensive findings | -**Why this matters:** Root cause investigation prevents "fix one thing, break another." The regression test locks in the fix. No formal notation overhead — just trace, test, fix, verify. +The final PRD covers problem statement, core user flows, scope boundaries, and technical context — then offers to hand off directly to `/spec` for implementation. + +
+ +### /setup-rules — Generate Modular Rules + +[`/setup-rules`](https://pilot-shell.com/docs/workflows/setup-rules) explores your codebase, discovers conventions, generates modular rules and documents MCP servers. Run once initially, then anytime your project changes significantly. + +```bash +pilot +> /setup-rules +``` + +
+What /setup-rules Does + +12 phases that read your codebase and produce comprehensive AI context: + +0. **Reference** — load best practices for rule structure, path-scoping, and quality standards +1. **Read existing rules** — inventory all `.claude/rules/` files, detect structure and path-scoping. Also detects `CLAUDE.md` and `AGENTS.md` (the cross-framework agent context file used by Codex, Cursor, etc.) +2. **Migrate unscoped assets** — prefix with project slug for better sharing +3. **Quality audit** — check rules against best practices (size, specificity, stale references, conflicts) +4. **Explore codebase** — semantic search with Probe CLI, structural analysis with CodeGraph +5. **Compare patterns** — discovered vs documented conventions +6. **Sync project rule** — update `{slug}-project.md` with current tech stack, structure, commands. Migrates `CLAUDE.md` / `AGENTS.md` content into modular rules +7. **Sync MCP docs** — smoke-test user MCP servers, document working tools +8. **Discover new rules** — find undocumented patterns worth capturing +9. **Cross-check** — validate all references, ensure consistency across generated files +10. **Sync AGENTS.md** — if `AGENTS.md` already exists, offer to re-export the updated rules into it so non-Claude agents see the same context. Always asks first, never creates the file if absent, preserves user-authored sections +11. **Summary** — report all changes made + +**For monorepos:** Organizes rules in nested subdirectories by product and team, with `paths` frontmatter to scope rules to specific file types. Generates a `README.md` documenting the structure. + +
+ +### /create-skill — Reusable Skill Creator + +[`/create-skill`](https://pilot-shell.com/docs/workflows/create-skill) builds a reusable skill from any topic — explores the codebase and creates it interactively with you. If no topic is given, evaluates the current session for extractable knowledge. + +```bash +pilot +> /create-skill "Automate the review and triaging of our PR Bot comments" +``` + +
+What /create-skill Does + +6 phases that turn domain knowledge into a reusable skill: + +1. **Reference** — load use case categories, complexity spectrum, file structure template, description formula, security restrictions +2. **Understand** — explore the codebase for relevant patterns, ask clarifying questions, or evaluate the current session for extractable knowledge +3. **Check existing** — search project and global skills to avoid duplicates +4. **Create** — write to `.claude/skills/` (project) or `~/.claude/skills/` (global), apply portability and determinism checklists +5. **Quality gates** — structure checklist (SKILL.md naming, frontmatter fields), content checklist (error handling, examples, exclusions), triggering test (should/shouldn't trigger), iteration signals +6. **Test & iterate** — run test prompts with sub-agents, evaluate results, optimize description triggering + +**Use case categories:** + +| Category | Best For | +| ----------------------------- | -------------------------------------------------------------------------- | +| **Document & Asset Creation** | Consistent reports, designs, code with embedded style guides and templates | +| **Workflow Automation** | Multi-step processes with validation gates and iterative refinement | +| **MCP Enhancement** | Workflow guidance on top of MCP tool access, multi-MCP coordination | + +**Skill structure:** Each skill is a folder with a `SKILL.md` file (case-sensitive), optional `scripts/`, `references/`, and `assets/` directories. The YAML frontmatter description determines when Claude loads the skill — it must include what the skill does, when to use it, and specific trigger phrases. Progressive disclosure keeps context lean: frontmatter loads always (~100 tokens), SKILL.md loads on activation, linked files load on demand. + +
+ +### /benchmark — Measure Rule & Skill Impact + +[`/benchmark`](https://pilot-shell.com/docs/workflows/benchmark) runs your prompts with and without the target, grades outputs against falsifiable assertions, and shows a structured report you can absorb in 30 seconds — labeled verdict, quadrant breakdown, and only the divergent assertions in the drill-down. Finishes with a concrete improvement plan so you know exactly what to change next. + +```bash +pilot +> /benchmark pilot/skills/create-skill +> /benchmark pilot/rules/testing.md +``` + +
+What /benchmark Does + +Six phases turn a rule or skill into a before/after comparison with an actionable plan: + +1. **Intake** — pick up an existing `benchmarks//evals.json` or author one +2. **Target discovery** — classify as `skill` or `rules` +3. **Author evals** — draft 3 falsifiable assertions; falsifiability gate ensures baseline actually fails +4. **Execute** — run both configs in isolated sandboxes; grader subagent scores every assertion +5. **Present findings** — three layers, scannable top-to-bottom: + + | Layer | Content | + |---|---| + | **Verdict** | One labeled sentence with a recommended next step. Delta bands: 🟢 Strong (≥ +0.50) / 🟢 Moderate (+0.20) / 🟡 Weak (+0.05) / ⚪ Indistinguishable (±0.05) / 🔴 Regression (< −0.05) | + | **Quadrant breakdown** | Counts each assertion as Signal (✓/✗) / Baseline (✓/✓) / Unreachable (✗/✗) / Regression (✗/✓). The dominant quadrant drives the plan | + | **Per-eval drill-down** | Only divergent assertions get a row; matching ones fold into header counts so the report stays under one screen | + +6. **Improvement plan** — ≤ 5 ranked proposals in a uniform format (`[TARGET]` or `[EVALS]` tag, location, current quote, replacement, "Lever" line). You pick: apply target edits, iterate on evals, both, or save the plan and stop. Re-runs land in a fresh `runs//` so iteration deltas stay legible. + +**Isolation:** each run gets its own sandbox directory; a globally-installed copy of the target in `~/.claude/` is auto-hidden for the duration and restored afterward (with on-disk recovery manifest covering SIGKILL / power loss / segfault). Conditional-loading frontmatter (`path:` / `paths:`) is stripped from the copy installed into the `with` sandbox so the target loads unconditionally for every prompt — without that, rules scoped to e.g. `paths: ["**/*.py"]` would stay dormant in both configs and the delta would collapse to 0.00. The source file is never modified. + +**Key flags:** `--runs N` (default 1), `--configs with,without`, `--workers N`, `--model`, `--no-isolate-global`, `--restore-hidden`.
@@ -412,137 +572,6 @@ Pilot Bot defines scheduled jobs, automates recurring tasks, and monitor system -### /prd — Brainstorm Ideas Into Product Requirements Documents - -[`/prd`](https://pilot-shell.com/docs/workflows/prd) is the brainstorming surface for ideas that aren't specs yet — vague problem statements and fuzzy shapes. It pitches directions, pressure-tests them with you, and converges on a PRD you can hand to `/spec`. PRDs are saved to `docs/prd/` and visible in the Console's **Requirements** tab. - -```bash -pilot -> /prd "Add real-time notifications for team updates" -> /prd "We need better onboarding — users drop off after signup" -``` - -
-What /prd Does - -**When to use `/prd` over `/spec`:** `/prd` is for **what** and **why**; `/spec` is for **how**. Reach for `/prd` first when you only have a problem statement, want to riff across multiple directions, or need scope boundaries defined before someone starts building. - -**Flow:** two modes, picked automatically from how fuzzy the idea is: - -1. **Ideate** — free-form prose, Claude pitches 3-5 directions, you react (only runs when the idea is vague) -2. **Clarify → Converge → Write** — structured multiple-choice questions once the shape is known, then the PRD is written - -**Research tiers** (picked at the start): - -| Tier | Behavior | -|------|----------| -| **Quick** | Skip research | -| **Standard** | Web search for competitors, prior art, best practices | -| **Deep** | Parallel research agents for comprehensive findings | - -The final PRD covers problem statement, core user flows, scope boundaries, and technical context — then offers to hand off directly to `/spec` for implementation. - -
- -### /setup-rules — Generate Modular Rules - -[`/setup-rules`](https://pilot-shell.com/docs/workflows/setup-rules) explores your codebase, discovers conventions, generates modular rules and documents MCP servers. Run once initially, then anytime your project changes significantly. - -```bash -pilot -> /setup-rules -``` - -
-What /setup-rules Does - -12 phases that read your codebase and produce comprehensive AI context: - -0. **Reference** — load best practices for rule structure, path-scoping, and quality standards -1. **Read existing rules** — inventory all `.claude/rules/` files, detect structure and path-scoping. Also detects `CLAUDE.md` and `AGENTS.md` (the cross-framework agent context file used by Codex, Cursor, etc.) -2. **Migrate unscoped assets** — prefix with project slug for better sharing -3. **Quality audit** — check rules against best practices (size, specificity, stale references, conflicts) -4. **Explore codebase** — semantic search with Probe CLI, structural analysis with CodeGraph -5. **Compare patterns** — discovered vs documented conventions -6. **Sync project rule** — update `{slug}-project.md` with current tech stack, structure, commands. Migrates `CLAUDE.md` / `AGENTS.md` content into modular rules -7. **Sync MCP docs** — smoke-test user MCP servers, document working tools -8. **Discover new rules** — find undocumented patterns worth capturing -9. **Cross-check** — validate all references, ensure consistency across generated files -10. **Sync AGENTS.md** — if `AGENTS.md` already exists, offer to re-export the updated rules into it so non-Claude agents see the same context. Always asks first, never creates the file if absent, preserves user-authored sections -11. **Summary** — report all changes made - -**For monorepos:** Organizes rules in nested subdirectories by product and team, with `paths` frontmatter to scope rules to specific file types. Generates a `README.md` documenting the structure. - -
- -### /create-skill — Reusable Skill Creator - -[`/create-skill`](https://pilot-shell.com/docs/workflows/create-skill) builds a reusable skill from any topic — explores the codebase and creates it interactively with you. If no topic is given, evaluates the current session for extractable knowledge. - -```bash -pilot -> /create-skill "Automate the review and triaging of our PR Bot comments" -``` - -
-What /create-skill Does - -6 phases that turn domain knowledge into a reusable skill: - -1. **Reference** — load use case categories, complexity spectrum, file structure template, description formula, security restrictions -2. **Understand** — explore the codebase for relevant patterns, ask clarifying questions, or evaluate the current session for extractable knowledge -3. **Check existing** — search project and global skills to avoid duplicates -4. **Create** — write to `.claude/skills/` (project) or `~/.claude/skills/` (global), apply portability and determinism checklists -5. **Quality gates** — structure checklist (SKILL.md naming, frontmatter fields), content checklist (error handling, examples, exclusions), triggering test (should/shouldn't trigger), iteration signals -6. **Test & iterate** — run test prompts with sub-agents, evaluate results, optimize description triggering - -**Use case categories:** - -| Category | Best For | -| ----------------------------- | -------------------------------------------------------------------------- | -| **Document & Asset Creation** | Consistent reports, designs, code with embedded style guides and templates | -| **Workflow Automation** | Multi-step processes with validation gates and iterative refinement | -| **MCP Enhancement** | Workflow guidance on top of MCP tool access, multi-MCP coordination | - -**Skill structure:** Each skill is a folder with a `SKILL.md` file (case-sensitive), optional `scripts/`, `references/`, and `assets/` directories. The YAML frontmatter description determines when Claude loads the skill — it must include what the skill does, when to use it, and specific trigger phrases. Progressive disclosure keeps context lean: frontmatter loads always (~100 tokens), SKILL.md loads on activation, linked files load on demand. - -
- -### /benchmark — Measure Rule & Skill Impact - -[`/benchmark`](https://pilot-shell.com/docs/workflows/benchmark) runs your prompts with and without the target, grades outputs against falsifiable assertions, and shows a structured report you can absorb in 30 seconds — labeled verdict, quadrant breakdown, and only the divergent assertions in the drill-down. Finishes with a concrete improvement plan so you know exactly what to change next. - -```bash -pilot -> /benchmark pilot/skills/create-skill -> /benchmark pilot/rules/testing.md -``` - -
-What /benchmark Does - -Six phases turn a rule or skill into a before/after comparison with an actionable plan: - -1. **Intake** — pick up an existing `benchmarks//evals.json` or author one -2. **Target discovery** — classify as `skill` or `rules` -3. **Author evals** — draft 3 falsifiable assertions; falsifiability gate ensures baseline actually fails -4. **Execute** — run both configs in isolated sandboxes; grader subagent scores every assertion -5. **Present findings** — three layers, scannable top-to-bottom: - - | Layer | Content | - |---|---| - | **Verdict** | One labeled sentence with a recommended next step. Delta bands: 🟢 Strong (≥ +0.50) / 🟢 Moderate (+0.20) / 🟡 Weak (+0.05) / ⚪ Indistinguishable (±0.05) / 🔴 Regression (< −0.05) | - | **Quadrant breakdown** | Counts each assertion as Signal (✓/✗) / Baseline (✓/✓) / Unreachable (✗/✗) / Regression (✗/✓). The dominant quadrant drives the plan | - | **Per-eval drill-down** | Only divergent assertions get a row; matching ones fold into header counts so the report stays under one screen | - -6. **Improvement plan** — ≤ 5 ranked proposals in a uniform format (`[TARGET]` or `[EVALS]` tag, location, current quote, replacement, "Lever" line). You pick: apply target edits, iterate on evals, both, or save the plan and stop. Re-runs land in a fresh `runs//` so iteration deltas stay legible. - -**Isolation:** each run gets its own sandbox directory; a globally-installed copy of the target in `~/.claude/` is auto-hidden for the duration and restored afterward (with on-disk recovery manifest covering SIGKILL / power loss / segfault). Conditional-loading frontmatter (`path:` / `paths:`) is stripped from the copy installed into the `with` sandbox so the target loads unconditionally for every prompt — without that, rules scoped to e.g. `paths: ["**/*.py"]` would stay dormant in both configs and the delta would collapse to 0.00. The source file is never modified. - -**Key flags:** `--runs N` (default 1), `--configs with,without`, `--workers N`, `--model`, `--no-isolate-global`, `--restore-hidden`. - -
- ### Claude CLI Flag Passthrough All Claude Code CLI flags work directly with `pilot` — current and future. Pilot forwards any flag it doesn't recognize to the Claude CLI automatically. @@ -719,6 +748,8 @@ On **Team**, every developer runs `pilot customize install ` once and st Yes. Copy the `.devcontainer` folder from this repository into your project, adapt it to your needs (base image, extensions, dependencies), and install Pilot Shell inside the container. Everything works the same — hooks, rules, MCP servers, persistent memory, and the Console dashboard all run inside the container. This is a great option for teams that want a consistent, reproducible development environment. +For tighter isolation when working with untrusted code, layer Claude Code's [`/sandbox`](https://code.claude.com/docs/en/sandboxing) on top — the Dockerfile pre-installs `bubblewrap`, `socat`, `iptables`, and `ipset` so it works out of the box. See Anthropic's [development containers](https://code.claude.com/docs/en/devcontainer) and [sandboxing](https://code.claude.com/docs/en/sandboxing) docs for the hardening patterns. +
diff --git a/console/package.json b/console/package.json index 54e54bc9..d3f78dbf 100644 --- a/console/package.json +++ b/console/package.json @@ -35,8 +35,12 @@ "devDependencies": { "@git-diff-view/file": "^0.1.1", "@git-diff-view/react": "^0.1.1", + "@happy-dom/global-registrator": "^20.9.0", "@iconify/react": "^6.0.2", "@tailwindcss/vite": "^4.1.18", + "@testing-library/dom": "^10.4.1", + "@testing-library/react": "^16.3.2", + "@testing-library/user-event": "^14.6.1", "@types/bun": "^1.3.8", "@types/cookie-parser": "^1.4.10", "@types/cors": "^2.8.19", diff --git a/console/src/ui/viewer/hooks/useSettings.ts b/console/src/ui/viewer/hooks/useSettings.ts index d8889ef0..536bed4c 100644 Binary files a/console/src/ui/viewer/hooks/useSettings.ts and b/console/src/ui/viewer/hooks/useSettings.ts differ diff --git a/console/src/ui/viewer/views/Settings/index.tsx b/console/src/ui/viewer/views/Settings/index.tsx index 7e9ae3fb..9d72bc15 100644 Binary files a/console/src/ui/viewer/views/Settings/index.tsx and b/console/src/ui/viewer/views/Settings/index.tsx differ diff --git a/console/src/ui/viewer/views/Spec/annotation/PlanAnnotator.tsx b/console/src/ui/viewer/views/Spec/annotation/PlanAnnotator.tsx index 38872022..6ee5ac4a 100644 Binary files a/console/src/ui/viewer/views/Spec/annotation/PlanAnnotator.tsx and b/console/src/ui/viewer/views/Spec/annotation/PlanAnnotator.tsx differ diff --git a/console/src/ui/viewer/views/Spec/annotation/useAnnotation.ts b/console/src/ui/viewer/views/Spec/annotation/useAnnotation.ts index d2c4d872..3a929d6a 100644 Binary files a/console/src/ui/viewer/views/Spec/annotation/useAnnotation.ts and b/console/src/ui/viewer/views/Spec/annotation/useAnnotation.ts differ diff --git a/console/src/ui/viewer/views/Spec/index.tsx b/console/src/ui/viewer/views/Spec/index.tsx index 6a8c6071..3bfdf91a 100644 Binary files a/console/src/ui/viewer/views/Spec/index.tsx and b/console/src/ui/viewer/views/Spec/index.tsx differ diff --git a/console/src/ui/viewer/views/Spec/parsePlanContent.ts b/console/src/ui/viewer/views/Spec/parsePlanContent.ts new file mode 100644 index 00000000..37c56e5a Binary files /dev/null and b/console/src/ui/viewer/views/Spec/parsePlanContent.ts differ diff --git a/console/tests/annotation/plan-annotator-persistence.test.tsx b/console/tests/annotation/plan-annotator-persistence.test.tsx new file mode 100644 index 00000000..f66f6cfc Binary files /dev/null and b/console/tests/annotation/plan-annotator-persistence.test.tsx differ diff --git a/console/tests/ui/spec-section-rendering.test.ts b/console/tests/ui/spec-section-rendering.test.ts new file mode 100644 index 00000000..180c518f Binary files /dev/null and b/console/tests/ui/spec-section-rendering.test.ts differ diff --git a/console/tsconfig.json b/console/tsconfig.json index a9add684..3a52a1ca 100644 --- a/console/tsconfig.json +++ b/console/tsconfig.json @@ -25,7 +25,8 @@ "include": [ "src/**/*.ts", "src/**/*.tsx", - "tests/**/*.ts" + "tests/**/*.ts", + "tests/**/*.tsx" ], "exclude": [ "node_modules", diff --git a/docs/docusaurus/docs/getting-started/installation.md b/docs/docusaurus/docs/getting-started/installation.md index ff12ea73..a6a994a4 100644 --- a/docs/docusaurus/docs/getting-started/installation.md +++ b/docs/docusaurus/docs/getting-started/installation.md @@ -49,6 +49,13 @@ When enabled, Codex provides an independent adversarial review during `/spec` pl Pilot Shell works inside Dev Containers. Copy the `.devcontainer` folder from the [Pilot Shell repository](https://github.com/maxritter/pilot-shell/tree/main/.devcontainer) into your project, adapt it to your needs (base image, extensions, dependencies), and run the installer inside the container. The installer auto-detects the container environment and skips system-level dependencies like Homebrew. +For tighter isolation when working with untrusted code, layer Claude Code's [`/sandbox`](https://code.claude.com/docs/en/sandboxing) on top — `bubblewrap`, `socat`, `iptables`, and `ipset` are pre-installed in the Dockerfile so it works out of the box on Linux. + +### Further reading + +- [Claude Code · Development containers](https://code.claude.com/docs/en/devcontainer) — Anthropic's reference container, persistent volumes, organization policy, network egress, the `--dangerously-skip-permissions` flag. +- [Claude Code · Sandboxing](https://code.claude.com/docs/en/sandboxing) — Seatbelt (macOS) and bubblewrap (Linux/WSL2), `/sandbox` modes, `allowedDomains`, filesystem allow/deny rules, security limitations. + ## Install Specific Version ```bash diff --git a/docs/docusaurus/docs/workflows/fix.md b/docs/docusaurus/docs/workflows/fix.md new file mode 100644 index 00000000..0f802d8c --- /dev/null +++ b/docs/docusaurus/docs/workflows/fix.md @@ -0,0 +1,106 @@ +--- +sidebar_position: 5 +title: /fix +description: Bugfix workflow — investigate, RED test, fix, verify end-to-end, done. +--- + +# /fix + +Bugfix workflow with TDD. Investigates the bug, writes a failing test, fixes at the root cause, **verifies end-to-end against the running program**, finishes. No plan file, no approval mid-flow, no separate verify phase. + +Use `/fix` for bugs. Use [`/spec`](/docs/workflows/spec) for features and architectural changes — including bugfixes that warrant a full plan with approval and code review. + +```bash +$ pilot +> /fix "annotation persistence drops fields between save and reload" +> /fix "off-by-one in pagination at boundary" +> /fix "wrong default for max_retries" +``` + +`/fix` is **always quick**. If investigation reveals the bug is multi-component, architectural, or otherwise larger than a quick fix, `/fix` stops cleanly and tells you to re-invoke with `/spec`. It does not silently switch lanes. + +## Workflow + +```text +Investigate → RED → Fix → Verify End-to-End → Quality Gate → Done +``` + +### Investigate + +Trace the bug to `file:lineN — function() does X but should do Y` with **High** or **Medium** confidence. For UI / async / race / timing bugs that don't surface from a static read, add temporary `SPEC-DEBUG:`-marked logs at component boundaries before tracing. Low confidence bails out. + +### RED — Write the Reproducing Test + +Encode `Currently → Expected` via an existing public entry point. Run it; it must **fail** with an error matching the symptom. A test that passes against buggy code doesn't encode the bug. + +### Fix at the Root Cause + +Minimal change at the root cause. Symptom patches (`try/except` hiding the bug, swallowed returns, silently normalised inputs) are forbidden. Re-run the reproducing test → must pass. Run the targeted test module(s). + +A diff sanity check follows: root-cause file IS in the diff, no unplanned files, < 20 lines typically. A grep over the diff catches symptom-patching and leftover `print` / `console.log` / `SPEC-DEBUG:` markers — every match must be justified or reverted. + +### Verify End-to-End + +The primary correctness signal. Run the actual program with the original input and observe the symptom is gone — a passing unit test alone is never accepted. This step is mandatory. + +| Bug surface | Tool | Evidence | +| --- | --- | --- | +| **UI / web** | 4-tier browser stack: **Claude Code Chrome** → **Chrome DevTools MCP** → **playwright-cli** → **agent-browser** | Page state, element values | +| **CLI** | The exact command the user ran | Stdout, exit code | +| **HTTP API** | `curl` / HTTP client with the user's body | Status code, response field | +| **Library / SDK / function** | `python -c '…'`, `node -e '…'`, REPL, scratch script | Returned value | +| **Background job** | Trigger manually with the failing input | Logs | + +The completion report must include concrete evidence — bare assertions ("looks fixed", "tests pass") are insufficient. If the symptom persists, the unit test is at the wrong layer: move the assertion up to the user's actual entry point and re-run RED → Fix → Verify End-to-End. + +### Quality Gate + +Lint + types + build (when applicable), then the full anti-regression suite, once. If a far-from-the-fix test breaks, the bug has unintended cross-coupling — bail out to `/spec`. + +### Finalise + +Worktree mode: bundle test + fix into one `fix:` commit. Approval gate fires only if **Plan Approval** is enabled. The completion report includes a mandatory **E2E** line documenting what was actually run. + +## When to bail out — use `/spec` instead + +`/fix` stops and tells you to re-invoke with `/spec` when: + +- Bug spans 3+ files or 2+ components. +- Root cause is architectural, not a single line. +- Fix needs defense-in-depth at multiple layers. +- Confidence stays Low — root cause can't be pinned to file:line. +- Two failed fix attempts. +- Fix has non-trivial UI implications that warrant a recorded Verification Scenario. + +The full lane (`/spec`) adds: Behavior Contract, three-task structure, plan file with approval gate, Console annotation cycle, `cp`+`trap` revert-test proof in verify, iteration cap at 3. + +## Common issues + +| Symptom | What it means | What to do | +| --- | --- | --- | +| Can't reproduce | Description too vague or environment-dependent | Ask for exact steps, env, stack trace. Don't write a speculative fix. | +| Test passes without the fix | Test doesn't encode the bug | Tighten the assertion or pick a more specific input. | +| Fix breaks far-away tests | Cross-coupling beyond the quick lane | Bail out. Re-invoke with `/spec`. | +| Reproducing test green but user still hits the bug | Test sits below the user's layer | Move the assertion up and re-run RED → Fix → Verify End-to-End. | +| Two failed fix attempts | Architectural problem, not a fix problem | Bail out. The pattern needs reconsidering, not another patch. | + +## Configurable Toggles + +`/fix` honours the same Console Settings as `/spec`: + +| Toggle | Default | Effect when disabled | +| --- | --- | --- | +| **Ask Questions** | On | Investigation skips clarifying questions and uses defaults. | +| **Plan Approval** | On | The end-of-flow approval gate is skipped. | + +When both are off, `/fix` runs end-to-end with no user interaction. Worktree isolation is not honoured — use `/spec` if you want a worktree. + +## When to use `/spec` vs `/fix` + +| Use `/fix` | Use `/spec` | +| --- | --- | +| Something is broken | Building new functionality | +| You want a fix without ceremony | Architecture or design decision matters | +| You want it done now | Work warrants a written plan + approval | + +`/fix` handles the full range — from typos to multi-step debugging. It bails out and points to `/spec` only when complexity is truly architectural (multiple components, defense-in-depth at multiple layers, repeated failed attempts). diff --git a/docs/docusaurus/docs/workflows/spec.md b/docs/docusaurus/docs/workflows/spec.md index 064785b8..3bf9b7db 100644 --- a/docs/docusaurus/docs/workflows/spec.md +++ b/docs/docusaurus/docs/workflows/spec.md @@ -8,15 +8,14 @@ description: Plan, implement, and verify complex features with full automation u Plan, implement, and verify complex features with full automation using Spec-Driven Development. -**Replaces Claude Code's built-in plan mode (Shift+Tab).** Best for complex features, refactoring tasks, or any work where you want to review a plan before implementation begins. The structured workflow prevents scope creep and ensures every task is tested and verified before being marked complete. +**Replaces Claude Code's built-in plan mode (Shift+Tab).** Best for new features, refactoring, architectural changes — work where a plan and a design discussion add value before code. The structured workflow prevents scope creep and ensures every task is tested and verified before being marked complete. -> **Tip:** For vague ideas or unclear requirements, use [`/prd`](/docs/workflows/prd) first to brainstorm back-and-forth and produce a PRD, then hand off to `/spec`. +For bugfixes, use [`/fix`](/docs/workflows/fix). For vague ideas, use [`/prd`](/docs/workflows/prd) first to produce a PRD, then hand off to `/spec`. ```bash $ pilot > /spec "Add user authentication with OAuth and JWT tokens" > /spec "Migrate the REST API to GraphQL" -> /spec "Fix the crash when deleting nodes with two children" # bugfix auto-detected ``` ## Workflow @@ -38,18 +37,9 @@ Full exploration workflow for new functionality, refactoring, or any work where - Full plan with scope, risks, and Definition of Done - Unified verification agent (optional, configurable in Console Settings) -### Bugfix Spec (auto-detected) +### Bugfixes -Investigation-first flow for targeted fixes. Finds the root cause before touching any code, then enforces a uniform three-task structure so every bugfix follows the same process — no freewheeling. - -- **Root cause tracing:** backward through the call chain to `file:line`, with CodeGraph caller/callee analysis -- **Pattern analysis:** compare broken vs working code paths -- **Behavior Contract:** every plan pins down `Given / When / Currently / Expected / Anti-regression` — the invariant the fix must produce and the behavior it must not break -- **Three uniform tasks** (always, regardless of bug size): - 1. **Write Reproducing Test (RED)** — must FAIL before any fix code exists - 2. **Implement Fix at Root Cause** — reproducing test passes, full suite passes - 3. **Quality Gate** — lint, type check, build, full suite green after any auto-fixes -- **Verify audit:** authoritative full suite + always-on revert-test (proves the reproducing test would genuinely fail without the fix — rules out retroactive rubber-stamp tests) + root-cause-at-source audit (flags symptom patches and caller-side workarounds) + anti-regression spot-check — no sub-agents, tests carry the proof +For a bugfix workflow without a plan file, use [`/fix`](/docs/workflows/fix). When the user types `/spec` with a bug description, the full bugfix workflow runs — root-cause investigation, three-task structure (RED test → fix → quality gate), Behavior Contract audit, revert-test proof in verify, iteration cap at 3. ## Three Phases @@ -66,7 +56,7 @@ Investigation-first flow for targeted fixes. Finds the root cause before touchin - Isolated git worktree, new branch from default, or current branch (your choice) - Strict TDD for each task: RED → GREEN → REFACTOR - Quality hooks auto-lint, format, and type-check every edit -- Full test suite after each task to catch regressions early +- Full test suite runs at the **Quality Gate** task (end), not after every task — running it per-fix-task is the single biggest token sink in bundled bugfix plans, so the targeted test module is used between fixes and the authoritative full-suite run happens once ### Verify Phase @@ -100,6 +90,8 @@ When all three are disabled, `/spec` runs end-to-end without any user interactio Both reviewers run in a separate context window and don't consume the main session's context budget. Optional **Codex adversarial reviewers** (off by default) provide an independent second opinion using OpenAI Codex. +**Codex runs at most once per `/spec` invocation.** Plan iterations (annotation feedback, verify re-runs, fixing prior findings) reuse the result of the first Codex review instead of re-launching — a sentinel file in the session directory enforces this. The bugfix planning phase no longer runs Codex at all; adversarial review is most valuable on real code, not on a plan. + ## Branch Strategy (Optional) When starting a `/spec` task, you're asked how you want to work: @@ -111,3 +103,5 @@ When starting a `/spec` task, you're asked how you want to work: | **New branch from default** | Fetches origin, creates `feat/` (or `fix/` for bugfixes) from `origin/main`, and checks it out. Best when your current branch isn't clean but you don't want full worktree isolation. | Disable the **Worktree Support** toggle in Console Settings to skip this question entirely — `/spec` will always use the current branch. + +For bugfixes, use [`/fix`](/docs/workflows/fix) — the worktree question is asked here in `/spec` because that's where it applies. diff --git a/docs/docusaurus/sidebars.ts b/docs/docusaurus/sidebars.ts index 60467827..5429aedf 100644 --- a/docs/docusaurus/sidebars.ts +++ b/docs/docusaurus/sidebars.ts @@ -22,6 +22,7 @@ const sidebars: SidebarsConfig = { "workflows/create-skill", "workflows/prd", "workflows/spec", + "workflows/fix", "workflows/benchmark", "workflows/quick-mode", ], diff --git a/docs/site/src/components/HeroSection.tsx b/docs/site/src/components/HeroSection.tsx index 77bd8e5e..267e5d8a 100644 --- a/docs/site/src/components/HeroSection.tsx +++ b/docs/site/src/components/HeroSection.tsx @@ -1,6 +1,5 @@ import { GithubIcon, BookOpen } from "lucide-react"; import { Button } from "@/components/ui/button"; -import { Badge } from "@/components/ui/badge"; import Logo from "@/components/Logo"; const HeroSection = () => { @@ -26,73 +25,6 @@ const HeroSection = () => {

- {/* Feature highlights */} -
-
-
- Spec-Driven -
-
- Plan → Build → Verify -
-
-
-
-
- TDD -
-
- Test-First -
-
-
-
-
- Memory -
-
- Persistent Context -
-
-
-
-
- Overlays -
-
- Modify Defaults -
-
-
-
-
- Hooks -
-
- Quality Gates -
-
-
- - {/* Feature badges */} -
- - Worktree Support - - - MCP Servers - - - LSP Servers - - - Semantic Search - - - Pilot Bot - -
- {/* CTA Buttons */}