diff --git a/README.md b/README.md index 0cf264a..d6b2726 100644 --- a/README.md +++ b/README.md @@ -101,8 +101,20 @@ bun claude-code-cli.ts --eval 001-server-component --verbose # Debug mode - keep output folders bun claude-code-cli.ts --eval 001-server-component --debug + +# Capture full conversation with tool calls (Claude Code only) +bun claude-code-cli.ts --eval 001-server-component --capture-conversation ``` +**Conversation Capture (Claude Code only):** + +Use the `--capture-conversation` flag to save the full conversation including all tool calls in JSONL format: + +- `claude-conversation.jsonl` - Complete conversation with all tool calls and responses in JSONL format +- `claude-output.txt` - Human-readable summary of the evaluation + +These files are saved in the output directory (`output-claude-code/`). + #### Claude Code with Dev Server and Hooks Run Claude Code with a Next.js dev server and lifecycle hooks (e.g., for MCP server setup): diff --git a/cli.ts b/cli.ts index b2608ef..fe34c13 100755 --- a/cli.ts +++ b/cli.ts @@ -298,6 +298,8 @@ function parseCliArgs(args: string[]) { values["with-hooks"] = args[++i]; } else if (arg === "--with-visual-diff") { values["with-visual-diff"] = true; + } else if (arg === "--capture-conversation") { + values["capture-conversation"] = true; } else if (!arg.startsWith("-")) { positionals.push(arg); } @@ -336,6 +338,7 @@ Options: --dev-server-port Port for dev server (default: 4000, auto-increments for concurrent evals) --with-hooks Use eval hooks from scripts/eval-hooks/-pre.sh and -post.sh --with-visual-diff Enable visual regression testing with screenshot comparison + --capture-conversation [Claude Code only] Capture full conversation with tool calls to JSONL Examples: # Run all evals with LLMs @@ -1506,6 +1509,7 @@ async function main() { : undefined, hooks, visualDiff: values["with-visual-diff"] || false, + captureConversation: values["capture-conversation"] || false, }; if (values.all) { diff --git a/lib/claude-code-runner.ts b/lib/claude-code-runner.ts index 1181e80..003547e 100644 --- a/lib/claude-code-runner.ts +++ b/lib/claude-code-runner.ts @@ -47,6 +47,7 @@ export interface ClaudeCodeEvalOptions { visualDiff?: boolean; outputFormat?: string; outputFile?: string; + captureConversation?: boolean; } export class ClaudeCodeRunner { @@ -58,6 +59,7 @@ export class ClaudeCodeRunner { private devServer?: { enabled: boolean; command?: string; port?: number }; private hooks?: { preEval?: string; postEval?: string }; private visualDiff: boolean; + private captureConversation: boolean; constructor(options: ClaudeCodeEvalOptions = {}) { this.verbose = options.verbose || false; @@ -66,6 +68,7 @@ export class ClaudeCodeRunner { this.devServer = options.devServer; this.hooks = options.hooks; this.visualDiff = options.visualDiff || false; + this.captureConversation = options.captureConversation || false; } async runClaudeCodeEval( @@ -239,11 +242,14 @@ IMPORTANT: Do not run npm, pnpm, yarn, or any package manager commands. Dependen // Additional flags to ensure it works well in automation: // --dangerously-skip-permissions: bypass file/execution permission prompts // --print: non-interactive mode that prints response and exits - const args = [ - '--print', - '--dangerously-skip-permissions', - enhancedPrompt - ]; + const args = ['--print', '--dangerously-skip-permissions']; + + // Add conversation capture flags if enabled + if (this.captureConversation) { + args.push('--verbose', '--output-format', 'stream-json'); + } + + args.push(enhancedPrompt); if (this.verbose) { console.log('🚀 Spawning claude process with:'); @@ -297,7 +303,7 @@ IMPORTANT: Do not run npm, pnpm, yarn, or any package manager commands. Dependen }); }, timeout); - claudeProcess.on('exit', (code, signal) => { + claudeProcess.on('exit', async (code, signal) => { clearTimeout(timeoutId); this.processes.delete(processId); @@ -306,21 +312,66 @@ IMPORTANT: Do not run npm, pnpm, yarn, or any package manager commands. Dependen console.log(`Claude Code finished with code: ${code}, signal: ${signal}`); } + // Parse JSONL to extract human-readable summary and save files (if conversation capture is enabled) + let finalResult = ''; + if (this.captureConversation) { + try { + // Save full JSONL conversation (with tool calls) + const jsonlFile = path.join(projectDir, 'claude-conversation.jsonl'); + await fs.writeFile(jsonlFile, stdout); + + // Parse JSONL to extract human-readable summary + const lines = stdout.trim().split('\n'); + const messages: any[] = []; + + for (const line of lines) { + try { + const msg = JSON.parse(line); + messages.push(msg); + if (msg.type === 'result') { + finalResult = msg.result || ''; + } + } catch (e) { + // Skip invalid JSON lines + } + } + + // Save human-readable summary + const summaryFile = path.join(projectDir, 'claude-output.txt'); + const summary = `=== Claude Code Output ===\n` + + `Exit Code: ${code}\n` + + `Signal: ${signal}\n\n` + + `=== FINAL RESULT ===\n${finalResult}\n\n` + + `=== STDERR ===\n${stderr}\n\n` + + `Full conversation with tool calls saved to: claude-conversation.jsonl\n`; + await fs.writeFile(summaryFile, summary); + + if (this.verbose) { + console.log(`📝 Conversation saved to ${jsonlFile}`); + console.log(`📝 Summary saved to ${summaryFile}`); + } + } catch (error) { + if (this.verbose) { + console.error(`Failed to save output files: ${error}`); + } + } + } + if (signal) { resolve({ success: false, - output: stdout, + output: this.captureConversation && finalResult ? finalResult : stdout, error: `Claude Code process killed by signal ${signal}` }); } else if (code === 0) { resolve({ success: true, - output: stdout + output: this.captureConversation && finalResult ? finalResult : stdout }); } else { resolve({ success: false, - output: stdout, + output: this.captureConversation && finalResult ? finalResult : stdout, error: stderr || `Claude Code process exited with code ${code}` }); }