Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,20 @@ bun claude-code-cli.ts --eval 001-server-component --verbose

# Debug mode - keep output folders
bun claude-code-cli.ts --eval 001-server-component --debug

# Capture full conversation with tool calls (Claude Code only)
bun claude-code-cli.ts --eval 001-server-component --capture-conversation
```

**Conversation Capture (Claude Code only):**

Use the `--capture-conversation` flag to save the full conversation including all tool calls in JSONL format:

- `claude-conversation.jsonl` - Complete conversation with all tool calls and responses in JSONL format
- `claude-output.txt` - Human-readable summary of the evaluation

These files are saved in the output directory (`output-claude-code/`).

#### Claude Code with Dev Server and Hooks

Run Claude Code with a Next.js dev server and lifecycle hooks (e.g., for MCP server setup):
Expand Down
4 changes: 4 additions & 0 deletions cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,8 @@ function parseCliArgs(args: string[]) {
values["with-hooks"] = args[++i];
} else if (arg === "--with-visual-diff") {
values["with-visual-diff"] = true;
} else if (arg === "--capture-conversation") {
values["capture-conversation"] = true;
} else if (!arg.startsWith("-")) {
positionals.push(arg);
}
Expand Down Expand Up @@ -336,6 +338,7 @@ Options:
--dev-server-port Port for dev server (default: 4000, auto-increments for concurrent evals)
--with-hooks <name> Use eval hooks from scripts/eval-hooks/<name>-pre.sh and <name>-post.sh
--with-visual-diff Enable visual regression testing with screenshot comparison
--capture-conversation [Claude Code only] Capture full conversation with tool calls to JSONL

Examples:
# Run all evals with LLMs
Expand Down Expand Up @@ -1506,6 +1509,7 @@ async function main() {
: undefined,
hooks,
visualDiff: values["with-visual-diff"] || false,
captureConversation: values["capture-conversation"] || false,
};

if (values.all) {
Expand Down
69 changes: 60 additions & 9 deletions lib/claude-code-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ export interface ClaudeCodeEvalOptions {
visualDiff?: boolean;
outputFormat?: string;
outputFile?: string;
captureConversation?: boolean;
}

export class ClaudeCodeRunner {
Expand All @@ -58,6 +59,7 @@ export class ClaudeCodeRunner {
private devServer?: { enabled: boolean; command?: string; port?: number };
private hooks?: { preEval?: string; postEval?: string };
private visualDiff: boolean;
private captureConversation: boolean;

constructor(options: ClaudeCodeEvalOptions = {}) {
this.verbose = options.verbose || false;
Expand All @@ -66,6 +68,7 @@ export class ClaudeCodeRunner {
this.devServer = options.devServer;
this.hooks = options.hooks;
this.visualDiff = options.visualDiff || false;
this.captureConversation = options.captureConversation || false;
}

async runClaudeCodeEval(
Expand Down Expand Up @@ -239,11 +242,14 @@ IMPORTANT: Do not run npm, pnpm, yarn, or any package manager commands. Dependen
// Additional flags to ensure it works well in automation:
// --dangerously-skip-permissions: bypass file/execution permission prompts
// --print: non-interactive mode that prints response and exits
const args = [
'--print',
'--dangerously-skip-permissions',
enhancedPrompt
];
const args = ['--print', '--dangerously-skip-permissions'];

// Add conversation capture flags if enabled
if (this.captureConversation) {
args.push('--verbose', '--output-format', 'stream-json');
}

args.push(enhancedPrompt);

if (this.verbose) {
console.log('🚀 Spawning claude process with:');
Expand Down Expand Up @@ -297,7 +303,7 @@ IMPORTANT: Do not run npm, pnpm, yarn, or any package manager commands. Dependen
});
}, timeout);

claudeProcess.on('exit', (code, signal) => {
claudeProcess.on('exit', async (code, signal) => {
clearTimeout(timeoutId);
this.processes.delete(processId);

Expand All @@ -306,21 +312,66 @@ IMPORTANT: Do not run npm, pnpm, yarn, or any package manager commands. Dependen
console.log(`Claude Code finished with code: ${code}, signal: ${signal}`);
}

// Parse JSONL to extract human-readable summary and save files (if conversation capture is enabled)
let finalResult = '';
if (this.captureConversation) {
try {
// Save full JSONL conversation (with tool calls)
const jsonlFile = path.join(projectDir, 'claude-conversation.jsonl');
await fs.writeFile(jsonlFile, stdout);

// Parse JSONL to extract human-readable summary
const lines = stdout.trim().split('\n');
const messages: any[] = [];

for (const line of lines) {
try {
const msg = JSON.parse(line);
messages.push(msg);
if (msg.type === 'result') {
finalResult = msg.result || '';
}
} catch (e) {
// Skip invalid JSON lines
}
}

// Save human-readable summary
const summaryFile = path.join(projectDir, 'claude-output.txt');
const summary = `=== Claude Code Output ===\n` +
`Exit Code: ${code}\n` +
`Signal: ${signal}\n\n` +
`=== FINAL RESULT ===\n${finalResult}\n\n` +
`=== STDERR ===\n${stderr}\n\n` +
`Full conversation with tool calls saved to: claude-conversation.jsonl\n`;
await fs.writeFile(summaryFile, summary);

if (this.verbose) {
console.log(`📝 Conversation saved to ${jsonlFile}`);
console.log(`📝 Summary saved to ${summaryFile}`);
}
} catch (error) {
if (this.verbose) {
console.error(`Failed to save output files: ${error}`);
}
}
}

if (signal) {
resolve({
success: false,
output: stdout,
output: this.captureConversation && finalResult ? finalResult : stdout,
error: `Claude Code process killed by signal ${signal}`
});
} else if (code === 0) {
resolve({
success: true,
output: stdout
output: this.captureConversation && finalResult ? finalResult : stdout
});
} else {
resolve({
success: false,
output: stdout,
output: this.captureConversation && finalResult ? finalResult : stdout,
error: stderr || `Claude Code process exited with code ${code}`
});
}
Expand Down