diff --git a/blackbox-cli.ts b/blackbox-cli.ts new file mode 100644 index 0000000..74250f0 --- /dev/null +++ b/blackbox-cli.ts @@ -0,0 +1,362 @@ +#!/usr/bin/env bun + + +import fs from "fs/promises"; +import path from "path"; +import { parseArgs } from "util"; +import { runBlackboxEval, BlackboxResult } from "./lib/blackbox-runner"; + +const { values, positionals } = parseArgs({ + args: process.argv.slice(2), + options: { + help: { type: "boolean", short: "h" }, + eval: { type: "string", short: "e" }, + all: { type: "boolean", short: "a" }, + verbose: { type: "boolean", short: "v" }, + debug: { type: "boolean" }, + dry: { type: "boolean", short: "d" }, + timeout: { type: "string", short: "t" }, + "api-key": { type: "string" }, + model: { type: "string", short: "m" }, + "output-file": { type: "string" }, + }, + allowPositionals: true, +}); + +function showHelp() { + console.log(` +Blackbox Evals CLI + +Usage: + blackbox-cli.ts [options] [eval-path] + + Note: + - Make sure to set OPENAI_API_KEY or --api-key option. + - Make sure to set OPENAI_BASE_URL to the base url of the API you want to use. Defaults to openrouter Base URL. + - Make sure to set OPENAI_MODEL to the model you want to use. Defaults to anthropic/claude-sonnet-4.5. + +Options: + -h, --help Show this help message + -e, --eval Run a specific eval by path + -a, --all Run all evals with Blackbox + -v, --verbose Show detailed logs during eval execution + --debug Persist output folders for debugging (don't clean up) + -d, --dry Alias for --debug (preserve output folder) + -t, --timeout Timeout in milliseconds (default: 600000 = 10 minutes) + --api-key Blackbox API key (or use BLACKBOX_API_KEY env var) + -m, --model Model to use (passed to blackbox as -m ) + --output-file Custom path for results file (default: results/blackbox-*.json) + +Results are automatically written to the results/ directory. + +Examples: + # Run a specific eval (results auto-saved to results/blackbox-*.json) + bun blackbox-cli.ts --eval 001-server-component + + # Run eval by positional argument + bun blackbox-cli.ts 001-server-component + + # Run with verbose output and custom timeout + bun blackbox-cli.ts --eval 001-server-component --verbose --timeout 600000 + + # Run with specific model + bun blackbox-cli.ts --eval 001-server-component --model blackbox-xyz + + # Run all evals (results auto-saved to results/blackbox-all-*.json) + bun blackbox-cli.ts --all + + # Debug mode - keep output folders for inspection + bun blackbox-cli.ts --eval 001-server-component --debug + + # Write results to custom location + bun blackbox-cli.ts --eval 001-server-component --output-file my-results.json +`); +} + +async function getAllEvals(): Promise { + const evalsDir = path.join(process.cwd(), "evals"); + const entries = await fs.readdir(evalsDir, { withFileTypes: true }); + + const evals: string[] = []; + + for (const entry of entries) { + if (entry.isDirectory() && /^\d+/.test(entry.name)) { + const evalPath = path.join(evalsDir, entry.name); + const hasInput = await fs + .stat(path.join(evalPath, "input")) + .then((s) => s.isDirectory()) + .catch(() => false); + const hasPrompt = await fs + .stat(path.join(evalPath, "prompt.md")) + .then((s) => s.isFile()) + .catch(() => false); + + if (hasInput && hasPrompt) { + evals.push(entry.name); + } + } + } + + return evals.sort(); +} + +function formatDuration(ms: number): string { + if (ms < 1000) return `${Math.round(ms)}ms`; + return `${(ms / 1000).toFixed(1)}s`; +} + +function displayResult(evalPath: string, result: BlackboxResult) { + console.log("\nšŸ“Š Blackbox Results:"); + console.log("═".repeat(80)); + + const evalColWidth = Math.max(25, evalPath.length); + const header = `| ${"Eval".padEnd(evalColWidth)} | Result | Build | Lint | Tests | Duration |`; + const separator = `|${"-".repeat(evalColWidth + 2)}|------------|-------|-------|-------|----------|`; + + console.log(header); + console.log(separator); + + const name = evalPath.padEnd(evalColWidth); + const build = result.buildSuccess ? "āœ…" : "āŒ"; + const lint = result.lintSuccess ? "āœ…" : "āŒ"; + const tests = result.testSuccess ? "āœ…" : "āŒ"; + const allPassed = result.buildSuccess && result.lintSuccess && result.testSuccess; + const resultStatus = allPassed ? "āœ… PASS" : "āŒ FAIL"; + const duration = formatDuration(result.duration); + + console.log( + `| ${name} | ${resultStatus.padEnd(10)} | ${build} | ${lint} | ${tests} | ${duration.padEnd(8)} |`, + ); + + console.log("═".repeat(80)); + + if (!allPassed || !result.success) { + console.log("\nāŒ Error Details:"); + console.log("─".repeat(80)); + + if (result.error) { + console.log(`Blackbox Error: ${result.error}`); + } + + if (!result.buildSuccess && result.buildOutput) { + console.log(`Build Error:\n${result.buildOutput.slice(-1000)}`); + } + + if (!result.lintSuccess && result.lintOutput) { + console.log(`Lint Error:\n${result.lintOutput.slice(-1000)}`); + } + + if (!result.testSuccess && result.testOutput) { + console.log(`Test Error:\n${result.testOutput.slice(-1000)}`); + } + } + + console.log("═".repeat(80)); +} + +function displayResultsTable(results: { evalPath: string; result: BlackboxResult }[]) { + const totalTests = results.length; + console.log(`\nšŸ“Š Blackbox Results Summary (${totalTests} Tests):`); + console.log("═".repeat(120)); + + const header = `| ${"Eval".padEnd(25)} | Result | Build | Lint | Tests | Duration |`; + const separator = `|${"-".repeat(27)}|------------|-------|-------|-------|----------|`; + + console.log(header); + console.log(separator); + + const failedEvals: Array<{ + evalPath: string; + buildError?: string; + lintError?: string; + testError?: string; + blackboxError?: string; + }> = []; + + let passedEvals = 0; + + for (const { evalPath, result } of results) { + const name = evalPath.padEnd(25); + const build = result.buildSuccess ? "āœ…" : "āŒ"; + const lint = result.lintSuccess ? "āœ…" : "āŒ"; + const tests = result.testSuccess ? "āœ…" : "āŒ"; + const allPassed = result.success && result.buildSuccess && result.lintSuccess && result.testSuccess; + const resultStatus = allPassed ? "āœ… PASS" : "āŒ FAIL"; + const duration = formatDuration(result.duration); + + if (allPassed) { + passedEvals++; + } + + console.log( + `| ${name} | ${resultStatus.padEnd(10)} | ${build} | ${lint} | ${tests} | ${duration.padEnd(8)} |`, + ); + + if (!allPassed) { + const errors: any = { evalPath }; + + if (result.error) { + errors.blackboxError = result.error; + } + + if (!result.buildSuccess && result.buildOutput) { + errors.buildError = result.buildOutput.slice(-500); + } + + if (!result.lintSuccess && result.lintOutput) { + errors.lintError = result.lintOutput.slice(-500); + } + + if (!result.testSuccess && result.testOutput) { + errors.testError = result.testOutput.slice(-500); + } + + failedEvals.push(errors); + } + } + + console.log("═".repeat(120)); + + console.log(`\nšŸ“ˆ Summary: ${passedEvals}/${totalTests} evals passed`); + + if (failedEvals.length > 0) { + console.log("\nāŒ Error Summaries:"); + console.log("─".repeat(120)); + + for (const failed of failedEvals) { + console.log(`\n${failed.evalPath}:`); + + if (failed.blackboxError) { + console.log(` Blackbox: ${failed.blackboxError}`); + } + + if (failed.buildError) { + console.log(` Build: ${failed.buildError}`); + } + + if (failed.lintError) { + console.log(` Lint: ${failed.lintError}`); + } + + if (failed.testError) { + console.log(` Tests: ${failed.testError}`); + } + } + } +} + +async function main() { + if (values.help) { + showHelp(); + return; + } + + const apiKey = values["api-key"] || process.env.OPENAI_API_KEY; + if (!apiKey) { + console.error("āŒ Error: API key is required."); + console.error("Set OPENAI_API_KEY environment variable or use --api-key option. By default the base url is set to openrouter Base URL. Set OPENAI_BASE_URL to change."); + process.exit(1); + } + + const model = values["model"] || process.env.OPENAI_MODEL + if (!model){ + values["model"] = "anthropic/claude-sonnet-4.5" + console.log("āš ļø OPENAI_MODEL environment variable or --model argument is not set, defaulting the model to `anthropic/claude-sonnet-4.5`") + } + + const evalOptions = { + verbose: values.verbose || false, + debug: values.debug || values.dry || false, + dry: values.dry || false, + timeout: values.timeout ? parseInt(values.timeout) : 600000, + apiKey, + model: values.model, + outputFile: values["output-file"], + }; + + if (values.all) { + const allEvals = await getAllEvals(); + console.log( + `Running ${allEvals.length} evals with Blackbox...${values.model ? ` (model: ${values.model})` : ""}\n`, + ); + + if(values.model){ + process.env.OPENAI_MODEL = values.model + } + + const results: { evalPath: string; result: BlackboxResult }[] = []; + const individualEvalOptions = { ...evalOptions, skipFileWrite: true }; + + for (const evalPath of allEvals) { + try { + console.log(`šŸš€ Running ${evalPath}...`); + const result = await runBlackboxEval(evalPath, individualEvalOptions); + results.push({ evalPath, result }); + + const status = result.success && result.buildSuccess && result.lintSuccess && result.testSuccess ? "āœ… PASS" : "āŒ FAIL"; + console.log(`${status} ${evalPath} (${formatDuration(result.duration)})`); + } catch (error) { + const errorResult: BlackboxResult = { + success: false, + output: "", + error: error instanceof Error ? error.message : String(error), + duration: 0, + }; + results.push({ evalPath, result: errorResult }); + console.log(`āŒ FAIL ${evalPath} - ${errorResult.error}`); + } + } + + displayResultsTable(results); + + let allResultsFile = evalOptions.outputFile; + if (!allResultsFile) { + const resultsDir = path.join(process.cwd(), "results"); + await fs.mkdir(resultsDir, { recursive: true }); + const timestamp = Date.now(); + allResultsFile = path.join(resultsDir, `blackbox-all-${timestamp}.json`); + } + + try { + await fs.writeFile(allResultsFile, JSON.stringify(results, null, 2), "utf-8"); + console.log(`\nšŸ“ All results written to: ${allResultsFile}`); + } catch (error) { + console.error( + `āš ļø Failed to write results to file: ${error instanceof Error ? error.message : String(error)}`, + ); + } + + return; + } + + const evalPath = values.eval || positionals[0]; + if (!evalPath) { + console.error("āŒ Error: No eval specified. Use --eval , provide a positional argument, or use --all"); + console.log("\nAvailable evals:"); + const allEvals = await getAllEvals(); + allEvals.forEach((evalName) => console.log(` ${evalName}`)); + process.exit(1); + } + + console.log(`šŸš€ Running Blackbox eval: ${evalPath}${values.model ? ` (model: ${values.model})` : ""}`); + + try { + const result = await runBlackboxEval(evalPath, evalOptions); + displayResult(evalPath, result); + + const success = result.success && result.buildSuccess && result.lintSuccess && result.testSuccess; + process.exit(success ? 0 : 1); + } catch (error) { + console.error(`āŒ Error: ${error instanceof Error ? error.message : String(error)}`); + process.exit(1); + } +} + +// @ts-ignore +if (import.meta.main) { + main().catch((error) => { + console.error("Unexpected error:", error); + process.exit(1); + }); +} + diff --git a/lib/blackbox-runner.ts b/lib/blackbox-runner.ts new file mode 100644 index 0000000..26fc354 --- /dev/null +++ b/lib/blackbox-runner.ts @@ -0,0 +1,529 @@ +import fs from "fs/promises"; +import path from "path"; +import { spawn, ChildProcess } from "child_process"; +import { performance } from "perf_hooks"; +import { copyFolder, ensureSharedDependencies } from "./eval-runner"; + +export interface BlackboxResult { + success: boolean; + output: string; + error?: string; + duration: number; + buildSuccess?: boolean; + lintSuccess?: boolean; + testSuccess?: boolean; + buildOutput?: string; + lintOutput?: string; + testOutput?: string; + evalPath?: string; + timestamp?: string; +} + +export interface BlackboxEvalOptions { + timeout?: number; + verbose?: boolean; + debug?: boolean; + apiKey?: string; + model?: string; + outputFile?: string; + skipFileWrite?: boolean; + dry?: boolean; +} + +export class BlackboxRunner { + private processes = new Map(); + private verbose: boolean; + private debug: boolean; + private apiKey?: string; + private model?: string; + + constructor(options: BlackboxEvalOptions = {}) { + this.verbose = options.verbose || false; + this.debug = options.debug || false; + this.apiKey = options.apiKey || process.env.OPENAI_API_KEY; + this.model = options.model || process.env.OPENAI_MODEL; + } + + async runBlackboxEval( + inputDir: string, + outputDir: string, + prompt: string, + timeout: number = 600000, // 10 minutes default + ): Promise { + const startTime = performance.now(); + + try { + await fs.mkdir(outputDir, { recursive: true }); + await copyFolder(inputDir, outputDir, true); // Exclude test files so blackbox doesn't see them + + await ensureSharedDependencies(this.verbose); + + if (this.verbose) { + console.log(`šŸ¤– Running Blackbox on ${outputDir}...`); + console.log(`šŸ“ Prompt: ${prompt}`); + console.log("─".repeat(80)); + } + + const blackboxResult = await this.executeBlackbox(outputDir, prompt, timeout); + + if (!blackboxResult.success) { + return { + success: false, + output: blackboxResult.output, + error: blackboxResult.error, + duration: performance.now() - startTime, + }; + } + + if (this.verbose) { + console.log("šŸ“‹ Copying test files and eslint config back for evaluation..."); + } + await this.copyTestFilesBack(inputDir, outputDir); + + const evalResults = await this.runEvaluation(outputDir); + + return { + success: true, + output: blackboxResult.output, + duration: performance.now() - startTime, + buildSuccess: evalResults.buildSuccess, + lintSuccess: evalResults.lintSuccess, + testSuccess: evalResults.testSuccess, + buildOutput: evalResults.buildOutput, + lintOutput: evalResults.lintOutput, + testOutput: evalResults.testOutput, + }; + } catch (error) { + return { + success: false, + output: "", + error: error instanceof Error ? error.message : String(error), + duration: performance.now() - startTime, + }; + } finally { + if (!this.debug) { + try { + await fs.rm(outputDir, { recursive: true, force: true }); + } catch { + // Ignore cleanup errors + } + } + } + } + + private async executeBlackbox( + projectDir: string, + prompt: string, + timeout?: number, + ): Promise<{ success: boolean; output: string; error?: string }> { + return new Promise((resolve) => { + const processId = Math.random().toString(36).substr(2, 9); + const startTime = Date.now(); + + const enhancedPrompt = `${prompt} + +IMPORTANT: Do not run any pnpm, npm, or yarn commands (like pnpm dev, npm run dev, pnpm install, etc.). Do not start any development servers. Just make the necessary code changes to the files and exit when done. Do not ask any followup questions either.`; + + const env = { ...process.env }; + if (this.apiKey) { + env.OPENAI_API_KEY = this.apiKey; + } + + const args = ["-y","--include-directories", projectDir]; + if (this.model) { + args.push("-m", this.model); + } + + if (this.verbose) { + console.log("šŸš€ Spawning blackbox process with:"); + console.log(" Command: blackbox"); + console.log(" Args:", args); + console.log(" Working Directory:", projectDir); + console.log(" API Key present:", !!this.apiKey); + if (this.model) { + console.log(" Model:", this.model); + } + console.log(" Prompt length:", enhancedPrompt.length, "chars"); + } + + const blackboxProcess = spawn("blackbox", args, { + cwd: projectDir, + env, + stdio: ["pipe", "pipe", "pipe"], + }); + this.processes.set(processId, blackboxProcess); + + if (blackboxProcess.stdin) { + blackboxProcess.stdin.write(enhancedPrompt); + blackboxProcess.stdin.end(); + } + + let stdout = ""; + let stderr = ""; + let lastOutputTime = startTime; + let resolved = false; + + const idleTimeoutMs = 90000; // 90 second idle timeout + let idleTimeoutHandle: NodeJS.Timeout | null = null; + + function resolveOnce(result: { success: boolean; output: string; error?: string }) { + if (resolved) return; + resolved = true; + clearTimeout(absoluteTimeoutId); + if (idleTimeoutHandle) clearTimeout(idleTimeoutHandle); + clearInterval(heartbeat); + resolve(result); + } + + function resetIdleTimeout() { + if (idleTimeoutHandle) clearTimeout(idleTimeoutHandle); + + idleTimeoutHandle = setTimeout(() => { + const sinceLastOutput = Date.now() - lastOutputTime; + console.log( + `ā±ļø Idle timeout reached (${(sinceLastOutput / 1000).toFixed(1)}s since last output)`, + ); + console.log(`šŸ›‘ Forcefully terminating blackbox process ${blackboxProcess.pid}...`); + blackboxProcess.kill("SIGTERM"); + + setTimeout(() => { + if (!resolved) { + console.log("šŸ›‘ Process didn't respond to SIGTERM, using SIGKILL..."); + blackboxProcess.kill("SIGKILL"); + } + }, 5000); + }, idleTimeoutMs); + } + + resetIdleTimeout(); + + const heartbeat = setInterval(() => { + const elapsed = Date.now() - startTime; + const sinceLastOutput = Date.now() - lastOutputTime; + console.log( + `ā³ Blackbox still running... (${(elapsed / 1000).toFixed(1)}s elapsed, ${(sinceLastOutput / 1000).toFixed(1)}s since last output)`, + ); + }, 5000); + + blackboxProcess.stdout?.on("data", (data) => { + const output = data.toString(); + lastOutputTime = Date.now(); + resetIdleTimeout(); + process.stdout.write(`[blackbox stdout] ${output}`); + if (this.verbose) { + console.log(`[DEBUG] stdout bytes: ${JSON.stringify(output)}`); + } + stdout += output; + }); + + blackboxProcess.stderr?.on("data", (data) => { + const output = data.toString(); + lastOutputTime = Date.now(); + resetIdleTimeout(); + process.stderr.write(`[blackbox stderr] ${output}`); + if (this.verbose) { + console.log(`[DEBUG] stderr bytes: ${JSON.stringify(output)}`); + } + stderr += output; + }); + + const absoluteTimeoutId = setTimeout(() => { + console.log(`ā±ļø Absolute timeout reached (${timeout}ms)`); + blackboxProcess.kill("SIGTERM"); + setTimeout(() => { + blackboxProcess.kill("SIGKILL"); + }, 5000); + resolveOnce({ + success: false, + output: stdout, + error: `Blackbox process timed out after ${timeout}ms`, + }); + }, timeout); + + blackboxProcess.on("exit", (code, signal) => { + const elapsed = Date.now() - startTime; + if (this.verbose) { + console.log( + `āœ“ Blackbox process exited with code: ${code}, signal: ${signal} after ${(elapsed / 1000).toFixed(1)}s`, + ); + } + + resolveOnce({ + success: code === 0 && !signal, + output: stdout, + error: signal + ? `Blackbox process killed by signal ${signal}` + : code !== 0 + ? stderr || `Blackbox process exited with code ${code}` + : undefined, + }); + }); + + blackboxProcess.on("error", (error) => { + resolveOnce({ + success: false, + output: stdout, + error: error.message, + }); + }); + }); + } + + private async copyTestFilesBack(inputDir: string, outputDir: string): Promise { + const entries = await fs.readdir(inputDir, { withFileTypes: true }); + + for (const entry of entries) { + if (entry.name === "node_modules") { + continue; + } + + const isTestFile = + entry.name.endsWith(".test.tsx") || + entry.name.endsWith(".test.ts") || + entry.name.endsWith(".spec.tsx") || + entry.name.endsWith(".spec.ts") || + entry.name.endsWith(".test.jsx") || + entry.name.endsWith(".test.js") || + entry.name.endsWith(".spec.jsx") || + entry.name.endsWith(".spec.js"); + const isTestDir = entry.name === "__tests__" || entry.name === "test" || entry.name === "tests"; + const isEslintConfig = + entry.name === ".eslintrc.json" || + entry.name === ".eslintrc.js" || + entry.name === ".eslintrc.cjs" || + entry.name === ".eslintrc.yml" || + entry.name === ".eslintrc.yaml" || + entry.name === "eslint.config.js" || + entry.name === "eslint.config.mjs" || + entry.name === "eslint.config.cjs"; + + const srcPath = path.join(inputDir, entry.name); + const destPath = path.join(outputDir, entry.name); + + try { + if (isTestFile || isEslintConfig) { + await fs.copyFile(srcPath, destPath); + } else if (entry.isDirectory() && isTestDir) { + await fs.cp(srcPath, destPath, { recursive: true, force: true }); + } else if (entry.isDirectory()) { + await this.copyTestFilesBack(srcPath, destPath); + } + } catch { + // Ignore errors (e.g., directory doesn't exist in output) + } + } + } + + private async runEvaluation( + projectDir: string, + ): Promise<{ + buildSuccess: boolean; + lintSuccess: boolean; + testSuccess: boolean; + buildOutput: string; + lintOutput: string; + testOutput: string; + }> { + let buildSuccess = false; + let buildOutput = ""; + let lintSuccess = false; + let lintOutput = ""; + let testSuccess = false; + let testOutput = ""; + + try { + if (this.verbose) { + console.log("Running build..."); + } + buildOutput = await this.execCommand(`cd "${projectDir}" && ../../node_modules/.bin/next build`, 60000); + buildSuccess = true; + if (this.verbose) { + console.log("āœ… Build completed"); + } + } catch (error) { + if (error && typeof error === "object" && "stdout" in error) { + buildOutput += (error as any).stdout || ""; + if ((error as any).stderr) { + buildOutput += "\n" + (error as any).stderr; + } + } else { + buildOutput += error instanceof Error ? error.message : String(error); + } + if (this.verbose) { + console.log("āŒ Build failed"); + } + } + + try { + if (this.verbose) { + console.log("Running lint..."); + } + + const eslintConfigPath = path.join(projectDir, ".eslintrc.json"); + const eslintConfigExists = await fs + .stat(eslintConfigPath) + .then(() => true) + .catch(() => false); + + if (!eslintConfigExists) { + const basicEslintConfig = { extends: "next/core-web-vitals" }; + await fs.writeFile(eslintConfigPath, JSON.stringify(basicEslintConfig, null, 2)); + } + + lintOutput = await this.execCommand(`cd "${projectDir}" && ../../node_modules/.bin/next lint`, 30000); + lintSuccess = true; + if (this.verbose) { + console.log("āœ… Lint completed"); + } + } catch (error) { + if (error && typeof error === "object" && "stdout" in error) { + lintOutput = (error as any).stdout || ""; + if ((error as any).stderr) { + lintOutput += "\n" + (error as any).stderr; + } + } else { + lintOutput = error instanceof Error ? error.message : String(error); + } + if (this.verbose) { + console.log("āŒ Lint failed"); + } + } + + try { + if (this.verbose) { + console.log("Running tests..."); + } + testOutput = await this.execCommand(`cd "${projectDir}" && ../../node_modules/.bin/vitest run`, 30000); + testSuccess = true; + if (this.verbose) { + console.log("āœ… Tests completed"); + } + } catch (error) { + if (error && typeof error === "object" && "stdout" in error) { + testOutput = (error as any).stdout || ""; + if ((error as any).stderr) { + testOutput += "\n" + (error as any).stderr; + } + } else { + testOutput = error instanceof Error ? error.message : String(error); + } + if (this.verbose) { + console.log("āŒ Tests failed"); + } + } + + return { buildSuccess, buildOutput, lintSuccess, lintOutput, testSuccess, testOutput }; + } + + private async execCommand(command: string, timeout: number): Promise { + return new Promise((resolve, reject) => { + const { exec } = require("child_process"); + exec( + command, + { maxBuffer: 10 * 1024 * 1024, timeout }, + (error: any, stdout: string, stderr: string) => { + if (error) { + error.stdout = stdout; + error.stderr = stderr; + reject(error); + } else { + resolve(stdout); + } + }, + ); + }); + } + + async cleanup(): Promise { + const promises = Array.from(this.processes.entries()).map(([processId, process]) => { + return new Promise((resolve) => { + process.kill("SIGTERM"); + process.on("exit", () => { + this.processes.delete(processId); + resolve(); + }); + setTimeout(() => { + process.kill("SIGKILL"); + this.processes.delete(processId); + resolve(); + }, 5000); + }); + }); + await Promise.all(promises); + } +} + +export async function runBlackboxEval( + evalPath: string, + options: BlackboxEvalOptions = {}, +): Promise { + const evalsDir = path.join(process.cwd(), "evals"); + const fullEvalPath = path.join(evalsDir, evalPath); + + const evalStat = await fs.stat(fullEvalPath).catch(() => null); + if (!evalStat || !evalStat.isDirectory()) { + throw new Error(`Eval directory not found: ${evalPath}`); + } + + const inputDir = path.join(fullEvalPath, "input"); + const inputExists = await fs + .stat(inputDir) + .then((s) => s.isDirectory()) + .catch(() => false); + if (!inputExists) { + throw new Error(`No input directory found in ${evalPath}`); + } + + const promptFile = path.join(fullEvalPath, "prompt.md"); + const promptExists = await fs + .stat(promptFile) + .then((s) => s.isFile()) + .catch(() => false); + if (!promptExists) { + throw new Error(`No prompt.md file found in ${evalPath}`); + } + + const prompt = await fs.readFile(promptFile, "utf8"); + const outputDir = path.join(fullEvalPath, "output-blackbox"); + + const runner = new BlackboxRunner(options); + + const base_url = process.env.OPENAI_BASE_URL; + if (!base_url) { + process.env.OPENAI_BASE_URL = "https://openrouter.ai/api/v1" + console.warn("āš ļø OPENAI_BASE_URL environment variable is not set.By default the base url is set to openrouter Base URL. Set OPENAI_BASE_URL to change."); + } + + try { + const result = await runner.runBlackboxEval(inputDir, outputDir, prompt,options.timeout); + const timestamp = new Date().toISOString(); + const enrichedResult: BlackboxResult = { ...result, evalPath, timestamp }; + + if (!options.skipFileWrite) { + let outputFile = options.outputFile; + if (!outputFile) { + const resultsDir = path.join(process.cwd(), "results"); + await fs.mkdir(resultsDir, { recursive: true }); + const sanitizedEvalPath = evalPath.replace(/\//g, "-"); + const timestampStr = Date.now(); + outputFile = path.join(resultsDir, `blackbox-${sanitizedEvalPath}-${timestampStr}.json`); + } + + try { + await fs.writeFile(outputFile, JSON.stringify(enrichedResult, null, 2), "utf-8"); + console.log(`šŸ“ Results written to: ${outputFile}`); + } catch (error) { + console.error( + `āš ļø Failed to write results to file: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + } + } + + return enrichedResult; + } finally { + await runner.cleanup(); + } +} +