From 3b3bf15d7f4845ffe42bb8c715a5265c587f6a46 Mon Sep 17 00:00:00 2001 From: Shibo Date: Thu, 6 Feb 2025 18:16:53 +0200 Subject: [PATCH] updated readme --- README.md | 85 ++++++++++++++++++++++---- scripts/run-benchmark.ts | 127 --------------------------------------- 2 files changed, 72 insertions(+), 140 deletions(-) delete mode 100644 scripts/run-benchmark.ts diff --git a/README.md b/README.md index 614375b..bb80b46 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,79 @@ -# Prompt Benchmarks +# tscircuit Prompt Benchmarks -[Docs](https://docs.tscircuit.com) · [Website](https://tscircuit.com) · [Twitter](https://x.com/tscircuit) · [discord](https://tscircuit.com/community/join-redirect) · [Quickstart](https://docs.tscircuit.com/quickstart) · [Online Playground](https://tscircuit.com/playground) +[Docs](https://docs.tscircuit.com) · [Website](https://tscircuit.com) · [Twitter](https://x.com/tscircuit) · [Discord](https://tscircuit.com/community/join-redirect) · [Quickstart](https://docs.tscircuit.com/quickstart) · [Online Playground](https://tscircuit.com/playground) + +This repository contains benchmarks for evaluating and improving the quality of system prompts used to generate tscircuit code. It includes components for: + +- **Code Runner** (in `lib/code-runner`): Safely transpiles, evaluates, and renders TSX code for circuit generation. +- **AI Integration** (in `lib/ai`): Interfaces with Anthropic’s Claude models for prompt completions and error correction. +- **Utility Modules** (in `lib/utils`): Provide logging, snapshot management, and type-checking of generated circuits. +- **Prompt Templates** (in `lib/prompt-templates`): Define various prompt structures for generating different circuit types. +- **Benchmarking & Scoring** (using evalite and custom scorers in `benchmarks/scorers`): Run multiple tests to ensure circuit validity and quality. -This repo contains benchmarks for tscircuit system prompts used for -automatically generating tscircuit code. ## Running Benchmarks -You can use `bun run benchmark` to select and run a benchmark. A single prompt takes about 10s-15s to -run when run with `sonnet`. We have a set of samples (see the [tests/samples](./tests/samples) directory) -that the benchmarks run against. When you change a prompt, you must run the benchmark -for that prompt to update the benchmark snapshot. This is how we record degradation -or improvement in the response quality. Each sample is run 5 times and two tests -are run: +To run the benchmarks using evalite, use: +```bash +bun start +``` +Each prompt is processed multiple times to test: +1. Whether the output compiles without errors. +2. Whether the output meets the expected circuit specifications. + +After modifying prompts or system components, evalite reruns automatically, you should skip the benchmarks you don't want to run. + +### Problem Sets + +This project uses TOML files to define problem sets for circuit generation. Each problem is defined using a TOML array of tables with the following format: + +```toml +[[problems]] +prompt = """ +Your circuit prompt description goes here. +""" +title = "Sample Problem Title" +questions = [ + { text = "Question text", answer = true }, + { text = "Another question text", answer = false } +] +``` + +In each problem: +- The `prompt` field must contain the circuit description that instructs the AI. +- The `title` gives a short title for the problem. +- The `questions` array contains objects with a `text` property (the question) and an `answer` property (a boolean) used to validate the generated circuit. + +To add a new problem set, create a new TOML file in the `problem-sets` directory following this format. Each new file can contain one or more problems defined with the `[[problems]]` header. + +## Build, Test, and Start + +- **Build**: `bun run build` +- **Test**: `bun run test` +- **Start**: `bun start` + +## Benchmarks Directory + +The benchmarks directory contains various files to help evaluate and score circuit‐generating prompts: + +• benchmarks/prompt-logs/ +  These are text files (e.g., prompt-2025-02-05T14-07-18-242Z.txt, prompt-2025-02-05T14-10-53-144Z.txt, etc.) that log each prompt attempt and its output. They serve as a history of interactions. + +• benchmarks/benchmark-local-circuit-error-correction.eval.ts +  Runs local circuit evaluation with an error correction workflow. It repeatedly calls the AI (up to a set maximum) until the circuit output meets expectations, logging each attempt. + +• benchmarks/benchmark-local-circuit.eval.ts +  Evaluates a local circuit by running a specific user prompt and checking that the generated circuit compiles and meets expected behaviors. + +• benchmarks/benchmark-local-circuit-random.eval.ts +  Generates random prompts using an AI-powered prompt generator and evaluates their corresponding circuit outputs. This file is useful for stress-testing and assessing the robustness of circuit generation. + +• benchmarks/scorers/ai-circuit-scorer.ts +  Uses an AI model to assign a score (from 0 to 1) based on correctness, appropriate use of components, circuit complexity, and code quality. + +• benchmarks/scorers/circuit-scorer.ts +  A basic scorer that checks each generated circuit against predefined questions and answers from problem sets. -1. Does the output from the prompt compile? -2. Does the output produce the expected circuit? +## License -The benchmark shows the percentage of samples that pass (1) and (2) +MIT License diff --git a/scripts/run-benchmark.ts b/scripts/run-benchmark.ts deleted file mode 100644 index 4f55a6c..0000000 --- a/scripts/run-benchmark.ts +++ /dev/null @@ -1,127 +0,0 @@ -import fs from "node:fs" -import path from "node:path" -import toml from "toml" -import { anthropic } from "../lib/ai/anthropic" -import { safeEvaluateCode } from "../lib/code-runner/safe-evaluate-code" -import { askAboutOutput } from "../tests/fixtures/ask-about-output" -import { createCircuitBoard1Template } from "../lib/prompt-templates/create-circuit-board1" - -interface Problem { - prompt: string - questions: { text: string; answer: boolean }[] -} - -interface Result { - prompt: string - questions: { text: string; expected: boolean; actual: boolean }[] - score: number - evaluationError?: string -} - -const loadProblems = (filePath: string): Problem[] => { - const tomlContent = fs.readFileSync(filePath, "utf-8") - const parsedToml = toml.parse(tomlContent) - - return parsedToml.problems.map((problem: any) => ({ - prompt: problem.prompt, - questions: problem.questions.map((q: any) => ({ - text: q.text, - answer: q.answer, - })), - })) -} - -const runAI = async (prompt: string): Promise => { - const fullPrompt = `${createCircuitBoard1Template({ - currentCode: "", - availableImports: {}, - })}\n\n${prompt}` - const completion = await anthropic.messages.create({ - model: "claude-3-5-haiku-20241022", - max_tokens: 1024, - system: "You are an expert in electronic circuit design and tscircuit.", - messages: [ - { - role: "user", - content: fullPrompt, - }, - ], - }) - - return (completion as any).content[0]?.text || "" -} - -const gaugeAccuracy = async (problem: Problem): Promise => { - const aiResponse = await runAI(problem.prompt) - const codeMatch = aiResponse.match(/```tsx\s*([\s\S]*?)\s*```/) - const code = codeMatch ? codeMatch[1].trim() : "" - - const evaluation = safeEvaluateCode(code, { - outputType: "board", - preSuppliedImports: {}, - }) - - if (!evaluation.success) { - return { - prompt: problem.prompt, - questions: problem.questions.map((q) => ({ - text: q.text, - expected: q.answer, - actual: false, - })), - score: 0, - evaluationError: evaluation.error, - } - } - - const questionsResults = await Promise.all( - problem.questions.map(async (question) => { - const actual = await askAboutOutput(code, question.text) - return { text: question.text, expected: question.answer, actual } - }), - ) - - const score = questionsResults.reduce( - (acc, result) => acc + (result.expected === result.actual ? 1 : 0), - 0, - ) - - return { - prompt: problem.prompt, - questions: questionsResults, - score, - } -} - -const outputResults = (results: Result[], filePath: string) => { - const markdown = results - .map( - (result) => ` -## Problem -${result.prompt} -### Questions and Responses -${result.questions - .map( - (question) => `- ${question.text} - - Expected: ${question.expected} - - Actual: ${question.actual}`, - ) - .join("\n")} -### Score -${result.score} -`, - ) - .join("\n") - - fs.writeFileSync(filePath, markdown) -} - -const main = async () => { - const problems = loadProblems( - path.join(__dirname, "../problem-sets/problems-1.toml"), - ) - const results = await Promise.all(problems.map(gaugeAccuracy)) - outputResults(results, path.join(__dirname, "../benchmarks/results.md")) -} - -main().catch(console.error)