From 3b3bf15d7f4845ffe42bb8c715a5265c587f6a46 Mon Sep 17 00:00:00 2001
From: Shibo <shibosoftwaredev@gmail.com>
Date: Thu, 6 Feb 2025 18:16:53 +0200
Subject: [PATCH] updated readme

---
 README.md                |  85 ++++++++++++++++++++++----
 scripts/run-benchmark.ts | 127 ---------------------------------------
 2 files changed, 72 insertions(+), 140 deletions(-)
 delete mode 100644 scripts/run-benchmark.ts

diff --git a/README.md b/README.md
index 614375b..bb80b46 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,79 @@
-# Prompt Benchmarks
+# tscircuit Prompt Benchmarks
 
-[Docs](https://docs.tscircuit.com) &middot; [Website](https://tscircuit.com) &middot; [Twitter](https://x.com/tscircuit) &middot; [discord](https://tscircuit.com/community/join-redirect) &middot; [Quickstart](https://docs.tscircuit.com/quickstart) &middot; [Online Playground](https://tscircuit.com/playground)
+[Docs](https://docs.tscircuit.com) · [Website](https://tscircuit.com) · [Twitter](https://x.com/tscircuit) · [Discord](https://tscircuit.com/community/join-redirect) · [Quickstart](https://docs.tscircuit.com/quickstart) · [Online Playground](https://tscircuit.com/playground)
+
+This repository contains benchmarks for evaluating and improving the quality of system prompts used to generate tscircuit code. It includes components for:
+
+- **Code Runner** (in `lib/code-runner`): Safely transpiles, evaluates, and renders TSX code for circuit generation.
+- **AI Integration** (in `lib/ai`): Interfaces with Anthropic’s Claude models for prompt completions and error correction.
+- **Utility Modules** (in `lib/utils`): Provide logging, snapshot management, and type-checking of generated circuits.
+- **Prompt Templates** (in `lib/prompt-templates`): Define various prompt structures for generating different circuit types.
+- **Benchmarking & Scoring** (using evalite and custom scorers in `benchmarks/scorers`): Run multiple tests to ensure circuit validity and quality.
 
-This repo contains benchmarks for tscircuit system prompts used for
-automatically generating tscircuit code.
 
 ## Running Benchmarks
 
-You can use `bun run benchmark` to select and run a benchmark. A single prompt takes about 10s-15s to
-run when run with `sonnet`. We have a set of samples (see the [tests/samples](./tests/samples) directory)
-that the benchmarks run against. When you change a prompt, you must run the benchmark
-for that prompt to update the benchmark snapshot. This is how we record degradation
-or improvement in the response quality. Each sample is run 5 times and two tests
-are run:
+To run the benchmarks using evalite, use:
+```bash
+bun start
+```
+Each prompt is processed multiple times to test:
+1. Whether the output compiles without errors.
+2. Whether the output meets the expected circuit specifications.
+
+After modifying prompts or system components, evalite reruns automatically, you should skip the benchmarks you don't want to run.
+
+### Problem Sets
+
+This project uses TOML files to define problem sets for circuit generation. Each problem is defined using a TOML array of tables with the following format:
+
+```toml
+[[problems]]
+prompt = """
+Your circuit prompt description goes here.
+"""
+title = "Sample Problem Title"
+questions = [
+  { text = "Question text", answer = true },
+  { text = "Another question text", answer = false }
+]
+```
+
+In each problem:
+- The `prompt` field must contain the circuit description that instructs the AI.
+- The `title` gives a short title for the problem.
+- The `questions` array contains objects with a `text` property (the question) and an `answer` property (a boolean) used to validate the generated circuit.
+
+To add a new problem set, create a new TOML file in the `problem-sets` directory following this format. Each new file can contain one or more problems defined with the `[[problems]]` header.
+
+## Build, Test, and Start
+
+- **Build**: `bun run build`
+- **Test**: `bun run test`
+- **Start**: `bun start`
+
+## Benchmarks Directory
+
+The benchmarks directory contains various files to help evaluate and score circuit‐generating prompts:
+
+• benchmarks/prompt-logs/  
+  These are text files (e.g., prompt-2025-02-05T14-07-18-242Z.txt, prompt-2025-02-05T14-10-53-144Z.txt, etc.) that log each prompt attempt and its output. They serve as a history of interactions.
+
+• benchmarks/benchmark-local-circuit-error-correction.eval.ts  
+  Runs local circuit evaluation with an error correction workflow. It repeatedly calls the AI (up to a set maximum) until the circuit output meets expectations, logging each attempt.
+
+• benchmarks/benchmark-local-circuit.eval.ts  
+  Evaluates a local circuit by running a specific user prompt and checking that the generated circuit compiles and meets expected behaviors.
+
+• benchmarks/benchmark-local-circuit-random.eval.ts  
+  Generates random prompts using an AI-powered prompt generator and evaluates their corresponding circuit outputs. This file is useful for stress-testing and assessing the robustness of circuit generation.
+
+• benchmarks/scorers/ai-circuit-scorer.ts  
+  Uses an AI model to assign a score (from 0 to 1) based on correctness, appropriate use of components, circuit complexity, and code quality.
+
+• benchmarks/scorers/circuit-scorer.ts  
+  A basic scorer that checks each generated circuit against predefined questions and answers from problem sets.
 
-1. Does the output from the prompt compile?
-2. Does the output produce the expected circuit?
+## License
 
-The benchmark shows the percentage of samples that pass (1) and (2)
+MIT License
diff --git a/scripts/run-benchmark.ts b/scripts/run-benchmark.ts
deleted file mode 100644
index 4f55a6c..0000000
--- a/scripts/run-benchmark.ts
+++ /dev/null
@@ -1,127 +0,0 @@
-import fs from "node:fs"
-import path from "node:path"
-import toml from "toml"
-import { anthropic } from "../lib/ai/anthropic"
-import { safeEvaluateCode } from "../lib/code-runner/safe-evaluate-code"
-import { askAboutOutput } from "../tests/fixtures/ask-about-output"
-import { createCircuitBoard1Template } from "../lib/prompt-templates/create-circuit-board1"
-
-interface Problem {
-  prompt: string
-  questions: { text: string; answer: boolean }[]
-}
-
-interface Result {
-  prompt: string
-  questions: { text: string; expected: boolean; actual: boolean }[]
-  score: number
-  evaluationError?: string
-}
-
-const loadProblems = (filePath: string): Problem[] => {
-  const tomlContent = fs.readFileSync(filePath, "utf-8")
-  const parsedToml = toml.parse(tomlContent)
-
-  return parsedToml.problems.map((problem: any) => ({
-    prompt: problem.prompt,
-    questions: problem.questions.map((q: any) => ({
-      text: q.text,
-      answer: q.answer,
-    })),
-  }))
-}
-
-const runAI = async (prompt: string): Promise<string> => {
-  const fullPrompt = `${createCircuitBoard1Template({
-    currentCode: "",
-    availableImports: {},
-  })}\n\n${prompt}`
-  const completion = await anthropic.messages.create({
-    model: "claude-3-5-haiku-20241022",
-    max_tokens: 1024,
-    system: "You are an expert in electronic circuit design and tscircuit.",
-    messages: [
-      {
-        role: "user",
-        content: fullPrompt,
-      },
-    ],
-  })
-
-  return (completion as any).content[0]?.text || ""
-}
-
-const gaugeAccuracy = async (problem: Problem): Promise<Result> => {
-  const aiResponse = await runAI(problem.prompt)
-  const codeMatch = aiResponse.match(/```tsx\s*([\s\S]*?)\s*```/)
-  const code = codeMatch ? codeMatch[1].trim() : ""
-
-  const evaluation = safeEvaluateCode(code, {
-    outputType: "board",
-    preSuppliedImports: {},
-  })
-
-  if (!evaluation.success) {
-    return {
-      prompt: problem.prompt,
-      questions: problem.questions.map((q) => ({
-        text: q.text,
-        expected: q.answer,
-        actual: false,
-      })),
-      score: 0,
-      evaluationError: evaluation.error,
-    }
-  }
-
-  const questionsResults = await Promise.all(
-    problem.questions.map(async (question) => {
-      const actual = await askAboutOutput(code, question.text)
-      return { text: question.text, expected: question.answer, actual }
-    }),
-  )
-
-  const score = questionsResults.reduce(
-    (acc, result) => acc + (result.expected === result.actual ? 1 : 0),
-    0,
-  )
-
-  return {
-    prompt: problem.prompt,
-    questions: questionsResults,
-    score,
-  }
-}
-
-const outputResults = (results: Result[], filePath: string) => {
-  const markdown = results
-    .map(
-      (result) => `
-## Problem
-${result.prompt}
-### Questions and Responses
-${result.questions
-  .map(
-    (question) => `- ${question.text}
-  - Expected: ${question.expected}
-  - Actual: ${question.actual}`,
-  )
-  .join("\n")}
-### Score
-${result.score}
-`,
-    )
-    .join("\n")
-
-  fs.writeFileSync(filePath, markdown)
-}
-
-const main = async () => {
-  const problems = loadProblems(
-    path.join(__dirname, "../problem-sets/problems-1.toml"),
-  )
-  const results = await Promise.all(problems.map(gaugeAccuracy))
-  outputResults(results, path.join(__dirname, "../benchmarks/results.md"))
-}
-
-main().catch(console.error)