Skip to content

Commit 0651596

Browse files
Adds cost reporting
1 parent 0c769f0 commit 0651596

File tree

1 file changed

+38
-14
lines changed

1 file changed

+38
-14
lines changed

tests/eval/eval_test.go

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,22 @@ or
5757
)
5858

5959
var (
60-
tempDir string
61-
localRepoRoot string
62-
testCases []string
63-
goldenModel string
64-
integrationModel string
65-
judgeModel string
60+
tempDir string
61+
localRepoRoot string
62+
testCases []string
63+
goldenModel string
64+
integrationModel string
65+
judgeModel string
66+
totalReviewerCost float64
67+
totalJudgeCost float64
6668
)
6769

70+
type claudeOutput struct {
71+
Type string `json:"type"`
72+
Result string `json:"result"`
73+
TotalCostUSD float64 `json:"total_cost_usd"`
74+
}
75+
6876
func TestEval(t *testing.T) {
6977
RegisterFailHandler(Fail)
7078
RunSpecs(t, "API Review Eval Suite")
@@ -116,6 +124,7 @@ var _ = AfterSuite(func() {
116124
By("cleaning up temp directory")
117125
os.RemoveAll(tempDir)
118126
}
127+
fmt.Printf("\nTotal Cost: $%.4f (Reviewer: $%.4f, Judge: $%.4f)\n", totalReviewerCost+totalJudgeCost, totalReviewerCost, totalJudgeCost)
119128
})
120129

121130
func copyLocalFiles() {
@@ -244,7 +253,7 @@ func readAndApplyPatch(patchPath string) {
244253
}
245254

246255
// runAPIReview and runJudge can probably share some common code.
247-
func runAPIReview(model string) string {
256+
func runAPIReview(model string) (string, float64) {
248257
By(fmt.Sprintf("running API review via Claude (%s)", model))
249258
ctx, cancel := context.WithTimeout(context.Background(), claudeTimeout)
250259
defer cancel()
@@ -255,15 +264,22 @@ func runAPIReview(model string) string {
255264
"--model", model,
256265
"-p", "/api-review",
257266
"--allowedTools", "Bash,Read,Grep,Glob,Task",
267+
"--output-format", "json",
258268
)
259269
cmd.Dir = tempDir
260270

261271
output, err := cmd.CombinedOutput()
262272
Expect(err).NotTo(HaveOccurred(), "claude command failed: %s", string(output))
263-
return string(output)
273+
274+
var parsed claudeOutput
275+
err = json.Unmarshal(output, &parsed)
276+
Expect(err).NotTo(HaveOccurred(), "failed to parse claude output: %s", string(output))
277+
278+
totalReviewerCost += parsed.TotalCostUSD
279+
return parsed.Result, parsed.TotalCostUSD
264280
}
265281

266-
func runJudge(model, reviewOutput, expectedIssues string) evalResult {
282+
func runJudge(model, reviewOutput, expectedIssues string) (evalResult, float64) {
267283
By(fmt.Sprintf("comparing results with Claude judge (%s)", model))
268284
ctx, cancel := context.WithTimeout(context.Background(), claudeTimeout)
269285
defer cancel()
@@ -274,17 +290,24 @@ func runJudge(model, reviewOutput, expectedIssues string) evalResult {
274290
"--dangerously-skip-permissions",
275291
"--model", model,
276292
"-p", prompt,
293+
"--output-format", "json",
277294
)
278295
cmd.Dir = tempDir
279296

280297
output, err := cmd.CombinedOutput()
281298
Expect(err).NotTo(HaveOccurred(), "claude judge command failed: %s", string(output))
282299

300+
var parsed claudeOutput
301+
err = json.Unmarshal(output, &parsed)
302+
Expect(err).NotTo(HaveOccurred(), "failed to parse judge output: %s", string(output))
303+
304+
totalJudgeCost += parsed.TotalCostUSD
305+
283306
var result evalResult
284-
jsonStr := stripMarkdownCodeBlock(string(output))
307+
jsonStr := stripMarkdownCodeBlock(parsed.Result)
285308
err = json.Unmarshal([]byte(jsonStr), &result)
286-
Expect(err).NotTo(HaveOccurred(), "failed to parse judge response as JSON: %s", string(output))
287-
return result
309+
Expect(err).NotTo(HaveOccurred(), "failed to parse judge response as JSON: %s", parsed.Result)
310+
return result, parsed.TotalCostUSD
288311
}
289312

290313
func runTestCase(tier, tc, reviewModel, judgeModelName string) {
@@ -297,9 +320,10 @@ func runTestCase(tier, tc, reviewModel, judgeModelName string) {
297320
Expect(err).NotTo(HaveOccurred())
298321
expectedIssues := strings.TrimSpace(string(expectedContent))
299322

300-
reviewOutput := runAPIReview(reviewModel)
301-
result := runJudge(judgeModelName, reviewOutput, expectedIssues)
323+
reviewOutput, reviewCost := runAPIReview(reviewModel)
324+
result, judgeCost := runJudge(judgeModelName, reviewOutput, expectedIssues)
302325

326+
GinkgoWriter.Printf("Cost: Reviewer=$%.4f, Judge=$%.4f, Total=$%.4f\n", reviewCost, judgeCost, reviewCost+judgeCost)
303327
GinkgoWriter.Printf("Judge result: pass=%v, reason=%s\n", result.Pass, result.Reason)
304328
Expect(result.Pass).To(BeTrue(), "API review did not match expected issues.\nJudge reason: %s\nReview output:\n%s\nExpected issues:\n%s", result.Reason, reviewOutput, expectedIssues)
305329
}

0 commit comments

Comments
 (0)