5757)
5858
5959var (
60- tempDir string
61- localRepoRoot string
62- testCases []string
63- goldenModel string
64- integrationModel string
65- judgeModel string
60+ tempDir string
61+ localRepoRoot string
62+ testCases []string
63+ goldenModel string
64+ integrationModel string
65+ judgeModel string
66+ totalReviewerCost float64
67+ totalJudgeCost float64
6668)
6769
70+ type claudeOutput struct {
71+ Type string `json:"type"`
72+ Result string `json:"result"`
73+ TotalCostUSD float64 `json:"total_cost_usd"`
74+ }
75+
6876func TestEval (t * testing.T ) {
6977 RegisterFailHandler (Fail )
7078 RunSpecs (t , "API Review Eval Suite" )
@@ -116,6 +124,7 @@ var _ = AfterSuite(func() {
116124 By ("cleaning up temp directory" )
117125 os .RemoveAll (tempDir )
118126 }
127+ fmt .Printf ("\n Total Cost: $%.4f (Reviewer: $%.4f, Judge: $%.4f)\n " , totalReviewerCost + totalJudgeCost , totalReviewerCost , totalJudgeCost )
119128})
120129
121130func copyLocalFiles () {
@@ -244,7 +253,7 @@ func readAndApplyPatch(patchPath string) {
244253}
245254
246255// runAPIReview and runJudge can probably share some common code.
247- func runAPIReview (model string ) string {
256+ func runAPIReview (model string ) ( string , float64 ) {
248257 By (fmt .Sprintf ("running API review via Claude (%s)" , model ))
249258 ctx , cancel := context .WithTimeout (context .Background (), claudeTimeout )
250259 defer cancel ()
@@ -255,15 +264,22 @@ func runAPIReview(model string) string {
255264 "--model" , model ,
256265 "-p" , "/api-review" ,
257266 "--allowedTools" , "Bash,Read,Grep,Glob,Task" ,
267+ "--output-format" , "json" ,
258268 )
259269 cmd .Dir = tempDir
260270
261271 output , err := cmd .CombinedOutput ()
262272 Expect (err ).NotTo (HaveOccurred (), "claude command failed: %s" , string (output ))
263- return string (output )
273+
274+ var parsed claudeOutput
275+ err = json .Unmarshal (output , & parsed )
276+ Expect (err ).NotTo (HaveOccurred (), "failed to parse claude output: %s" , string (output ))
277+
278+ totalReviewerCost += parsed .TotalCostUSD
279+ return parsed .Result , parsed .TotalCostUSD
264280}
265281
266- func runJudge (model , reviewOutput , expectedIssues string ) evalResult {
282+ func runJudge (model , reviewOutput , expectedIssues string ) ( evalResult , float64 ) {
267283 By (fmt .Sprintf ("comparing results with Claude judge (%s)" , model ))
268284 ctx , cancel := context .WithTimeout (context .Background (), claudeTimeout )
269285 defer cancel ()
@@ -274,17 +290,24 @@ func runJudge(model, reviewOutput, expectedIssues string) evalResult {
274290 "--dangerously-skip-permissions" ,
275291 "--model" , model ,
276292 "-p" , prompt ,
293+ "--output-format" , "json" ,
277294 )
278295 cmd .Dir = tempDir
279296
280297 output , err := cmd .CombinedOutput ()
281298 Expect (err ).NotTo (HaveOccurred (), "claude judge command failed: %s" , string (output ))
282299
300+ var parsed claudeOutput
301+ err = json .Unmarshal (output , & parsed )
302+ Expect (err ).NotTo (HaveOccurred (), "failed to parse judge output: %s" , string (output ))
303+
304+ totalJudgeCost += parsed .TotalCostUSD
305+
283306 var result evalResult
284- jsonStr := stripMarkdownCodeBlock (string ( output ) )
307+ jsonStr := stripMarkdownCodeBlock (parsed . Result )
285308 err = json .Unmarshal ([]byte (jsonStr ), & result )
286- Expect (err ).NotTo (HaveOccurred (), "failed to parse judge response as JSON: %s" , string ( output ) )
287- return result
309+ Expect (err ).NotTo (HaveOccurred (), "failed to parse judge response as JSON: %s" , parsed . Result )
310+ return result , parsed . TotalCostUSD
288311}
289312
290313func runTestCase (tier , tc , reviewModel , judgeModelName string ) {
@@ -297,9 +320,10 @@ func runTestCase(tier, tc, reviewModel, judgeModelName string) {
297320 Expect (err ).NotTo (HaveOccurred ())
298321 expectedIssues := strings .TrimSpace (string (expectedContent ))
299322
300- reviewOutput := runAPIReview (reviewModel )
301- result := runJudge (judgeModelName , reviewOutput , expectedIssues )
323+ reviewOutput , reviewCost := runAPIReview (reviewModel )
324+ result , judgeCost := runJudge (judgeModelName , reviewOutput , expectedIssues )
302325
326+ GinkgoWriter .Printf ("Cost: Reviewer=$%.4f, Judge=$%.4f, Total=$%.4f\n " , reviewCost , judgeCost , reviewCost + judgeCost )
303327 GinkgoWriter .Printf ("Judge result: pass=%v, reason=%s\n " , result .Pass , result .Reason )
304328 Expect (result .Pass ).To (BeTrue (), "API review did not match expected issues.\n Judge reason: %s\n Review output:\n %s\n Expected issues:\n %s" , result .Reason , reviewOutput , expectedIssues )
305329}
0 commit comments