diff --git a/benchmarks/utils/report_costs.py b/benchmarks/utils/report_costs.py index 564f2448..67076a82 100755 --- a/benchmarks/utils/report_costs.py +++ b/benchmarks/utils/report_costs.py @@ -160,8 +160,14 @@ def calculate_costs(directory_path: str) -> None: # Find output files output_file, critic_files = find_output_files(directory) - if not output_file and not critic_files: - print(f"No output.jsonl or critic attempt files found in {directory_path}") + if not output_file: + print(f"Missing output.jsonl in {directory_path}") + sys.exit(1) + + if not critic_files: + print( + f"Missing critic attempt files (output.critic_attempt_*.jsonl) in {directory_path}" + ) sys.exit(1) print(f"Cost Report for: {directory_path}") @@ -176,19 +182,48 @@ def calculate_costs(directory_path: str) -> None: "summary": {}, } - main_cost: Optional[float] = None - main_average_duration: Optional[float] = None + main_cost: float + main_average_duration: float # Process main output file - if output_file: - print("\nSelected instance in Main output.jsonl only:") - print(f" {output_file.name}") + print("\nSelected instance in Main output.jsonl only:") + print(f" {output_file.name}") + + jsonl_data = read_jsonl_file(output_file) + cost = extract_accumulated_cost(jsonl_data) + time_stats = calculate_time_statistics(jsonl_data) + main_cost = cost + main_average_duration = time_stats["average_duration"] + + print(f" Lines: {len(jsonl_data)}") + print(f" Cost: ${cost:.6f}") + print(" Time Stats:") + print(f" Average Duration: {format_duration(time_stats['average_duration'])}") + print(f" Max Duration: {format_duration(time_stats['max_duration'])}") + print(f" Min Duration: {format_duration(time_stats['min_duration'])}") + print( + f" Lines with Duration: {time_stats['lines_with_duration']}/{time_stats['total_lines']}" + ) + + # Add to report data + report_data["main_output"] = { + "filename": output_file.name, + "lines": len(jsonl_data), + "cost": cost, + "time_statistics": time_stats, + } - jsonl_data = read_jsonl_file(output_file) + # Process critic files individually + critic_total = 0.0 + print("\nCritic Attempt Files:") + + for critic_file in critic_files: + print(f" {critic_file.name}") + + jsonl_data = read_jsonl_file(critic_file) cost = extract_accumulated_cost(jsonl_data) time_stats = calculate_time_statistics(jsonl_data) - main_cost = cost - main_average_duration = time_stats.get("average_duration", 0.0) + critic_total += cost print(f" Lines: {len(jsonl_data)}") print(f" Cost: ${cost:.6f}") @@ -203,74 +238,35 @@ def calculate_costs(directory_path: str) -> None: ) # Add to report data - report_data["main_output"] = { - "filename": output_file.name, - "lines": len(jsonl_data), - "cost": cost, - "time_statistics": time_stats, - } + report_data["critic_files"].append( + { + "filename": critic_file.name, + "lines": len(jsonl_data), + "cost": cost, + "time_statistics": time_stats, + } + ) - # Process critic files individually - critic_total = 0.0 - if critic_files: - print("\nCritic Attempt Files:") - - for critic_file in critic_files: - print(f" {critic_file.name}") - - jsonl_data = read_jsonl_file(critic_file) - cost = extract_accumulated_cost(jsonl_data) - time_stats = calculate_time_statistics(jsonl_data) - critic_total += cost - - print(f" Lines: {len(jsonl_data)}") - print(f" Cost: ${cost:.6f}") - print(" Time Stats:") - print( - f" Average Duration: {format_duration(time_stats['average_duration'])}" - ) - print(f" Max Duration: {format_duration(time_stats['max_duration'])}") - print(f" Min Duration: {format_duration(time_stats['min_duration'])}") - print( - f" Lines with Duration: {time_stats['lines_with_duration']}/{time_stats['total_lines']}" - ) - - # Add to report data - report_data["critic_files"].append( - { - "filename": critic_file.name, - "lines": len(jsonl_data), - "cost": cost, - "time_statistics": time_stats, - } - ) - - print(f"\n Total Critic Files Cost: ${critic_total:.6f}") + print(f"\n Total Critic Files Cost: ${critic_total:.6f}") # Summary print("\n" + "=" * 80) print("SUMMARY:") # Summary duration is based on the main (aggregated) output only - summary_average_duration = main_average_duration or 0.0 + summary_average_duration = main_average_duration - # Total cost represents actual spend: - # - If critic files exist, they contain all attempts; use their sum. - # - Otherwise, fall back to the main output cost. - total_cost = critic_total if critic_files else (main_cost or 0.0) + # Total cost represents actual spend: critic files contain all attempts. + total_cost = critic_total - if main_cost is not None: - print(f" Main Output Cost (best results): ${main_cost:.6f}") - if critic_files: - print(f" Sum Critic Files (all attempts): ${critic_total:.6f}") + print(f" Main Output Cost (best results): ${main_cost:.6f}") + print(f" Sum Critic Files (all attempts): ${critic_total:.6f}") print(f" Total Cost (no double-count): ${total_cost:.6f}") summary = {"total_cost": total_cost, "average_duration": summary_average_duration} - if main_cost is not None: - summary["only_main_output_cost"] = main_cost - if critic_files: - summary["sum_critic_files"] = critic_total + summary["only_main_output_cost"] = main_cost + summary["sum_critic_files"] = critic_total report_data["summary"] = summary