Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 61 additions & 65 deletions benchmarks/utils/report_costs.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,14 @@ def calculate_costs(directory_path: str) -> None:
# Find output files
output_file, critic_files = find_output_files(directory)

if not output_file and not critic_files:
print(f"No output.jsonl or critic attempt files found in {directory_path}")
if not output_file:
print(f"Missing output.jsonl in {directory_path}")
sys.exit(1)

if not critic_files:
print(
f"Missing critic attempt files (output.critic_attempt_*.jsonl) in {directory_path}"
)
sys.exit(1)

print(f"Cost Report for: {directory_path}")
Expand All @@ -176,19 +182,48 @@ def calculate_costs(directory_path: str) -> None:
"summary": {},
}

main_cost: Optional[float] = None
main_average_duration: Optional[float] = None
main_cost: float
main_average_duration: float

# Process main output file
if output_file:
print("\nSelected instance in Main output.jsonl only:")
print(f" {output_file.name}")
print("\nSelected instance in Main output.jsonl only:")
print(f" {output_file.name}")

jsonl_data = read_jsonl_file(output_file)
cost = extract_accumulated_cost(jsonl_data)
time_stats = calculate_time_statistics(jsonl_data)
main_cost = cost
main_average_duration = time_stats["average_duration"]

print(f" Lines: {len(jsonl_data)}")
print(f" Cost: ${cost:.6f}")
print(" Time Stats:")
print(f" Average Duration: {format_duration(time_stats['average_duration'])}")
print(f" Max Duration: {format_duration(time_stats['max_duration'])}")
print(f" Min Duration: {format_duration(time_stats['min_duration'])}")
print(
f" Lines with Duration: {time_stats['lines_with_duration']}/{time_stats['total_lines']}"
)

# Add to report data
report_data["main_output"] = {
"filename": output_file.name,
"lines": len(jsonl_data),
"cost": cost,
"time_statistics": time_stats,
}

jsonl_data = read_jsonl_file(output_file)
# Process critic files individually
critic_total = 0.0
print("\nCritic Attempt Files:")

for critic_file in critic_files:
print(f" {critic_file.name}")

jsonl_data = read_jsonl_file(critic_file)
cost = extract_accumulated_cost(jsonl_data)
time_stats = calculate_time_statistics(jsonl_data)
main_cost = cost
main_average_duration = time_stats.get("average_duration", 0.0)
critic_total += cost

print(f" Lines: {len(jsonl_data)}")
print(f" Cost: ${cost:.6f}")
Expand All @@ -203,74 +238,35 @@ def calculate_costs(directory_path: str) -> None:
)

# Add to report data
report_data["main_output"] = {
"filename": output_file.name,
"lines": len(jsonl_data),
"cost": cost,
"time_statistics": time_stats,
}
report_data["critic_files"].append(
{
"filename": critic_file.name,
"lines": len(jsonl_data),
"cost": cost,
"time_statistics": time_stats,
}
)

# Process critic files individually
critic_total = 0.0
if critic_files:
print("\nCritic Attempt Files:")

for critic_file in critic_files:
print(f" {critic_file.name}")

jsonl_data = read_jsonl_file(critic_file)
cost = extract_accumulated_cost(jsonl_data)
time_stats = calculate_time_statistics(jsonl_data)
critic_total += cost

print(f" Lines: {len(jsonl_data)}")
print(f" Cost: ${cost:.6f}")
print(" Time Stats:")
print(
f" Average Duration: {format_duration(time_stats['average_duration'])}"
)
print(f" Max Duration: {format_duration(time_stats['max_duration'])}")
print(f" Min Duration: {format_duration(time_stats['min_duration'])}")
print(
f" Lines with Duration: {time_stats['lines_with_duration']}/{time_stats['total_lines']}"
)

# Add to report data
report_data["critic_files"].append(
{
"filename": critic_file.name,
"lines": len(jsonl_data),
"cost": cost,
"time_statistics": time_stats,
}
)

print(f"\n Total Critic Files Cost: ${critic_total:.6f}")
print(f"\n Total Critic Files Cost: ${critic_total:.6f}")

# Summary
print("\n" + "=" * 80)
print("SUMMARY:")

# Summary duration is based on the main (aggregated) output only
summary_average_duration = main_average_duration or 0.0
summary_average_duration = main_average_duration

# Total cost represents actual spend:
# - If critic files exist, they contain all attempts; use their sum.
# - Otherwise, fall back to the main output cost.
total_cost = critic_total if critic_files else (main_cost or 0.0)
# Total cost represents actual spend: critic files contain all attempts.
total_cost = critic_total

if main_cost is not None:
print(f" Main Output Cost (best results): ${main_cost:.6f}")
if critic_files:
print(f" Sum Critic Files (all attempts): ${critic_total:.6f}")
print(f" Main Output Cost (best results): ${main_cost:.6f}")
print(f" Sum Critic Files (all attempts): ${critic_total:.6f}")
print(f" Total Cost (no double-count): ${total_cost:.6f}")

summary = {"total_cost": total_cost, "average_duration": summary_average_duration}

if main_cost is not None:
summary["only_main_output_cost"] = main_cost
if critic_files:
summary["sum_critic_files"] = critic_total
summary["only_main_output_cost"] = main_cost
summary["sum_critic_files"] = critic_total

report_data["summary"] = summary

Expand Down