Skip to content
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions src/agentlab/experiments/custom_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from abc import abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

import benchmarks
import pandas as pd
from bgym import Benchmark, EnvArgs, HighLevelActionSetArgs
from browsergym.experiments.benchmark.base import BenchmarkBackend
from dataclasses_json import DataClassJsonMixin, config
from torch import threshold

from agentlab.analyze.inspect_results import load_result_df
from agentlab.experiments.study import Study


@dataclass
class ResampleBenchmark(Benchmark):
exp_dir: Path = None
name: str = None
high_level_action_set_args: HighLevelActionSetArgs = None
is_multi_tab: bool = None
supports_parallel_seeds: bool = None
env_args_list: list[EnvArgs] = None
backends: list[BenchmarkBackend] = None
task_metadata: Optional[pd.DataFrame] = field(
default_factory=lambda: None,
metadata=config(
encoder=lambda df: df.to_dict(orient="records") if df is not None else None,
decoder=lambda items: pd.DataFrame(items) if items is not None else None,
),
)

def __post_init__(self):
assert self.exp_dir is not None
study = Study.load(self.exp_dir)
benchmark = study.benchmark

self.name = f"resample-{benchmark.name}"
self.high_level_action_set_args = benchmark.high_level_action_set_args
self.is_multi_tab = benchmark.is_multi_tab
self.supports_parallel_seeds = benchmark.supports_parallel_seeds
self.backends = benchmark.backends
# we discard the task_metadata to create new ones in post_init

values = self.evaluate(study, benchmark.env_args_list)
selected_env_args = self.select(values, benchmark.env_args_list)

if len(selected_env_args) == 0:
raise ValueError("No env_args selected, lower restrictions")

self.env_args_list = selected_env_args

super().__post_init__()

@abstractmethod
def evaluate(self, study, env_args_list):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

simplify this

pass

@abstractmethod
def select(self, values, env_args_list):
pass


@dataclass
class AllTasksBenchmark(ResampleBenchmark):
def evaluate(self, study, env_args_list):
return [0] * len(env_args_list)

def select(self, values, env_args_list):
return env_args_list


@dataclass
class HighVarianceBenchmark(ResampleBenchmark):
threshold: float = 0.2
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Threshold Value Validation Missing category Readability

Tell me more
What is the issue?

The hardcoded threshold value of 0.2 for variance selection lacks validation to ensure it's a reasonable value.

Why this matters

Invalid threshold values (negative or extremely high) could lead to unintended task filtering behavior.

Suggested change ∙ Feature Preview

Add threshold validation in post_init:

def __post_init__(self):
    if not 0 <= self.threshold <= 1:
        raise ValueError(f"Threshold must be between 0 and 1, got {self.threshold}")
    super().__post_init__()

Report a problem with this comment

💬 Chat with Korbit by mentioning @korbit-ai.


def evaluate(self, study: Study, env_args_list):
result_df = load_result_df(study.dir)
return dict(result_df.groupby("env.task_name")["cum_reward"].std())
Comment on lines +80 to +82
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing Task Name Error Handling category Functionality

Tell me more
What is the issue?

The evaluate method in HighVarianceBenchmark may fail if a task_name in env_args_list doesn't exist in the result_df from the study.

Why this matters

This will cause a KeyError when trying to access non-existent task names in the select method, potentially crashing the benchmark creation.

Suggested change ∙ Feature Preview

Add error handling to safely handle missing task names:

def evaluate(self, study: Study, env_args_list):
    result_df = load_result_df(study.dir)
    std_dict = dict(result_df.groupby("env.task_name")["cum_reward"].std())
    # Return 0 variance for missing tasks to exclude them
    return {task.task_name: std_dict.get(task.task_name, 0) for task in env_args_list}

Report a problem with this comment

💬 Chat with Korbit by mentioning @korbit-ai.

Comment on lines +81 to +82
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inefficient DataFrame Loading and Processing category Performance

Tell me more
What is the issue?

Loading and processing the entire result DataFrame for each evaluation is inefficient, especially when dealing with large datasets.

Why this matters

This approach requires loading the complete dataset into memory and performing groupby operations each time evaluate() is called, which can be memory-intensive and slow for large experiment results.

Suggested change ∙ Feature Preview

Cache the processed results or pass pre-computed statistics to avoid reloading and recomputing. Consider implementing as:

def __init__(self, exp_dir: Path, threshold: float = 0.2):
    self._cached_stats = None
    super().__init__(exp_dir=exp_dir, threshold=threshold)

def _compute_stats(self, study: Study):
    if self._cached_stats is None:
        result_df = load_result_df(study.dir)
        self._cached_stats = dict(result_df.groupby("env.task_name")["cum_reward"].std())
    return self._cached_stats

def evaluate(self, study: Study, env_args_list):
    return self._compute_stats(study)

Report a problem with this comment

💬 Chat with Korbit by mentioning @korbit-ai.


def select(self, values, env_args_list):
selected_env_args = []
for env_args in env_args_list:
if values[env_args.task_name] > self.threshold:
selected_env_args.append(env_args)
return selected_env_args


if __name__ == "__main__":
exp_dir = Path("/home/t/agentlab_results/2025-02-26_10-15-04_genericagent-gpt-4o-mini-2024-07-18-on-miniwob-tiny-test")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded Path Exposure category Security

Tell me more
What is the issue?

Hardcoded file path in the main script block that may expose sensitive directory structure information.

Why this matters

Exposing internal directory structures can help attackers map the system layout and potentially identify access points for exploitation.

Suggested change ∙ Feature Preview

Move the path to a configuration file or environment variable:

from os import getenv
from dotenv import load_dotenv

load_dotenv()
exp_dir = Path(getenv('EXPERIMENT_DIR'))

Report a problem with this comment

💬 Chat with Korbit by mentioning @korbit-ai.

benchmark = HighVarianceBenchmark(exp_dir=exp_dir)
print(benchmark.env_args_list)

Loading