diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index eb5c954b..3ebe9ab9 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -5,6 +5,7 @@ from openhands.sdk import LLM, Event, get_logger from openhands.sdk.critic import CriticBase from openhands.sdk.llm import Metrics +from openhands.sdk.utils.models import OpenHandsModel logger = get_logger(__name__) @@ -68,7 +69,22 @@ class EvalInstance(BaseModel): ) -class EvalOutput(BaseModel): +class EvalOutput(OpenHandsModel): + """ + Evaluation output model. + + Uses OpenHandsModel instead of BaseModel to ensure pydantic schemas are properly + rebuilt when new discriminated union types (like Browser actions/observations) + are dynamically registered at runtime. + + Without this, pydantic caches the Event discriminated union schema at import time, + before Browser tools are registered. When deserializing output.jsonl files that + contain Browser events, pydantic fails with "Unexpected kind BrowserGetContentAction" + because the schema doesn't include the newly registered types. + + OpenHandsModel automatically calls model_rebuild() before validation, regenerating + the schema to include all registered event types. + """ # NOTE: User-specified instance_id: str # output of the evaluation