fix: addressing edge cases when resuming (continued)

mjschock · mjschock · commit 9ca00a84fa03 · 2025-11-17T15:40:58.000-08:00
diff --git a/src/agents/run.py b/src/agents/run.py
@@ -161,7 +161,13 @@ def prepare_input(
 
         # On first call (when there are no generated items yet), include the original input
         if not generated_items:
-            input_items.extend(ItemHelpers.input_to_new_input_list(original_input))
+            # Normalize original_input items to ensure field names are in snake_case
+            # (items from RunState deserialization may have camelCase)
+            raw_input_list = ItemHelpers.input_to_new_input_list(original_input)
+            # Filter out function_call items that don't have corresponding function_call_output
+            # (API requires every function_call to have a function_call_output)
+            filtered_input_list = AgentRunner._filter_incomplete_function_calls(raw_input_list)
+            input_items.extend(AgentRunner._normalize_input_items(filtered_input_list))
 
         # First, collect call_ids from tool_call_output_item items
         # (completed tool calls with outputs) and build a map of
@@ -737,8 +743,8 @@ async def run(
             original_user_input = run_state._original_input
             # Normalize items to remove top-level providerData (API doesn't accept it there)
             if isinstance(original_user_input, list):
-                prepared_input: str | list[TResponseInputItem] = (
-                    AgentRunner._normalize_input_items(original_user_input)
+                prepared_input: str | list[TResponseInputItem] = AgentRunner._normalize_input_items(
+                    original_user_input
                 )
             else:
                 prepared_input = original_user_input
@@ -833,8 +839,7 @@ async def run(
                     if session is not None and generated_items:
                         # Save tool_call_output_item items (the outputs)
                         tool_output_items: list[RunItem] = [
-                            item for item in generated_items
-                            if item.type == "tool_call_output_item"
+                            item for item in generated_items if item.type == "tool_call_output_item"
                         ]
                         # Also find and save the corresponding function_call items
                         # (they might not be in session if the run was interrupted before saving)
@@ -995,7 +1000,7 @@ async def run(
                                 )
                                 if call_id in output_call_ids and item not in items_to_save:
                                     items_to_save.append(item)
-                        
+
                         # Don't save original_user_input again - it was already saved at the start
                         await self._save_result_to_session(session, [], items_to_save)
 
@@ -1369,9 +1374,12 @@ async def _start_streaming(
             # state's input, causing duplicate items.
             if run_state is not None:
                 # Resuming from state - normalize items to remove top-level providerData
+                # and filter incomplete function_call pairs
                 if isinstance(starting_input, list):
+                    # Filter incomplete function_call pairs before normalizing
+                    filtered = AgentRunner._filter_incomplete_function_calls(starting_input)
                     prepared_input: str | list[TResponseInputItem] = (
-                        AgentRunner._normalize_input_items(starting_input)
+                        AgentRunner._normalize_input_items(filtered)
                     )
                 else:
                     prepared_input = starting_input
@@ -2345,20 +2353,82 @@ def _get_model(cls, agent: Agent[Any], run_config: RunConfig) -> Model:
 
         return run_config.model_provider.get_model(agent.model)
 
+    @staticmethod
+    def _filter_incomplete_function_calls(
+        items: list[TResponseInputItem],
+    ) -> list[TResponseInputItem]:
+        """Filter out function_call items that don't have corresponding function_call_output.
+
+        The OpenAI API requires every function_call in an assistant message to have a
+        corresponding function_call_output (tool message). This function ensures only
+        complete pairs are included to prevent API errors.
+
+        IMPORTANT: This only filters incomplete function_call items. All other items
+        (messages, complete function_call pairs, etc.) are preserved to maintain
+        conversation history integrity.
+
+        Args:
+            items: List of input items to filter
+
+        Returns:
+            Filtered list with only complete function_call pairs. All non-function_call
+            items and complete function_call pairs are preserved.
+        """
+        # First pass: collect call_ids from function_call_output/function_call_result items
+        completed_call_ids: set[str] = set()
+        for item in items:
+            if isinstance(item, dict):
+                item_type = item.get("type")
+                # Handle both API format (function_call_output) and
+                # protocol format (function_call_result)
+                if item_type in ("function_call_output", "function_call_result"):
+                    call_id = item.get("call_id") or item.get("callId")
+                    if call_id and isinstance(call_id, str):
+                        completed_call_ids.add(call_id)
+
+        # Second pass: only include function_call items that have corresponding outputs
+        filtered: list[TResponseInputItem] = []
+        for item in items:
+            if isinstance(item, dict):
+                item_type = item.get("type")
+                if item_type == "function_call":
+                    call_id = item.get("call_id") or item.get("callId")
+                    # Only include if there's a corresponding
+                    # function_call_output/function_call_result
+                    if call_id and call_id in completed_call_ids:
+                        filtered.append(item)
+                else:
+                    # Include all non-function_call items
+                    filtered.append(item)
+            else:
+                # Include non-dict items as-is
+                filtered.append(item)
+
+        return filtered
+
     @staticmethod
     def _normalize_input_items(items: list[TResponseInputItem]) -> list[TResponseInputItem]:
-        """Normalize input items by removing top-level providerData/provider_data.
-        
+        """Normalize input items by removing top-level providerData/provider_data
+        and normalizing field names (callId -> call_id).
+
         The OpenAI API doesn't accept providerData at the top level of input items.
         providerData should only be in content where it belongs. This function removes
         top-level providerData while preserving it in content.
-        
+
+        Also normalizes field names from camelCase (callId) to snake_case (call_id)
+        to match API expectations.
+
+        Normalizes item types: converts 'function_call_result' to 'function_call_output'
+        to match API expectations.
+
         Args:
             items: List of input items to normalize
-            
+
         Returns:
             Normalized list of input items
         """
+        from .run_state import _normalize_field_names
+
         normalized: list[TResponseInputItem] = []
         for item in items:
             if isinstance(item, dict):
@@ -2368,6 +2438,18 @@ def _normalize_input_items(items: list[TResponseInputItem]) -> list[TResponseInp
                 # The API doesn't accept providerData at the top level of input items
                 normalized_item.pop("providerData", None)
                 normalized_item.pop("provider_data", None)
+                # Normalize item type: API expects 'function_call_output',
+                # not 'function_call_result'
+                item_type = normalized_item.get("type")
+                if item_type == "function_call_result":
+                    normalized_item["type"] = "function_call_output"
+                    item_type = "function_call_output"
+                # Remove invalid fields based on item type
+                # function_call_output items should not have 'name' field
+                if item_type == "function_call_output":
+                    normalized_item.pop("name", None)
+                # Normalize field names (callId -> call_id, responseId -> response_id)
+                normalized_item = _normalize_field_names(normalized_item)
                 normalized.append(cast(TResponseInputItem, normalized_item))
             else:
                 # For non-dict items, keep as-is (they should already be in correct format)
@@ -2414,10 +2496,14 @@ async def _prepare_input_with_session(
                 f"Invalid `session_input_callback` value: {session_input_callback}. "
                 "Choose between `None` or a custom callable function."
             )
-        
+
+        # Filter incomplete function_call pairs before normalizing
+        # (API requires every function_call to have a function_call_output)
+        filtered = cls._filter_incomplete_function_calls(merged)
+
         # Normalize items to remove top-level providerData and deduplicate by ID
-        normalized = cls._normalize_input_items(merged)
-        
+        normalized = cls._normalize_input_items(filtered)
+
         # Deduplicate items by ID to prevent sending duplicate items to the API
         # This can happen when resuming from state and items are already in the session
         seen_ids: set[str] = set()
@@ -2429,13 +2515,13 @@ async def _prepare_input_with_session(
                 item_id = cast(str | None, item.get("id"))
             elif hasattr(item, "id"):
                 item_id = cast(str | None, getattr(item, "id", None))
-            
+
             # Only add items we haven't seen before (or items without IDs)
             if item_id is None or item_id not in seen_ids:
                 deduplicated.append(item)
                 if item_id:
                     seen_ids.add(item_id)
-        
+
         return deduplicated
 
     @classmethod
diff --git a/src/agents/run_state.py b/src/agents/run_state.py
@@ -48,14 +48,6 @@ class RunState(Generic[TContext, TAgent]):
     _current_turn: int = 0
     """Current turn number in the conversation."""
 
-    _current_turn_persisted_item_count: int = 0
-    """Tracks how many generated run items from this turn were already persisted to session.
-
-    When saving to session, we slice off only new entries. When a turn is interrupted
-    (e.g., awaiting tool approval) and later resumed, we rewind this counter before
-    continuing so pending tool outputs still get stored.
-    """
-
     _current_agent: TAgent | None = None
     """The agent currently handling the conversation."""
 
@@ -250,13 +242,63 @@ def to_json(self) -> dict[str, Any]:
             }
             model_responses.append(response_dict)
 
+        # Normalize and camelize originalInput if it's a list of items
+        # Convert API format to protocol format to match TypeScript schema
+        # Protocol expects function_call_result (not function_call_output)
+        original_input_serialized = self._original_input
+        if isinstance(original_input_serialized, list):
+            # First pass: build a map of call_id -> function_call name
+            # to help convert function_call_output to function_call_result
+            call_id_to_name: dict[str, str] = {}
+            for item in original_input_serialized:
+                if isinstance(item, dict):
+                    item_type = item.get("type")
+                    call_id = item.get("call_id") or item.get("callId")
+                    name = item.get("name")
+                    if item_type == "function_call" and call_id and name:
+                        call_id_to_name[call_id] = name
+
+            normalized_items = []
+            for item in original_input_serialized:
+                if isinstance(item, dict):
+                    # Create a copy to avoid modifying the original
+                    normalized_item = dict(item)
+                    # Remove session/conversation metadata fields that shouldn't be in originalInput
+                    # These are not part of the input protocol schema
+                    normalized_item.pop("id", None)
+                    normalized_item.pop("created_at", None)
+                    # Remove top-level providerData/provider_data (protocol allows it but
+                    # we remove it for cleaner serialization)
+                    normalized_item.pop("providerData", None)
+                    normalized_item.pop("provider_data", None)
+                    # Convert API format to protocol format
+                    # API uses function_call_output, protocol uses function_call_result
+                    item_type = normalized_item.get("type")
+                    call_id = normalized_item.get("call_id") or normalized_item.get("callId")
+                    if item_type == "function_call_output":
+                        # Convert to protocol format: function_call_result
+                        normalized_item["type"] = "function_call_result"
+                        # Protocol format requires status field (default to 'completed')
+                        if "status" not in normalized_item:
+                            normalized_item["status"] = "completed"
+                        # Protocol format requires name field
+                        # Look it up from the corresponding function_call if missing
+                        if "name" not in normalized_item and call_id:
+                            normalized_item["name"] = call_id_to_name.get(call_id, "")
+                    # Normalize field names to camelCase for JSON (call_id -> callId)
+                    normalized_item = self._camelize_field_names(normalized_item)
+                    normalized_items.append(normalized_item)
+                else:
+                    normalized_items.append(item)
+            original_input_serialized = normalized_items
+
         result = {
             "$schemaVersion": CURRENT_SCHEMA_VERSION,
             "currentTurn": self._current_turn,
             "currentAgent": {
                 "name": self._current_agent.name,
             },
-            "originalInput": self._original_input,
+            "originalInput": original_input_serialized,
             "modelResponses": model_responses,
             "context": {
                 "usage": {
@@ -345,7 +387,6 @@ def to_json(self) -> dict[str, Any]:
             if self._last_processed_response
             else None
         )
-        result["currentTurnPersistedItemCount"] = self._current_turn_persisted_item_count
         result["trace"] = None
 
         return result
@@ -571,18 +612,29 @@ async def from_string(
         context.usage = usage
         context._rebuild_approvals(context_data.get("approvals", {}))
 
+        # Normalize originalInput to remove providerData fields that may have been
+        # included by TypeScript serialization. These fields are metadata and should
+        # not be sent to the API.
+        original_input_raw = state_json["originalInput"]
+        if isinstance(original_input_raw, list):
+            # Normalize each item in the list to remove providerData fields
+            normalized_original_input = [
+                _normalize_field_names(item) if isinstance(item, dict) else item
+                for item in original_input_raw
+            ]
+        else:
+            # If it's a string, use it as-is
+            normalized_original_input = original_input_raw
+
         # Create the RunState instance
         state = RunState(
             context=context,
-            original_input=state_json["originalInput"],
+            original_input=normalized_original_input,
             starting_agent=current_agent,
             max_turns=state_json["maxTurns"],
         )
 
         state._current_turn = state_json["currentTurn"]
-        state._current_turn_persisted_item_count = state_json.get(
-            "currentTurnPersistedItemCount", 0
-        )
 
         # Reconstruct model responses
         state._model_responses = _deserialize_model_responses(state_json.get("modelResponses", []))
@@ -679,18 +731,29 @@ async def from_json(
         context.usage = usage
         context._rebuild_approvals(context_data.get("approvals", {}))
 
+        # Normalize originalInput to remove providerData fields that may have been
+        # included by TypeScript serialization. These fields are metadata and should
+        # not be sent to the API.
+        original_input_raw = state_json["originalInput"]
+        if isinstance(original_input_raw, list):
+            # Normalize each item in the list to remove providerData fields
+            normalized_original_input = [
+                _normalize_field_names(item) if isinstance(item, dict) else item
+                for item in original_input_raw
+            ]
+        else:
+            # If it's a string, use it as-is
+            normalized_original_input = original_input_raw
+
         # Create the RunState instance
         state = RunState(
             context=context,
-            original_input=state_json["originalInput"],
+            original_input=normalized_original_input,
             starting_agent=current_agent,
             max_turns=state_json["maxTurns"],
         )
 
         state._current_turn = state_json["currentTurn"]
-        state._current_turn_persisted_item_count = state_json.get(
-            "currentTurnPersistedItemCount", 0
-        )
 
         # Reconstruct model responses
         state._model_responses = _deserialize_model_responses(state_json.get("modelResponses", []))
diff --git a/tests/test_run_state.py b/tests/test_run_state.py