add/document gemini 3.0 support and ReflectAndRetryToolPlugin

selcukgun · selcukgun · commit 7ec090c15e08 · 2025-11-21T10:36:20.000-08:00
diff --git a/python/agents/tau2-benchmark-agent/README.md b/python/agents/tau2-benchmark-agent/README.md
@@ -48,6 +48,18 @@ tenacity dependecy version may conflict with that of tau2 repo. Upgrading it bac
 pip install --upgrade tenacity
 ```
 
+**IMPORTANT:** Gemini 3 Pro model makes sending thought signatures mandatory. Tau2 bench relies on litellm for user simulation and non-adk agent simulation. Until https://github.com/BerriAI/litellm/pull/16812 is merged to litellm repository, the PR needs to be applied as shown below:
+
+```bash
+git clone --filter=blob:none --quiet https://github.com/BerriAI/litellm.git /tmp/litellm-pr-16812
+cd /tmp/litellm-pr-16812
+git checkout -q pull/16812/head
+git fetch origin pull/16812/head:pr-16812
+git checkout pr-16812
+pip install .
+cd -
+```
+
 ## 3. Add env params
 
 Create `.env` file at root with the following content.
@@ -130,11 +142,17 @@ def _create_agent(name: str, model: Union[str, BaseLlm], instruction: str, tools
 Here is an example command to run the agent on an airline domain task:
 
 ```bash
-tau2 run --domain airline --agent adk_agent --agent-llm vertex_ai/gemini-2.5-pro --user-llm vertex_ai/gemini-2.5-pro --num-trials 1 --num-tasks 1
+tau2 run --domain airline --agent adk_agent --agent-llm vertex_ai/gemini-3-pro-preview --user-llm vertex_ai/gemini-3-pro-preview --num-trials 1 --num-tasks 1 --user-llm-args '{"temperature": 1, "reasoning_effort": "high"}' --agent-llm-args '{"temperature": 1, "reasoning_effort": "high"}'
 ```
 
 Optionally, you can run specific example by using `--task-ids` instead of `--num-tasks`.
 
+**temperature:** When adk_agent is used defaults to 1. The commands in this document sets them explicitly using llm_args for both user and agent models.
+
+**reasoning_level** Only applies to Gemini 3 Pro model. It defaults to high for adk_agent while using this model. Otherwise, it will default to dynamic thinking. Again this document demonsrates setting it explicitly using llm_args.
+
+**NOTE**: It is normal that you will be getting `This model isn't mapped yet` error logs. This is coming from litellm cost calculation workflow used by `--user-llm`. You can suppress is temporarily by swapping `--user-llm vertex_ai/gemini-3-pro-preview` with `--user-llm vertex_ai/gemini-2.5-pro`.
+
 ### Viewing trajectories
 
 You can use the following command to view trajectories after following the default options:
@@ -149,18 +167,47 @@ Full run requires dropping the arg `--task-ids`.
 
 ```bash
 # Example: Run complete evaluation for all domains
-tau2 run --domain retail --agent adk_agent --agent-llm vertex_ai/gemini-2.5-pro --user-llm vertex_ai/gemini-2.5-pro --num-trials 4 --save-to my_model_retail
-tau2 run --domain airline --agent adk_agent --agent-llm vertex_ai/gemini-2.5-pro --user-llm vertex_ai/gemini-2.5-pro --num-trials 4 --save-to my_model_airline
-tau2 run --domain telecom --agent adk_agent --agent-llm vertex_ai/gemini-2.5-pro --user-llm vertex_ai/gemini-2.5-pro --num-trials 4 --save-to my_model_telecom
+tau2 run \
+  --domain retail \
+  --agent adk_agent \
+  --agent-llm vertex_ai/gemini-3-pro-preview \
+  --user-llm vertex_ai/gemini-3-pro-preview \
+  --num-trials 4 \
+  --save-to gemini_3_pro_retail \
+  --user-llm-args '{"temperature": 1, "reasoning_effort": "high"}' \
+  --agent-llm-args '{"temperature": 1, "reasoning_effort": "high"}'
+
+
+tau2 run \
+  --domain airline \
+  --agent adk_agent \
+  --agent-llm vertex_ai/gemini-3-pro-preview \
+  --user-llm vertex_ai/gemini-3-pro-preview \
+  --num-trials 4 \
+  --save-to gemini_3_pro_airline \
+  --user-llm-args '{"temperature": 1, "reasoning_effort": "high"}' \
+  --agent-llm-args '{"temperature": 1, "reasoning_effort": "high"}'
+
+
+tau2 run \
+  --domain telecom \
+  --agent adk_agent \
+  --agent-llm vertex_ai/gemini-3-pro-preview \
+  --user-llm vertex_ai/gemini-3-pro-preview \
+  --num-trials 4 \
+  --save-to gemini_3_pro_telecom \
+  --user-llm-args '{"temperature": 1, "reasoning_effort": "high"}' \
+  --agent-llm-args '{"temperature": 1, "reasoning_effort": "high"}'
 ```
 
 ### Prepare Submission Package
 
 ```bash
-tau2 submit prepare data/tau2/simulations/my_model_*.json --output ./my_submission
+tau2 submit prepare data/tau2/simulations/gemini_3_pro_*.json --output ./gemini_3_pro_submission
 ```
 
 This command will:
+
 - Verify all trajectory files are valid
 - Check that submission requirements are met
 - Compute performance metrics (Pass^k rates)
@@ -185,4 +232,4 @@ pip install pytest-cov
 
 ```bash
 pytest --cov=tau2.agent.adk_agent --cov-report=html tests/test_adk_agent.py
-````
+````
diff --git a/python/agents/tau2-benchmark-agent/tau2_agent/adk_agent.py b/python/agents/tau2-benchmark-agent/tau2_agent/adk_agent.py
@@ -14,12 +14,13 @@
 
 
 import asyncio
-from typing import Any, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 from google.adk import Agent as AdkLlmAgent
 from google.adk.agents import BaseAgent
 from google.adk.models.base_llm import BaseLlm
 from google.adk.planners import built_in_planner
+from google.adk.plugins import ReflectAndRetryToolPlugin
 from google.adk.runners import InMemoryRunner
 from google.adk.tools import base_tool
 from google.genai import types
@@ -42,7 +43,7 @@ def __init__(self, function_declaration: types.FunctionDeclaration):
         """Initialize the AdkTool with a function declaration.
 
         Args:
-            function_declaration: The function declaration for the tool.
+          function_declaration: The function declaration for the tool.
         """
         super().__init__(
             name=function_declaration.name,
@@ -64,20 +65,24 @@ async def run_async(self, *, args, tool_context) -> Any:
 
 
 def _create_agent(
-    name: str, model: Union[str, BaseLlm], instruction: str, tools: List[Tool]
+    name: str,
+    model: Union[str, BaseLlm],
+    instruction: str,
+    tools: List[Tool],
+    llm_args: Dict[str, Any],
 ) -> BaseAgent:
     """Create an ADK LLM Agent with the given parameters.
 
     Args:
-        name: The name of the agent.
-        model: The LLM model to use.
-        instruction: The system prompt/instruction for the agent.
-        tools: The list of tools available to the agent.
+      name: The name of the agent.
+      model: The LLM model to use.
+      instruction: The system prompt/instruction for the agent.
+      tools: The list of tools available to the agent.
+      llm_args: Additional arguments for the LLM.
 
     Returns:
-        An instance of BaseAgent (which also allows workflow agents).
+      An instance of BaseAgent (which also allows workflow agents).
     """
-
     adk_tools = [
         AdkTool(
             types.FunctionDeclaration(
@@ -88,14 +93,33 @@ def _create_agent(
         )
         for tool in tools
     ]
+
+    generate_content_config = types.GenerateContentConfig()
+    generate_content_config.temperature = llm_args.get(
+        "temperature", 1
+    )  # default to recommended temperature for gemini models
+
+    thinking_level = None
+    if (
+        isinstance(model, str)
+        and model.startswith("gemini-3")
+        and "reasoning_effort" in llm_args
+    ):
+        thinking_level = llm_args["reasoning_effort"]
+
+    thinking_config = types.ThinkingConfig(
+        include_thoughts=True, thinking_level=thinking_level, thinking_budget=None
+    )
+
     return AdkLlmAgent(
         model=model,
         name=name,
         instruction=instruction,
         tools=adk_tools,
         planner=built_in_planner.BuiltInPlanner(
-            thinking_config=types.ThinkingConfig(include_thoughts=True),
+            thinking_config=thinking_config,
         ),
+        generate_content_config=generate_content_config,
     )
 
 
@@ -112,12 +136,11 @@ def __init__(
         """Initialize the AdkAgent with the given parameters.
 
         Args:
-            tools: The list of tools available to the agent.
-            domain_policy: The domain policy for the agent.
-            llm: The LLM model to use.
-            llm_args: Additional arguments for the LLM.
+          tools: The list of tools available to the agent.
+          domain_policy: The domain policy for the agent.
+          llm: The LLM model to use.
+          llm_args: Additional arguments for the LLM.
         """
-
         super().__init__(
             tools=tools, domain_policy=domain_policy, llm=llm, llm_args=llm_args
         )
@@ -127,15 +150,24 @@ def __init__(
         ), "AdkAgent only supports gemini models for this benchmark."
         if model_name.startswith("vertex_ai/"):
             model_name = model_name.replace("vertex_ai/", "")
+        if model_name.startswith("gemini/"):
+            model_name = model_name.replace("gemini/", "")
         self._adk_root_agent = _create_agent(
             name="customer_service_agent",
             model=self.llm_args.get("model_obj", model_name),
             instruction=self.system_prompt,
             tools=tools,
+            llm_args=llm_args,
+        )
+
+        error_handling_plugin = ReflectAndRetryToolPlugin(
+            max_retries=3, throw_exception_if_retry_exceeded=False
         )
-        self.long_running_call_infos = []
+
         self._runner = InMemoryRunner(
-            agent=self._adk_root_agent, app_name="tau2_adk_app"
+            agent=self._adk_root_agent,
+            app_name="tau2_adk_app",
+            plugins=[error_handling_plugin],
         )
         self._app_name = "tau2_adk_app"
         self._user_id = "tau2_user"
@@ -165,10 +197,11 @@ async def _run_prompt_async(
         """Run the prompt asynchronously and return the assistant message.
 
         Args:
-            new_message: The new message from the user.
-            function_responses: The list of function responses from tools.
+          new_message: The new message from the user.
+          function_responses: The list of function responses from tools.
+
         Returns:
-            An AssistantMessage containing the response from the agent.
+          An AssistantMessage containing the response from the agent.
         """
         if new_message is not None:
             content = types.Content(
@@ -186,6 +219,9 @@ async def _run_prompt_async(
         async for event in self._runner.run_async(
             user_id=self._user_id, session_id=self.session.id, new_message=content
         ):
+            if event is None or event.content is None:
+                continue
+
             logger.info(f"** Event received: {event.content.parts}")
             for part in event.content.parts:
                 if part.function_call:
@@ -206,7 +242,6 @@ async def _run_prompt_async(
                     )
                 elif part.text:
                     if not part.thought:
-                        text_content += "\n" if text_content else ""
                         text_content += part.text
                 else:
                     logger.info(f"** Other part type received: {part}")
@@ -223,13 +258,12 @@ def generate_next_message(
         """Generate the next message from the agent based on the input message.
 
         Args:
-            message: The input message from the user or tool.
-            state: The current state of the agent.
+          message: The input message from the user or tool.
+          state: The current state of the agent.
 
         Returns:
-            A tuple containing the assistant message and the updated agent state.
+          A tuple containing the assistant message and the updated agent state.
         """
-
         if isinstance(message, MultiToolMessage):
             state.messages.extend(message.tool_messages)
         else:
@@ -292,17 +326,20 @@ def add_long_running_call_info(self, call_info: tuple[str, str]):
         """Add information about a long-running call.
 
         Args:
-            call_info: A tuple containing the call ID and call name.
+          call_info: A tuple containing the call ID and call name.
         """
+        if not hasattr(self, "long_running_call_infos"):
+            self.long_running_call_infos = []
         self.long_running_call_infos.append(call_info)
 
     def pop_long_running_call_info(self):
         """Pop the oldest long-running call information.
 
         Returns:
-            A tuple containing the call ID and call name, or None if no information is available.
+          A tuple containing the call ID and call name, or None if no information
+          is available.
         """
-        if self.long_running_call_infos:
+        if hasattr(self, "long_running_call_infos") and self.long_running_call_infos:
             return self.long_running_call_infos.pop(0)
         return None
 
@@ -312,12 +349,16 @@ def pop_long_running_call_info_with_id(
         """Pop long-running call information by call ID.
 
         Args:
-            call_id: The ID of the long-running call to pop.
+          call_id: The ID of the long-running call to pop.
 
         Returns:
-            A tuple containing the call ID and call name, or None if no information is available.
+          A tuple containing the call ID and call name, or None if no information
+          is available.
         """
-        for i, (stored_call_id, call_name) in enumerate(self.long_running_call_infos):
-            if stored_call_id == call_id:
-                return self.long_running_call_infos.pop(i)
+        if hasattr(self, "long_running_call_infos") and self.long_running_call_infos:
+            for i, (stored_call_id, call_name) in enumerate(
+                self.long_running_call_infos
+            ):
+                if stored_call_id == call_id:
+                    return self.long_running_call_infos.pop(i)
         return None
diff --git a/python/agents/tau2-benchmark-agent/tests/conftest.py b/python/agents/tau2-benchmark-agent/tests/conftest.py
@@ -1,19 +1,20 @@
 import sys
+
 import pytest
 import tau2.agent
 
 try:
     from tau2_agent import adk_agent
 except ImportError:
     # Fallback: try to import from relative path if installed as editable but path issues
-    import os
     import importlib.util
-    
+    import os
+
     # Assuming this conftest is in tests/ and tau2_agent is in ../tau2_agent/
     project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
     if project_root not in sys.path:
         sys.path.insert(0, project_root)
-    
+
     from tau2_agent import adk_agent
 
 # Inject the local adk_agent module into the tau2.agent namespace
@@ -22,9 +23,11 @@
 tau2.agent.adk_agent = adk_agent
 sys.modules["tau2.agent.adk_agent"] = adk_agent
 
+
 @pytest.fixture
 def get_environment():
     """Fixture to provide a mock environment with tools and policy."""
+
     class MockTool:
         def __init__(self, name="mock_tool"):
             self.openai_schema = {
@@ -33,10 +36,8 @@ def __init__(self, name="mock_tool"):
                     "description": f"Description for {name}",
                     "parameters": {
                         "type": "object",
-                        "properties": {
-                            "arg1": {"type": "string"}
-                        }
-                    }
+                        "properties": {"arg1": {"type": "string"}},
+                    },
                 }
             }