feat: Context handling in realtime

anuragsharanjuspay · sharananurag998 · commit 8bcb3893e378 · 2025-05-07T16:34:50.000+05:30
diff --git a/docs/voice/pipeline.md b/docs/voice/pipeline.md
@@ -103,6 +103,14 @@ from agents.voice import (
     VoicePipelineConfig
 )
 from agents.voice.models.sdk_realtime import SDKRealtimeLLM
+from dataclasses import dataclass
+
+# Define a simple context class for state management (optional)
+@dataclass
+class MyAppContext:
+    """Context for the voice assistant."""
+    user_name: str = "User"
+    interaction_count: int = 0
 
 # Create the input, config, and model
 input_stream = StreamedAudioInput()
@@ -114,11 +122,15 @@ config = VoicePipelineConfig(
 )
 model = SDKRealtimeLLM(model_name="gpt-4o-realtime-preview")
 
-# Create the pipeline with tools
+# Create an app context instance (optional)
+app_context = MyAppContext()
+
+# Create the pipeline with tools and shared context
 pipeline = RealtimeVoicePipeline(
     model=model,
     tools=[get_weather, get_time],
     config=config,
+    shared_context=app_context,  # Optional: shared state for context-aware tools
 )
 
 # Start the pipeline
@@ -147,6 +159,117 @@ while True:
         break
 ```
 
+### Using Shared Context with Tools
+
+The `RealtimeVoicePipeline` supports passing a shared context object to tools, allowing them to access and modify shared state across multiple interactions. This is useful for building more complex voice applications that need to maintain state, such as:
+
+-   Tracking user preferences
+-   Maintaining conversation history
+-   Counting interactions
+-   Storing user information
+
+#### Setting up a shared context
+
+To use shared context with tools:
+
+1. Define a context class (typically a dataclass) to hold your application state
+2. Create an instance of this class
+3. Pass it to the `RealtimeVoicePipeline` using the `shared_context` parameter
+4. Create tools that accept a `RunContextWrapper[YourContextType]` as their first parameter
+
+```python
+from dataclasses import dataclass
+from agents.run_context import RunContextWrapper
+from agents.tool import function_tool
+
+# Define your context class
+@dataclass
+class MyAppContext:
+    """Context for the voice assistant."""
+    user_name: str
+    interaction_count: int = 0
+
+# Create a context-aware tool
+@function_tool
+def greet_user_and_count(context: RunContextWrapper[MyAppContext]) -> str:
+    """Greets the user by name and counts interactions."""
+    # Access and modify the context
+    context.context.interaction_count += 1
+
+    return f"Hello {context.context.user_name}! This is interaction number {context.context.interaction_count}."
+
+# Create another context-aware tool
+@function_tool
+def get_user_details(context: RunContextWrapper[MyAppContext]) -> dict:
+    """Gets user details from the context."""
+    return {
+        "user_name": context.context.user_name,
+        "interaction_count": context.context.interaction_count
+    }
+
+# Create your application context
+app_context = MyAppContext(user_name="Alice", interaction_count=0)
+
+# Create the pipeline with shared context
+pipeline = RealtimeVoicePipeline(
+    model=model,
+    tools=[get_weather, get_time, greet_user_and_count, get_user_details],
+    config=config,
+    shared_context=app_context,  # Pass the context here
+)
+```
+
+#### How it works
+
+1. The `RealtimeVoicePipeline` passes the shared context to its internal `ToolExecutor`
+2. When the LLM calls a tool, the `ToolExecutor` checks if the tool's first parameter is named `context`
+3. If it is, the executor wraps your context object in a `RunContextWrapper` and passes it to the tool
+4. The tool can then access and modify your context object via `context.context`
+5. Since all tools share the same context object, changes made by one tool are visible to other tools in future calls
+
+This mechanism allows your tools to maintain shared state across turns and interactions in your voice application, without needing to set up a separate state management system.
+
+#### Context-Aware vs. Standard Tools
+
+You can mix both context-aware and standard tools in the same `RealtimeVoicePipeline`:
+
+```python
+# A standard tool (no context parameter)
+@function_tool
+def get_weather(city: str) -> dict:
+    """Gets the weather for the specified city."""
+    return {"temperature": 72, "condition": "sunny"}
+
+# A context-aware tool (has context parameter)
+@function_tool
+def update_user_preference(context: RunContextWrapper[MyAppContext], preference: str, value: str) -> str:
+    """Updates a user preference in the context."""
+    if not hasattr(context.context, "preferences"):
+        context.context.preferences = {}
+    context.context.preferences[preference] = value
+    return f"Updated {preference} to {value}"
+```
+
+**When to use standard tools:**
+
+-   For stateless operations that don't need to remember information between calls
+-   For simple lookups or calculations based solely on the input parameters
+-   When integration with external APIs or services doesn't require user-specific state
+
+**When to use context-aware tools:**
+
+-   When tools need to access or modify shared state
+-   For personalization features that adapt to the user
+-   To implement features that track usage or interactions
+-   When information gathered in one tool call needs to be available to another tool
+
+**Important notes:**
+
+-   The first parameter of a context-aware tool must be named `context` and should have a type annotation of `RunContextWrapper[YourContextType]`
+-   Type hints are recommended but not required; the parameter name `context` is sufficient for the tool to be detected as context-aware
+-   The actual object inside `context.context` will be the instance you passed to `shared_context` when creating the pipeline
+-   All context-aware tools see the same context instance, so changes are immediately visible to all tools
+
 ### Turn Detection Modes
 
 The realtime models can operate in different turn detection modes, controlled via the `turn_detection` setting:
diff --git a/examples/voice/realtime_assistant.py b/examples/voice/realtime_assistant.py
@@ -18,14 +18,15 @@
    on applying for access to the realtime API.
 
 Usage:
-    python continuous_realtime_assistant.py
+    python realtime_assistant.py
 """
 
 import asyncio
 import logging
 import os
 import time
 from typing import Dict, Any
+from dataclasses import dataclass
 
 import numpy as np
 import sounddevice as sd  # For microphone and speaker I/O
@@ -42,6 +43,7 @@
 )
 from agents.tool import function_tool, Tool
 from agents.voice.models.sdk_realtime import SDKRealtimeLLM
+from agents.run_context import RunContextWrapper
 
 # Import the new event types from our SDK
 from agents.voice.realtime.model import (
@@ -60,6 +62,15 @@
 logger = logging.getLogger("realtime_assistant")
 
 
+# Define a dataclass for our application context
+@dataclass
+class MyAppContext:
+    """A simple context for the realtime voice assistant example."""
+
+    user_name: str
+    interaction_count: int = 0
+
+
 # Define some sample tools
 @function_tool
 def get_weather(city: str) -> Dict[str, Any]:
@@ -75,6 +86,37 @@ def get_time(timezone: str = "UTC") -> Dict[str, Any]:
     return {"time": time.strftime("%H:%M:%S", time.gmtime()), "timezone": timezone}
 
 
+# Define a context-aware tool
+@function_tool
+def greet_user_and_count(context: RunContextWrapper[MyAppContext]) -> str:
+    """Greets the user by name and counts interactions."""
+    logger.info(f"greet_user_and_count called with context: {context}")
+    # Increment the interaction count
+    context.context.interaction_count += 1
+
+    logger.info(
+        f"Greeting user: {context.context.user_name}, "
+        f"Interaction count: {context.context.interaction_count}"
+    )
+
+    return f"Hello {context.context.user_name}! This is interaction number {context.context.interaction_count}."
+
+
+# Another context-aware tool that reads but doesn't modify the context
+@function_tool
+def get_user_details(context: RunContextWrapper[MyAppContext]) -> Dict[str, Any]:
+    """Gets the user's details from the context."""
+    logger.info(f"get_user_details called with context: {context}")
+
+    logger.info(
+        f"Returning user details: name={context.context.user_name}, count={context.context.interaction_count}"
+    )
+    return {
+        "user_name": context.context.user_name,
+        "interaction_count": context.context.interaction_count,
+    }
+
+
 # Get the OpenAI API key from environment variables
 api_key = os.environ.get("OPENAI_API_KEY")
 if not api_key:
@@ -117,18 +159,22 @@ async def main():
         realtime_settings={
             "turn_detection": "server_vad",  # Use server-side VAD
             "assistant_voice": "alloy",
-            "system_message": "You are a helpful assistant that responds concisely.",
+            "system_message": "You are a helpful assistant that responds concisely. You can use the greet_user_and_count tool to greet the user by name and the get_user_details tool to retrieve information about the user.",
             # Enable server-side noise / echo reduction
             "input_audio_noise_reduction": {},
         }
     )
     input_stream = StreamedAudioInput()
 
-    # Create the realtime pipeline
+    # Create our application context
+    app_context = MyAppContext(user_name="Anurag", interaction_count=0)
+
+    # Create the realtime pipeline with shared context
     pipeline = RealtimeVoicePipeline(
         model=model,
-        tools=[get_weather, get_time],
+        tools=[get_weather, get_time, greet_user_and_count, get_user_details],
         config=config,
+        shared_context=app_context,  # Pass the context to the pipeline
     )
 
     # Track events and errors
@@ -321,6 +367,9 @@ async def toggle_push_to_talk_simulation():
 
         logger.info(f"Total events processed: {event_count}")
 
+        # Print the final interaction count from the context
+        logger.info(f"Final interaction count: {app_context.interaction_count}")
+
         # Provide troubleshooting information if needed
         if error_occurred or event_count <= 1:  # <=1 because turn_started is an event
             logger.error(f"Error occurred: {error_occurred}")
diff --git a/src/agents/voice/pipeline_realtime.py b/src/agents/voice/pipeline_realtime.py
@@ -37,6 +37,7 @@ def __init__(
         model: RealtimeLLMModel | str | None = None,
         tools: Sequence[Tool] = (),
         config: VoicePipelineConfig | None = None,
+        shared_context: Any | None = None,
     ):
         """Create a new real-time voice pipeline.
 
@@ -45,6 +46,7 @@ def __init__(
                    or a string identifier for a model from the provider.
             tools: A sequence of tools available to the LLM.
             config: The pipeline configuration. If not provided, a default will be used.
+            shared_context: An optional context object that will be passed to tools when they are executed.
         """
         if isinstance(model, str) or model is None:
             self._model_name_to_load: str | None = model
@@ -59,7 +61,8 @@ def __init__(
 
         self._tools = tools
         self._config = config or VoicePipelineConfig()
-        self._tool_executor = ToolExecutor(tools)
+        self._shared_context = shared_context
+        self._tool_executor = ToolExecutor(tools, shared_context=shared_context)
 
     def _get_model(self) -> RealtimeLLMModel:
         """Get the real-time LLM model to use."""
diff --git a/src/agents/voice/realtime/tool_exec.py b/src/agents/voice/realtime/tool_exec.py