Skip to content

Commit 5eab348

Browse files
committed
Realtime: send audio item/content index
1 parent 46101cb commit 5eab348

File tree

5 files changed

+77
-9
lines changed

5 files changed

+77
-9
lines changed

src/agents/realtime/events.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ class RealtimeAudioEnd:
115115
info: RealtimeEventInfo
116116
"""Common info for all events, such as the context."""
117117

118+
item_id: str
119+
"""The ID of the item containing audio."""
120+
121+
content_index: int
122+
"""The index of the audio content in `item.content`"""
123+
118124
type: Literal["audio_end"] = "audio_end"
119125

120126

@@ -125,6 +131,12 @@ class RealtimeAudio:
125131
audio: RealtimeModelAudioEvent
126132
"""The audio event from the model layer."""
127133

134+
item_id: str
135+
"""The ID of the item containing audio."""
136+
137+
content_index: int
138+
"""The index of the audio content in `item.content`"""
139+
128140
info: RealtimeEventInfo
129141
"""Common info for all events, such as the context."""
130142

@@ -140,6 +152,12 @@ class RealtimeAudioInterrupted:
140152
info: RealtimeEventInfo
141153
"""Common info for all events, such as the context."""
142154

155+
item_id: str
156+
"""The ID of the item containing audio."""
157+
158+
content_index: int
159+
"""The index of the audio content in `item.content`"""
160+
143161
type: Literal["audio_interrupted"] = "audio_interrupted"
144162

145163

src/agents/realtime/model_events.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,20 +40,38 @@ class RealtimeModelAudioEvent:
4040
data: bytes
4141
response_id: str
4242

43+
item_id: str
44+
"""The ID of the item containing audio."""
45+
46+
content_index: int
47+
"""The index of the audio content in `item.content`"""
48+
4349
type: Literal["audio"] = "audio"
4450

4551

4652
@dataclass
4753
class RealtimeModelAudioInterruptedEvent:
4854
"""Audio interrupted."""
4955

56+
item_id: str
57+
"""The ID of the item containing audio."""
58+
59+
content_index: int
60+
"""The index of the audio content in `item.content`"""
61+
5062
type: Literal["audio_interrupted"] = "audio_interrupted"
5163

5264

5365
@dataclass
5466
class RealtimeModelAudioDoneEvent:
5567
"""Audio done."""
5668

69+
item_id: str
70+
"""The ID of the item containing audio."""
71+
72+
content_index: int
73+
"""The index of the audio content in `item.content`"""
74+
5775
type: Literal["audio_done"] = "audio_done"
5876

5977

src/agents/realtime/openai_realtime.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,12 @@ async def _send_interrupt(self, event: RealtimeModelSendInterrupt) -> None:
302302

303303
elapsed_time_ms = (datetime.now() - self._audio_start_time).total_seconds() * 1000
304304
if elapsed_time_ms > 0 and elapsed_time_ms < self._audio_length_ms:
305-
await self._emit_event(RealtimeModelAudioInterruptedEvent())
305+
await self._emit_event(
306+
RealtimeModelAudioInterruptedEvent(
307+
item_id=self._current_item_id,
308+
content_index=self._current_audio_content_index or 0,
309+
)
310+
)
306311
converted = _ConversionHelper.convert_interrupt(
307312
self._current_item_id,
308313
self._current_audio_content_index or 0,
@@ -331,7 +336,12 @@ async def _handle_audio_delta(self, parsed: ResponseAudioDeltaEvent) -> None:
331336
# Calculate audio length in ms using 24KHz pcm16le
332337
self._audio_length_ms += self._calculate_audio_length_ms(audio_bytes)
333338
await self._emit_event(
334-
RealtimeModelAudioEvent(data=audio_bytes, response_id=parsed.response_id)
339+
RealtimeModelAudioEvent(
340+
data=audio_bytes,
341+
response_id=parsed.response_id,
342+
item_id=parsed.item_id,
343+
content_index=parsed.content_index,
344+
)
335345
)
336346

337347
def _calculate_audio_length_ms(self, audio_bytes: bytes) -> float:
@@ -429,7 +439,12 @@ async def _handle_ws_event(self, event: dict[str, Any]):
429439
if parsed.type == "response.audio.delta":
430440
await self._handle_audio_delta(parsed)
431441
elif parsed.type == "response.audio.done":
432-
await self._emit_event(RealtimeModelAudioDoneEvent())
442+
await self._emit_event(
443+
RealtimeModelAudioDoneEvent(
444+
item_id=parsed.item_id,
445+
content_index=parsed.content_index,
446+
)
447+
)
433448
elif parsed.type == "input_audio_buffer.speech_started":
434449
await self._send_interrupt(RealtimeModelSendInterrupt())
435450
elif parsed.type == "response.created":

src/agents/realtime/session.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,11 +188,26 @@ async def on_event(self, event: RealtimeModelEvent) -> None:
188188
elif event.type == "function_call":
189189
await self._handle_tool_call(event)
190190
elif event.type == "audio":
191-
await self._put_event(RealtimeAudio(info=self._event_info, audio=event))
191+
await self._put_event(
192+
RealtimeAudio(
193+
info=self._event_info,
194+
audio=event,
195+
item_id=event.item_id,
196+
content_index=event.content_index,
197+
)
198+
)
192199
elif event.type == "audio_interrupted":
193-
await self._put_event(RealtimeAudioInterrupted(info=self._event_info))
200+
await self._put_event(
201+
RealtimeAudioInterrupted(
202+
info=self._event_info, item_id=event.item_id, content_index=event.content_index
203+
)
204+
)
194205
elif event.type == "audio_done":
195-
await self._put_event(RealtimeAudioEnd(info=self._event_info))
206+
await self._put_event(
207+
RealtimeAudioEnd(
208+
info=self._event_info, item_id=event.item_id, content_index=event.content_index
209+
)
210+
)
196211
elif event.type == "input_audio_transcription_completed":
197212
self._history = RealtimeSession._get_new_history(self._history, event)
198213
await self._put_event(

tests/realtime/test_session.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,15 +157,17 @@ async def test_audio_events_transformation(self, mock_model, mock_agent):
157157
session = RealtimeSession(mock_model, mock_agent, None)
158158

159159
# Test audio event
160-
audio_event = RealtimeModelAudioEvent(data=b"audio_data", response_id="resp_1")
160+
audio_event = RealtimeModelAudioEvent(
161+
data=b"audio_data", response_id="resp_1", item_id="item_1", content_index=0
162+
)
161163
await session.on_event(audio_event)
162164

163165
# Test audio interrupted event
164-
interrupted_event = RealtimeModelAudioInterruptedEvent()
166+
interrupted_event = RealtimeModelAudioInterruptedEvent(item_id="item_1", content_index=0)
165167
await session.on_event(interrupted_event)
166168

167169
# Test audio done event
168-
done_event = RealtimeModelAudioDoneEvent()
170+
done_event = RealtimeModelAudioDoneEvent(item_id="item_1", content_index=0)
169171
await session.on_event(done_event)
170172

171173
# Should have 6 events total (2 per event: raw + transformed)

0 commit comments

Comments
 (0)