diff --git a/README.md b/README.md index c1d35f1..3531488 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ [![Swift Version](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fm1guelpf%2Fswift-realtime-openai%2Fbadge%3Ftype%3Dswift-versions&color=brightgreen)](https://swiftpackageindex.com/m1guelpf/swift-realtime-openai) [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://raw.githubusercontent.com/m1guelpf/swift-realtime-openai/main/LICENSE) +**[Update Feb 21, 2025 Added the WebRTC support. See the complete demo [here](https://github.com/jeffxtang/ios_realtime_api) on how to use both WebRTC and WebSocket.]** + This library provides a simple interface for implementing multi-modal conversations using OpenAI's new Realtime API. It can handle automatically recording the user's microphone and playing back the assistant's response, and also gives you a transparent layer over the API for advanced use cases. @@ -121,7 +123,7 @@ struct ContentView: View { - [x] Optionally handle recording the user's mic and sending it to the API - [x] Optionally handle playing model responses as they stream in - [x] Allow interrupting the model -- [ ] WebRTC support +- [x] WebRTC support ## Architecture diff --git a/src/Connectors/WebRTCConnector.swift b/src/Connectors/WebRTCConnector.swift index f16f181..201d582 100644 --- a/src/Connectors/WebRTCConnector.swift +++ b/src/Connectors/WebRTCConnector.swift @@ -25,6 +25,10 @@ public final class WebRTCConnector: NSObject, Connector, Sendable { return RTCPeerConnectionFactory() }() + public func getConnection() -> RTCPeerConnection { + connection + } + private let encoder: JSONEncoder = { let encoder = JSONEncoder() encoder.keyEncodingStrategy = .convertToSnakeCase @@ -102,8 +106,15 @@ extension WebRTCConnector: RTCPeerConnectionDelegate { print("Connection state changed to \(connection.signalingState)") } - public func peerConnection(_: RTCPeerConnection, didAdd _: RTCMediaStream) { + public func peerConnection(_: RTCPeerConnection, didAdd stream: RTCMediaStream) { print("Media stream added.") + if let audioTrack = stream.audioTracks.first { + print("Audio track received") + let audioSession = AVAudioSession.sharedInstance() + do { + try AVAudioSession.sharedInstance().overrideOutputAudioPort(AVAudioSession.PortOverride.speaker) + } catch {} + } } public func peerConnection(_: RTCPeerConnection, didRemove _: RTCMediaStream) { diff --git a/src/Conversation.swift b/src/Conversation.swift index 4de855f..c1cb793 100644 --- a/src/Conversation.swift +++ b/src/Conversation.swift @@ -1,3 +1,4 @@ +@preconcurrency import WebRTC import Foundation @preconcurrency import AVFoundation @@ -20,6 +21,8 @@ public final class Conversation: @unchecked Sendable { private let userConverter = UnsafeInteriorMutable() private let desiredFormat = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 24000, channels: 1, interleaved: false)! + public func getClient() -> RealtimeAPI { return client } + /// A stream of errors that occur during the conversation. public let errors: AsyncStream @@ -97,10 +100,26 @@ public final class Conversation: @unchecked Sendable { } /// Create a new conversation providing an API token and, optionally, a model. - public convenience init(authToken token: String, model: String = "gpt-4o-realtime-preview") { + public convenience init(authToken token: String) { + let model = "gpt-4o-realtime-preview-2024-12-17" + + // use websocket + print("webSocket initalization...") self.init(client: RealtimeAPI.webSocket(authToken: token, model: model)) } + public convenience init(authToken token: String, webRTC _: Bool) async { + let model = "gpt-4o-realtime-preview-2024-12-17" + + // use webrtc + do { + await try self.init(client: RealtimeAPI.webRTC(authToken: token, model: model)) + } catch { + print("Exception - webRTC init failed: \(error). \nFalling back to webSocket...") + self.init(client: RealtimeAPI.webSocket(authToken: token, model: model)) + } + } + /// Create a new conversation that connects using a custom `URLRequest`. public convenience init(connectingTo request: URLRequest) { self.init(client: RealtimeAPI.webSocket(connectingTo: request)) @@ -212,10 +231,16 @@ public extension Conversation { userConverter.set(converter) audioEngine.attach(playerNode) - audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: converter.inputFormat) + + let compatibleFormat = AVAudioFormat(standardFormatWithSampleRate: 44100, channels: 1) + audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: compatibleFormat) #if os(iOS) - try audioEngine.inputNode.setVoiceProcessingEnabled(true) + do { + try audioEngine.inputNode.setVoiceProcessingEnabled(true) + } catch { + print("Failed to setVoiceProcessingEnabled: \(error.localizedDescription)") + } #endif audioEngine.prepare() @@ -225,7 +250,9 @@ public extension Conversation { #if os(iOS) let audioSession = AVAudioSession.sharedInstance() try audioSession.setCategory(.playAndRecord, mode: .voiceChat, options: [.defaultToSpeaker, .allowBluetooth]) + try audioSession.setPreferredSampleRate(44100) try audioSession.setActive(true) + #endif handlingVoice = true diff --git a/src/OpenAIRealtime.swift b/src/OpenAIRealtime.swift index 0254e85..1c5ddce 100644 --- a/src/OpenAIRealtime.swift +++ b/src/OpenAIRealtime.swift @@ -29,6 +29,15 @@ public final class RealtimeAPI: NSObject, Sendable { public func send(event: ClientEvent) async throws { try await connector.send(event: event) } + + public func getConnector() -> WebRTCConnector? { + if let _ = self.connector as? WebSocketConnector { + return nil + } + else { + return self.connector as! WebRTCConnector + } + } } /// Helper methods for connecting to the OpenAI Realtime API. @@ -55,13 +64,56 @@ extension RealtimeAPI { } /// Connect to the OpenAI WebRTC Realtime API with the given authentication token and model. - static func webRTC(authToken: String, model: String = "gpt-4o-realtime-preview") async throws -> RealtimeAPI { - var request = URLRequest(url: URL(string: "wss://api.openai.com/v1/realtime")!.appending(queryItems: [ - URLQueryItem(name: "model", value: model), - ])) + static func webRTC(authToken: String, model: String = "gpt-4o-realtime-preview-2024-12-17") async throws -> RealtimeAPI { + + // https://platform.openai.com/docs/guides/realtime-webrtc + + guard let url = URL(string: "https://api.openai.com/v1/realtime/sessions") else { + throw URLError(.badURL) + } + var request = URLRequest(url: url, cachePolicy: .useProtocolCachePolicy) + request.httpMethod = "POST" + request.httpBody = try JSONSerialization.data(withJSONObject: [ + "model": model, + "voice": "echo" + ]) + request.setValue("application/json", forHTTPHeaderField: "Content-Type") + request.setValue("Bearer \(authToken)", forHTTPHeaderField: "Authorization") + let (data, response) = try await URLSession.shared.data(for: request) + guard let httpResponse = response as? HTTPURLResponse, + (200...299).contains(httpResponse.statusCode) else { + throw URLError(.badServerResponse) + } + let arr = try JSONSerialization.jsonObject(with: data, options: []) as? [String: Any] ?? [:] + var ephemeral_key = "" + for element in arr { + if element.key == "client_secret" { + let arr2 = element.value as! [String : Any] + for element2 in arr2 { + if element2.key == "value" { + ephemeral_key = element2.value as! String + + break + } + } + break + } + } + + request = URLRequest(url: URL(string: "https://api.openai.com/v1/realtime")!.appending(queryItems: [ + URLQueryItem(name: "model", value: model), + ])) - request.addValue("realtime=v1", forHTTPHeaderField: "OpenAI-Beta") - request.addValue("Bearer \(authToken)", forHTTPHeaderField: "Authorization") + request.httpMethod = "POST" + // Add query items to the body instead of appending them to the URL + let body = ["model": model] + request.httpBody = try? JSONSerialization.data(withJSONObject: body) + // Add headers + request.addValue("application/sdp", forHTTPHeaderField: "Content-Type") + request.addValue("Bearer \(ephemeral_key)", forHTTPHeaderField: "Authorization") + + print(ephemeral_key) + return try await webRTC(connectingTo: request) } }