m1guelpf · jeffxtang · Feb 20, 2025 · Feb 20, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@
 [![Swift Version](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fm1guelpf%2Fswift-realtime-openai%2Fbadge%3Ftype%3Dswift-versions&color=brightgreen)](https://swiftpackageindex.com/m1guelpf/swift-realtime-openai)
 [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://raw.githubusercontent.com/m1guelpf/swift-realtime-openai/main/LICENSE)
 
+**[Update Feb 21, 2025 Added the WebRTC support. See the complete demo [here](https://github.com/jeffxtang/ios_realtime_api) on how to use both WebRTC and WebSocket.]**
+
 This library provides a simple interface for implementing multi-modal conversations using OpenAI's new Realtime API.
 
 It can handle automatically recording the user's microphone and playing back the assistant's response, and also gives you a transparent layer over the API for advanced use cases.
@@ -121,7 +123,7 @@ struct ContentView: View {
 -   [x] Optionally handle recording the user's mic and sending it to the API
 -   [x] Optionally handle playing model responses as they stream in
 -   [x] Allow interrupting the model
--   [ ] WebRTC support
+-   [x] WebRTC support
 
 ## Architecture
 

diff --git a/src/Connectors/WebRTCConnector.swift b/src/Connectors/WebRTCConnector.swift
@@ -25,6 +25,10 @@ public final class WebRTCConnector: NSObject, Connector, Sendable {
 		return RTCPeerConnectionFactory()
 	}()
 
+	public func getConnection() -> RTCPeerConnection {
+		connection
+	}
+
 	private let encoder: JSONEncoder = {
 		let encoder = JSONEncoder()
 		encoder.keyEncodingStrategy = .convertToSnakeCase
@@ -102,8 +106,15 @@ extension WebRTCConnector: RTCPeerConnectionDelegate {
 		print("Connection state changed to \(connection.signalingState)")
 	}
 
-	public func peerConnection(_: RTCPeerConnection, didAdd _: RTCMediaStream) {
+	public func peerConnection(_: RTCPeerConnection, didAdd stream: RTCMediaStream) {
 		print("Media stream added.")
+		if let audioTrack = stream.audioTracks.first {
+			print("Audio track received")
+			let audioSession = AVAudioSession.sharedInstance()
+			do {
+				try AVAudioSession.sharedInstance().overrideOutputAudioPort(AVAudioSession.PortOverride.speaker)
+			} catch {}
+		}
 	}
 
 	public func peerConnection(_: RTCPeerConnection, didRemove _: RTCMediaStream) {

diff --git a/src/Conversation.swift b/src/Conversation.swift
@@ -1,3 +1,4 @@
+@preconcurrency import WebRTC
 import Foundation
 @preconcurrency import AVFoundation
 
@@ -20,6 +21,8 @@ public final class Conversation: @unchecked Sendable {
 	private let userConverter = UnsafeInteriorMutable<AVAudioConverter>()
 	private let desiredFormat = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 24000, channels: 1, interleaved: false)!
 
+	public func getClient() -> RealtimeAPI { return client }
+
 	/// A stream of errors that occur during the conversation.
 	public let errors: AsyncStream<ServerError>
 
@@ -97,10 +100,26 @@ public final class Conversation: @unchecked Sendable {
 	}
 
 	/// Create a new conversation providing an API token and, optionally, a model.
-	public convenience init(authToken token: String, model: String = "gpt-4o-realtime-preview") {
+	public convenience init(authToken token: String) {
+		let model = "gpt-4o-realtime-preview-2024-12-17"
+
+		// use websocket
+		print("webSocket initalization...")
 		self.init(client: RealtimeAPI.webSocket(authToken: token, model: model))
 	}
 
+	public convenience init(authToken token: String, webRTC _: Bool) async {
+		let model = "gpt-4o-realtime-preview-2024-12-17"
+
+		// use webrtc
+		do {
+			await try self.init(client: RealtimeAPI.webRTC(authToken: token, model: model))
+		} catch {
+			print("Exception - webRTC init failed: \(error). \nFalling back to webSocket...")
+			self.init(client: RealtimeAPI.webSocket(authToken: token, model: model))
+		}
+	}
+
 	/// Create a new conversation that connects using a custom `URLRequest`.
 	public convenience init(connectingTo request: URLRequest) {
 		self.init(client: RealtimeAPI.webSocket(connectingTo: request))
@@ -212,10 +231,16 @@ public extension Conversation {
 		userConverter.set(converter)
 
 		audioEngine.attach(playerNode)
-		audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: converter.inputFormat)
+
+		let compatibleFormat = AVAudioFormat(standardFormatWithSampleRate: 44100, channels: 1)
+		audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: compatibleFormat)
 
 		#if os(iOS)
-		try audioEngine.inputNode.setVoiceProcessingEnabled(true)
+		do {
+			try audioEngine.inputNode.setVoiceProcessingEnabled(true)
+		} catch {
+			print("Failed to setVoiceProcessingEnabled: \(error.localizedDescription)")
+		}
 		#endif
 
 		audioEngine.prepare()
@@ -225,7 +250,9 @@ public extension Conversation {
 			#if os(iOS)
 			let audioSession = AVAudioSession.sharedInstance()
 			try audioSession.setCategory(.playAndRecord, mode: .voiceChat, options: [.defaultToSpeaker, .allowBluetooth])
+			try audioSession.setPreferredSampleRate(44100)
 			try audioSession.setActive(true)
+
 			#endif
 
 			handlingVoice = true

diff --git a/src/OpenAIRealtime.swift b/src/OpenAIRealtime.swift
@@ -29,6 +29,15 @@ public final class RealtimeAPI: NSObject, Sendable {
 	public func send(event: ClientEvent) async throws {
 		try await connector.send(event: event)
 	}
+
+  public func getConnector() -> WebRTCConnector? {
+    if let _ = self.connector as? WebSocketConnector {
+        return nil
+    }
+    else {
+      return self.connector as! WebRTCConnector
+    }
+  }
 }
 
 /// Helper methods for connecting to the OpenAI Realtime API.
@@ -55,13 +64,56 @@ extension RealtimeAPI {
 	}
 
 	/// Connect to the OpenAI WebRTC Realtime API with the given authentication token and model.
-	static func webRTC(authToken: String, model: String = "gpt-4o-realtime-preview") async throws -> RealtimeAPI {
-		var request = URLRequest(url: URL(string: "wss://api.openai.com/v1/realtime")!.appending(queryItems: [
-			URLQueryItem(name: "model", value: model),
-		]))
+	static func webRTC(authToken: String, model: String = "gpt-4o-realtime-preview-2024-12-17") async throws -> RealtimeAPI {
+
+    // https://platform.openai.com/docs/guides/realtime-webrtc
+
+    guard let url = URL(string: "https://api.openai.com/v1/realtime/sessions") else {
+        throw URLError(.badURL)
+    }
+    var request = URLRequest(url: url, cachePolicy: .useProtocolCachePolicy)
+    request.httpMethod = "POST"
+    request.httpBody = try JSONSerialization.data(withJSONObject: [
+        "model": model,
+        "voice": "echo"
+    ])
+    request.setValue("application/json", forHTTPHeaderField: "Content-Type")
+    request.setValue("Bearer \(authToken)", forHTTPHeaderField: "Authorization")
+    let (data, response) = try await URLSession.shared.data(for: request)
+    guard let httpResponse = response as? HTTPURLResponse,
+          (200...299).contains(httpResponse.statusCode) else {
+        throw URLError(.badServerResponse)
+    }
+    let arr = try JSONSerialization.jsonObject(with: data, options: []) as? [String: Any] ?? [:]
+    var ephemeral_key = ""
+    for element in arr {
+        if element.key == "client_secret" {
+          let arr2 = element.value as! [String : Any]
+          for element2 in arr2 {
+            if element2.key == "value" {
+              ephemeral_key = element2.value as! String
+
+              break
+            }
+          }
+          break
+      }
+    }
+
+    request = URLRequest(url: URL(string: "https://api.openai.com/v1/realtime")!.appending(queryItems: [
+      URLQueryItem(name: "model", value: model),
+    ]))
 
-		request.addValue("realtime=v1", forHTTPHeaderField: "OpenAI-Beta")
-		request.addValue("Bearer \(authToken)", forHTTPHeaderField: "Authorization")
+    request.httpMethod = "POST"
+    // Add query items to the body instead of appending them to the URL
+    let body = ["model": model]
+    request.httpBody = try? JSONSerialization.data(withJSONObject: body)
+    // Add headers
+    request.addValue("application/sdp", forHTTPHeaderField: "Content-Type")
+    request.addValue("Bearer \(ephemeral_key)", forHTTPHeaderField: "Authorization")
+
+    print(ephemeral_key)
+
 		return try await webRTC(connectingTo: request)
 	}
 }