Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
[![Swift Version](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fm1guelpf%2Fswift-realtime-openai%2Fbadge%3Ftype%3Dswift-versions&color=brightgreen)](https://swiftpackageindex.com/m1guelpf/swift-realtime-openai)
[![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://raw.githubusercontent.com/m1guelpf/swift-realtime-openai/main/LICENSE)

**[Update Feb 21, 2025 Added the WebRTC support. See the complete demo [here](https://github.com/jeffxtang/ios_realtime_api) on how to use both WebRTC and WebSocket.]**

This library provides a simple interface for implementing multi-modal conversations using OpenAI's new Realtime API.

It can handle automatically recording the user's microphone and playing back the assistant's response, and also gives you a transparent layer over the API for advanced use cases.
Expand Down Expand Up @@ -121,7 +123,7 @@ struct ContentView: View {
- [x] Optionally handle recording the user's mic and sending it to the API
- [x] Optionally handle playing model responses as they stream in
- [x] Allow interrupting the model
- [ ] WebRTC support
- [x] WebRTC support

## Architecture

Expand Down
13 changes: 12 additions & 1 deletion src/Connectors/WebRTCConnector.swift
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ public final class WebRTCConnector: NSObject, Connector, Sendable {
return RTCPeerConnectionFactory()
}()

public func getConnection() -> RTCPeerConnection {
connection
}

private let encoder: JSONEncoder = {
let encoder = JSONEncoder()
encoder.keyEncodingStrategy = .convertToSnakeCase
Expand Down Expand Up @@ -102,8 +106,15 @@ extension WebRTCConnector: RTCPeerConnectionDelegate {
print("Connection state changed to \(connection.signalingState)")
}

public func peerConnection(_: RTCPeerConnection, didAdd _: RTCMediaStream) {
public func peerConnection(_: RTCPeerConnection, didAdd stream: RTCMediaStream) {
print("Media stream added.")
if let audioTrack = stream.audioTracks.first {
print("Audio track received")
let audioSession = AVAudioSession.sharedInstance()
do {
try AVAudioSession.sharedInstance().overrideOutputAudioPort(AVAudioSession.PortOverride.speaker)
} catch {}
}
}

public func peerConnection(_: RTCPeerConnection, didRemove _: RTCMediaStream) {
Expand Down
33 changes: 30 additions & 3 deletions src/Conversation.swift
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
@preconcurrency import WebRTC
import Foundation
@preconcurrency import AVFoundation

Expand All @@ -20,6 +21,8 @@ public final class Conversation: @unchecked Sendable {
private let userConverter = UnsafeInteriorMutable<AVAudioConverter>()
private let desiredFormat = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 24000, channels: 1, interleaved: false)!

public func getClient() -> RealtimeAPI { return client }

/// A stream of errors that occur during the conversation.
public let errors: AsyncStream<ServerError>

Expand Down Expand Up @@ -97,10 +100,26 @@ public final class Conversation: @unchecked Sendable {
}

/// Create a new conversation providing an API token and, optionally, a model.
public convenience init(authToken token: String, model: String = "gpt-4o-realtime-preview") {
public convenience init(authToken token: String) {
let model = "gpt-4o-realtime-preview-2024-12-17"

// use websocket
print("webSocket initalization...")
self.init(client: RealtimeAPI.webSocket(authToken: token, model: model))
}

public convenience init(authToken token: String, webRTC _: Bool) async {
let model = "gpt-4o-realtime-preview-2024-12-17"

// use webrtc
do {
await try self.init(client: RealtimeAPI.webRTC(authToken: token, model: model))
} catch {
print("Exception - webRTC init failed: \(error). \nFalling back to webSocket...")
self.init(client: RealtimeAPI.webSocket(authToken: token, model: model))
}
}

/// Create a new conversation that connects using a custom `URLRequest`.
public convenience init(connectingTo request: URLRequest) {
self.init(client: RealtimeAPI.webSocket(connectingTo: request))
Expand Down Expand Up @@ -212,10 +231,16 @@ public extension Conversation {
userConverter.set(converter)

audioEngine.attach(playerNode)
audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: converter.inputFormat)

let compatibleFormat = AVAudioFormat(standardFormatWithSampleRate: 44100, channels: 1)
audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: compatibleFormat)

#if os(iOS)
try audioEngine.inputNode.setVoiceProcessingEnabled(true)
do {
try audioEngine.inputNode.setVoiceProcessingEnabled(true)
} catch {
print("Failed to setVoiceProcessingEnabled: \(error.localizedDescription)")
}
#endif

audioEngine.prepare()
Expand All @@ -225,7 +250,9 @@ public extension Conversation {
#if os(iOS)
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(.playAndRecord, mode: .voiceChat, options: [.defaultToSpeaker, .allowBluetooth])
try audioSession.setPreferredSampleRate(44100)
try audioSession.setActive(true)

#endif

handlingVoice = true
Expand Down
64 changes: 58 additions & 6 deletions src/OpenAIRealtime.swift
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ public final class RealtimeAPI: NSObject, Sendable {
public func send(event: ClientEvent) async throws {
try await connector.send(event: event)
}

public func getConnector() -> WebRTCConnector? {
if let _ = self.connector as? WebSocketConnector {
return nil
}
else {
return self.connector as! WebRTCConnector
}
}
}

/// Helper methods for connecting to the OpenAI Realtime API.
Expand All @@ -55,13 +64,56 @@ extension RealtimeAPI {
}

/// Connect to the OpenAI WebRTC Realtime API with the given authentication token and model.
static func webRTC(authToken: String, model: String = "gpt-4o-realtime-preview") async throws -> RealtimeAPI {
var request = URLRequest(url: URL(string: "wss://api.openai.com/v1/realtime")!.appending(queryItems: [
URLQueryItem(name: "model", value: model),
]))
static func webRTC(authToken: String, model: String = "gpt-4o-realtime-preview-2024-12-17") async throws -> RealtimeAPI {

// https://platform.openai.com/docs/guides/realtime-webrtc

guard let url = URL(string: "https://api.openai.com/v1/realtime/sessions") else {
throw URLError(.badURL)
}
var request = URLRequest(url: url, cachePolicy: .useProtocolCachePolicy)
request.httpMethod = "POST"
request.httpBody = try JSONSerialization.data(withJSONObject: [
"model": model,
"voice": "echo"
])
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
request.setValue("Bearer \(authToken)", forHTTPHeaderField: "Authorization")
let (data, response) = try await URLSession.shared.data(for: request)
guard let httpResponse = response as? HTTPURLResponse,
(200...299).contains(httpResponse.statusCode) else {
throw URLError(.badServerResponse)
}
let arr = try JSONSerialization.jsonObject(with: data, options: []) as? [String: Any] ?? [:]
var ephemeral_key = ""
for element in arr {
if element.key == "client_secret" {
let arr2 = element.value as! [String : Any]
for element2 in arr2 {
if element2.key == "value" {
ephemeral_key = element2.value as! String

break
}
}
break
}
}

request = URLRequest(url: URL(string: "https://api.openai.com/v1/realtime")!.appending(queryItems: [
URLQueryItem(name: "model", value: model),
]))

request.addValue("realtime=v1", forHTTPHeaderField: "OpenAI-Beta")
request.addValue("Bearer \(authToken)", forHTTPHeaderField: "Authorization")
request.httpMethod = "POST"
// Add query items to the body instead of appending them to the URL
let body = ["model": model]
request.httpBody = try? JSONSerialization.data(withJSONObject: body)
// Add headers
request.addValue("application/sdp", forHTTPHeaderField: "Content-Type")
request.addValue("Bearer \(ephemeral_key)", forHTTPHeaderField: "Authorization")

print(ephemeral_key)

return try await webRTC(connectingTo: request)
}
}