diff --git a/lib/api.js b/lib/api.js index cc8f904..534477d 100644 --- a/lib/api.js +++ b/lib/api.js @@ -1,44 +1,62 @@ import { RealtimeEventHandler } from './event_handler.js'; import { RealtimeUtils } from './utils.js'; +import { RealtimeTransportType } from './transport.js'; +import { RealtimeTransportWebRTC } from './transport_webrtc.js'; +import { RealtimeTransportWebSocket } from './transport_websocket.js'; export class RealtimeAPI extends RealtimeEventHandler { /** * Create a new RealtimeAPI instance - * @param {{url?: string, apiKey?: string, dangerouslyAllowAPIKeyInBrowser?: boolean, debug?: boolean}} [settings] + * @param {{transportType?: string, url?: string, apiKey?: string, dangerouslyAllowAPIKeyInBrowser?: boolean, debug?: boolean}} [settings] * @returns {RealtimeAPI} */ - constructor({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug } = {}) { + constructor({ transportType, url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug } = {}) { super(); - this.defaultUrl = 'wss://api.openai.com/v1/realtime'; - this.url = url || this.defaultUrl; - this.apiKey = apiKey || null; this.debug = !!debug; - this.ws = null; - if (globalThis.document && this.apiKey) { - if (!dangerouslyAllowAPIKeyInBrowser) { - throw new Error( - `Can not provide API key in the browser without "dangerouslyAllowAPIKeyInBrowser" set to true`, - ); + transportType = transportType?.toUpperCase() || RealtimeTransportType.WEBRTC; + switch (transportType) { + case RealtimeTransportType.WEBRTC: { + this.transport = new RealtimeTransportWebRTC({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug }); + break; + } + case RealtimeTransportType.WEBSOCKET: { + this.transport = new RealtimeTransportWebSocket({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug }); + break; + } + default: { + throw new Error(`Invalid transportType: "${transportType}"`); } } + this.transport.on('close', (data) => { + this.disconnect(); + this.dispatch('close', data); + }); + this.transport.on('message', (event) => { + const message = JSON.parse(event.data); + this._receive(message.type, message) + }); + } + + get transportType() { + return this.transport.transportType; } /** - * Tells us whether or not the WebSocket is connected + * Tells us whether or not the Realtime API server is connected * @returns {boolean} */ - isConnected() { - return !!this.ws; + get isConnected() { + return this.transport.isConnected; } /** - * Writes WebSocket logs to console + * Writes log to console * @param {...any} args * @returns {true} */ log(...args) { const date = new Date().toISOString(); - const logs = [`[Websocket/${date}]`].concat(args).map((arg) => { + const logs = [`[RealtimeAPI/${date}]`].concat(args).map((arg) => { if (typeof arg === 'object' && arg !== null) { return JSON.stringify(arg, null, 2); } else { @@ -52,142 +70,51 @@ export class RealtimeAPI extends RealtimeEventHandler { } /** - * Connects to Realtime API Websocket Server - * @param {{model?: string}} [settings] + * Connects to Realtime API Server + * @param {{sessionConfig?: SessionConfig, setAudioOutputCallback?: Function, getMicrophoneCallback?: Function}} [settings] * @returns {Promise} */ - async connect({ model } = { model: 'gpt-4o-realtime-preview-2024-10-01' }) { - if (!this.apiKey && this.url === this.defaultUrl) { - console.warn(`No apiKey provided for connection to "${this.url}"`); - } - if (this.isConnected()) { - throw new Error(`Already connected`); - } - if (globalThis.WebSocket) { - /** - * Web browser - */ - if (globalThis.document && this.apiKey) { - console.warn( - 'Warning: Connecting using API key in the browser, this is not recommended', - ); - } - const WebSocket = globalThis.WebSocket; - const ws = new WebSocket(`${this.url}${model ? `?model=${model}` : ''}`, [ - 'realtime', - `openai-insecure-api-key.${this.apiKey}`, - 'openai-beta.realtime-v1', - ]); - ws.addEventListener('message', (event) => { - const message = JSON.parse(event.data); - this.receive(message.type, message); - }); - return new Promise((resolve, reject) => { - const connectionErrorHandler = () => { - this.disconnect(ws); - reject(new Error(`Could not connect to "${this.url}"`)); - }; - ws.addEventListener('error', connectionErrorHandler); - ws.addEventListener('open', () => { - this.log(`Connected to "${this.url}"`); - ws.removeEventListener('error', connectionErrorHandler); - ws.addEventListener('error', () => { - this.disconnect(ws); - this.log(`Error, disconnected from "${this.url}"`); - this.dispatch('close', { error: true }); - }); - ws.addEventListener('close', () => { - this.disconnect(ws); - this.log(`Disconnected from "${this.url}"`); - this.dispatch('close', { error: false }); - }); - this.ws = ws; - resolve(true); - }); - }); - } else { - /** - * Node.js - */ - const moduleName = 'ws'; - const wsModule = await import(/* webpackIgnore: true */ moduleName); - const WebSocket = wsModule.default; - const ws = new WebSocket( - 'wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01', - [], - { - finishRequest: (request) => { - // Auth - request.setHeader('Authorization', `Bearer ${this.apiKey}`); - request.setHeader('OpenAI-Beta', 'realtime=v1'); - request.end(); - }, - }, - ); - ws.on('message', (data) => { - const message = JSON.parse(data.toString()); - this.receive(message.type, message); - }); - return new Promise((resolve, reject) => { - const connectionErrorHandler = () => { - this.disconnect(ws); - reject(new Error(`Could not connect to "${this.url}"`)); - }; - ws.on('error', connectionErrorHandler); - ws.on('open', () => { - this.log(`Connected to "${this.url}"`); - ws.removeListener('error', connectionErrorHandler); - ws.on('error', () => { - this.disconnect(ws); - this.log(`Error, disconnected from "${this.url}"`); - this.dispatch('close', { error: true }); - }); - ws.on('close', () => { - this.disconnect(ws); - this.log(`Disconnected from "${this.url}"`); - this.dispatch('close', { error: false }); - }); - this.ws = ws; - resolve(true); - }); - }); - } + async connect({ sessionConfig, setAudioOutputCallback, getMicrophoneCallback }) { + return this.transport.connect({ sessionConfig, setAudioOutputCallback, getMicrophoneCallback }); } /** * Disconnects from Realtime API server - * @param {WebSocket} [ws] * @returns {true} */ - disconnect(ws) { - if (!ws || this.ws === ws) { - this.ws && this.ws.close(); - this.ws = null; - return true; - } + async disconnect() { + await this.transport.disconnect(); + return true; } /** - * Receives an event from WebSocket and dispatches as "server.{eventName}" and "server.*" events + * Receives an event from Realtime API server and dispatches as "server.{eventName}" and "server.*" events * @param {string} eventName * @param {{[key: string]: any}} event * @returns {true} */ - receive(eventName, event) { - this.log(`received:`, eventName, event); + _receive(eventName, event) { + if (this.debug) { + if (eventName === 'response.audio.delta') { + const delta = event.delta; + this.log(`received:`, eventName, { ...event, delta: delta.slice(0, 10) + '...' + delta.slice(-10) }); + } else { + this.log(`received:`, eventName, event); + } + } this.dispatch(`server.${eventName}`, event); this.dispatch('server.*', event); return true; } /** - * Sends an event to WebSocket and dispatches as "client.{eventName}" and "client.*" events + * Sends an event to Realtime API server and dispatches as "client.{eventName}" and "client.*" events * @param {string} eventName * @param {{[key: string]: any}} event * @returns {true} */ - send(eventName, data) { - if (!this.isConnected()) { + async send(eventName, data) { + if (!this.isConnected) { throw new Error(`RealtimeAPI is not connected`); } data = data || {}; @@ -201,8 +128,15 @@ export class RealtimeAPI extends RealtimeEventHandler { }; this.dispatch(`client.${eventName}`, event); this.dispatch('client.*', event); - this.log(`sent:`, eventName, event); - this.ws.send(JSON.stringify(event)); + if (this.debug) { + if (eventName === 'input_audio_buffer.append') { + const audio = event.audio; + this.log(`sending:`, eventName, { ...event, audio: audio.slice(0, 10) + '...' + audio.slice(-10) }); + } else { + this.log(`sending:`, eventName, event); + } + } + await this.transport.send(event); return true; } } diff --git a/lib/client.js b/lib/client.js index 2c48d7f..b5406a3 100644 --- a/lib/client.js +++ b/lib/client.js @@ -1,6 +1,7 @@ -import { RealtimeEventHandler } from './event_handler.js'; import { RealtimeAPI } from './api.js'; import { RealtimeConversation } from './conversation.js'; +import { RealtimeEventHandler } from './event_handler.js'; +import { RealtimeTransportType } from './transport.js'; import { RealtimeUtils } from './utils.js'; /** @@ -189,9 +190,9 @@ import { RealtimeUtils } from './utils.js'; export class RealtimeClient extends RealtimeEventHandler { /** * Create a new RealtimeClient instance - * @param {{url?: string, apiKey?: string, dangerouslyAllowAPIKeyInBrowser?: boolean, debug?: boolean}} [settings] + * @param {{transportType?: string, url?: string, apiKey?: string, dangerouslyAllowAPIKeyInBrowser?: boolean, debug?: boolean}} [settings] */ - constructor({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug } = {}) { + constructor({ transportType, url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug } = {}) { super(); this.defaultSessionConfig = { modalities: ['text', 'audio'], @@ -219,6 +220,7 @@ export class RealtimeClient extends RealtimeEventHandler { silence_duration_ms: 200, // How long to wait to mark the speech as stopped. }; this.realtime = new RealtimeAPI({ + transportType, url, apiKey, dangerouslyAllowAPIKeyInBrowser, @@ -238,7 +240,12 @@ export class RealtimeClient extends RealtimeEventHandler { this.sessionCreated = false; this.tools = {}; this.sessionConfig = JSON.parse(JSON.stringify(this.defaultSessionConfig)); - this.inputAudioBuffer = new Int16Array(0); + switch (this.realtime.transportType) { + case RealtimeTransportType.WEBSOCKET: { + this.inputAudioBuffer = new Int16Array(0); + break; + } + } return true; } @@ -367,8 +374,8 @@ export class RealtimeClient extends RealtimeEventHandler { * Tells us whether the realtime socket is connected and the session has started * @returns {boolean} */ - isConnected() { - return this.realtime.isConnected(); + get isConnected() { + return this.realtime.isConnected; } /** @@ -385,15 +392,16 @@ export class RealtimeClient extends RealtimeEventHandler { } /** - * Connects to the Realtime WebSocket API + * Connects to the Realtime API server * Updates session config and conversation config + * @param {{sessionConfig?: SessionConfig, setAudioOutputCallback?: Function, getMicrophoneCallback?: Function}} [settings] * @returns {Promise} */ - async connect() { - if (this.isConnected()) { + async connect({ sessionConfig, setAudioOutputCallback, getMicrophoneCallback }) { + if (this.isConnected) { throw new Error(`Already connected, use .disconnect() first`); } - await this.realtime.connect(); + await this.realtime.connect({ sessionConfig, setAudioOutputCallback, getMicrophoneCallback }); this.updateSession(); return true; } @@ -403,7 +411,7 @@ export class RealtimeClient extends RealtimeEventHandler { * @returns {Promise} */ async waitForSessionCreated() { - if (!this.isConnected()) { + if (!this.isConnected) { throw new Error(`Not connected, use .connect() first`); } while (!this.sessionCreated) { @@ -413,11 +421,11 @@ export class RealtimeClient extends RealtimeEventHandler { } /** - * Disconnects from the Realtime API and clears the conversation history + * Disconnects from the Realtime API server and clears the conversation history */ disconnect() { this.sessionCreated = false; - this.realtime.isConnected() && this.realtime.disconnect(); + this.realtime.isConnected && this.realtime.disconnect(); this.conversation.clear(); } @@ -483,6 +491,7 @@ export class RealtimeClient extends RealtimeEventHandler { */ updateSession({ modalities = void 0, + model = void 0, instructions = void 0, voice = void 0, input_audio_format = void 0, @@ -495,6 +504,7 @@ export class RealtimeClient extends RealtimeEventHandler { max_response_output_tokens = void 0, } = {}) { modalities !== void 0 && (this.sessionConfig.modalities = modalities); + model !== void 0 && (this.sessionConfig.model = model); instructions !== void 0 && (this.sessionConfig.instructions = instructions); voice !== void 0 && (this.sessionConfig.voice = voice); input_audio_format !== void 0 && @@ -535,7 +545,7 @@ export class RealtimeClient extends RealtimeEventHandler { ); const session = { ...this.sessionConfig }; session.tools = useTools; - if (this.realtime.isConnected()) { + if (this.realtime.isConnected) { this.realtime.send('session.update', { session }); } return true; @@ -568,11 +578,15 @@ export class RealtimeClient extends RealtimeEventHandler { } /** - * Appends user audio to the existing audio buffer + * Appends user audio to the existing audio buffer. + * Only used by WebSocket. * @param {Int16Array|ArrayBuffer} arrayBuffer * @returns {true} */ appendInputAudio(arrayBuffer) { + if (this.realtime.transportType !== RealtimeTransportType.WEBSOCKET) { + throw new Error(`appendInputAudio is only supported for WebSocket transport`); + } if (arrayBuffer.byteLength > 0) { this.realtime.send('input_audio_buffer.append', { audio: RealtimeUtils.arrayBufferToBase64(arrayBuffer), @@ -592,7 +606,7 @@ export class RealtimeClient extends RealtimeEventHandler { createResponse() { if ( this.getTurnDetectionType() === null && - this.inputAudioBuffer.byteLength > 0 + this.inputAudioBuffer?.byteLength > 0 ) { this.realtime.send('input_audio_buffer.commit'); this.conversation.queueInputAudio(this.inputAudioBuffer); diff --git a/lib/transport.js b/lib/transport.js new file mode 100644 index 0000000..05014f0 --- /dev/null +++ b/lib/transport.js @@ -0,0 +1,84 @@ +import { RealtimeEventHandler } from './event_handler.js'; + +/** + * Enum representing the transport types. + * @readonly + * @enum {string} + */ +export const RealtimeTransportType = { + WEBRTC: "WEBRTC", + WEBSOCKET: "WEBSOCKET", +}; + +/** + * An abstract base class representing a RealtimeTransport. + * Subclasses must implement all of these methods. + * + * @interface + */ +export class RealtimeTransport extends RealtimeEventHandler { + get transportType() { + throw new Error("Not implemented: transportType getter"); + } + + get defaultUrl() { + throw new Error("Not implemented: defaultUrl getter"); + } + + log(...args) { + if (this.debug) { + const date = new Date().toISOString(); + const logs = [`[${this.transportType}/${date}]`].concat(args).map((arg) => { + if (typeof arg === 'object' && arg !== null) { + return JSON.stringify(arg, null, 2); + } else { + return arg; + } + }); + console.log(...logs); + } + return true; + } + + constructor({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug } = {}) { + super(); + this.url = url || this.defaultUrl; + this.apiKey = apiKey || null; + this.debug = !!debug; + if (globalThis.document && this.apiKey) { + if (!dangerouslyAllowAPIKeyInBrowser) { + throw new Error( + `Can not provide API key in the browser without "dangerouslyAllowAPIKeyInBrowser" set to true`, + ); + } + } + } + + get isConnected() { + throw new Error("Not implemented: isConnected getter"); + } + + async connect(options = {}) { + if (!this.apiKey && this.url === this.defaultUrl) { + console.warn(`No apiKey provided for connection to "${this.url}"`); + } + if (this.isConnected) { + throw new Error(`Already connected`); + } + if (globalThis.document && this.apiKey) { + console.warn( + 'Warning: Connecting using API key in the browser, this is not recommended', + ); + } + } + + async disconnect(options = {}) { + throw new Error("Not implemented: disconnect"); + } + + async send(data) { + if (!this.isConnected) { + throw new Error(`RealtimeAPI is not connected`); + } + } +} diff --git a/lib/transport_webrtc.js b/lib/transport_webrtc.js new file mode 100644 index 0000000..48d2203 --- /dev/null +++ b/lib/transport_webrtc.js @@ -0,0 +1,121 @@ +import { RealtimeTransportType, RealtimeTransport } from './transport.js'; + +export class RealtimeTransportWebRTC extends RealtimeTransport { + get transportType() { return RealtimeTransportType.WEBRTC; } + + get defaultUrl() { + return 'https://api.openai.com/v1/realtime'; + } + + constructor({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug } = {}) { + super({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug }); + this.peerConnection = null; + this.dataChannel = null; + } + + get isConnected() { + return !!this.peerConnection; + } + + async connect({ sessionConfig, setAudioOutputCallback, getMicrophoneCallback }) { + super.connect(); + sessionConfig = { + model: 'gpt-4o-mini-realtime-preview', + voice: 'ash', + ...sessionConfig, + }; + this.log(`connect(sessionConfig=${JSON.stringify(sessionConfig)}, ...)`); + const emphemeralApiToken = await this._requestEphemeralApiToken(this.apiKey, sessionConfig); + await this._init(emphemeralApiToken, sessionConfig.model, setAudioOutputCallback, getMicrophoneCallback); + } + + /** + * Initially from: + * https://platform.openai.com/docs/guides/realtime-webrtc#creating-an-ephemeral-token + */ + async _requestEphemeralApiToken(dangerousApiKey, sessionConfig) { + const r = await fetch(`${this.url}/sessions`, { + method: 'POST', + headers: { + 'Authorization': `Bearer ${dangerousApiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(sessionConfig), + }); + const data = await r.json(); + return data.client_secret.value; + } + + /** + * Initially from: + * https://platform.openai.com/docs/guides/realtime-webrtc#connection-details + */ + async _init(ephemeralApiToken, model, setAudioOutputCallback, getMicrophoneCallback) { + this.log(`init(...)`); + this.peerConnection = new RTCPeerConnection(); + + this.peerConnection.ontrack = (e) => setAudioOutputCallback(e.streams[0]); + this.peerConnection.addTrack(await getMicrophoneCallback()); + + return new Promise(async (resolve, reject) => { + const dataChannel = this.peerConnection?.createDataChannel('oai-events'); + if (!dataChannel) { + reject(new Error('dataChannel == null')); + return; + } + dataChannel.addEventListener('open', () => { + this.log('Data channel is open'); + this.dataChannel = dataChannel; + resolve(true); + }); + dataChannel.addEventListener('closing', () => { + this.log('Data channel is closing'); + }); + dataChannel.addEventListener('close', () => { + this.disconnect(); + this.log('Data channel is closed'); + this.dispatch('close', { error: false }); + }); + dataChannel.addEventListener('message', (event) => { + this.dispatch('message', event); + }); + + // Start the session using the Session Description Protocol (SDP) + const offer = await this.peerConnection?.createOffer(); + if (!offer) { + reject(new Error('offer == null')); + return; + } + await this.peerConnection?.setLocalDescription(offer); + const sdpResponse = await fetch(`${this.url}?model=${model}`, { + method: 'POST', + body: offer.sdp, + headers: { + Authorization: `Bearer ${ephemeralApiToken}`, + 'Content-Type': 'application/sdp' + }, + }); + await this.peerConnection?.setRemoteDescription({ + type: 'answer', + sdp: await sdpResponse.text(), + }); + }); + } + + async disconnect() { + this.log('disconnect()'); + if (this.dataChannel) { + this.dataChannel.close(); + this.dataChannel = null; + } + if (this.peerConnection) { + this.peerConnection.close(); + this.peerConnection = null; + } + } + + async send(data) { + super.send(data); + this.dataChannel.send(JSON.stringify(data)); + } +} diff --git a/lib/transport_websocket.js b/lib/transport_websocket.js new file mode 100644 index 0000000..c7520a1 --- /dev/null +++ b/lib/transport_websocket.js @@ -0,0 +1,123 @@ +import { RealtimeTransportType, RealtimeTransport } from './transport.js'; + +export class RealtimeTransportWebSocket extends RealtimeTransport { + get transportType() { return RealtimeTransportType.WEBSOCKET; } + + get defaultUrl() { + return 'wss://api.openai.com/v1/realtime'; + } + + constructor({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug } = {}) { + super({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug }); + this.ws = null; + } + + get isConnected() { + return !!this.ws; + } + + async connect({ sessionConfig }) { + super.connect(); + sessionConfig = { + model: 'gpt-4o-mini-realtime-preview', + ...sessionConfig, + }; + this.log(`connect(sessionConfig=${JSON.stringify(sessionConfig)}, ...)`); + const { model } = sessionConfig; + if (globalThis.WebSocket) { + const WebSocket = globalThis.WebSocket; + const ws = new WebSocket(`${this.url}${model ? `?model=${model}` : ''}`, [ + 'realtime', + `openai-insecure-api-key.${this.apiKey}`, + 'openai-beta.realtime-v1', + ]); + ws.addEventListener('message', (event) => { + this.dispatch('message', event); + }); + return new Promise((resolve, reject) => { + const connectionErrorHandler = () => { + this.disconnect(ws); + reject(new Error(`Could not connect to "${this.url}"`)); + }; + ws.addEventListener('error', connectionErrorHandler); + ws.addEventListener('open', () => { + this.log(`Connected to "${this.url}"`); + ws.removeEventListener('error', connectionErrorHandler); + ws.addEventListener('error', () => { + this.disconnect(ws); + this.log(`Error, disconnected from "${this.url}"`); + this.dispatch('close', { error: true }); + }); + ws.addEventListener('close', () => { + this.disconnect(ws); + this.log(`Disconnected from "${this.url}"`); + this.dispatch('close', { error: false }); + }); + this.ws = ws; + resolve(true); + }); + }); + } else { + /** + * Node.js + */ + const moduleName = 'ws'; + const wsModule = await import(/* webpackIgnore: true */ moduleName); + const WebSocket = wsModule.default; + const ws = new WebSocket( + 'wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01', + [], + { + finishRequest: (request) => { + // Auth + request.setHeader('Authorization', `Bearer ${this.apiKey}`); + request.setHeader('OpenAI-Beta', 'realtime=v1'); + request.end(); + }, + }, + ); + ws.on('message', (data) => { + const message = JSON.parse(data.toString()); + this.receive(message.type, message); + }); + return new Promise((resolve, reject) => { + const connectionErrorHandler = () => { + this.disconnect(ws); + reject(new Error(`Could not connect to "${this.url}"`)); + }; + ws.on('error', connectionErrorHandler); + ws.on('open', () => { + this.log(`Connected to "${this.url}"`); + ws.removeListener('error', connectionErrorHandler); + ws.on('error', () => { + this.disconnect(ws); + this.log(`Error, disconnected from "${this.url}"`); + this.dispatch('close', { error: true }); + }); + ws.on('close', () => { + this.disconnect(ws); + this.log(`Disconnected from "${this.url}"`); + this.dispatch('close', { error: false }); + }); + this.ws = ws; + resolve(true); + }); + }); + } + } + + async disconnect({ ws } = {}) { + this.log(`disconnect(${ws})`); + if (!ws || this.ws === ws) { + this.ws && this.ws.close(); + this.ws = null; + return true; + } + } + + async send(data) { + super.send(data); + this.ws.send(JSON.stringify(data)); + return true; + } +}