/// import { isBrowserEnvironment } from '@openai/agents-core/_shims'; import { RealtimeTransportLayer, RealtimeTransportLayerConnectOptions, } from './transportLayer'; import { UserError } from '@openai/agents-core'; import logger from './logger'; import { RealtimeClientMessage, RealtimeSessionConfig } from './clientMessages'; import { OpenAIRealtimeBase, OpenAIRealtimeBaseOptions, } from './openaiRealtimeBase'; import { parseRealtimeEvent } from './openaiRealtimeEvents'; import { HEADERS } from './utils'; /** * The connection state of the WebRTC connection. */ export type WebRTCState = | { status: 'disconnected'; peerConnection: undefined; dataChannel: undefined; } | { status: 'connecting'; peerConnection: RTCPeerConnection; dataChannel: RTCDataChannel; } | { status: 'connected'; peerConnection: RTCPeerConnection; dataChannel: RTCDataChannel; }; /** * The options for the OpenAI Realtime WebRTC transport layer. */ export type OpenAIRealtimeWebRTCOptions = { /** * Override of the base URL for the Realtime API */ baseUrl?: string; /** * The audio element to use for audio playback. If not provided, a new audio element will be * created. */ audioElement?: HTMLAudioElement; /** * The media stream to use for audio input. If not provided, the default microphone will be used. */ mediaStream?: MediaStream; /** * **Important**: Do not use this option unless you know what you are doing. * * Whether to use an insecure API key. This has to be set if you are trying to use a regular * OpenAI API key instead of a client ephemeral key. * @see https://platform.openai.com/docs/guides/realtime#creating-an-ephemeral-token */ useInsecureApiKey?: boolean; /** * Optional hook invoked with the freshly created peer connection. Returning a * different connection will override the one created by the transport layer. * This is called right before the offer is created and can be asynchronous. */ changePeerConnection?: ( peerConnection: RTCPeerConnection, ) => RTCPeerConnection | Promise; } & OpenAIRealtimeBaseOptions; /** * Transport layer that's handling the connection between the client and OpenAI's Realtime API * via WebRTC. While this transport layer is designed to be used within a RealtimeSession, it can * also be used standalone if you want to have a direct connection to the Realtime API. * * Unless you specify a `mediaStream` or `audioElement` option, the transport layer will * automatically configure the microphone and audio output to be used by the session. */ export class OpenAIRealtimeWebRTC extends OpenAIRealtimeBase implements RealtimeTransportLayer { #url: string; #state: WebRTCState = { status: 'disconnected', peerConnection: undefined, dataChannel: undefined, }; #useInsecureApiKey: boolean; #ongoingResponse: boolean = false; #muted = false; constructor(private readonly options: OpenAIRealtimeWebRTCOptions = {}) { if (typeof RTCPeerConnection === 'undefined') { throw new Error('WebRTC is not supported in this environment'); } super(options); this.#url = options.baseUrl ?? `https://api.openai.com/v1/realtime`; this.#useInsecureApiKey = options.useInsecureApiKey ?? false; } /** * The current status of the WebRTC connection. */ get status() { return this.#state.status; } /** * The current connection state of the WebRTC connection including the peer connection and data * channel. */ get connectionState(): WebRTCState { return this.#state; } /** * Whether the session is muted. */ get muted(): boolean { return this.#muted; } /** * Connect to the Realtime API. This will establish the connection to the OpenAI Realtime API * via WebRTC. * * If you are using a browser, the transport layer will also automatically configure the * microphone and audio output to be used by the session. * * @param options - The options for the connection. */ async connect(options: RealtimeTransportLayerConnectOptions) { if (this.#state.status === 'connected') { return; } if (this.#state.status === 'connecting') { logger.warn( 'Realtime connection already in progress. Please await original promise', ); } const model = options.model ?? this.currentModel; this.currentModel = model; const baseUrl = options.url ?? this.#url; const apiKey = await this._getApiKey(options); const isClientKey = typeof apiKey === 'string' && apiKey.startsWith('ek_'); if (isBrowserEnvironment() && !this.#useInsecureApiKey && !isClientKey) { throw new UserError( 'Using the WebRTC connection in a browser environment requires an insecure API key. Please use a WebSocket connection instead or set the useInsecureApiKey option to true.', ); } // eslint-disable-next-line no-async-promise-executor return new Promise(async (resolve, reject) => { try { const userSessionConfig: Partial = { ...(options.initialSessionConfig || {}), model: this.currentModel, }; const connectionUrl = new URL(baseUrl); let peerConnection: RTCPeerConnection = new RTCPeerConnection(); const dataChannel = peerConnection.createDataChannel('oai-events'); this.#state = { status: 'connecting', peerConnection, dataChannel, }; this.emit('connection_change', this.#state.status); dataChannel.addEventListener('open', () => { this.#state = { status: 'connected', peerConnection, dataChannel, }; // Sending the session config again here once the channel is connected to ensure // that the session config is sent to the server before the first response is received // Setting it on connection should work but the config is not being validated on the // server. This triggers a validation error if the config is not valid. this.updateSessionConfig(userSessionConfig); this.emit('connection_change', this.#state.status); this._onOpen(); resolve(); }); dataChannel.addEventListener('error', (event) => { this.close(); this._onError(event); reject(event); }); dataChannel.addEventListener('message', (event) => { this._onMessage(event); const { data: parsed, isGeneric } = parseRealtimeEvent(event); if (!parsed || isGeneric) { return; } if (parsed.type === 'response.created') { this.#ongoingResponse = true; } else if (parsed.type === 'response.done') { this.#ongoingResponse = false; } if (parsed.type === 'session.created') { this._tracingConfig = parsed.session.tracing; // Trying to turn on tracing after the session is created this._updateTracingConfig(userSessionConfig.tracing ?? 'auto'); } }); // set up audio playback const audioElement = this.options.audioElement ?? document.createElement('audio'); audioElement.autoplay = true; peerConnection.ontrack = (event) => { audioElement.srcObject = event.streams[0]; }; // get microphone stream const stream = this.options.mediaStream ?? (await navigator.mediaDevices.getUserMedia({ audio: true, })); peerConnection.addTrack(stream.getAudioTracks()[0]); if (this.options.changePeerConnection) { peerConnection = await this.options.changePeerConnection(peerConnection); this.#state = { ...this.#state, peerConnection }; } const offer = await peerConnection.createOffer(); await peerConnection.setLocalDescription(offer); if (!offer.sdp) { throw new Error('Failed to create offer'); } const sessionConfig = { ...this._getMergedSessionConfig(userSessionConfig), model: this.currentModel, }; const data = new FormData(); data.append('sdp', offer.sdp); data.append('session', JSON.stringify(sessionConfig)); const sdpResponse = await fetch(connectionUrl, { method: 'POST', body: data, headers: { Authorization: `Bearer ${apiKey}`, 'X-OpenAI-Agents-SDK': HEADERS['X-OpenAI-Agents-SDK'], }, }); const answer: RTCSessionDescriptionInit = { type: 'answer', sdp: await sdpResponse.text(), }; await peerConnection.setRemoteDescription(answer); } catch (error) { this.close(); this._onError(error); reject(error); } }); } /** * Send an event to the Realtime API. This will stringify the event and send it directly to the * API. This can be used if you want to take control over the connection and send events manually. * * @param event - The event to send. */ sendEvent(event: RealtimeClientMessage): void { if ( !this.#state.dataChannel || this.#state.dataChannel.readyState !== 'open' ) { throw new Error( 'WebRTC data channel is not connected. Make sure you call `connect()` before sending events.', ); } this.#state.dataChannel.send(JSON.stringify(event)); } /** * Mute or unmute the session. * @param muted - Whether to mute the session. */ mute(muted: boolean) { this.#muted = muted; if (this.#state.peerConnection) { const peerConnection = this.#state.peerConnection; peerConnection.getSenders().forEach((sender) => { if (sender.track) { sender.track.enabled = !muted; } }); } } /** * Close the connection to the Realtime API and disconnects the underlying WebRTC connection. */ close() { if (this.#state.dataChannel) { this.#state.dataChannel.close(); } if (this.#state.peerConnection) { const peerConnection = this.#state.peerConnection; peerConnection.getSenders().forEach((sender) => { sender.track?.stop(); }); peerConnection.close(); } if (this.#state.status !== 'disconnected') { this.#state = { status: 'disconnected', peerConnection: undefined, dataChannel: undefined, }; this.emit('connection_change', this.#state.status); this._onClose(); } } /** * Interrupt the current response if one is ongoing and clear the audio buffer so that the agent * stops talking. */ interrupt() { if (this.#ongoingResponse) { this.sendEvent({ type: 'response.cancel', }); this.#ongoingResponse = false; } this.sendEvent({ type: 'output_audio_buffer.clear', }); } }