/** * OpenAI Realtime API session provider for voice calls. * * Replaces the STT → LLM → TTS chain with native speech-to-speech via the * OpenAI Agents SDK RealtimeSession + TwilioRealtimeTransportLayer. * * Tools are generic: exec (run shell commands) and web_search (using the * configured search provider). Workspace-specific tools are defined in config, * not hardcoded here. */ import { execFile } from "node:child_process"; import { promisify } from "node:util"; import type { WebSocket as NodeWebSocket } from "ws"; import { RealtimeAgent, RealtimeSession, tool, backgroundResult } from "@openai/agents/realtime"; import { TwilioRealtimeTransportLayer } from "@openai/agents-extensions"; import { z } from "zod"; import type { VoiceCallConfig } from "../config.js"; const execFileAsync = promisify(execFile); const SCRIPT_TIMEOUT_MS = 15_000; async function runCommand(command: string, args: string[]): Promise { try { const { stdout, stderr } = await execFileAsync(command, args, { timeout: SCRIPT_TIMEOUT_MS, maxBuffer: 1024 * 512, env: process.env, }); return stdout.trim() || stderr.trim() || "(no output)"; } catch (err) { const msg = err instanceof Error ? err.message : String(err); return `Error: ${msg}`; } } function buildGenericTools() { const exec = tool({ name: "run_command", description: "Execute a shell command. Use for any data lookup, script execution, or system check.", parameters: z.object({ command: z.string().describe("Shell command to run"), }), execute: async ({ command }) => { return backgroundResult(await runCommand("bash", ["-c", command])); }, }); return [exec]; } export type RealtimeCallSession = { session: RealtimeSession; close: () => void; }; export function createRealtimeCallSession(params: { twilioWebSocket: WebSocket | NodeWebSocket; voiceConfig: VoiceCallConfig; callId: string; from: string; }): RealtimeCallSession { const { twilioWebSocket, voiceConfig, callId, from } = params; const systemPrompt = voiceConfig.responseSystemPrompt ?? `You are a voice assistant on a PHONE CALL. Keep responses brief — max 2 short sentences. This is spoken aloud, not text. Never use lists, bullet points, or markdown. Use your tools when asked for data or lookups. If the caller says bye, say farewell and include [END_CALL].`; const agent = new RealtimeAgent({ name: "VoiceAgent", instructions: `${systemPrompt}\n\nThe caller's phone number is ${from}. Call ID: ${callId}.`, tools: buildGenericTools(), }); const voice = voiceConfig.realtimeVoice || "verse"; const model = voiceConfig.realtimeModel || "gpt-4o-realtime-preview"; const transport = new TwilioRealtimeTransportLayer({ twilioWebSocket, }); const session = new RealtimeSession(agent, { transport, model, config: { audio: { output: { voice }, }, }, tracingDisabled: true, }); session.on("error", (error: unknown) => { console.error(`[voice-call] [realtime] Error for ${callId}:`, error); }); session.on("agent_start", () => { console.log(`[voice-call] [realtime] Agent started for ${callId}`); }); session.on("agent_end", (_ctx: unknown, _agent: unknown, output: string) => { console.log(`[voice-call] [realtime] Agent response for ${callId}: "${output.slice(0, 200)}"`); }); session.on("agent_tool_start", (_ctx: unknown, _agent: unknown, t: any, details: any) => { const name = t?.name || "unknown"; const args = JSON.stringify(details?.toolCall?.arguments ?? {}).slice(0, 200); console.log(`[voice-call] [realtime] Tool START for ${callId}: ${name}(${args})`); }); session.on("agent_tool_end", (_ctx: unknown, _agent: unknown, t: any, result: string) => { const name = t?.name || "unknown"; console.log( `[voice-call] [realtime] Tool END for ${callId}: ${name} → ${result.slice(0, 300)}`, ); }); session.on("audio_interrupted", () => { console.log(`[voice-call] [realtime] Audio interrupted (barge-in) for ${callId}`); }); session.on("history_added", (item: any) => { if (item?.type === "message" && item?.role === "user") { const text = item?.content?.[0]?.transcript || item?.content?.[0]?.text || ""; if (text) console.log(`[voice-call] [realtime] User said for ${callId}: "${text}"`); } }); const apiKey = voiceConfig.streaming?.openaiApiKey || process.env.OPENAI_API_KEY; if (!apiKey) { throw new Error("OpenAI API key required for realtime voice mode"); } session .connect({ apiKey }) .then(() => { console.log( `[voice-call] [realtime] Connected for call ${callId} (model=${model}, voice=${voice})`, ); }) .catch((err) => { console.error(`[voice-call] [realtime] Connection failed for call ${callId}:`, err); }); return { session, close: () => { try { session.close(); } catch { // best effort } }, }; }