// Runtype Speech Engine (streaming TTS) // // Built-in `SpeechEngine` that backs the per-message "Read aloud" action (and // the auto-speak path) with Runtype-hosted text-to-speech. It is the client // half of the "Option A" HTTP synthesize design: a stateless // // POST {host}/v1/agents/:agentId/speak -> streamed PCM16 / 24 kHz / mono // // The `clientToken` is browser-safe (same one the chat widget uses, scoped by // `allowedOrigins`), so this calls Runtype directly from the page — no proxy. // The streamed PCM is fed chunk-by-chunk into a {@link PcmStreamPlayer}, so this // engine stays tiny: fetch -> enqueue chunks -> markStreamEnd. The player owns // prebuffering, gapless scheduling, graceful underrun, and pause/resume. // // By default the player is the in-bundle, main-thread `AudioPlaybackManager` // (no AudioWorklet module — keeps the main bundle lean). Consumers who want the // jitter-buffered AudioWorklet player inject `createPcmStreamPlayer` from // `@runtypelabs/persona/voice-worklet-player` via `createPlaybackEngine` (config: // `textToSpeech.createPlaybackEngine`); the worklet then lands in their bundle. // // Wired automatically by `textToSpeech: { provider: 'runtype' }` (see // `session.ts`), which derives `host`/`agentId`/`clientToken` from the widget // config and — unless `browserFallback: false` — wraps this in a // `FallbackSpeechEngine` so a missing endpoint or transient failure falls back // to the browser voice instead of erroring. import type { PcmStreamPlayer, SpeechCallbacks, SpeechEngine, SpeechRequest, } from "../types"; import { AudioPlaybackManager } from "./audio-playback-manager"; export interface RuntypeSpeechEngineOptions { /** * Runtype API host, e.g. `https://api.runtype.com` (typically the widget's * `apiUrl`). A trailing slash is tolerated. */ host: string; /** Agent whose configured voice synthesizes the text. */ agentId: string; /** Browser-safe client token — the same one the chat widget uses. */ clientToken: string; /** * Default voice id, used when a `SpeechRequest` doesn't carry its own. When * omitted the agent's configured voice is used. */ voice?: string; /** * Audio (ms) the player buffers before the first sample and after an * underrun. Runtype streams steadily, so the default (200) keeps first sound * close to time-to-first-byte while still riding out small hiccups. Applies to * the default {@link AudioPlaybackManager}; a custom `createPlaybackEngine` is * responsible for its own prebuffer. */ prebufferMs?: number; /** * Factory for the streaming PCM player. Defaults to the in-bundle, main-thread * {@link AudioPlaybackManager} (with `prebufferMs`). Pass `createPcmStreamPlayer` * from `@runtypelabs/persona/voice-worklet-player` for the jitter-buffered * AudioWorklet player (it then ships in your bundle, not Persona's). May be * async — it is resolved on first playback, inside the user gesture. */ createPlaybackEngine?: () => PcmStreamPlayer | Promise; /** * Optional hook for surfacing fetch/stream failures (a missing endpoint, an * expired token, an upstream 4xx) to a log or telemetry. The widget itself * only returns the read-aloud button to idle (or, with a fallback engine, * silently switches to the browser voice), so without this the reason is * invisible. */ onError?: (error: Error) => void; } /** Strip a trailing slash so `${host}/v1/...` never doubles up. */ function normalizeHost(host: string): string { return host.replace(/\/+$/, ""); } /** Streaming `SpeechEngine` backed by Runtype's `/v1/agents/:id/speak`. */ export class RuntypeSpeechEngine implements SpeechEngine { readonly id = "runtype-tts"; // The PCM player pauses/resumes via AudioContext.suspend() — solid, unlike // speechSynthesis.pause(). readonly supportsPause = true; private player: PcmStreamPlayer | null = null; private playerPromise: Promise | null = null; // Bumped on every speak()/stop() so a superseded request's async callbacks and // its in-flight stream read loop become no-ops. private generation = 0; constructor(private readonly opts: RuntypeSpeechEngineOptions) {} // Create one player lazily (a worklet engine's addModule is async), then reuse // it across speaks; flush() between replies clears the queue without tearing // down the AudioContext. Defaults to the in-bundle AudioPlaybackManager. private ensurePlayer(): Promise { return (this.playerPromise ??= Promise.resolve( this.opts.createPlaybackEngine ? this.opts.createPlaybackEngine() : new AudioPlaybackManager(24000, { prebufferMs: this.opts.prebufferMs ?? 200, }), ).then((player) => (this.player = player))); } speak(request: SpeechRequest, callbacks: SpeechCallbacks): void { const gen = ++this.generation; // Run the async fetch/stream without making speak() itself async — the // widget surfaces the time until audio starts as the "loading" state. void this.run(gen, request, callbacks); } private async run( gen: number, request: SpeechRequest, callbacks: SpeechCallbacks, ): Promise { try { const player = await this.ensurePlayer(); if (gen !== this.generation) return; // superseded while the worklet booted player.flush(); // drop any prior playback (and its callbacks) player.resume(); // clear a prior pause so this reply isn't stuck suspended // Drive read-aloud state from the player, not from chunk arrival: onStarted // fires when the prebuffer fills and audio is actually audible (loading -> // playing); onFinished when it drains (-> idle). An empty stream produces // no onStarted and an immediate onFinished, so the UI skips straight back // to idle without a phantom "playing". player.onStarted(() => { if (gen === this.generation) callbacks.onStart?.(); }); player.onFinished(() => { if (gen === this.generation) callbacks.onEnd?.(); }); const url = `${normalizeHost(this.opts.host)}/v1/agents/${encodeURIComponent( this.opts.agentId, )}/speak`; const res = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json", // Match Runtype's client-token auth convention. Never placed in the // URL/query string. Authorization: `Bearer ${this.opts.clientToken}`, }, body: JSON.stringify({ text: request.text, voice: request.voice ?? this.opts.voice, format: "pcm", }), }); if (gen !== this.generation) return; // superseded while awaiting headers if (!res.ok || !res.body) throw new Error(await describeError(res)); const reader = res.body.getReader(); for (;;) { const { done, value } = await reader.read(); if (gen !== this.generation) { // A newer speak()/stop() won — stop pulling and release the stream. await reader.cancel().catch(() => {}); return; } if (done) break; if (value && value.byteLength > 0) player.enqueue(value); } player.markStreamEnd(); } catch (err) { if (gen !== this.generation) return; // error from a superseded request const error = err instanceof Error ? err : new Error(String(err)); this.opts.onError?.(error); // surface the reason (log, telemetry, …) callbacks.onError?.(error); // and let the widget (or fallback) react } } pause(): void { this.player?.pause(); } resume(): void { this.player?.resume(); } stop(): void { this.generation++; // invalidate any in-flight stream + pending onFinished this.player?.flush(); } destroy(): void { this.generation++; void this.player?.destroy(); this.player = null; this.playerPromise = null; } } /** Best-effort human-readable message from a non-OK speak response. */ async function describeError(res: Response): Promise { try { const data = (await res.json()) as { error?: string; detail?: string }; return data.detail ? `${data.error ?? `Runtype TTS ${res.status}`}: ${data.detail}` : data.error ?? `Runtype TTS request failed (${res.status})`; } catch { return `Runtype TTS request failed (${res.status})`; } }