/** * Public types for the SpeechRecognition tool. * * Design: a small `RecognitionEngine` interface lets consumers plug in * any STT backend (browser Web Speech, Deepgram, Whisper, custom WS). * The hooks/UI never depend on a specific engine. */ export type RecognitionStatus = | 'idle' | 'starting' | 'listening' | 'stopping' | 'error'; export type EngineState = | 'idle' | 'connecting' | 'listening' | 'closing' | 'closed' | 'error'; export type RecognitionErrorCode = | 'unsupported' | 'permission-denied' | 'no-microphone' | 'network' | 'aborted' | 'no-speech' | 'language' | 'engine' | 'unknown'; export interface RecognitionError { code: RecognitionErrorCode; message: string; cause?: unknown; } export interface Segment { id: string; text: string; isFinal: boolean; /** Engine-provided confidence 0..1 if available. */ confidence?: number; /** ms since session start. */ startedAt: number; endedAt?: number; /** Pass-through metadata from custom engines (diarization, lang, …). */ metadata?: Record; } export interface Transcript { /** Latest interim text (not yet final). Empty string when none. */ interim: string; /** Concatenated final text (all segments joined with " "). */ final: string; /** Full segment list including the trailing interim segment if any. */ segments: Segment[]; } // ── engine contract ──────────────────────────────────────────────────────── export interface EngineStartOptions { language: string; /** Whether the engine should emit partial/interim results. */ interim: boolean; deviceId?: string; signal?: AbortSignal; } export type EngineEventMap = { partial: (text: string, segmentId: string) => void; final: (text: string, segmentId: string, confidence?: number) => void; error: (err: RecognitionError) => void; state: (state: EngineState) => void; }; export type Unsub = () => void; export interface RecognitionEngine { readonly id: string; readonly isSupported: boolean; /** * Whether this engine captures audio through the browser microphone * (`navigator.mediaDevices.getUserMedia`). `true` (default when * omitted) for the browser-native engines — Web Speech, HTTP and * WebSocket all open a `getUserMedia` stream. `false` for engines * that own capture outside the browser (`createExternalEngine` — * Wails / Tauri / native whisper sidecar), so consumers must NOT gate * them on `getUserMedia` being present. */ readonly usesMicrophone?: boolean; start(opts: EngineStartOptions): Promise; stop(): Promise; abort(): void; on(event: K, cb: EngineEventMap[K]): Unsub; /** * Optional — engines that capture mic audio themselves (HTTP / WS) * may expose the active `MediaStream` so consumers can wire up a * VU meter or waveform without owning a second `getUserMedia` call. */ getStream?(): MediaStream | null; } // ── hook config ──────────────────────────────────────────────────────────── export interface AutoStopOptions { /** Stop after this many ms of silence (RMS below threshold). */ silenceMs?: number; /** Hard cap on session length. */ maxMs?: number; /** RMS threshold below which we count "silence". 0..1. Default 0.02. */ silenceThreshold?: number; } export interface UseSpeechRecognitionConfig { engine?: RecognitionEngine; language?: string; interim?: boolean; deviceId?: string; autoStop?: AutoStopOptions; onFinal?: (text: string, segment: Segment) => void; onPartial?: (text: string, segment: Segment) => void; onError?: (err: RecognitionError) => void; onStart?: () => void; onStop?: () => void; } export interface UseSpeechRecognitionReturn { status: RecognitionStatus; isSupported: boolean; transcript: Transcript; error: RecognitionError | null; /** RMS level 0..1 for VU-meters. */ level: number; start(): Promise; stop(): Promise; abort(): void; toggle(): Promise; reset(): void; }