'use client'; import type * as React from 'react'; import { useCallback, useEffect, useRef } from 'react'; import { AlertCircle, Loader2, Mic } from 'lucide-react'; import { useCountdownFromSeconds, useNotificationSounds } from '@djangocfg/ui-core/hooks'; import { cn } from '@djangocfg/ui-core/lib'; import { Tooltip, TooltipContent, TooltipTrigger } from '@djangocfg/ui-core/components'; import { useActiveComposer } from '@djangocfg/ui-tools/composer-registry'; import { useSpeechRecognition } from '../hooks/useSpeechRecognition'; import { useVoiceSupport } from '../hooks/useVoiceSupport'; import { getSpeechLogger } from '../core/logger'; import { normaliseFinal } from '../core/transcript'; import { DEFAULT_VOICE_SOUNDS, type VoiceSoundEvent } from '../core/audio/defaults'; import { RecordingPulse } from '../components/RecordingPulse'; import type { RecognitionEngine } from '../types'; /** High-level visual state the composer mic button reflects. */ export type VoiceSlotState = 'idle' | 'listening' | 'processing' | 'error'; const log = getSpeechLogger(); export interface VoiceComposerSlotProps { /** * Controlled composer value. Optional — when omitted, the slot * reads/writes through the Chat context's registered `ComposerHandle` * (built-in `` and any host that calls * `useRegisterComposer({ getValue, setValue })`). Pass explicitly * only for standalone usage outside a ``. */ value?: string; /** Composer setter — see `value`. Optional when used inside a chat. */ onChange?: (next: string) => void; /** Optional custom engine (Deepgram / HTTP / WS). Defaults to Web Speech. */ engine?: RecognitionEngine; /** BCP-47 language override. Otherwise `useSpeechPrefs` decides. */ language?: string; /** Hide the button if the host wants to disable voice on phones. */ hideOnMobile?: boolean; /** Max session length in seconds before we auto-stop. Default 90. */ maxSeconds?: number; /** Auto-stop after this many ms of silence. Default 2500. */ silenceMs?: number; /** Button size. @default 'md' */ size?: 'sm' | 'md' | 'lg'; /** Override classes on the button. */ className?: string; /** Fires when dictation finishes — useful for parent-side analytics. */ onFinish?: (transcript: string) => void; /** * Start/stop earcons. `true` (default) plays the bundled sounds, * `false` disables them, or pass `{ start, stop }` data-URLs to * override. Master mute lives in `useNotificationSounds` localStorage * — users can silence everything via their own UI. */ sounds?: boolean | { start?: string; stop?: string }; /** Notified whenever the high-level visual state changes. */ onStateChange?: (state: VoiceSlotState) => void; } /** * Per-size geometry. Mirrors the chat composer action-bar buttons * (`composer-kit/ComposerButton` `BUTTON_SIZE`) so the slot drops into * `composerSlots.inlineEnd` round and correctly sized with no host CSS. */ const SIZE_CLS: Record, string> = { sm: 'h-7 w-7 [&_svg]:size-4', md: 'h-9 w-9 [&_svg]:size-[1.125rem]', lg: 'h-11 w-11 [&_svg]:size-5', }; const STORAGE_KEY = 'djangocfg-stt:voice-sounds'; /** * Drop-in microphone slot for `` — pass it as a raw node via * `composerSlots.inlineEnd` (inline layout) or `composerSlots.blockStart`. * * Renders a microphone button — but only when the browser + device * combination can actually do speech recognition. Firefox, in-app * WebViews, and missing `getUserMedia` all collapse the component to * `null`, so the chat composer never shows a broken affordance. * * While listening, interim+final transcript is pushed live into the * composer's `value` — like dictation on iOS / Android keyboards. The * user's already-typed prefix is preserved (anchored on press). Cancel * with the same button or by pressing Escape (handled upstream). * * Soft start/stop earcons play by default. Pass `sounds={false}` to * mute, or `sounds={{ start, stop }}` to override the audio URLs. */ export function VoiceComposerSlot({ value, onChange, engine, language, hideOnMobile = false, maxSeconds = 90, silenceMs = 2500, size = 'md', className, onFinish, sounds = true, onStateChange, }: VoiceComposerSlotProps): React.ReactElement | null { const support = useVoiceSupport(engine); // Read the active composer handle from the cross-tool registry // (`@djangocfg/ui-tools/composer-registry`). The built-in // `` (and TipTap hosts via `useRegisterComposer`) publish // their handle to this registry on mount. Falls back to a no-op // when nothing is registered (no composer in the tree). const activeComposer = useActiveComposer(); const composerHandleRef = useRef(activeComposer); composerHandleRef.current = activeComposer; useEffect(() => { log.slot.debug('mount', { supported: support.supported, reason: support.reason, hasComposerHandle: !!activeComposer, hasExplicitValue: value !== undefined, hasOnChange: !!onChange, }); }, [support.supported, support.reason, activeComposer, value, onChange]); // Resolve value/onChange: prop wins; otherwise pull from the // registered composer handle. The slot can therefore be dropped into // `composerBlockStart` of `ChatRoot` with zero props. const resolvedGetValue = useCallback((): string => { if (value !== undefined) return value; return composerHandleRef.current?.getValue?.() ?? ''; }, [value]); const resolvedSetValue = useCallback( (next: string): void => { if (onChange) { log.composer.debug('setValue → onChange prop', { len: next.length }); onChange(next); return; } const handle = composerHandleRef.current; if (!handle?.setValue) { log.composer.warn( 'setValue called but no composer handle is registered — text will be lost. ' + 'Make sure lives inside a (or pass `value`/`onChange` props for standalone use).', { len: next.length }, ); return; } log.composer.debug('setValue → composer handle', { len: next.length }); handle.setValue(next); }, [onChange], ); // Anchor: what was already in the textarea when the user pressed the // mic. Live transcript is appended to this baseline so manual typing // before pressing the button is never overwritten. const anchorRef = useRef(''); const onFinishRef = useRef(onFinish); onFinishRef.current = onFinish; // Push caret to the end on the next frame — after React commits the // new `value` into the DOM. Without the rAF the selection lands on // the old text length, leaving the cursor mid-string while the live // transcript visually keeps growing. const pinCaretToEnd = useCallback(() => { const handle = composerHandleRef.current; if (!handle?.moveCursorToEnd) return; requestAnimationFrame(() => { composerHandleRef.current?.moveCursorToEnd?.(); }); }, []); const [countdown, startCountdown] = useCountdownFromSeconds(); // Earcon bus. `sounds === false` → pass empty map so the bus stays // silent. Object overrides merge with bundled defaults. const soundMap = sounds === false ? undefined : sounds === true ? DEFAULT_VOICE_SOUNDS : { ...DEFAULT_VOICE_SOUNDS, ...sounds }; const audio = useNotificationSounds({ storageKey: STORAGE_KEY, sounds: soundMap, muted: sounds === false, // Both earcons stay deliberately quiet — they're self-initiated // micro-confirmations, not notifications. Anything louder feels // attention-grabbing when the user pressed the button themselves. eventVolumes: { start: 0.35, stop: 0.5 }, }); const handlePartial = useCallback( (text: string) => { log.dictation.debug('partial', { len: text.length, anchorLen: anchorRef.current.length, }); const next = anchorRef.current ? `${anchorRef.current} ${text}` : text; resolvedSetValue(next); pinCaretToEnd(); }, [pinCaretToEnd, resolvedSetValue], ); const handleFinal = useCallback( (text: string) => { const clean = normaliseFinal(text); if (!clean) { log.dictation.debug('final dropped — empty after normalise', { raw: text }); return; } const merged = anchorRef.current ? `${anchorRef.current} ${clean}` : clean; log.dictation.info('final merged', { len: clean.length, totalLen: merged.length, }); anchorRef.current = merged; resolvedSetValue(merged); pinCaretToEnd(); }, [pinCaretToEnd, resolvedSetValue], ); const rec = useSpeechRecognition({ engine, language, interim: true, autoStop: { silenceMs, maxMs: maxSeconds * 1000 }, onPartial: (text) => handlePartial(text), onFinal: (text) => handleFinal(text), onStart: () => { void audio.play('start'); // Focus the composer + park caret at the end so the live // transcript visibly grows where the user expects it to. composerHandleRef.current?.focus(); pinCaretToEnd(); }, onStop: () => { void audio.play('stop'); // Re-focus on stop too — auto-stop on silence happens without // a user gesture, and we want the user to keep typing seamlessly. composerHandleRef.current?.focus(); pinCaretToEnd(); onFinishRef.current?.(resolvedGetValue()); }, }); // Drive the countdown alongside the listening state. useEffect(() => { if (rec.status === 'listening') { startCountdown(maxSeconds); } }, [rec.status, maxSeconds, startCountdown]); // Hotkeys while listening: // Esc — cancel dictation (and stop event propagation so the // outer chat doesn't *also* close — same convention as // ChatGPT / Slack voice mode). // Enter — finish dictation, KEEP what we already pushed into the // composer, do NOT submit the chat (avoids accidental // sends while the user is still talking). We block the // Enter that would otherwise reach the composer textarea // by listening in the capture phase. const listening = rec.status === 'listening' || rec.status === 'starting'; useEffect(() => { if (!listening) return undefined; const onKey = (e: KeyboardEvent): void => { if (e.key === 'Escape') { e.preventDefault(); e.stopPropagation(); rec.abort(); return; } if (e.key === 'Enter' && !e.shiftKey) { // Block the chat composer's "Enter to send" while we're // dictating — finish recording instead. e.preventDefault(); e.stopPropagation(); void rec.stop(); } }; // `capture: true` so we run before the composer textarea's // keydown handler (which would otherwise close the chat on Esc // or submit the form on Enter). window.addEventListener('keydown', onKey, true); return () => { window.removeEventListener('keydown', onKey, true); }; }, [listening, rec]); const toggle = useCallback(() => { if (rec.status === 'listening' || rec.status === 'starting') { void rec.stop(); return; } anchorRef.current = resolvedGetValue().trim(); void rec.start(); }, [rec, resolvedGetValue]); // Derive the high-level visual state from the engine status. The // engine owns the truth — `processing` is the post-speech tail while // the final result settles (`stopping`), `error` is a recoverable // failure that resets to `idle` on the next press. const stopping = rec.status === 'stopping'; const slotState: VoiceSlotState = listening ? 'listening' : stopping ? 'processing' : rec.status === 'error' ? 'error' : 'idle'; // Notify the host on every transition (after render, ref-guarded so // identical states never fire twice). const lastStateRef = useRef(null); const onStateChangeRef = useRef(onStateChange); onStateChangeRef.current = onStateChange; useEffect(() => { if (lastStateRef.current === slotState) return; lastStateRef.current = slotState; onStateChangeRef.current?.(slotState); }, [slotState]); if (!support.supported) return null; if (hideOnMobile && support.isMobile) return null; // Tooltip: countdown + hotkey hint while listening, error copy on // failure, plain prompt otherwise. const tooltip = slotState === 'listening' ? `Listening — ${countdown.label || `${maxSeconds}s left`} · Enter to finish · Esc to cancel` : slotState === 'processing' ? 'Transcribing…' : slotState === 'error' ? rec.error?.message || 'Dictation failed — tap to retry' : 'Dictate message'; const ariaLabel = slotState === 'listening' ? 'Stop dictation' : slotState === 'error' ? 'Dictation failed — retry' : 'Dictate message'; return ( {slotState === 'listening' && countdown.label ? ( {countdown.label} ) : null} {slotState === 'error' ? ( Failed ) : null} {tooltip} ); }