import {Script, VisemeWeights, ZERO_VISEME} from 'xrblocks'; import {computeAudioFeatures} from './computeAudioFeatures'; import {FormantVisemeMapper} from './FormantVisemeMapper'; /** * Minimal duck-typed target that {@link LipsyncMouth} drives every * frame. Anything with a `setVisemes` method satisfies the contract — * the canonical implementation is xrblocks core's {@link StylizedFace}, * which sits on every netblocks `RemoteUserAvatar` out of the box. */ export interface VisemeTarget { setVisemes(visemes: VisemeWeights): void; } export interface LipsyncMouthOptions { /** * The face this lipsync driver animates. Required. Caller owns the * target — `LipsyncMouth.dispose()` will never dispose it. Pass the * `face` field on a netblocks avatar (`user.avatar.face`), or build * a standalone `StylizedFace` for a non-multiplayer puppet. */ target: VisemeTarget; /** * Reuse an existing `AudioContext` instead of creating a new one. * Browsers cap the number of contexts per page (typically 6-8), so * when driving multiple peer streams (one driver per peer) pass the * shared context from `xb.core.sound.listener.context` or * `THREE.AudioContext.getContext()`. When provided, this class will * not close the context on `dispose()`. */ audioContext?: AudioContext; /** AnalyserNode FFT size; must be a power of two. Defaults to 1024. */ fftSize?: number; /** * Below this RMS the mouth enters its "silence" path. Default 0.01. */ silenceThreshold?: number; /** * Minimum continuous silence duration (ms) before the mouth starts * closing. Brief sub-threshold gaps shorter than this (plosive * stops, breaths, syllable boundaries) leave the mouth held in place * so it doesn't jitter. Once exceeded, the mapper's natural * smoothing decays the mouth to rest. Default 150. */ silenceHoldMs?: number; } /** * `LipsyncMouth` reads audio from a `MediaStream`, runs an FFT + * formant-based viseme mapper on it every frame, and writes the * resulting viseme weights to a {@link VisemeTarget} (typically a * {@link StylizedFace}). It owns no visual of its own — the face you * pass via `target` is the only thing on screen. * * Extends `Script` so the xrblocks scripts manager calls `init()` once * the instance is part of the active scene and `update(time)` every * frame. `dispose()` is called automatically by the scripts manager on * the next sync after the instance is removed from the scene graph; it * disconnects audio nodes and releases internal state. It * deliberately never stops the input `MediaStream` tracks (the caller * owns those), never closes a caller-supplied `AudioContext`, and * never disposes the target face (the avatar / host owns that too). * * Instances are one-shot: after `dispose()` runs (i.e. once the script * has been removed from the scene), do NOT re-add the same instance. * Construct a new `LipsyncMouth` for the next attachment. * * Standalone (e.g. puppet sample): * * ```ts * const face = new StylizedFace({showEyes: false}); * puppetHead.add(face); * const driver = new LipsyncMouth(micStream, {target: face}); * puppetHead.add(driver); * ``` * * Multiplayer netblocks avatar: * * ```ts * session.voice.onTrack((peerId, stream) => { * const user = session.users.get(peerId)!; * const driver = new LipsyncMouth(stream, { * target: user.avatar.face, * audioContext: THREE.AudioContext.getContext(), * }); * user.avatar.add(driver); * }); * ``` */ export class LipsyncMouth extends Script { /** The face this driver animates. Caller-owned. */ readonly target: VisemeTarget; private readonly stream: MediaStream; private readonly fftSize: number; private readonly silenceThreshold: number; private readonly silenceHoldMs: number; private readonly externalContext: boolean; private ctx?: AudioContext; private source?: MediaStreamAudioSourceNode; private analyser?: AnalyserNode; // TS lib.dom expects Uint8Array on the analyser methods, // not the looser Uint8Array that the default // `new Uint8Array(n)` produces. Pin the buffer type so the calls // below type-check under strict rollup-typescript. private freqData?: Uint8Array; private timeData?: Uint8Array; private primer?: HTMLAudioElement; private readonly mapper = new FormantVisemeMapper(); private lastTime = 0; /** Wall-clock ms when the most recent silence run started, or null. */ private silenceSinceMs: number | null = null; constructor(stream: MediaStream, opts: LipsyncMouthOptions) { super(); this.stream = stream; this.target = opts.target; this.fftSize = opts.fftSize ?? 1024; this.silenceThreshold = opts.silenceThreshold ?? 0.01; this.silenceHoldMs = opts.silenceHoldMs ?? 150; this.externalContext = !!opts.audioContext; this.ctx = opts.audioContext; } override async init(): Promise { if (!this.ctx) { // Fall back to a fresh context; prefer this only for single-mouth // demos. Multi-peer callers should pass a shared context. this.ctx = new AudioContext(); } // Browsers create the shared AudioContext suspended until a user // gesture. resume() is a no-op when already running. void this.ctx.resume?.().catch(() => undefined); this.source = this.ctx.createMediaStreamSource(this.stream); this.analyser = this.ctx.createAnalyser(); this.analyser.fftSize = this.fftSize; // Higher smoothingTimeConstant = more time-averaging on the FFT // magnitudes upstream of everything; cuts the per-frame bin jitter // that destabilises formant peak picking on sustained vowels. this.analyser.smoothingTimeConstant = 0.7; this.source.connect(this.analyser); this.freqData = new Uint8Array( new ArrayBuffer(this.analyser.frequencyBinCount) ); this.timeData = new Uint8Array(new ArrayBuffer(this.analyser.fftSize)); // Chromium WebRTC quirk: a MediaStreamAudioSourceNode built from a // remote stream stays silent unless the stream is also being pumped // by an HTMLMediaElement. Same fix SpatialVoice uses. Harmless for // local mic streams. if (typeof document !== 'undefined') { const primer = document.createElement('audio'); primer.muted = true; primer.autoplay = true; primer.srcObject = this.stream; // play() returns a Promise on modern browsers, undefined on older // ones (and in jsdom). Optional-chain both. const playP = primer.play(); playP?.catch?.(() => undefined); this.primer = primer; } } override update(time?: number): void { if (!this.analyser || !this.freqData || !this.timeData) return; // xrblocks passes `time` in milliseconds (matches the rest of the // codebase; see e.g. netblocks samples). Convert to seconds so the // mapper's `1 - exp(-dt / tau)` smoothing stays frame-rate // independent across 60/72/90/120 Hz XR refresh. const nowMs = typeof time === 'number' ? time : performance.now(); const dt = this.lastTime ? Math.max(0.001, Math.min(0.1, (nowMs - this.lastTime) / 1000)) : 0.016; this.lastTime = nowMs; this.analyser.getByteFrequencyData(this.freqData); this.analyser.getByteTimeDomainData(this.timeData); const features = computeAudioFeatures( {freqData: this.freqData, timeData: this.timeData}, this.ctx!.sampleRate ); // Silence hysteresis: brief sub-threshold gaps (plosive stops, // breaths, syllable boundaries) hold the previous visemes so the // mouth doesn't jitter. Only after `silenceHoldMs` of continuous // silence do we let the mapper's smoothing close the mouth. // Schmitt-trigger style: once silent, we only exit silence when // RMS rises clearly above the threshold (× 1.25), so mic noise // hovering near `silenceThreshold` doesn't keep resetting the // hold timer and prevent the mouth from ever closing. const inSilence = this.silenceSinceMs !== null; const exitThreshold = this.silenceThreshold * 1.25; const isSilent = inSilence ? features.rms < exitThreshold : features.rms < this.silenceThreshold; if (isSilent) { if (this.silenceSinceMs === null) this.silenceSinceMs = nowMs; if (nowMs - this.silenceSinceMs < this.silenceHoldMs) return; } else { this.silenceSinceMs = null; } const visemes = this.mapper.update(features, dt); this.target.setVisemes(visemes); } override dispose(): void { try { this.source?.disconnect(); } catch { // ignore } try { this.analyser?.disconnect(); } catch { // ignore } if (this.primer) { try { this.primer.pause(); } catch { // ignore } this.primer.srcObject = null; this.primer = undefined; } if (this.ctx && !this.externalContext) { // Only close contexts we created. void this.ctx.close?.().catch(() => undefined); } this.source = undefined; this.analyser = undefined; this.freqData = undefined; this.timeData = undefined; this.ctx = undefined; // Reset the target to its rest pose. Without this, a peer who // disabled their mic mid-vowel would leave their avatar's mouth // frozen open until something else drove the face. We never // dispose the target itself — the caller owns its lifetime. try { this.target.setVisemes(ZERO_VISEME); } catch { // ignore — target may already be disposed by the host (e.g. // the avatar was removed in the same frame). } } }