import type {VisemeWeights} from './BlendshapeReducer'; import {ZERO_VISEME} from './BlendshapeReducer'; /** * Per-frame audio features the FormantVisemeMapper consumes. Produced by * the audio pipeline (AnalyserNode + FFT analysis) and shared between the * formant heuristic and any optional learned mapper. */ export interface AudioFeatures { /** Root-mean-square amplitude in [0, 1]; used for voicing + jaw drive. */ rms: number; /** Spectral centroid in Hz; used as a brightness proxy for sibilance. */ centroid: number; /** Low-band, mid-band, and high-band energy fractions. */ low: number; mid: number; high: number; /** Estimated first and second formant in Hz; 0 when unknown. */ f1Hz: number; f2Hz: number; /** Voicing decision (true when periodic energy is present). */ voiced: boolean; } export interface FormantVisemeMapperOptions { /** * Time constants (seconds) for the exponential smoothing of each output * channel. Smaller means snappier. Independent of frame rate. */ vowelTau?: number; consonantTau?: number; } const DEFAULT_VOWEL_TAU = 0.1; const DEFAULT_CONSONANT_TAU = 0.07; const DEFAULT_FORMANT_TAU = 0.1; // Seconds of contiguous silence after which the smoothed F1/F2 cache // is reset, so the first voiced frame of a new utterance doesn't blend // from the previous vowel's stale formants. const FORMANT_DECAY_AFTER_SILENCE_S = 0.25; /** * Heuristic audio-to-viseme mapper based on the first two formants. Vowel * identity in speech is set by F1/F2: * * "aa" = F1 high (~700-900 Hz) * "ee" = F1 low (~250-400 Hz) + F2 high (~2000-2500 Hz) * "oo" = F1 low (~300-450 Hz) + F2 low (~700-1000 Hz) * * Consonants are characterised by high-band sibilance (fricatives) or * very low RMS during stops. * * Smoothing uses `1 - exp(-dt / tau)`, which gives the same time-to-target * regardless of frame rate (important on XR devices that run at 60, 72, * 90, or 120 Hz). The `dt` argument to `update()` is the seconds since * the previous frame. */ export class FormantVisemeMapper { private current: VisemeWeights = {...ZERO_VISEME}; private smoothF1 = 0; private smoothF2 = 0; /** Seconds of contiguous unvoiced input; resets to 0 on any voiced frame. */ private silentFor = 0; private readonly vowelTau: number; private readonly consonantTau: number; constructor(opts: FormantVisemeMapperOptions = {}) { this.vowelTau = opts.vowelTau ?? DEFAULT_VOWEL_TAU; this.consonantTau = opts.consonantTau ?? DEFAULT_CONSONANT_TAU; } update(features: AudioFeatures, dt: number): VisemeWeights { if (!features) return this.current; const {rms, centroid, low, mid, high, f1Hz, f2Hz, voiced} = features; // 1. Voicing gate so background noise doesn't drive the mouth. const voicingGate = voiced ? 1 : smoothstep(0.02, 0.05, rms); // 2. Jaw drive = scaled RMS. const jawTarget = clamp01(voicingGate * Math.min(1, rms * 6)); // 3. Consonant: high-band dominance. const fricRatio = high / (low + mid + high + 0.001); const brightness = clamp01((centroid - 1500) / 2500); const consonantTarget = clamp01( voicingGate * (0.55 * brightness + 0.7 * fricRatio) ); // 4. Smooth the raw F1/F2 peaks so per-frame bin jitter doesn't // knock sustained vowels (especially /oo/, whose true F2 sits // very close to a noisy 2x harmonic) out of their classification. // Skip the update on unvoiced frames so silence doesn't pull the // smoothed values toward zero. After a long contiguous silence, // clear the cached formants so the first voiced frame of a new // utterance doesn't smooth from the previous vowel's stale // F1/F2 and briefly emit the wrong viseme. if (voicingGate > 0.5 && f1Hz > 0 && f2Hz > 0) { this.silentFor = 0; const formantAlpha = 1 - Math.exp(-dt / DEFAULT_FORMANT_TAU); this.smoothF1 = this.smoothF1 ? lerp(this.smoothF1, f1Hz, formantAlpha) : f1Hz; this.smoothF2 = this.smoothF2 ? lerp(this.smoothF2, f2Hz, formantAlpha) : f2Hz; } else { this.silentFor += dt; if (this.silentFor > FORMANT_DECAY_AFTER_SILENCE_S) { this.smoothF1 = 0; this.smoothF2 = 0; } } const sF1 = this.smoothF1; const sF2 = this.smoothF2; // 5. Vowel identity from smoothed F1/F2. Compete the three // membership scores so /aa/ doesn't also light up /oo/. const vowelMass = clamp01(voicingGate * (1 - consonantTarget)); let aaScore = 0; let eeScore = 0; let ooScore = 0; if (vowelMass > 0.1 && sF1 > 0 && sF2 > 0) { aaScore = smoothstep(550, 850, sF1); const f1Low = 1 - smoothstep(350, 600, sF1); const f2High = smoothstep(1700, 2400, sF2); eeScore = f1Low * f2High; // Widen the f2Low band: /oo/ has its true F2 right at the bottom // of the F2 search range, so even with smoothing F2 can drift up // to ~1500-1700 Hz on noisy frames. The looser cutoff lets /oo/ // partially survive that drift rather than cliff-falling to zero. const f2Low = 1 - smoothstep(1100, 1700, sF2); ooScore = f1Low * f2Low; } const sum = aaScore + eeScore + ooScore + 0.001; aaScore = (aaScore / sum) * vowelMass; eeScore = (eeScore / sum) * vowelMass; ooScore = (ooScore / sum) * vowelMass; // 6. Frame-rate-independent smoothing of the final viseme weights. const vowelAlpha = 1 - Math.exp(-dt / this.vowelTau); const consAlpha = 1 - Math.exp(-dt / this.consonantTau); this.current = { jawOpen: lerp(this.current.jawOpen, jawTarget, vowelAlpha), aa: lerp(this.current.aa, aaScore, vowelAlpha), oo: lerp(this.current.oo, ooScore, vowelAlpha), // Formant heuristic doesn't have a separate /oh/ signal; the model // mapper supplies it instead. oh: 0, ee: lerp(this.current.ee, eeScore, vowelAlpha), consonant: lerp(this.current.consonant, consonantTarget, consAlpha), }; return this.current; } reset(): void { this.current = {...ZERO_VISEME}; this.smoothF1 = 0; this.smoothF2 = 0; this.silentFor = 0; } } function clamp01(x: number): number { return Math.min(1, Math.max(0, x)); } function smoothstep(edge0: number, edge1: number, x: number): number { const t = clamp01((x - edge0) / (edge1 - edge0)); return t * t * (3 - 2 * t); } function lerp(current: number, target: number, alpha: number): number { return current + (target - current) * alpha; }