import type {AudioFeatures} from './FormantVisemeMapper'; export interface AudioFeatureInputs { /** `analyser.getByteFrequencyData()` output. */ freqData: Uint8Array; /** * `analyser.getFloatFrequencyData()` output (dB), same length as * `freqData`. Reserved for downstream consumers (e.g. ML mappers * computing MFCC); the heuristic path doesn't read it. */ freqDataFloat?: Float32Array; /** `analyser.getByteTimeDomainData()` output, length == `analyser.fftSize`. */ timeData: Uint8Array; /** * Optional 13-element MFCC vector. Passed through unchanged in the * returned features so downstream consumers (e.g. a future * ModelMapper) see the same numbers; the formant-based mapper * doesn't consume it. */ mfcc?: Float32Array; } /** * Pure-function feature extractor. Given the raw analyser buffers and the * audio context's sample rate, returns the per-frame features the viseme * mappers consume. Extracted from `LipsyncMouth` so the math is testable * without a real `AudioContext` / `AnalyserNode`. */ export function computeAudioFeatures( inputs: AudioFeatureInputs, sampleRate: number ): AudioFeatures & {mfcc?: Float32Array} { const {freqData, timeData, mfcc} = inputs; // RMS from time domain. timeData is unsigned 8-bit: 128 == silence. let sumSq = 0; for (let i = 0; i < timeData.length; i++) { const v = timeData[i] / 128 - 1; sumSq += v * v; } const rms = Math.sqrt(sumSq / timeData.length); // Spectral bands and centroid. const binHz = sampleRate / 2 / freqData.length; let totalEnergy = 0; let weightedSum = 0; let low = 0; let mid = 0; let high = 0; for (let i = 0; i < freqData.length; i++) { const energy = freqData[i] / 255; const hz = i * binHz; totalEnergy += energy; weightedSum += hz * energy; if (hz < 500) low += energy; else if (hz < 2000) mid += energy; else if (hz < 8000) high += energy; } const centroid = totalEnergy > 0 ? weightedSum / totalEnergy : 0; const norm = (x: number) => Math.min(1, x / 50); const f1Hz = peakHzInRange(freqData, binHz, 200, 1000); const f2Hz = peakHzInRange(freqData, binHz, 800, 3000); const lowMid = low + mid; const voiced = rms > 0.02 && lowMid > high * 1.2 && lowMid > 1; return { rms, centroid, low: norm(low), mid: norm(mid), high: norm(high), f1Hz, f2Hz, voiced, mfcc, }; } function peakHzInRange( freqData: Uint8Array, binHz: number, lowHz: number, highHz: number ): number { const loBin = Math.max(0, Math.floor(lowHz / binHz)); const hiBin = Math.min(freqData.length - 1, Math.ceil(highHz / binHz)); // 5-bin smoothed envelope rather than raw per-bin max. Single-bin // peaks frequently come from individual harmonics of F0, not the // vocal-tract formant envelope. Averaging ±2 bins picks the wider // envelope peak so e.g. /oo/'s true F2 around 1000 Hz survives even // when a louder harmonic spike sits at 2400 Hz. Out-of-range bins // are treated as zero (not skipped) so the edges of the search // range aren't artificially inflated by a smaller window divisor. const WIN = 5; let bestBin = -1; let bestVal = 0; for (let i = loBin; i <= hiBin; i++) { let sum = 0; for (let k = -2; k <= 2; k++) { const j = i + k; if (j >= loBin && j <= hiBin) sum += freqData[j]; } const avg = sum / WIN; if (avg > bestVal) { bestVal = avg; bestBin = i; } } if (bestVal < 20) return 0; return bestBin * binHz; }