import { TranscribeStreamingClient, StartStreamTranscriptionCommand, LanguageCode, } from '@aws-sdk/client-transcribe-streaming'; import { ISpeechRecognizer, ISpeechRecoResult, ISpeechRecoItem, } from '../../interfaces/ISpeechRecognizer'; /** * Target audio sample rate used for PCM encoding. * AWS Transcribe Streaming supports 8000-48000 Hz; 16000 is optimal for speech. */ const TARGET_SAMPLE_RATE = 16000; /** * Implements speech recognition services using Amazon Transcribe Streaming * @see {@link ISpeechRecognizer} */ export class AwsSpeechRecognizer implements ISpeechRecognizer { private accessKeyId: string; private secretAccessKey: string; private sessionToken: string | undefined; private region: string; private languageCode: LanguageCode; private client: TranscribeStreamingClient; private recoStart: Date; private isListening: boolean; /** Active microphone MediaStream (released on stop) */ private mediaStream: MediaStream | null; /** Web Audio context used for PCM capture */ private audioContext: AudioContext | null; /** Audio worklet / script-processor node used to grab raw samples */ private processorNode: ScriptProcessorNode | null; /** Source node wired from the microphone MediaStream */ private sourceNode: MediaStreamAudioSourceNode | null; /** Queue of PCM-encoded chunks waiting to be consumed by the async audio generator */ private audioQueue: ArrayBuffer[]; /** Resolves the next pending dequeue when a chunk arrives */ private audioQueueResolve: (() => void) | null; /** When true the generator should stop yielding */ private audioStreamDone: boolean; /** The AbortController used to cancel a running transcription session */ private abortController: AbortController | null; //#region Construction / initialization /** * Constructs an AWS Transcribe Streaming speech recognizer * @param accessKeyId - AWS IAM access key ID (or Cognito-derived key) * @param secretAccessKey - AWS IAM secret access key * @param region - AWS region for the Transcribe service (e.g. 'us-east-1') * @param sessionToken - Optional session token (when using temporary credentials / Cognito) * @param recoLanguage - Optional language code. Default is 'en-US' */ constructor( accessKeyId: string, secretAccessKey: string, region: string, sessionToken?: string, recoLanguage?: string ) { this.accessKeyId = accessKeyId; this.secretAccessKey = secretAccessKey; this.sessionToken = sessionToken; this.region = region; this.languageCode = (recoLanguage ?? 'en-US') as LanguageCode; this.client = new TranscribeStreamingClient({ region: this.region, credentials: { accessKeyId: this.accessKeyId, secretAccessKey: this.secretAccessKey, sessionToken: this.sessionToken, }, }); this.isListening = false; this.recoStart = new Date(); this.mediaStream = null; this.audioContext = null; this.processorNode = null; this.sourceNode = null; this.audioQueue = []; this.audioQueueResolve = null; this.audioStreamDone = false; this.abortController = null; } //#endregion Construction / initialization //#region Events /** * Event handler invoked whenever the recognizer has a complete phrase to return */ onRecognized: ((result: ISpeechRecoResult | null) => void) | undefined; /** * Optional event handler invoked whenever the recognizer has a partial recognition available */ onRecognizing: ((snippet: string) => void) | undefined; /** * Optional event handler invoked when there is a recognition error */ onError: ((error: Error) => void) | undefined; //#endregion Events //#region Single-shot recognition /** * Activate the microphone and attempt to recognize speech in the next few seconds. * Returns the first final transcription result. * @param maxRetries - Number of times to retry before returning an error * @returns Recognized items/hypotheses, or null if nothing was recognized */ async recognizeOnce(maxRetries?: number): Promise { const delay = 250; if (!maxRetries) { maxRetries = 2000 / delay; } for (let i = 0; i < maxRetries; i++) { try { const result = await this.trySingleReco(); return result; } catch (_e) { // Ignore and retry } if (i < maxRetries - 1) { await new Promise((r) => setTimeout(r, delay)); } } const err = new Error('Failed to recognize speech'); this.onError?.call(this, err); throw err; } /** * Performs a single recognition attempt: starts transcription, waits for the * first final result, then tears down the stream. */ private async trySingleReco(): Promise { this.recoStart = new Date(); await this.startMicrophone(); const audioStream = this.createAudioStream(); const command = new StartStreamTranscriptionCommand({ LanguageCode: this.languageCode, MediaEncoding: 'pcm', MediaSampleRateHertz: TARGET_SAMPLE_RATE, AudioStream: audioStream, }); try { this.abortController = new AbortController(); const response = await this.client.send(command, { abortSignal: this.abortController.signal, }); if (!response.TranscriptResultStream) { this.cleanup(); return null; } // Iterate over transcript events; return on the first final result for await (const event of response.TranscriptResultStream) { if (event.TranscriptEvent) { const results = event.TranscriptEvent.Transcript?.Results ?? []; for (const result of results) { if (result.IsPartial) { // Emit partial / intermediate const snippet = (result.Alternatives ?? []) .map((alt) => (alt.Items ?? []).map((it) => it.Content).join(' ') ) .join(' '); this.onRecognizing?.call(this, snippet); } else { // Final result const recoResult = this.convertResult(result); this.onRecognized?.call(this, recoResult); this.cleanup(); return recoResult; } } } } // Stream ended with no final result this.cleanup(); return null; } catch (e: any) { this.cleanup(); throw e; } } //#endregion Single-shot recognition //#region "Continuous" recognition /** * Start the recognition process. Intermediate results are returned via the onRecognizing event; * final results (phrase) are returned via the onRecognized event. * The expectation is that speech recognition is started at the beginning of a sketch, and stopped * sometime after the end of the sketch. */ startRecognizing(): void { if (this.isListening) { return; } this.isListening = true; this.recoStart = new Date(); // Fire-and-forget the async streaming loop this.runContinuousRecognition().catch((e) => { this.isListening = false; if (this.onError) { this.onError.call(this, e instanceof Error ? e : new Error(String(e))); } else { this.onRecognized?.call(this, null); } }); } /** * Internal async loop that drives continuous recognition. */ private async runContinuousRecognition(): Promise { await this.startMicrophone(); const audioStream = this.createAudioStream(); const command = new StartStreamTranscriptionCommand({ LanguageCode: this.languageCode, MediaEncoding: 'pcm', MediaSampleRateHertz: TARGET_SAMPLE_RATE, AudioStream: audioStream, }); this.abortController = new AbortController(); const response = await this.client.send(command, { abortSignal: this.abortController.signal, }); if (!response.TranscriptResultStream) { this.cleanup(); return; } try { for await (const event of response.TranscriptResultStream) { if (!this.isListening) break; if (event.TranscriptEvent) { const results = event.TranscriptEvent.Transcript?.Results ?? []; for (const result of results) { if (result.IsPartial) { const snippet = (result.Alternatives ?? []) .map((alt) => (alt.Items ?? []).map((it) => it.Content).join(' ') ) .join(' '); this.onRecognizing?.call(this, snippet); } else { const recoResult = this.convertResult(result); this.onRecognized?.call(this, recoResult); } } } } } catch (e: any) { // AbortError is expected when we call stopRecognizing if (e.name !== 'AbortError') { throw e; } } finally { this.cleanup(); } } /** * Stop the recognition process. Is normally called at the end of a sketch action. * @param wait Time in milliseconds to wait before stopping recognition */ stopRecognizing(wait?: number): void { if (wait && wait > 0) { setTimeout(() => this.doStop(), wait); } else { this.doStop(); } } private doStop(): void { this.isListening = false; // Signal the audio generator to stop this.audioStreamDone = true; if (this.audioQueueResolve) { this.audioQueueResolve(); this.audioQueueResolve = null; } // Abort the running transcription request if (this.abortController) { this.abortController.abort(); this.abortController = null; } this.stopMicrophone(); } //#endregion "Continuous" recognition //#region Microphone capture and PCM encoding /** * Starts capturing audio from the default microphone and feeds PCM-encoded * chunks into {@link audioQueue}. */ private async startMicrophone(): Promise { this.audioQueue = []; this.audioQueueResolve = null; this.audioStreamDone = false; this.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false, }); // Use a standard AudioContext; fall back to webkitAudioContext on Safari const AudioCtx = (window as any).AudioContext || (window as any).webkitAudioContext; this.audioContext = new AudioCtx({ sampleRate: TARGET_SAMPLE_RATE }) as AudioContext; this.sourceNode = this.audioContext.createMediaStreamSource( this.mediaStream ); // Use ScriptProcessorNode (widely supported). Buffer 4096 frames, mono. const bufferSize = 4096; this.processorNode = this.audioContext.createScriptProcessor( bufferSize, 1, 1 ); this.processorNode.onaudioprocess = (event: AudioProcessingEvent) => { if (this.audioStreamDone) return; const inputData = event.inputBuffer.getChannelData(0); const pcm = this.pcmEncode(inputData); this.audioQueue.push(pcm); // Wake up the async generator if it is waiting if (this.audioQueueResolve) { this.audioQueueResolve(); this.audioQueueResolve = null; } }; this.sourceNode.connect(this.processorNode); this.processorNode.connect(this.audioContext.destination); } /** * Releases the microphone and tears down the audio graph. */ private stopMicrophone(): void { if (this.processorNode) { this.processorNode.disconnect(); this.processorNode.onaudioprocess = null; this.processorNode = null; } if (this.sourceNode) { this.sourceNode.disconnect(); this.sourceNode = null; } if (this.audioContext) { this.audioContext.close().catch(() => {}); this.audioContext = null; } if (this.mediaStream) { this.mediaStream.getTracks().forEach((t) => t.stop()); this.mediaStream = null; } } /** * Common cleanup called after a transcription session ends. */ private cleanup(): void { this.isListening = false; this.audioStreamDone = true; this.stopMicrophone(); } /** * Encodes a Float32Array of audio samples into 16-bit signed PCM (little-endian). * @param float32 - Raw audio samples, range [-1, 1] * @returns ArrayBuffer containing the PCM data */ private pcmEncode(float32: Float32Array): ArrayBuffer { const buffer = new ArrayBuffer(float32.length * 2); const view = new DataView(buffer); for (let i = 0; i < float32.length; i++) { const s = Math.max(-1, Math.min(1, float32[i])); view.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true); } return buffer; } /** * Creates an async generator that yields audio chunks in the shape expected * by StartStreamTranscriptionCommand. */ private createAudioStream(): AsyncGenerator< { AudioEvent: { AudioChunk: Uint8Array } }, void, unknown > { const self = this; return (async function* () { while (!self.audioStreamDone) { // Wait for data if the queue is empty if (self.audioQueue.length === 0) { await new Promise((resolve) => { self.audioQueueResolve = resolve; }); } // Drain all available chunks while (self.audioQueue.length > 0) { const chunk = self.audioQueue.shift()!; yield { AudioEvent: { AudioChunk: new Uint8Array(chunk), }, }; } } })(); } //#endregion Microphone capture and PCM encoding //#region Result conversion /** * Converts an AWS Transcribe result into the plugin-standard ISpeechRecoResult. * AWS Transcribe does not provide timing offsets that map neatly to the Azure * model, so we approximate start/end times relative to the session start. */ private convertResult( result: any ): ISpeechRecoResult | null { const alternatives = result.Alternatives ?? []; if (alternatives.length === 0) return null; // Build the items array from each alternative const items: ISpeechRecoItem[] = []; for (const alt of alternatives) { const text = (alt.Items ?? []).map((it: any) => it.Content).join(' '); // AWS provides a per-item confidence; average them for the phrase const confidences = (alt.Items ?? []) .map((it: any) => it.Confidence) .filter((c: any) => c != null); const avgConfidence = confidences.length > 0 ? confidences.reduce((a: number, b: number) => a + b, 0) / confidences.length : 1.0; items.push(new SpeechRecoItem(text, avgConfidence)); } // Determine timing from the first alternative's items const firstAlt = alternatives[0]; const firstItems = firstAlt.Items ?? []; const startSec = firstItems.length > 0 && firstItems[0].StartTime != null ? firstItems[0].StartTime : 0; const endSec = firstItems.length > 0 && firstItems[firstItems.length - 1].EndTime != null ? firstItems[firstItems.length - 1].EndTime : 0; const recoResult = new SpeechRecoResult(); recoResult.startTime = new Date(this.recoStart.getTime() + startSec * 1000); recoResult.endTime = new Date(this.recoStart.getTime() + endSec * 1000); recoResult.results = items.sort((a, b) => b.confidence - a.confidence); // Add acronym hypotheses (matching Azure plugin behavior) const basicConversion = Array.from(items); for (const item of basicConversion) { // String of one or more occurrences of a letter followed by a space, ended with a letter if (item.text.search(/^([a-zA-Z]\s)+[a-zA-Z]$/) >= 0) { const acronym = item.text.replace(/\s/g, ''); const conf = item.confidence * 0.9; recoResult.results.push(new SpeechRecoItem(acronym, conf)); } else if (item.text.search(/^([a-zA-Z]\s)+[a-zA-Z][a-zA-Z]+$/) >= 0) { const parts = item.text.match( /^(([a-zA-Z]\s)+)([a-zA-Z][a-zA-Z]+)$/ ); if (parts && parts.length === 4) { const acronym = parts[1].replace(/\s/g, ''); const designator = parts[3]; const conf = item.confidence * 0.85; recoResult.results.push( new SpeechRecoItem(acronym + ' ' + designator, conf) ); } } } // Re-sort after adding acronyms recoResult.results.sort((a, b) => b.confidence - a.confidence); return recoResult; } //#endregion Result conversion } /** * Recognition results - items recognized and time interval */ class SpeechRecoResult implements ISpeechRecoResult { results: ISpeechRecoItem[]; startTime: Date; endTime: Date; constructor() { this.results = []; this.startTime = new Date(); this.endTime = new Date(); } } /** * Recognition item, including recognized text and confidence */ class SpeechRecoItem implements ISpeechRecoItem { text: string; confidence: number; constructor(text: string, confidence: number) { this.text = text; this.confidence = confidence; } }