import { AudioConfig, BotFrameworkConfig, PropertyId, DialogServiceConnector } from 'microsoft-cognitiveservices-speech-sdk'; import { createSpeechRecognitionPonyfillFromRecognizer } from 'web-speech-cognitive-services'; import coreJSWithResolvers from 'core-js-pure/features/promise/with-resolvers.js'; import EventTarget, { getEventAttributeValue, setEventAttributeValue } from 'event-target-shim'; import { decode } from 'base64-arraybuffer'; import { EventAsPromise } from 'event-as-promise'; import Observable$1 from 'core-js/features/observable/index.js'; import random from 'math-random'; function withResolvers() { return coreJSWithResolvers(); } function createTaskQueue() { let queueWithCurrent = []; const queue = { cancelAll: () => { queueWithCurrent.forEach(({ cancel }) => cancel()); }, push: fn => { const cancelWithResolvers = withResolvers(); const resultWithResolvers = withResolvers(); const entry = { promise: resultWithResolvers.promise }; let abort; const cancel = (entry.cancel = () => { // Override the "fn" so we don't call the actual "fn" later. // In this approach, we can reuse the logic inside "start" to handle post-cancellation. fn = () => ({ result: Promise.reject(new Error('cancelled before start')) }); // Abort the task if it is currently running. abort && abort(); cancelWithResolvers.reject(new Error('cancelled in the midway')); }); const start = async () => { const { abort: abortFn, result } = fn(); abort = abortFn; try { // Either wait for the actual result, or the task is being cancelled. resultWithResolvers.resolve(await Promise.race([result, cancelWithResolvers.promise])); } catch (error) { resultWithResolvers.reject(error); } queueWithCurrent = queueWithCurrent.filter(e => e !== entry); }; const lastEntry = queueWithCurrent[queueWithCurrent.length - 1]; const lastPromise = (lastEntry && lastEntry.promise) || Promise.resolve(); queueWithCurrent.push(entry); // After the last promise resolved/rejected, we will start this task. // We will start even if the last promise rejected. lastPromise.then(start, start); return { cancel, result: resultWithResolvers.promise }; } }; Object.defineProperty(queue, 'length', { get: () => queueWithCurrent.length }); return queue; } // Currently, Web Chat uses a triple-buffer approach. const NUM_BUFFER = 3; function zeroBuffer(buffer) { const channels = buffer.numberOfChannels; for (let channel = 0; channel < channels; channel++) { const audioData = buffer.getChannelData(channel); [].fill.call(audioData, 0); } } function copyBuffer(buffer, multiChannelArray) { const channels = buffer.numberOfChannels; for (let channel = 0; channel < channels; channel++) { const float32Array = multiChannelArray[+channel]; // Note that Safari does not support AudioBuffer.copyToChannel yet. if (buffer.copyToChannel) { buffer.copyToChannel(float32Array, channel); } else { const { length: float32ArrayLength } = float32Array; const perChannelBuffer = buffer.getChannelData(channel); for (let offset = 0; offset < float32ArrayLength; offset++) { perChannelBuffer[+offset] = float32Array[+offset]; } } } } // This is a multi-buffering player. Users can keep pushing buffer to Web Chat. // The buffer, realized as BufferSource, is queued to AudioContext. // Data will be queued as quickly and frequently as possible. // Web Chat does not support progressive buffering (pushing a partial buffer) and there are currently no plans to implement. function createMultiBufferingPlayer(audioContext, { channels, samplesPerSec }, numSamplePerBuffer) { const freeBuffers = new Array(NUM_BUFFER) .fill() .map(() => audioContext.createBuffer(channels, numSamplePerBuffer, samplesPerSec)); let queuedBufferSources = []; let nextSchedule; const queue = []; const playNext = () => { if (typeof nextSchedule !== 'number') { nextSchedule = audioContext.currentTime; } const bufferSource = audioContext.createBufferSource(); const multiChannelArray = queue.shift(); if (typeof multiChannelArray === 'function') { // If the queued item is a function, it is because the user called "flush". // The "flush" function will callback when all queued buffers before the "flush" call have played. multiChannelArray(); } else if (multiChannelArray) { const nextBuffer = freeBuffers.shift(); // If all buffers are currently occupied, prepend the data back to the queue. // When one of the buffers finish, it will call playNext() again to pick up items from the queue. if (!nextBuffer) { queue.unshift(multiChannelArray); return; } zeroBuffer(nextBuffer); copyBuffer(nextBuffer, multiChannelArray); bufferSource.buffer = nextBuffer; bufferSource.connect(audioContext.destination); bufferSource.start(nextSchedule); // All BufferSource data that is currently queued will be stored at the AudioContext, via bufferSource.start(). // This is for cancelAll() to effectively cancel all BufferSource queued at the AudioContext. queuedBufferSources.push(bufferSource); nextSchedule += nextBuffer.duration; bufferSource.addEventListener('ended', () => { queuedBufferSources = queuedBufferSources.filter(target => target !== bufferSource); // Declare the buffer is free to pick up on the next iteration. freeBuffers.push(nextBuffer); playNext(); }); } }; return { cancelAll: () => { queue.splice(0); // Although all buffers are cleared, there are still some BufferSources queued at the AudioContext that need to be stopped. queuedBufferSources.forEach(bufferSource => bufferSource.stop()); }, flush: () => new Promise(resolve => queue.push(resolve)), push: multiChannelArray => { if (!multiChannelArray) { throw new Error('multiChannelArray must not be falsy.'); } queue.push(multiChannelArray); playNext(); } }; } /* eslint no-magic-numbers: ["error", { "ignore": [0, 1, 8, 16, 32, 128, 1000, 16000, 32768, 96000, 2147483648] }] */ /* eslint no-await-in-loop: "off" */ /* eslint prefer-destructuring: "off" */ // Safari requires an audio buffer with a sample rate of 22050 Hz. // Using a minimum sample rate of 44100 Hz as an example, the Speech SDK's default 16000 Hz will be upsampled to 48000 Hz. const MIN_SAMPLE_RATE = 44100; // The Speech SDK is hardcoded to chop packets to 4096 bytes. // Web Chat's multi-buffering player is set up with 3 buffers; each is 4096 bytes (2048 16-bit samples). // For simplicity, the multi-buffer player currently does not support progressive buffering. // Progressive buffering allows queuing at any sample size and will be concatenated. // If 1000 samples are queued, then 1048 samples are queued, they will be concatenated into a single buffer of size 2048. // For simplicity, data will be queued to two buffers. // The first buffer is 1000 samples followed by 1048 zeroes, and the second buffer is 1048 samples followed by 1000 zeroes. // There is no plan to support progressive buffering until the Speech SDK chops data at dynamic size. const DEFAULT_BUFFER_SIZE = 4096; function average(array) { return array.reduce((sum, value) => sum + value, 0) / array.length; } function formatTypedBitArrayToFloatArray(audioData, maxValue) { const float32Data = new Float32Array(audioData.length); for (let i = 0; i < audioData.length; i++) { float32Data[+i] = audioData[+i] / maxValue; } return float32Data; } function formatAudioDataArrayBufferToFloatArray({ bitsPerSample }, arrayBuffer) { switch (bitsPerSample) { case 8: return formatTypedBitArrayToFloatArray(new Int8Array(arrayBuffer), 128); case 16: return formatTypedBitArrayToFloatArray(new Int16Array(arrayBuffer), 32768); case 32: return formatTypedBitArrayToFloatArray(new Int32Array(arrayBuffer), 2147483648); default: throw new Error('Only WAVE_FORMAT_PCM (8/16/32 bps) format supported at this time'); } } function abortToReject(signal) { return new Promise((_, reject) => { signal.onabort = () => reject(new Error('aborted')); }); } // In a 2 channel audio (e.g. A/B), the data arrives as interleaved, like "ABABABABAB". // This function will take "ABABABABAB" and return an array ["AAAAA", "BBBBB"]. function deinterleave(channelInterleavedAudioData, { channels }) { const multiChannelArrayBuffer = new Array(channels); const frameSize = channelInterleavedAudioData.length / channels; for (let channel = 0; channel < channels; channel++) { const audioData = new Float32Array(frameSize); multiChannelArrayBuffer[+channel] = audioData; for (let offset = 0; offset < frameSize; offset++) { audioData[+offset] = channelInterleavedAudioData[offset * channels + channel]; } } return multiChannelArrayBuffer; } // This function upsamples the audio data via an integer multiplier. // Web Chat uses simple anti-aliasing. For simplicity, the anti-aliasing does not roll over to next buffer. function multiplySampleRate(source, sampleRateMultiplier) { if (sampleRateMultiplier === 1) { return source; } const lastValues = new Array(sampleRateMultiplier).fill(source[0]); const target = new Float32Array(source.length * sampleRateMultiplier); for (let sourceOffset = 0; sourceOffset < source.length; sourceOffset++) { const value = source[+sourceOffset]; const targetOffset = sourceOffset * sampleRateMultiplier; for (let multiplierIndex = 0; multiplierIndex < sampleRateMultiplier; multiplierIndex++) { lastValues.shift(); lastValues.push(value); target[targetOffset + multiplierIndex] = average(lastValues); } } return target; } async function playCognitiveServicesStream(audioContext, stream, { signal = {} } = {}) { if (!audioContext) { throw new Error('botframework-directlinespeech-sdk: audioContext must be specified.'); } else if (!stream) { throw new Error('botframework-directlinespeech-sdk: stream must be specified.'); } else if (!stream.format) { throw new Error('botframework-directlinespeech-sdk: stream is missing format.'); } else if (typeof stream.read !== 'function') { throw new Error('botframework-directlinespeech-sdk: stream is missing read().'); } const queuedBufferSourceNodes = []; try { const { format } = stream; const abortPromise = abortToReject(signal); const array = new Uint8Array(DEFAULT_BUFFER_SIZE); const read = () => Promise.race([ abortPromise.catch(() => { // Abort will gracefully end the queue. We will check signal.aborted later to throw abort exception. }), stream .read(array.buffer) .then(numBytes => (numBytes === array.byteLength ? array : numBytes ? array.slice(0, numBytes) : undefined)) ]); if (signal.aborted) { throw new Error('aborted'); } let { samplesPerSec } = format; // TODO: [P0] #3692 Remove the following if-condition block when the underlying bugs are resolved. // There is a bug in Speech SDK 1.15.0 that returns 24kHz instead of 16kHz. // Even if we explicitly specify the output audio format to 16kHz, there is another bug that ignored it. // In short, DLSpeech service currently always streams in RIFF WAV format, instead of MP3. // https://github.com/microsoft/cognitive-services-speech-sdk-js/issues/313 // https://github.com/microsoft/cognitive-services-speech-sdk-js/issues/314 if (format.requestAudioFormatString === 'audio-24khz-48kbitrate-mono-mp3') { samplesPerSec = 16000; } let newSamplesPerSec = samplesPerSec; let sampleRateMultiplier = 1; // Safari requires a minimum sample rate of 22100 Hz. // A multiplier is calculated the the data meets the minimum sample rate. // An integer-based multiplier to simplify our upsampler. // For security, data will only be upsampled up to 96000 Hz. while (newSamplesPerSec < MIN_SAMPLE_RATE && newSamplesPerSec < 96000) { sampleRateMultiplier++; newSamplesPerSec = samplesPerSec * sampleRateMultiplier; } // The third parameter is the sample size in bytes. // For example, if the Speech SDK sends Web Chat 4096 bytes of 16-bit samples, there will be 2048 samples per channel. // The multi-buffering player is set up to handle 2048 samples per buffer. // If the multiplier 3x, it will handle 6144 samples per buffer. const player = createMultiBufferingPlayer( audioContext, { ...format, samplesPerSec: newSamplesPerSec }, (DEFAULT_BUFFER_SIZE / (format.bitsPerSample / 8)) * sampleRateMultiplier ); // For security, the maximum number of chunks handled will be 1000. for ( let chunk = await read(), maxChunks = 0; chunk && maxChunks < 1000 && !signal.aborted; chunk = await read(), maxChunks++ ) { if (signal.aborted) { break; } // Data received from Speech SDK is interleaved; 2 channels (e.g. A and B) will be sent as "ABABABABAB" // And each sample (A/B) will be an 8 to 32-bit number. // Convert the 8 - 32-bit number into a floating-point number, as required by Web Audio API. const interleavedArray = formatAudioDataArrayBufferToFloatArray(format, chunk.buffer); // Deinterleave data back into two array buffer, e.g. "AAAAA" and "BBBBB". const multiChannelArray = deinterleave(interleavedArray, format); // Upsample data if necessary. If the multiplier is 2x, "AAAAA" will be upsampled to "AAAAAAAAAA" (with anti-alias). const upsampledMultiChannelArray = multiChannelArray.map(array => multiplySampleRate(array, sampleRateMultiplier) ); // Queue to the buffering player. player.push(upsampledMultiChannelArray); } abortPromise.catch(() => player.cancelAll()); if (signal.aborted) { throw new Error('aborted'); } await Promise.race([abortPromise, player.flush()]); } finally { queuedBufferSourceNodes.forEach(node => node.stop()); } } const EMPTY_MP3_BASE64 = 'SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU3LjU2LjEwMQAAAAAAAAAAAAAA//tAwAAAAAAAAAAAAAAAAAAAAAAASW5mbwAAAA8AAAACAAABhgC7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7//////////////////////////////////////////////////////////////////8AAAAATGF2YzU3LjY0AAAAAAAAAAAAAAAAJAUHAAAAAAAAAYYoRBqpAAAAAAD/+xDEAAPAAAGkAAAAIAAANIAAAARMQU1FMy45OS41VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVf/7EMQpg8AAAaQAAAAgAAA0gAAABFVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV'; function subscribeEvent(target, name, handler) { target.addEventListener(name, handler); return () => target.removeEventListener(name, handler); } function asyncDecodeAudioData(audioContext, arrayBuffer) { return new Promise((resolve, reject) => { const promise = audioContext.decodeAudioData(arrayBuffer, resolve, reject); // Newer implementation of "decodeAudioData" will return a Promise promise && typeof promise.then === 'function' && resolve(promise); }); } function playDecoded(audioContext, audioBuffer, source) { return new Promise((resolve, reject) => { const audioContextClosed = new EventAsPromise(); const sourceEnded = new EventAsPromise(); const unsubscribe = subscribeEvent( audioContext, 'statechange', ({ target: { state } }) => state === 'closed' && audioContextClosed.eventListener() ); try { source.buffer = audioBuffer; // "ended" may not fire if the underlying AudioContext is closed prematurely source.onended = sourceEnded.eventListener; source.connect(audioContext.destination); source.start(0); Promise.race([audioContextClosed.upcoming(), sourceEnded.upcoming()]).then(resolve); } catch (err) { reject(err); } finally { unsubscribe(); } }); } async function playWhiteNoise(audioContext) { const source = audioContext.createBufferSource(); const audioBuffer = await asyncDecodeAudioData(audioContext, decode(EMPTY_MP3_BASE64)); await playDecoded(audioContext, audioBuffer, source); } class SpeechSynthesisAudioStreamUtterance extends EventTarget { constructor(audioStream) { super(); if (audioStream && !(audioStream.format && typeof audioStream.read === 'function')) { throw new Error( 'botframework-directlinespeech-sdk: If the first argument is specified, it must be a Cognitive Services audio stream.' ); } this.audioStream = audioStream; } get onboundary() { return getEventAttributeValue(this, 'boundary'); } set onboundary(value) { setEventAttributeValue(this, 'boundary', value); } get onend() { return getEventAttributeValue(this, 'end'); } set onend(value) { setEventAttributeValue(this, 'end', value); } get onerror() { return getEventAttributeValue(this, 'error'); } set onerror(value) { setEventAttributeValue(this, 'error', value); } get onstart() { return getEventAttributeValue(this, 'start'); } set onstart(value) { setEventAttributeValue(this, 'start', value); } } /* eslint class-methods-use-this: ["error", { "exceptMethods": ["cancel", "getVoices", "speak"] }] */ function createWebSpeechPonyfillFactory ({ audioContext, enableTelemetry, ponyfill = { AudioContext: window.AudioContext || window.webkitAudioContext }, recognizer, textNormalization }) { if (!ponyfill.AudioContext) { console.warn( 'botframework-directlinespeech-sdk: This browser does not support Web Audio API. Speech support is disabled.' ); return () => ({}); } return () => { const { SpeechGrammarList, SpeechRecognition } = createSpeechRecognitionPonyfillFromRecognizer({ createRecognizer: () => recognizer, enableTelemetry, looseEvents: true, textNormalization }); if (!audioContext) { audioContext = new ponyfill.AudioContext(); } const { cancelAll, push } = createTaskQueue(); class SpeechSynthesis extends EventTarget { cancel() { cancelAll(); } // Returns an empty array. // Synthesis is done on the bot side, the content of the voice list is not meaningful on the client side. getVoices() { return []; } speak(utterance) { const { result } = push(() => { const controller = new AbortController(); const { signal } = controller; return { abort: controller.abort.bind(controller), result: (async () => { utterance.dispatchEvent(new Event('start')); try { if (utterance.audioStream) { await playCognitiveServicesStream(audioContext, utterance.audioStream, { signal }); } else { await playWhiteNoise(audioContext); } } catch (error) { // Either dispatch "end" or "error" event, but not both if (error.message !== 'aborted') { return utterance.dispatchEvent(new ErrorEvent(error)); } } utterance.dispatchEvent(new Event('end')); })() }; }); // Catching the error to prevent uncaught promise error due to cancellation. result.catch(error => { if (!/^cancelled/iu.test(error.message)) { throw error; } }); } get onvoiceschanged() { return getEventAttributeValue(this, 'voiceschanged'); } set onvoiceschanged(value) { setEventAttributeValue(this, 'voiceschanged', value); } } return { SpeechGrammarList, SpeechRecognition, speechSynthesis: new SpeechSynthesis(), SpeechSynthesisUtterance: SpeechSynthesisAudioStreamUtterance }; }; } /* global Observable */ function shareObservable(observable) { let observers = []; let subscription; return new Observable(observer => { if (!subscription) { subscription = observable.subscribe({ complete() { observers.forEach(observer => observer.complete()); }, error(err) { observers.forEach(observer => observer.error(err)); }, next(value) { observers.forEach(observer => observer.next(value)); } }); } observers.push(observer); return () => { observers = observers.filter(o => o !== observer); if (!observers.length) { subscription.unsubscribe(); subscription = null; } }; }); } /* eslint no-magic-numbers: ["error", { "ignore": [0, 1, 2, 4, 36] }] */ function randomActivityId() { return random().toString(36).substr(2); } class DirectLineSpeech { constructor({ dialogServiceConnector }) { let connectionStatusObserver; this.dialogServiceConnector = dialogServiceConnector; this.activity$ = shareObservable( new Observable$1(observer => { this._activityObserver = observer; connectionStatusObserver.next(0); connectionStatusObserver.next(1); dialogServiceConnector.connect( () => { connectionStatusObserver.next(2); }, error => { connectionStatusObserver.next(4); console.warn('botframework-directlinespeech-sdk: Failed to connect', { error }); } ); }) ); this.connectionStatus$ = shareObservable( new Observable$1(observer => { connectionStatusObserver = observer; }) ); dialogServiceConnector.activityReceived = (_, { activity, audioStream }) => { try { this._activityObserver && this._activityObserver.next({ ...activity, channelData: { ...activity.channelData, speechSynthesisUtterance: new SpeechSynthesisAudioStreamUtterance(audioStream) }, from: { ...activity.from, // Since DLSpeech service never ACK our outgoing activity, this activity must be from bot. role: 'bot' }, // Since DLSpeech never ACK our outgoing activity, the "replyToId" will rarely able to point to an existing activity. replyToId: undefined, // Direct Line Speech server currently do not timestamp outgoing activities. // Thus, it will be easier to just re-timestamp every incoming/outgoing activities using local time. timestamp: new Date().toISOString() }); } catch (error) { console.error(error); } }; } end() { try { this.dialogServiceConnector.close(); } catch (err) { if (!~err.message.indexOf('already disposed')) { throw err; } } } postActivity(activity) { // Currently, Web Chat set user ID on all outgoing activities. // As Direct Line Speech maintains its own user ID, Web Chat should not set the user ID. // TODO: [P2] We should move user ID into options of DirectLineJS, instead of Web Chat. activity = { ...activity, from: { role: 'user' } }; try { // TODO: [P1] Direct Line Speech server currently do not ack the outgoing activities with any activity ID or timestamp. const pseudoActivityId = randomActivityId(); const isSpeech = !!activity.channelData?.speech; // Do not send the activity if it was from speech. if (!isSpeech) { // Starting from Speech SDK 1.13.0, they accept JSON text instead of JavaScript object. // https://github.com/microsoft/cognitive-services-speech-sdk-js/commit/2f3a35446692b6d492a6c68e3237a48de67e293f this.dialogServiceConnector.sendActivityAsync(JSON.stringify(activity)); } this._activityObserver && this._activityObserver.next({ ...activity, id: pseudoActivityId, timestamp: new Date().toISOString() }); return Observable$1.of(pseudoActivityId); } catch (err) { return new Observable$1(observer => observer.error(err)); } } } // Interfaces not yet implemented in Web Chat: // referenceGrammarId?: string, // getSessionId? : () => Observable // Patching a function to add pre-processing of arguments and post-processing of result. function patchFunction(fn, pre, post) { return (...args) => { args = pre ? pre(...args) : args; const result = fn(...args); return post ? post(result) : result; }; } function patchDialogServiceConnectorInline(dialogServiceConnector) { // This function will patch DialogServiceConnector by modifying the object. // The patches are intended to fill-in features to make DialogServiceConnector object works like the full-fledged Recognizer object. let lastRecognitionWithResolvers; dialogServiceConnector.listenOnceAsync = patchFunction( dialogServiceConnector.listenOnceAsync.bind(dialogServiceConnector), (resolve, reject, ...args) => { lastRecognitionWithResolvers = withResolvers(); return [ patchFunction(resolve, null, result => { lastRecognitionWithResolvers.resolve(result); return result; }), patchFunction(reject, null, error => { lastRecognitionWithResolvers.reject(error); return error; }), ...args ]; } ); // TODO: [P1] #2664 startContinuousRecognitionAsync is not working yet in Speech SDK 1.15.0. // We need to polyfill to use listenOnceAsync instead, and disable stopContinuousRecognitionAsync. dialogServiceConnector.startContinuousRecognitionAsync = (resolve, reject) => { dialogServiceConnector.listenOnceAsync( () => { // We will resolve the Promise in a setTimeout. }, err => { resolve = null; reject && reject(err); } ); setTimeout(() => { reject = null; resolve && resolve(); }, 0); }; // TODO: stopContinuousRecognitionAsync is not working yet. // We will leave out the implementation as falsy, Web Chat will disable the microphone button after start dictate. // This will prevent user from aborting speech recognition. // dialogServiceConnector.stopContinuousRecognitionAsync = resolve => { // }; return dialogServiceConnector; } // Helper function for fetching network resource as JSON async function fetchJSON(url, options) { const res = await fetch(url, { ...options, headers: { ...options.headers, accept: 'application/json' } }); if (!res.ok) { throw new Error(`Failed to fetch JSON from server due to ${res.status}`); } return res.json(); } // Refreshes the given token async function refreshDirectLineToken(token) { const { token: refreshedToken } = await fetchJSON( 'https://directline.botframework.com/v3/directline/tokens/refresh', { headers: { authorization: `Bearer ${token}` }, method: 'POST' } ); return refreshedToken; } function resolveFunctionOrReturnValue(fnOrValue) { return typeof fnOrValue === 'function' ? fnOrValue() : fnOrValue; } /* eslint complexity: ["error", 33] */ const DIRECT_LINE_TOKEN_RENEWAL_INTERVAL = 900000; // 15 minutes const TOKEN_RENEWAL_INTERVAL = 120000; // eslint-disable-next-line complexity async function create({ audioConfig, audioContext, audioInputDeviceId, enableInternalHTTPSupport, enableTelemetry, fetchCredentials, speechRecognitionEndpointId, speechRecognitionLanguage = (typeof window !== 'undefined' && typeof window.navigator !== 'undefined' && window.navigator.language) || 'en-US', speechSynthesisDeploymentId, speechSynthesisOutputFormat, textNormalization, userID, username }) { if (!fetchCredentials) { throw new Error('"fetchCredentials" must be specified.'); } const { authorizationToken, directLineToken, directLineSpeechHostname, region, subscriptionKey } = await resolveFunctionOrReturnValue(fetchCredentials); if ( (!authorizationToken && !subscriptionKey) || (authorizationToken && subscriptionKey) || (authorizationToken && typeof authorizationToken !== 'string') || (subscriptionKey && typeof subscriptionKey !== 'string') || (enableInternalHTTPSupport && !directLineToken) ) { throw new Error( '"fetchCredentials" must return either "authorizationToken" or "subscriptionKey" as a non-empty string only. If enableInternalHTTPSupport is set to true, then it should also return a non-empty "directLineToken"' ); } if (typeof enableTelemetry !== 'undefined') { console.warn( 'botframework-directlinespeech: Telemetry options are not yet supported. Please refer to Cognitive Services documentation for details.' ); } if ((directLineSpeechHostname && region) || (!directLineSpeechHostname && !region)) { throw new Error( '"fetchCredentials" must return either "directLineSpeechHostname" or "region" and it must not be an empty string.' ); } if (directLineSpeechHostname) { if (typeof directLineSpeechHostname !== 'string') { throw new Error('"fetchCredentials" must return "directLineSpeechHostname" as a string.'); } if (enableInternalHTTPSupport) { throw new Error( '"fetchCredentials" must not return "directLineSpeechHostname" if "enableInternalHTTPSupport" is set.' ); } } else { if (typeof region !== 'string') { throw new Error('"fetchCredentials" must return "region" as a string.'); } } if (audioConfig && audioInputDeviceId) { console.warn( 'botframework-directlinespeech-sdk: Only "audioConfig" or "audioInputDeviceId" can be specified, but not both; ignoring "audioInputDeviceId".' ); } else if (!audioConfig) { if (audioInputDeviceId) { audioConfig = AudioConfig.fromMicrophoneInput(audioInputDeviceId); } else { audioConfig = AudioConfig.fromDefaultMicrophoneInput(); } } if (speechRecognitionEndpointId) { console.warn( 'botframework-directlinespeech: Custom Speech is currently not supported; ignoring "speechRecognitionEndpointId".' ); } if (speechSynthesisDeploymentId) { console.warn( 'botframework-directlinespeech: Custom Voice is currently not supported; ignoring "speechSynthesisDeploymentId".' ); } if (speechSynthesisOutputFormat) { console.warn( 'botframework-directlinespeech: Synthesis output format is currently not supported; ignoring "speechSynthesisOutputFormat".' ); } if (textNormalization) { console.warn( 'botframework-directlinespeech: Text normalization is currently not supported; ignoring "textNormalization".' ); } if (userID || username) { console.warn( 'botframework-directlinespeech: Custom "userId" and "username" are currently not supported and are ignored.' ); } let config; if (directLineSpeechHostname) { if (authorizationToken) { config = BotFrameworkConfig.fromHost(new URL(`wss://${directLineSpeechHostname}`)); config.setProperty(PropertyId.SpeechServiceAuthorization_Token, authorizationToken); } else { config = BotFrameworkConfig.fromHost(new URL(`wss://${directLineSpeechHostname}`), subscriptionKey); } } else { if (authorizationToken) { config = BotFrameworkConfig.fromAuthorizationToken(authorizationToken, region); } else { config = BotFrameworkConfig.fromSubscription(subscriptionKey, region); } // If internal HTTP support is enabled, switch the endpoint to Direct Line on Direct Line Speech service. if (enableInternalHTTPSupport) { config.setProperty( PropertyId.SpeechServiceConnection_Endpoint, `wss://${encodeURI(region)}.convai.speech.microsoft.com/directline/api/v1` ); config.setProperty(PropertyId.Conversation_Agent_Connection_Id, directLineToken); } } // Supported options can be found in DialogConnectorFactory.js. // Set the language used for recognition. config.setProperty(PropertyId.SpeechServiceConnection_RecoLanguage, speechRecognitionLanguage); // The following code sets the output format. // As advised by the Speech team, this API may be subject to future changes. // We are not enabling output format option because it does not send detailed output format to the bot, rendering this option useless. // config.setProperty(PropertyId.SpeechServiceResponse_OutputFormatOption, OutputFormat[OutputFormat.Detailed]); // Set the user ID for starting the conversation. userID && config.setProperty(PropertyId.Conversation_From_Id, userID); // Set Custom Speech and Custom Voice. // The following code is copied from C#, and it is not working yet. // https://github.com/Azure-Samples/Cognitive-Services-Direct-Line-Speech-Client/blob/master/DLSpeechClient/MainWindow.xaml.cs // speechRecognitionEndpointId && config.setServiceProperty('cid', speechRecognitionEndpointId, ServicePropertyChannel.UriQueryParameter); // speechSynthesisDeploymentId && config.setProperty(PropertyId.conversation_Custom_Voice_Deployment_Ids, speechSynthesisDeploymentId); const dialogServiceConnector = patchDialogServiceConnectorInline(new DialogServiceConnector(config, audioConfig)); // Renew token per interval. if (authorizationToken) { const interval = setInterval(async () => { // #2660 If the connector has been disposed, we should stop renewing the token. // TODO: We should use a public implementation if Speech SDK has one related to "privIsDisposed". if (dialogServiceConnector.privIsDisposed) { clearInterval(interval); } const { authorizationToken, directLineSpeechHostname: nextDirectLineSpeechHostname, region: nextRegion } = await resolveFunctionOrReturnValue(fetchCredentials); if (!authorizationToken) { return console.warn( 'botframework-directlinespeech-sdk: Renew token failed because "fetchCredentials" call returned no authorization token.' ); } if (directLineSpeechHostname) { if (directLineSpeechHostname !== nextDirectLineSpeechHostname) { return console.warn( 'botframework-directlinespeech-sdk: "directLineSpeechHostname" change is not supported for renewed token. Authorization token is not renewed.' ); } } else { if (region !== nextRegion) { return console.warn( 'botframework-directlinespeech-sdk: Region change is not supported for renewed token. Authorization token is not renewed.' ); } } dialogServiceConnector.authorizationToken = authorizationToken; // eslint-disable-line require-atomic-updates }, TOKEN_RENEWAL_INTERVAL); } // Renew token per interval. if (enableInternalHTTPSupport) { const interval = setInterval(async () => { // #2660 If the connector has been disposed, we should stop renewing the token. // TODO: We should use a public implementation if Speech SDK has one related to "privIsDisposed". if (dialogServiceConnector.privIsDisposed) { clearInterval(interval); } const refreshedDirectLineToken = await refreshDirectLineToken(directLineToken); if (!refreshedDirectLineToken) { return console.warn( 'botframework-directlinespeech-sdk: Renew token failed because call to refresh token Direct Line API did not return a new token.' ); } config.setProperty(PropertyId.Conversation_Agent_Connection_Id, refreshedDirectLineToken); dialogServiceConnector.properties.setProperty( PropertyId.Conversation_Agent_Connection_Id, refreshedDirectLineToken ); dialogServiceConnector.connect(); }, DIRECT_LINE_TOKEN_RENEWAL_INTERVAL); } const directLine = new DirectLineSpeech({ dialogServiceConnector }); const webSpeechPonyfillFactory = createWebSpeechPonyfillFactory({ audioContext, enableTelemetry, recognizer: dialogServiceConnector, textNormalization }); return { directLine, webSpeechPonyfillFactory }; } /* global process:readonly */ const buildTool = process.env.build_tool; const moduleFormat = process.env.module_format; const version = process.env.npm_package_version; const buildInfo = { buildTool, moduleFormat, version }; if (typeof HTMLDocument !== 'undefined' && typeof document !== 'undefined' && document instanceof HTMLDocument) { const version = process.env.npm_package_version; const versionMeta = document.createElement('meta'); versionMeta.setAttribute('name', 'botframework-directlinespeech:version'); versionMeta.setAttribute('content', version); document.head.appendChild(versionMeta); const packageMeta = document.createElement('meta'); packageMeta.setAttribute('name', 'botframework-directlinespeech'); packageMeta.setAttribute('content', `version=${version}; build_tool=${buildTool}; format=${moduleFormat}`); document.head.appendChild(packageMeta); } export { buildInfo, create as createAdapters };