/** * Represents a chunk of data received during TTS streaming. * Can contain either audio data or word boundary metadata. */ type TTSChunk = { /** The type of chunk - either audio data or word boundary metadata */ type: "audio" | "WordBoundary"; /** Raw audio data buffer (present for audio chunks) */ data?: Buffer; /** Duration of the word in 100-nanosecond units (present for WordBoundary chunks) */ duration?: number; /** Offset from the beginning in 100-nanosecond units (present for WordBoundary chunks) */ offset?: number; /** The spoken text (present for WordBoundary chunks) */ text?: string; }; /** * Voice characteristics and personality tags from the Microsoft Edge TTS service. */ type VoiceTag = { /** Content categories that the voice is optimized for */ ContentCategories: ("Cartoon" | "Conversation" | "Copilot" | "Dialect" | "General" | "News" | "Novel" | "Sports")[]; /** Personality traits that describe the voice's characteristics */ VoicePersonalities: ("Approachable" | "Authentic" | "Authority" | "Bright" | "Caring" | "Casual" | "Cheerful" | "Clear" | "Comfort" | "Confident" | "Considerate" | "Conversational" | "Cute" | "Expressive" | "Friendly" | "Honest" | "Humorous" | "Lively" | "Passion" | "Pleasant" | "Positive" | "Professional" | "Rational" | "Reliable" | "Sincere" | "Sunshine" | "Warm")[]; }; /** * Complete voice definition as returned by the Microsoft Edge TTS service. */ type Voice = { /** Full voice name identifier */ Name: string; /** Short name for the voice */ ShortName: string; /** Gender of the voice */ Gender: "Female" | "Male"; /** Locale code (e.g., "en-US", "zh-CN") */ Locale: string; /** Recommended audio codec for this voice */ SuggestedCodec: "audio-24khz-48kbitrate-mono-mp3"; /** Human-readable friendly name */ FriendlyName: string; /** Voice availability status */ Status: "GA"; /** Voice characteristics and personality traits */ VoiceTag: VoiceTag; }; /** * Extended voice type with language information for the VoicesManager. */ type VoicesManagerVoice = Voice & { /** Language code extracted from the locale (e.g., "en" from "en-US") */ Language: string; }; /** * Filter criteria for finding voices using the VoicesManager. */ type VoicesManagerFind = { /** Filter by voice gender */ Gender?: "Female" | "Male"; /** Filter by locale code */ Locale?: string; /** Filter by language code */ Language?: string; }; /** * Internal state tracking for the Communicate class during streaming. */ type CommunicateState = { /** Buffer for partial text data */ partialText: Buffer; /** Timing offset compensation for multi-request scenarios */ offsetCompensation: number; /** Last recorded duration offset for timing calculations */ lastDurationOffset: number; /** Flag indicating if the stream method has been called */ streamWasCalled: boolean; }; /** * Supported audio output formats for Microsoft Edge TTS service. */ type AudioOutputFormat = 'audio-16khz-32kbitrate-mono-mp3' | 'audio-16khz-64kbitrate-mono-mp3' | 'audio-16khz-128kbitrate-mono-mp3' | 'audio-24khz-48kbitrate-mono-mp3' | 'audio-24khz-96kbitrate-mono-mp3' | 'audio-24khz-160kbitrate-mono-mp3' | 'audio-48khz-96kbitrate-mono-mp3' | 'audio-48khz-192kbitrate-mono-mp3' | 'audio-16khz-16bit-32kbps-mono-opus' | 'audio-24khz-16bit-24kbps-mono-opus' | 'audio-24khz-16bit-48kbps-mono-opus' | 'ogg-16khz-16bit-mono-opus' | 'ogg-24khz-16bit-mono-opus' | 'ogg-48khz-16bit-mono-opus' | 'webm-16khz-16bit-mono-opus' | 'webm-24khz-16bit-24kbps-mono-opus' | 'webm-24khz-16bit-mono-opus' | 'raw-8khz-8bit-mono-alaw' | 'raw-8khz-8bit-mono-mulaw' | 'raw-8khz-16bit-mono-pcm' | 'raw-16khz-16bit-mono-pcm' | 'raw-22050hz-16bit-mono-pcm' | 'raw-24khz-16bit-mono-pcm' | 'raw-44100hz-16bit-mono-pcm' | 'raw-48khz-16bit-mono-pcm' | 'amr-wb-16000hz' | 'g722-16khz-64kbps' | 'raw-16khz-16bit-mono-truesilk' | 'raw-24khz-16bit-mono-truesilk'; interface IsomorphicTTSChunk { type: "audio" | "WordBoundary"; data?: Uint8Array; duration?: number; offset?: number; text?: string; } /** * Configuration options for the isomorphic Communicate class. */ interface IsomorphicCommunicateOptions { /** Voice to use for synthesis (e.g., "en-US-EmmaMultilingualNeural") */ voice?: string; /** Speech rate adjustment (e.g., "+20%", "-10%") */ rate?: string; /** Volume level adjustment (e.g., "+50%", "-25%") */ volume?: string; /** Pitch adjustment in Hz (e.g., "+5Hz", "-10Hz") */ pitch?: string; /** Audio output format (default: "audio-24khz-48kbitrate-mono-mp3") */ format?: AudioOutputFormat; /** Proxy URL for requests (Node.js only) */ proxy?: string; /** WebSocket connection timeout in milliseconds */ connectionTimeout?: number; } /** * Isomorphic Communicate class that works in both Node.js and browsers. * Uses isomorphic packages to provide consistent functionality across environments. * * @example * ```typescript * // Works in both Node.js and browsers (with CORS considerations) * const communicate = new IsomorphicCommunicate('Hello, world!', { * voice: 'en-US-EmmaMultilingualNeural', * }); * * for await (const chunk of communicate.stream()) { * if (chunk.type === 'audio' && chunk.data) { * // Handle audio data * } * } * ``` */ declare class IsomorphicCommunicate { private readonly ttsConfig; private readonly texts; private readonly format; private state; /** * Creates a new isomorphic Communicate instance for text-to-speech synthesis. * * @param text - The text to synthesize * @param options - Configuration options for synthesis * @param options.format - Audio output format (default: "audio-24khz-48kbitrate-mono-mp3") */ constructor(text: string, options?: IsomorphicCommunicateOptions); private parseMetadata; private createWebSocket; private _stream; /** * Streams text-to-speech synthesis results using isomorphic WebSocket. * Works in both Node.js and browsers (subject to CORS policy). * * @yields TTSChunk - Audio data or word boundary information * @throws {Error} If called more than once * @throws {NoAudioReceived} If no audio data is received * @throws {WebSocketError} If WebSocket connection fails */ stream(): AsyncGenerator; } /** * Error class for fetch-related errors (isomorphic equivalent of AxiosError) */ declare class FetchError extends Error { response?: { status: number; headers: Record; }; constructor(message: string, response?: { status: number; headers: Record; }); } /** * Fetches all available voices from the Microsoft Edge TTS service (isomorphic version). * Works in both Node.js and browsers (subject to CORS policy). * * @param proxy - Optional proxy URL for the request (limited browser support) * @returns Promise resolving to array of available voices */ declare function listVoices(proxy?: string): Promise; /** * Isomorphic utility class for finding and filtering available voices. * Works in both Node.js and browsers (subject to CORS policy). * * @example * ```typescript * const voicesManager = await IsomorphicVoicesManager.create(); * const englishVoices = voicesManager.find({ Language: 'en' }); * ``` */ declare class IsomorphicVoicesManager { private voices; private calledCreate; /** * Creates a new IsomorphicVoicesManager instance. * * @param customVoices - Optional custom voice list instead of fetching from API * @param proxy - Optional proxy URL for API requests (limited browser support) * @returns Promise resolving to IsomorphicVoicesManager instance */ static create(customVoices?: Voice[], proxy?: string): Promise; /** * Finds voices matching the specified criteria. * * @param filter - Filter criteria for voice selection * @returns Array of voices matching the filter * @throws {Error} If called before create() */ find(filter: VoicesManagerFind): VoicesManagerVoice[]; } /** * Isomorphic DRM class that works in both Node.js and browsers. * Uses appropriate crypto APIs based on the environment. */ declare class IsomorphicDRM { private static clockSkewSeconds; static adjClockSkewSeconds(skewSeconds: number): void; static getUnixTimestamp(): number; static parseRfc2616Date(date: string): number | null; static handleClientResponseError(response: { status: number; headers: any; }): void; static generateSecMsGec(): Promise; } /** * Options for controlling the voice prosody (rate, pitch, volume). */ interface ProsodyOptions { /** * The speaking rate of the voice. * Examples: "+10.00%", "-20.00%" */ rate?: string; /** * The speaking volume of the voice. * Examples: "+15.00%", "-10.00%" */ volume?: string; /** * The speaking pitch of the voice. * Examples: "+20Hz", "-10Hz" */ pitch?: string; } /** * Represents a single word boundary with its timing and text. * The API provides timing in 100-nanosecond units. */ interface WordBoundary { /** * The offset from the beginning of the audio stream in 100-nanosecond units. */ offset: number; /** * The duration of the word in 100-nanosecond units. */ duration: number; /** * The text of the spoken word. */ text: string; } /** * The final result of the synthesis process. */ interface SynthesisResult { /** * The generated audio as a Blob, which can be used in an