/** * Represents a chunk of data received during TTS streaming. * Can contain either audio data or word boundary metadata. */ type TTSChunk = { /** The type of chunk - either audio data or word boundary metadata */ type: "audio" | "WordBoundary"; /** Raw audio data buffer (present for audio chunks) */ data?: Buffer; /** Duration of the word in 100-nanosecond units (present for WordBoundary chunks) */ duration?: number; /** Offset from the beginning in 100-nanosecond units (present for WordBoundary chunks) */ offset?: number; /** The spoken text (present for WordBoundary chunks) */ text?: string; }; /** * Voice characteristics and personality tags from the Microsoft Edge TTS service. */ type VoiceTag = { /** Content categories that the voice is optimized for */ ContentCategories: ("Cartoon" | "Conversation" | "Copilot" | "Dialect" | "General" | "News" | "Novel" | "Sports")[]; /** Personality traits that describe the voice's characteristics */ VoicePersonalities: ("Approachable" | "Authentic" | "Authority" | "Bright" | "Caring" | "Casual" | "Cheerful" | "Clear" | "Comfort" | "Confident" | "Considerate" | "Conversational" | "Cute" | "Expressive" | "Friendly" | "Honest" | "Humorous" | "Lively" | "Passion" | "Pleasant" | "Positive" | "Professional" | "Rational" | "Reliable" | "Sincere" | "Sunshine" | "Warm")[]; }; /** * Complete voice definition as returned by the Microsoft Edge TTS service. */ type Voice = { /** Full voice name identifier */ Name: string; /** Short name for the voice */ ShortName: string; /** Gender of the voice */ Gender: "Female" | "Male"; /** Locale code (e.g., "en-US", "zh-CN") */ Locale: string; /** Recommended audio codec for this voice */ SuggestedCodec: "audio-24khz-48kbitrate-mono-mp3"; /** Human-readable friendly name */ FriendlyName: string; /** Voice availability status */ Status: "GA"; /** Voice characteristics and personality traits */ VoiceTag: VoiceTag; }; /** * Extended voice type with language information for the VoicesManager. */ type VoicesManagerVoice = Voice & { /** Language code extracted from the locale (e.g., "en" from "en-US") */ Language: string; }; /** * Filter criteria for finding voices using the VoicesManager. */ type VoicesManagerFind = { /** Filter by voice gender */ Gender?: "Female" | "Male"; /** Filter by locale code */ Locale?: string; /** Filter by language code */ Language?: string; }; /** * Internal state tracking for the Communicate class during streaming. */ type CommunicateState = { /** Buffer for partial text data */ partialText: Buffer; /** Timing offset compensation for multi-request scenarios */ offsetCompensation: number; /** Last recorded duration offset for timing calculations */ lastDurationOffset: number; /** Flag indicating if the stream method has been called */ streamWasCalled: boolean; }; /** * Supported audio output formats for Microsoft Edge TTS service. */ type AudioOutputFormat = 'audio-16khz-32kbitrate-mono-mp3' | 'audio-16khz-64kbitrate-mono-mp3' | 'audio-16khz-128kbitrate-mono-mp3' | 'audio-24khz-48kbitrate-mono-mp3' | 'audio-24khz-96kbitrate-mono-mp3' | 'audio-24khz-160kbitrate-mono-mp3' | 'audio-48khz-96kbitrate-mono-mp3' | 'audio-48khz-192kbitrate-mono-mp3' | 'audio-16khz-16bit-32kbps-mono-opus' | 'audio-24khz-16bit-24kbps-mono-opus' | 'audio-24khz-16bit-48kbps-mono-opus' | 'ogg-16khz-16bit-mono-opus' | 'ogg-24khz-16bit-mono-opus' | 'ogg-48khz-16bit-mono-opus' | 'webm-16khz-16bit-mono-opus' | 'webm-24khz-16bit-24kbps-mono-opus' | 'webm-24khz-16bit-mono-opus' | 'raw-8khz-8bit-mono-alaw' | 'raw-8khz-8bit-mono-mulaw' | 'raw-8khz-16bit-mono-pcm' | 'raw-16khz-16bit-mono-pcm' | 'raw-22050hz-16bit-mono-pcm' | 'raw-24khz-16bit-mono-pcm' | 'raw-44100hz-16bit-mono-pcm' | 'raw-48khz-16bit-mono-pcm' | 'amr-wb-16000hz' | 'g722-16khz-64kbps' | 'raw-16khz-16bit-mono-truesilk' | 'raw-24khz-16bit-mono-truesilk'; /** * Configuration options for the Communicate class. */ interface CommunicateOptions { /** Voice to use for synthesis (e.g., "en-US-EmmaMultilingualNeural") */ voice?: string; /** Speech rate adjustment (e.g., "+20%", "-10%") */ rate?: string; /** Volume level adjustment (e.g., "+50%", "-25%") */ volume?: string; /** Pitch adjustment in Hz (e.g., "+5Hz", "-10Hz") */ pitch?: string; /** Audio output format (default: "audio-24khz-48kbitrate-mono-mp3") */ format?: AudioOutputFormat; /** Proxy URL for requests */ proxy?: string; /** WebSocket connection timeout in milliseconds */ connectionTimeout?: number; } /** * Main class for text-to-speech synthesis using Microsoft Edge's online TTS service. * * @example * ```typescript * const communicate = new Communicate('Hello, world!', { * voice: 'en-US-EmmaMultilingualNeural', * }); * * for await (const chunk of communicate.stream()) { * if (chunk.type === 'audio' && chunk.data) { * // Handle audio data * } * } * ``` */ declare class Communicate { private readonly ttsConfig; private readonly texts; private readonly format; private readonly proxy?; private readonly connectionTimeout?; private state; /** * Creates a new Communicate instance for text-to-speech synthesis. * * @param text - The text to synthesize * @param options - Configuration options for synthesis * @param options.format - Audio output format (default: "audio-24khz-48kbitrate-mono-mp3") */ constructor(text: string, options?: CommunicateOptions); private parseMetadata; private _stream; /** * Streams text-to-speech synthesis results. * * Returns an async generator that yields audio chunks and word boundary events. * Can only be called once per Communicate instance. * * @yields TTSChunk - Audio data or word boundary information * @throws {Error} If called more than once * @throws {NoAudioReceived} If no audio data is received * @throws {WebSocketError} If WebSocket connection fails * * @example * ```typescript * for await (const chunk of communicate.stream()) { * if (chunk.type === 'audio') { * // Process audio data * } else if (chunk.type === 'WordBoundary') { * // Process subtitle timing * } * } * ``` */ stream(): AsyncGenerator; } /** * Utility class for generating SRT subtitles from WordBoundary events. * * @example * ```typescript * const subMaker = new SubMaker(); * * for await (const chunk of communicate.stream()) { * if (chunk.type === 'WordBoundary') { * subMaker.feed(chunk); * } * } * * const srt = subMaker.getSrt(); * ``` */ declare class SubMaker { private cues; /** * Adds a WordBoundary chunk to the subtitle maker. * * @param msg - Must be a WordBoundary type chunk with offset, duration, and text * @throws {ValueError} If chunk is not a WordBoundary with required fields */ feed(msg: TTSChunk): void; /** * Merges consecutive cues to create subtitle entries with multiple words. * This is useful for creating more readable subtitles instead of word-by-word display. * * @param words - Maximum number of words per merged cue * @throws {ValueError} If words parameter is invalid */ mergeCues(words: number): void; /** * Returns the subtitles in SRT format. * * @returns SRT formatted subtitles */ getSrt(): string; toString(): string; } /** * Fetches all available voices from the Microsoft Edge TTS service. * * @param proxy - Optional proxy URL for the request * @returns Promise resolving to array of available voices */ declare function listVoices$1(proxy?: string): Promise; /** * Utility class for finding and filtering available voices. * * @example * ```typescript * const voicesManager = await VoicesManager.create(); * const englishVoices = voicesManager.find({ Language: 'en' }); * ``` */ declare class VoicesManager { private voices; private calledCreate; /** * Creates a new VoicesManager instance. * * @param customVoices - Optional custom voice list instead of fetching from API * @param proxy - Optional proxy URL for API requests * @returns Promise resolving to VoicesManager instance */ static create(customVoices?: Voice[], proxy?: string): Promise; /** * Finds voices matching the specified criteria. * * @param filter - Filter criteria for voice selection * @returns Array of voices matching the filter * @throws {Error} If called before create() */ find(filter: VoicesManagerFind): VoicesManagerVoice[]; } /** * Options for controlling the voice prosody (rate, pitch, volume). */ interface ProsodyOptions$2 { /** * The speaking rate of the voice. * Examples: "+10.00%", "-20.00%" */ rate?: string; /** * The speaking volume of the voice. * Examples: "+15.00%", "-10.00%" */ volume?: string; /** * The speaking pitch of the voice. * Examples: "+20Hz", "-10Hz" */ pitch?: string; } /** * Represents a single word boundary with its timing and text. * The API provides timing in 100-nanosecond units. */ interface WordBoundary$2 { /** * The offset from the beginning of the audio stream in 100-nanosecond units. */ offset: number; /** * The duration of the word in 100-nanosecond units. */ duration: number; /** * The text of the spoken word. */ text: string; } /** * The final result of the synthesis process. */ interface SynthesisResult$2 { /** * The generated audio as a Blob, which can be used in an