import type * as SarvamAI from "../../../../index.js"; /** * @example * { * text: "x" * } */ export interface TextToSpeechStreamRequest { /** * The text to be converted into streamed speech. * * **Features:** * - Max 3500 characters * - Supports code-mixed text (English and Indic languages) * * **Important Note:** * - For numbers larger than 4 digits, use commas (e.g., '10,000' instead of '10000') * - This ensures proper pronunciation as a whole number */ text: string; /** The language code in BCP-47 format. */ target_language_code?: SarvamAI.TextToSpeechLanguage; /** * The speaker voice to be used for the output audio. * * **Default:** shubh (for bulbul:v3), anushka (for bulbul:v2) * * **Note:** Speaker selection must match the chosen model version. * * **Important:** Speaker names are case-sensitive and must be lowercase (e.g., `ritu` not `Ritu`). */ speaker?: SarvamAI.TextToSpeechSpeaker; /** * Controls the pitch of the audio. Range: -0.75 to 0.75. Default is 0.0. * * **Note:** Only supported for bulbul:v2. */ pitch?: number; /** * Controls the speed of the audio. Default is 1.0. * * **Model-specific ranges:** * - **bulbul:v3:** 0.5 to 2.0 * - **bulbul:v2:** 0.3 to 3.0 */ pace?: number; /** * Controls the loudness of the audio. Range: 0.3 to 3.0. Default is 1.0. * * **Note:** Only supported for bulbul:v2. */ loudness?: number; /** * Specifies the sample rate of the output audio. Default is 22050 Hz. * * **Note:** OPUS codec only supports 8000, 12000, 16000, 24000, 48000 Hz. */ speech_sample_rate?: SarvamAI.SpeechSampleRate; /** Controls whether normalization of English words and numeric entities is performed. Default is false. */ enable_preprocessing?: boolean; /** Specifies the model to use for text-to-speech conversion. Default is bulbul:v2. */ model?: SarvamAI.TextToSpeechModel; /** * Controls the randomness of the output. Range: 0.01 to 1.0. Default is 0.6. * * **Note:** Only supported for bulbul:v3. */ temperature?: number; /** Enable caching for the request. Default is false. Currently in beta. */ enable_cached_responses?: boolean; /** * The ID of a pronunciation dictionary to apply during synthesis. When provided, matching words in the input text will be replaced with their custom pronunciations before generating speech. * * Create and manage dictionaries via the [Pronunciation Dictionary API](https://docs.sarvam.ai/api-reference-docs/pronunciation-dictionary/create). Only supported by **bulbul:v3**. */ dict_id?: string; /** Specifies the codec for the streamed output audio (e.g., 'mp3'). */ output_audio_codec?: SarvamAI.SpeechStreamCodec; /** Bitrate for the streamed output audio. Default is '128k'. */ output_audio_bitrate?: SarvamAI.SpeechStreamBitrate; }