// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. import { APIResource } from '../../core/resource'; import * as SpeechAPI from './speech'; import * as AudioAPI from './audio'; import { APIPromise } from '../../core/api-promise'; import { Stream } from '../../core/streaming'; import { buildHeaders } from '../../internal/headers'; import { RequestOptions } from '../../internal/request-options'; export class Speech extends APIResource { /** * Generate audio from input text * * @example * ```ts * const speech = await client.audio.speech.create({ * input: 'input', * model: 'canopylabs/orpheus-3b-0.1-ft', * voice: 'voice', * }); * * const content = await speech.blob(); * console.log(content); * ``` */ create(body: SpeechCreateParamsNonStreaming, options?: RequestOptions): APIPromise; create( body: SpeechCreateParamsStreaming, options?: RequestOptions, ): APIPromise>; create( body: SpeechCreateParamsBase, options?: RequestOptions, ): APIPromise | Response>; create( body: SpeechCreateParams, options?: RequestOptions, ): APIPromise | APIPromise> { return this._client.post('/audio/speech', { body, ...options, headers: buildHeaders([{ Accept: 'application/octet-stream' }, options?.headers]), stream: body.stream ?? false, __binaryResponse: true, }) as APIPromise | APIPromise>; } } export type SpeechCreateParams = SpeechCreateParamsNonStreaming | SpeechCreateParamsStreaming; export interface SpeechCreateParamsBase { /** * Input text to generate the audio for */ input: string; /** * The name of the model to query. * * [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#audio-models) * The current supported tts models are: - cartesia/sonic - hexgrad/Kokoro-82M - * canopylabs/orpheus-3b-0.1-ft */ model: 'cartesia/sonic' | 'hexgrad/Kokoro-82M' | 'canopylabs/orpheus-3b-0.1-ft' | (string & {}); /** * The voice to use for generating the audio. The voices supported are different * for each model. For eg - for canopylabs/orpheus-3b-0.1-ft, one of the voices * supported is tara, for hexgrad/Kokoro-82M, one of the voices supported is * af_alloy and for cartesia/sonic, one of the voices supported is "friendly * sidekick". * * You can view the voices supported for each model using the /v1/voices endpoint * sending the model name as the query parameter. * [View all supported voices here](https://docs.together.ai/docs/text-to-speech#supported-voices). * * `hexgrad/Kokoro-82M` additionally supports voice mixing, where two or more * voices are combined into a single blended voice by joining their names with `+` * (e.g. `af_bella+af_heart`). Optional per-voice weights can be provided in * parentheses (e.g. `af_bella(2)+af_heart(1)`). Other models require a single * voice name. */ voice: string; /** * Bitrate of the MP3 audio output in bits per second. Only applicable when * response_format is mp3. Higher values produce better audio quality at larger * file sizes. Default is 128000. Currently supported on Cartesia models. */ bit_rate?: 32000 | 64000 | 96000 | 128000 | 192000; /** * Additional model-specific parameters that fine-tune speech generation behavior. */ extra_params?: SpeechCreateParams.ExtraParams; /** * Language or locale of input text. Accepts ISO 639-1 language codes (e.g., `en`, * `fr`, `es`, `zh`) as well as locale codes for region-specific variants. Locale * codes must be lowercase (e.g., `zh-hk` for Cantonese). */ language?: string; /** * Audio encoding of response. Only applicable when response_format is raw or pcm. * Cartesia models respect this parameter and support all values. Orpheus, Kokoro, * and Minimax models always return pcm_s16le regardless of this setting. */ response_encoding?: 'pcm_f32le' | 'pcm_s16le' | 'pcm_mulaw' | 'pcm_alaw'; /** * The format of audio output. Supported formats are mp3, wav, raw if streaming is * false. If streaming is true, the only supported format is raw. */ response_format?: 'mp3' | 'wav' | 'raw'; /** * Sampling rate in Hz for the output audio. Cartesia and Minimax models respect * this parameter. Orpheus and Kokoro models always output at 24000 Hz regardless * of this setting. */ sample_rate?: number; /** * If true, output is streamed for several characters at a time instead of waiting * for the full response. The stream terminates with `data: [DONE]`. If false, * return the encoded audio as octet stream */ stream?: boolean; } export namespace SpeechCreateParams { /** * Additional model-specific parameters that fine-tune speech generation behavior. */ export interface ExtraParams { /** * A list of pronunciation rules for specific characters or symbols. Each entry * uses the format `"/"` (e.g., `["omg/oh my god"]`) to * override how the model pronounces matching tokens. */ pronunciation_dict?: Array; } export type SpeechCreateParamsNonStreaming = SpeechAPI.SpeechCreateParamsNonStreaming; export type SpeechCreateParamsStreaming = SpeechAPI.SpeechCreateParamsStreaming; } export interface SpeechCreateParamsNonStreaming extends SpeechCreateParamsBase { /** * If true, output is streamed for several characters at a time instead of waiting * for the full response. The stream terminates with `data: [DONE]`. If false, * return the encoded audio as octet stream */ stream?: false; } export interface SpeechCreateParamsStreaming extends SpeechCreateParamsBase { /** * If true, output is streamed for several characters at a time instead of waiting * for the full response. The stream terminates with `data: [DONE]`. If false, * return the encoded audio as octet stream */ stream: true; } export declare namespace Speech { export { type SpeechCreateParams as SpeechCreateParams, type SpeechCreateParamsNonStreaming as SpeechCreateParamsNonStreaming, type SpeechCreateParamsStreaming as SpeechCreateParamsStreaming, }; }