// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.

import { APIResource } from '../../core/resource';
import * as SpeechAPI from './speech';
import * as AudioAPI from './audio';
import { APIPromise } from '../../core/api-promise';
import { Stream } from '../../core/streaming';
import { buildHeaders } from '../../internal/headers';
import { RequestOptions } from '../../internal/request-options';

export class Speech extends APIResource {
  /**
   * Generate audio from input text
   *
   * @example
   * ```ts
   * const speech = await client.audio.speech.create({
   *   input: 'input',
   *   model: 'canopylabs/orpheus-3b-0.1-ft',
   *   voice: 'voice',
   * });
   *
   * const content = await speech.blob();
   * console.log(content);
   * ```
   */
  create(body: SpeechCreateParamsNonStreaming, options?: RequestOptions): APIPromise<Response>;
  create(
    body: SpeechCreateParamsStreaming,
    options?: RequestOptions,
  ): APIPromise<Stream<AudioAPI.AudioSpeechStreamChunk>>;
  create(
    body: SpeechCreateParamsBase,
    options?: RequestOptions,
  ): APIPromise<Stream<AudioAPI.AudioSpeechStreamChunk> | Response>;
  create(
    body: SpeechCreateParams,
    options?: RequestOptions,
  ): APIPromise<Response> | APIPromise<Stream<AudioAPI.AudioSpeechStreamChunk>> {
    return this._client.post('/audio/speech', {
      body,
      ...options,
      headers: buildHeaders([{ Accept: 'application/octet-stream' }, options?.headers]),
      stream: body.stream ?? false,
      __binaryResponse: true,
    }) as APIPromise<Response> | APIPromise<Stream<AudioAPI.AudioSpeechStreamChunk>>;
  }
}

export type SpeechCreateParams = SpeechCreateParamsNonStreaming | SpeechCreateParamsStreaming;

export interface SpeechCreateParamsBase {
  /**
   * Input text to generate the audio for
   */
  input: string;

  /**
   * The name of the model to query.
   *
   * [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#audio-models)
   * The current supported tts models are: - cartesia/sonic - hexgrad/Kokoro-82M -
   * canopylabs/orpheus-3b-0.1-ft
   */
  model: 'cartesia/sonic' | 'hexgrad/Kokoro-82M' | 'canopylabs/orpheus-3b-0.1-ft' | (string & {});

  /**
   * The voice to use for generating the audio. The voices supported are different
   * for each model. For eg - for canopylabs/orpheus-3b-0.1-ft, one of the voices
   * supported is tara, for hexgrad/Kokoro-82M, one of the voices supported is
   * af_alloy and for cartesia/sonic, one of the voices supported is "friendly
   * sidekick".
   *
   * You can view the voices supported for each model using the /v1/voices endpoint
   * sending the model name as the query parameter.
   * [View all supported voices here](https://docs.together.ai/docs/text-to-speech#supported-voices).
   *
   * `hexgrad/Kokoro-82M` additionally supports voice mixing, where two or more
   * voices are combined into a single blended voice by joining their names with `+`
   * (e.g. `af_bella+af_heart`). Optional per-voice weights can be provided in
   * parentheses (e.g. `af_bella(2)+af_heart(1)`). Other models require a single
   * voice name.
   */
  voice: string;

  /**
   * Bitrate of the MP3 audio output in bits per second. Only applicable when
   * response_format is mp3. Higher values produce better audio quality at larger
   * file sizes. Default is 128000. Currently supported on Cartesia models.
   */
  bit_rate?: 32000 | 64000 | 96000 | 128000 | 192000;

  /**
   * Additional model-specific parameters that fine-tune speech generation behavior.
   */
  extra_params?: SpeechCreateParams.ExtraParams;

  /**
   * Language or locale of input text. Accepts ISO 639-1 language codes (e.g., `en`,
   * `fr`, `es`, `zh`) as well as locale codes for region-specific variants. Locale
   * codes must be lowercase (e.g., `zh-hk` for Cantonese).
   */
  language?: string;

  /**
   * Audio encoding of response. Only applicable when response_format is raw or pcm.
   * Cartesia models respect this parameter and support all values. Orpheus, Kokoro,
   * and Minimax models always return pcm_s16le regardless of this setting.
   */
  response_encoding?: 'pcm_f32le' | 'pcm_s16le' | 'pcm_mulaw' | 'pcm_alaw';

  /**
   * The format of audio output. Supported formats are mp3, wav, raw if streaming is
   * false. If streaming is true, the only supported format is raw.
   */
  response_format?: 'mp3' | 'wav' | 'raw';

  /**
   * Sampling rate in Hz for the output audio. Cartesia and Minimax models respect
   * this parameter. Orpheus and Kokoro models always output at 24000 Hz regardless
   * of this setting.
   */
  sample_rate?: number;

  /**
   * If true, output is streamed for several characters at a time instead of waiting
   * for the full response. The stream terminates with `data: [DONE]`. If false,
   * return the encoded audio as octet stream
   */
  stream?: boolean;
}

export namespace SpeechCreateParams {
  /**
   * Additional model-specific parameters that fine-tune speech generation behavior.
   */
  export interface ExtraParams {
    /**
     * A list of pronunciation rules for specific characters or symbols. Each entry
     * uses the format `"<source>/<replacement>"` (e.g., `["omg/oh my god"]`) to
     * override how the model pronounces matching tokens.
     */
    pronunciation_dict?: Array<string>;
  }

  export type SpeechCreateParamsNonStreaming = SpeechAPI.SpeechCreateParamsNonStreaming;
  export type SpeechCreateParamsStreaming = SpeechAPI.SpeechCreateParamsStreaming;
}

export interface SpeechCreateParamsNonStreaming extends SpeechCreateParamsBase {
  /**
   * If true, output is streamed for several characters at a time instead of waiting
   * for the full response. The stream terminates with `data: [DONE]`. If false,
   * return the encoded audio as octet stream
   */
  stream?: false;
}

export interface SpeechCreateParamsStreaming extends SpeechCreateParamsBase {
  /**
   * If true, output is streamed for several characters at a time instead of waiting
   * for the full response. The stream terminates with `data: [DONE]`. If false,
   * return the encoded audio as octet stream
   */
  stream: true;
}

export declare namespace Speech {
  export {
    type SpeechCreateParams as SpeechCreateParams,
    type SpeechCreateParamsNonStreaming as SpeechCreateParamsNonStreaming,
    type SpeechCreateParamsStreaming as SpeechCreateParamsStreaming,
  };
}