{"version":3,"sources":["../src/audio.ts"],"sourcesContent":["/**\n * Copyright 2024 The Fire Company\n * Copyright 2024 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport type {\n  GenerateRequest,\n  GenerateResponseData,\n  ModelReference,\n} from 'genkit';\nimport { GenerationCommonConfigSchema, Message, modelRef, z } from 'genkit';\nimport type { ModelAction, ModelInfo } from 'genkit/model';\nimport { model } from 'genkit/plugin';\nimport OpenAI from 'openai';\nimport { Response } from 'openai/core.mjs';\nimport type {\n  SpeechCreateParams,\n  Transcription,\n  TranscriptionCreateParams,\n} from 'openai/resources/audio/index.mjs';\nimport { PluginOptions } from './index.js';\nimport { maybeCreateRequestScopedOpenAIClient, toModelName } from './utils.js';\n\nexport type SpeechRequestBuilder = (\n  req: GenerateRequest,\n  params: SpeechCreateParams\n) => void;\nexport type TranscriptionRequestBuilder = (\n  req: GenerateRequest,\n  params: TranscriptionCreateParams\n) => void;\n\nexport const TRANSCRIPTION_MODEL_INFO: ModelInfo = {\n  supports: {\n    media: true,\n    output: ['text', 'json'],\n    multiturn: false,\n    systemRole: false,\n    tools: false,\n  },\n};\n\nexport const SPEECH_MODEL_INFO: ModelInfo = {\n  supports: {\n    media: false,\n    output: ['media'],\n    multiturn: false,\n    systemRole: false,\n    tools: false,\n  },\n};\n\nconst ChunkingStrategySchema = z.object({\n  type: z.string(),\n  prefix_padding_ms: z.number().int().optional(),\n  silence_duration_ms: z.number().int().optional(),\n  threshold: z.number().min(0).max(1.0).optional(),\n});\nexport const TranscriptionConfigSchema = GenerationCommonConfigSchema.pick({\n  temperature: true,\n}).extend({\n  chunking_strategy: z\n    .union([z.literal('auto'), ChunkingStrategySchema])\n    .optional(),\n  include: z.array(z.any()).optional(),\n  language: z.string().optional(),\n  timestamp_granularities: z.array(z.enum(['word', 'segment'])).optional(),\n  response_format: z\n    .enum(['json', 'text', 'srt', 'verbose_json', 'vtt'])\n    .optional(),\n  // TODO stream support\n});\n\nexport const SpeechConfigSchema = z.object({\n  voice: z\n    .enum(['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'])\n    .default('alloy'),\n  speed: z.number().min(0.25).max(4.0).optional(),\n  response_format: z\n    .enum(['mp3', 'opus', 'aac', 'flac', 'wav', 'pcm'])\n    .optional(),\n});\n\n/**\n * Supported media formats for Audio generation\n */\nexport const RESPONSE_FORMAT_MEDIA_TYPES = {\n  mp3: 'audio/mpeg',\n  opus: 'audio/opus',\n  aac: 'audio/aac',\n  flac: 'audio/flac',\n  wav: 'audio/wav',\n  pcm: 'audio/L16',\n};\n\nexport function toTTSRequest(\n  modelName: string,\n  request: GenerateRequest,\n  requestBuilder?: SpeechRequestBuilder\n): SpeechCreateParams {\n  const {\n    voice,\n    version: modelVersion,\n    temperature,\n    maxOutputTokens,\n    stopSequences,\n    topK,\n    topP,\n    ...restOfConfig\n  } = request.config ?? {};\n\n  let options: SpeechCreateParams = {\n    model: modelVersion ?? modelName,\n    input: new Message(request.messages[0]).text,\n    voice: voice ?? 'alloy',\n  };\n  if (requestBuilder) {\n    requestBuilder(request, options);\n  } else {\n    options = {\n      ...options,\n      ...restOfConfig, // passthorugh rest of the config\n    };\n  }\n  for (const k in options) {\n    if (options[k] === undefined) {\n      delete options[k];\n    }\n  }\n  return options;\n}\n\nexport async function speechToGenerateResponse(\n  response: Response,\n  responseFormat: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm' = 'mp3'\n): Promise<GenerateResponseData> {\n  const resultArrayBuffer = await response.arrayBuffer();\n  const resultBuffer = Buffer.from(new Uint8Array(resultArrayBuffer));\n  const mediaType = RESPONSE_FORMAT_MEDIA_TYPES[responseFormat];\n  return {\n    message: {\n      role: 'model',\n      content: [\n        {\n          media: {\n            contentType: mediaType,\n            url: `data:${mediaType};base64,${resultBuffer.toString('base64')}`,\n          },\n        },\n      ],\n    },\n    finishReason: 'stop',\n    raw: response,\n  };\n}\n\n/**\n * Method to define a new Genkit Model that is compatible with the Open AI Audio\n * API. \n *\n * These models are to be used to create audio speech from a given request.\n * @param params An object containing parameters for defining the OpenAI speech\n * model.\n * @param params.ai The Genkit AI instance.\n * @param params.name The name of the model.\n * @param params.client The OpenAI client instance.\n * @param params.modelRef Optional reference to the model's configuration and\n * custom options.\n\n * @returns the created {@link ModelAction}\n */\nexport function defineCompatOpenAISpeechModel<\n  CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,\n>(params: {\n  name: string;\n  client: OpenAI;\n  modelRef?: ModelReference<CustomOptions>;\n  requestBuilder?: SpeechRequestBuilder;\n  pluginOptions: PluginOptions;\n}): ModelAction {\n  const {\n    name,\n    client: defaultClient,\n    pluginOptions,\n    modelRef,\n    requestBuilder,\n  } = params;\n  const modelName = toModelName(name, pluginOptions?.name);\n  const actionName = `${pluginOptions?.name ?? 'compat-oai'}/${modelName}`;\n\n  return model(\n    {\n      name: actionName,\n      ...modelRef?.info,\n      configSchema: modelRef?.configSchema,\n    },\n    async (request, { abortSignal }) => {\n      const ttsRequest = toTTSRequest(modelName, request, requestBuilder);\n      const client = maybeCreateRequestScopedOpenAIClient(\n        pluginOptions,\n        request,\n        defaultClient\n      );\n      const result = await client.audio.speech.create(ttsRequest, {\n        signal: abortSignal,\n      });\n      return await speechToGenerateResponse(result, ttsRequest.response_format);\n    }\n  );\n}\n\n/** Speech generation ModelRef helper, with reasonable defaults for\n * OpenAI-compatible providers */\nexport function compatOaiSpeechModelRef<\n  CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,\n>(params: {\n  name: string;\n  info?: ModelInfo;\n  configSchema?: CustomOptions;\n  config?: any;\n  namespace?: string;\n}) {\n  const {\n    name,\n    info = SPEECH_MODEL_INFO,\n    configSchema,\n    config = undefined,\n    namespace,\n  } = params;\n  return modelRef({\n    name,\n    configSchema: configSchema || (SpeechConfigSchema as any),\n    info,\n    config,\n    namespace,\n  });\n}\n\nexport function toSttRequest(\n  modelName: string,\n  request: GenerateRequest,\n  requestBuilder?: TranscriptionRequestBuilder\n): TranscriptionCreateParams {\n  const message = new Message(request.messages[0]);\n  const media = message.media;\n  if (!media?.url) {\n    throw new Error('No media found in the request');\n  }\n  const mediaBuffer = Buffer.from(\n    media.url.slice(media.url.indexOf(',') + 1),\n    'base64'\n  );\n  const mediaFile = new File([mediaBuffer], 'input', {\n    type:\n      media.contentType ??\n      media.url.slice('data:'.length, media.url.indexOf(';')),\n  });\n  const {\n    temperature,\n    version: modelVersion,\n    maxOutputTokens,\n    stopSequences,\n    topK,\n    topP,\n    ...restOfConfig\n  } = request.config ?? {};\n\n  let options: TranscriptionCreateParams = {\n    model: modelVersion ?? modelName,\n    file: mediaFile,\n    prompt: message.text,\n    temperature,\n  };\n  if (requestBuilder) {\n    requestBuilder(request, options);\n  } else {\n    options = {\n      ...options,\n      ...restOfConfig, // passthrough rest of the config\n    };\n  }\n  const outputFormat = request.output?.format as 'json' | 'text' | 'media';\n  const customFormat = request.config?.response_format;\n  if (outputFormat && customFormat) {\n    if (\n      outputFormat === 'json' &&\n      customFormat !== 'json' &&\n      customFormat !== 'verbose_json'\n    ) {\n      throw new Error(\n        `Custom response format ${customFormat} is not compatible with output format ${outputFormat}`\n      );\n    }\n  }\n  if (outputFormat === 'media') {\n    throw new Error(`Output format ${outputFormat} is not supported.`);\n  }\n  options.response_format = customFormat || outputFormat || 'text';\n  for (const k in options) {\n    if (options[k] === undefined) {\n      delete options[k];\n    }\n  }\n  return options;\n}\n\nexport function transcriptionToGenerateResponse(\n  result: Transcription | string\n): GenerateResponseData {\n  return {\n    message: {\n      role: 'model',\n      content: [\n        {\n          text: typeof result === 'string' ? result : result.text,\n        },\n      ],\n    },\n    finishReason: 'stop',\n    raw: result,\n  };\n}\n\n/**\n * Method to define a new Genkit Model that is compatible with Open AI\n * Transcriptions API. \n *\n * These models are to be used to transcribe audio to text.\n *\n * @param params An object containing parameters for defining the OpenAI\n * transcription model.\n * @param params.ai The Genkit AI instance.\n * @param params.name The name of the model.\n * @param params.client The OpenAI client instance.\n * @param params.modelRef Optional reference to the model's configuration and\n * custom options.\n\n * @returns the created {@link ModelAction}\n */\nexport function defineCompatOpenAITranscriptionModel<\n  CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,\n>(params: {\n  name: string;\n  client: OpenAI;\n  pluginOptions?: PluginOptions;\n  modelRef?: ModelReference<CustomOptions>;\n  requestBuilder?: TranscriptionRequestBuilder;\n}): ModelAction {\n  const {\n    name,\n    pluginOptions,\n    client: defaultClient,\n    modelRef,\n    requestBuilder,\n  } = params;\n  const modelName = toModelName(name, pluginOptions?.name);\n  const actionName =\n    modelRef?.name ?? `${pluginOptions?.name ?? 'compat-oai'}/${modelName}`;\n\n  return model(\n    {\n      name: actionName,\n      ...modelRef?.info,\n      configSchema: modelRef?.configSchema,\n    },\n    async (request, { abortSignal }) => {\n      const params = toSttRequest(modelName, request, requestBuilder);\n      const client = maybeCreateRequestScopedOpenAIClient(\n        pluginOptions,\n        request,\n        defaultClient\n      );\n      // Explicitly setting stream to false ensures we use the non-streaming overload\n      const result = await client.audio.transcriptions.create(\n        {\n          ...params,\n          stream: false,\n        },\n        { signal: abortSignal }\n      );\n      return transcriptionToGenerateResponse(result);\n    }\n  );\n}\n\n/** Transcription ModelRef helper, with reasonable defaults for\n * OpenAI-compatible providers */\nexport function compatOaiTranscriptionModelRef<\n  CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,\n>(params: {\n  name: string;\n  info?: ModelInfo;\n  configSchema?: CustomOptions;\n  config?: any;\n  namespace?: string;\n}) {\n  const {\n    name,\n    info = TRANSCRIPTION_MODEL_INFO,\n    configSchema,\n    config = undefined,\n    namespace,\n  } = params;\n  return modelRef({\n    name,\n    configSchema: configSchema || (TranscriptionConfigSchema as any),\n    info,\n    config,\n    namespace,\n  });\n}\n"],"mappings":"AAsBA,SAAS,8BAA8B,SAAS,UAAU,SAAS;AAEnE,SAAS,aAAa;AAStB,SAAS,sCAAsC,mBAAmB;AAW3D,MAAM,2BAAsC;AAAA,EACjD,UAAU;AAAA,IACR,OAAO;AAAA,IACP,QAAQ,CAAC,QAAQ,MAAM;AAAA,IACvB,WAAW;AAAA,IACX,YAAY;AAAA,IACZ,OAAO;AAAA,EACT;AACF;AAEO,MAAM,oBAA+B;AAAA,EAC1C,UAAU;AAAA,IACR,OAAO;AAAA,IACP,QAAQ,CAAC,OAAO;AAAA,IAChB,WAAW;AAAA,IACX,YAAY;AAAA,IACZ,OAAO;AAAA,EACT;AACF;AAEA,MAAM,yBAAyB,EAAE,OAAO;AAAA,EACtC,MAAM,EAAE,OAAO;AAAA,EACf,mBAAmB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAC7C,qBAAqB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EAC/C,WAAW,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAG,EAAE,SAAS;AACjD,CAAC;AACM,MAAM,4BAA4B,6BAA6B,KAAK;AAAA,EACzE,aAAa;AACf,CAAC,EAAE,OAAO;AAAA,EACR,mBAAmB,EAChB,MAAM,CAAC,EAAE,QAAQ,MAAM,GAAG,sBAAsB,CAAC,EACjD,SAAS;AAAA,EACZ,SAAS,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS;AAAA,EACnC,UAAU,EAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,yBAAyB,EAAE,MAAM,EAAE,KAAK,CAAC,QAAQ,SAAS,CAAC,CAAC,EAAE,SAAS;AAAA,EACvE,iBAAiB,EACd,KAAK,CAAC,QAAQ,QAAQ,OAAO,gBAAgB,KAAK,CAAC,EACnD,SAAS;AAAA;AAEd,CAAC;AAEM,MAAM,qBAAqB,EAAE,OAAO;AAAA,EACzC,OAAO,EACJ,KAAK,CAAC,SAAS,QAAQ,SAAS,QAAQ,QAAQ,SAAS,CAAC,EAC1D,QAAQ,OAAO;AAAA,EAClB,OAAO,EAAE,OAAO,EAAE,IAAI,IAAI,EAAE,IAAI,CAAG,EAAE,SAAS;AAAA,EAC9C,iBAAiB,EACd,KAAK,CAAC,OAAO,QAAQ,OAAO,QAAQ,OAAO,KAAK,CAAC,EACjD,SAAS;AACd,CAAC;AAKM,MAAM,8BAA8B;AAAA,EACzC,KAAK;AAAA,EACL,MAAM;AAAA,EACN,KAAK;AAAA,EACL,MAAM;AAAA,EACN,KAAK;AAAA,EACL,KAAK;AACP;AAEO,SAAS,aACd,WACA,SACA,gBACoB;AACpB,QAAM;AAAA,IACJ;AAAA,IACA,SAAS;AAAA,IACT;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG;AAAA,EACL,IAAI,QAAQ,UAAU,CAAC;AAEvB,MAAI,UAA8B;AAAA,IAChC,OAAO,gBAAgB;AAAA,IACvB,OAAO,IAAI,QAAQ,QAAQ,SAAS,CAAC,CAAC,EAAE;AAAA,IACxC,OAAO,SAAS;AAAA,EAClB;AACA,MAAI,gBAAgB;AAClB,mBAAe,SAAS,OAAO;AAAA,EACjC,OAAO;AACL,cAAU;AAAA,MACR,GAAG;AAAA,MACH,GAAG;AAAA;AAAA,IACL;AAAA,EACF;AACA,aAAW,KAAK,SAAS;AACvB,QAAI,QAAQ,CAAC,MAAM,QAAW;AAC5B,aAAO,QAAQ,CAAC;AAAA,IAClB;AAAA,EACF;AACA,SAAO;AACT;AAEA,eAAsB,yBACpB,UACA,iBAAkE,OACnC;AAC/B,QAAM,oBAAoB,MAAM,SAAS,YAAY;AACrD,QAAM,eAAe,OAAO,KAAK,IAAI,WAAW,iBAAiB,CAAC;AAClE,QAAM,YAAY,4BAA4B,cAAc;AAC5D,SAAO;AAAA,IACL,SAAS;AAAA,MACP,MAAM;AAAA,MACN,SAAS;AAAA,QACP;AAAA,UACE,OAAO;AAAA,YACL,aAAa;AAAA,YACb,KAAK,QAAQ,SAAS,WAAW,aAAa,SAAS,QAAQ,CAAC;AAAA,UAClE;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAAA,IACA,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AACF;AAiBO,SAAS,8BAEd,QAMc;AACd,QAAM;AAAA,IACJ;AAAA,IACA,QAAQ;AAAA,IACR;AAAA,IACA,UAAAA;AAAA,IACA;AAAA,EACF,IAAI;AACJ,QAAM,YAAY,YAAY,MAAM,eAAe,IAAI;AACvD,QAAM,aAAa,GAAG,eAAe,QAAQ,YAAY,IAAI,SAAS;AAEtE,SAAO;AAAA,IACL;AAAA,MACE,MAAM;AAAA,MACN,GAAGA,WAAU;AAAA,MACb,cAAcA,WAAU;AAAA,IAC1B;AAAA,IACA,OAAO,SAAS,EAAE,YAAY,MAAM;AAClC,YAAM,aAAa,aAAa,WAAW,SAAS,cAAc;AAClE,YAAM,SAAS;AAAA,QACb;AAAA,QACA;AAAA,QACA;AAAA,MACF;AACA,YAAM,SAAS,MAAM,OAAO,MAAM,OAAO,OAAO,YAAY;AAAA,QAC1D,QAAQ;AAAA,MACV,CAAC;AACD,aAAO,MAAM,yBAAyB,QAAQ,WAAW,eAAe;AAAA,IAC1E;AAAA,EACF;AACF;AAIO,SAAS,wBAEd,QAMC;AACD,QAAM;AAAA,IACJ;AAAA,IACA,OAAO;AAAA,IACP;AAAA,IACA,SAAS;AAAA,IACT;AAAA,EACF,IAAI;AACJ,SAAO,SAAS;AAAA,IACd;AAAA,IACA,cAAc,gBAAiB;AAAA,IAC/B;AAAA,IACA;AAAA,IACA;AAAA,EACF,CAAC;AACH;AAEO,SAAS,aACd,WACA,SACA,gBAC2B;AAC3B,QAAM,UAAU,IAAI,QAAQ,QAAQ,SAAS,CAAC,CAAC;AAC/C,QAAM,QAAQ,QAAQ;AACtB,MAAI,CAAC,OAAO,KAAK;AACf,UAAM,IAAI,MAAM,+BAA+B;AAAA,EACjD;AACA,QAAM,cAAc,OAAO;AAAA,IACzB,MAAM,IAAI,MAAM,MAAM,IAAI,QAAQ,GAAG,IAAI,CAAC;AAAA,IAC1C;AAAA,EACF;AACA,QAAM,YAAY,IAAI,KAAK,CAAC,WAAW,GAAG,SAAS;AAAA,IACjD,MACE,MAAM,eACN,MAAM,IAAI,MAAM,QAAQ,QAAQ,MAAM,IAAI,QAAQ,GAAG,CAAC;AAAA,EAC1D,CAAC;AACD,QAAM;AAAA,IACJ;AAAA,IACA,SAAS;AAAA,IACT;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG;AAAA,EACL,IAAI,QAAQ,UAAU,CAAC;AAEvB,MAAI,UAAqC;AAAA,IACvC,OAAO,gBAAgB;AAAA,IACvB,MAAM;AAAA,IACN,QAAQ,QAAQ;AAAA,IAChB;AAAA,EACF;AACA,MAAI,gBAAgB;AAClB,mBAAe,SAAS,OAAO;AAAA,EACjC,OAAO;AACL,cAAU;AAAA,MACR,GAAG;AAAA,MACH,GAAG;AAAA;AAAA,IACL;AAAA,EACF;AACA,QAAM,eAAe,QAAQ,QAAQ;AACrC,QAAM,eAAe,QAAQ,QAAQ;AACrC,MAAI,gBAAgB,cAAc;AAChC,QACE,iBAAiB,UACjB,iBAAiB,UACjB,iBAAiB,gBACjB;AACA,YAAM,IAAI;AAAA,QACR,0BAA0B,YAAY,yCAAyC,YAAY;AAAA,MAC7F;AAAA,IACF;AAAA,EACF;AACA,MAAI,iBAAiB,SAAS;AAC5B,UAAM,IAAI,MAAM,iBAAiB,YAAY,oBAAoB;AAAA,EACnE;AACA,UAAQ,kBAAkB,gBAAgB,gBAAgB;AAC1D,aAAW,KAAK,SAAS;AACvB,QAAI,QAAQ,CAAC,MAAM,QAAW;AAC5B,aAAO,QAAQ,CAAC;AAAA,IAClB;AAAA,EACF;AACA,SAAO;AACT;AAEO,SAAS,gCACd,QACsB;AACtB,SAAO;AAAA,IACL,SAAS;AAAA,MACP,MAAM;AAAA,MACN,SAAS;AAAA,QACP;AAAA,UACE,MAAM,OAAO,WAAW,WAAW,SAAS,OAAO;AAAA,QACrD;AAAA,MACF;AAAA,IACF;AAAA,IACA,cAAc;AAAA,IACd,KAAK;AAAA,EACP;AACF;AAkBO,SAAS,qCAEd,QAMc;AACd,QAAM;AAAA,IACJ;AAAA,IACA;AAAA,IACA,QAAQ;AAAA,IACR,UAAAA;AAAA,IACA;AAAA,EACF,IAAI;AACJ,QAAM,YAAY,YAAY,MAAM,eAAe,IAAI;AACvD,QAAM,aACJA,WAAU,QAAQ,GAAG,eAAe,QAAQ,YAAY,IAAI,SAAS;AAEvE,SAAO;AAAA,IACL;AAAA,MACE,MAAM;AAAA,MACN,GAAGA,WAAU;AAAA,MACb,cAAcA,WAAU;AAAA,IAC1B;AAAA,IACA,OAAO,SAAS,EAAE,YAAY,MAAM;AAClC,YAAMC,UAAS,aAAa,WAAW,SAAS,cAAc;AAC9D,YAAM,SAAS;AAAA,QACb;AAAA,QACA;AAAA,QACA;AAAA,MACF;AAEA,YAAM,SAAS,MAAM,OAAO,MAAM,eAAe;AAAA,QAC/C;AAAA,UACE,GAAGA;AAAA,UACH,QAAQ;AAAA,QACV;AAAA,QACA,EAAE,QAAQ,YAAY;AAAA,MACxB;AACA,aAAO,gCAAgC,MAAM;AAAA,IAC/C;AAAA,EACF;AACF;AAIO,SAAS,+BAEd,QAMC;AACD,QAAM;AAAA,IACJ;AAAA,IACA,OAAO;AAAA,IACP;AAAA,IACA,SAAS;AAAA,IACT;AAAA,EACF,IAAI;AACJ,SAAO,SAAS;AAAA,IACd;AAAA,IACA,cAAc,gBAAiB;AAAA,IAC/B;AAAA,IACA;AAAA,IACA;AAAA,EACF,CAAC;AACH;","names":["modelRef","params"]}