import type { IntentDynamicStepExecutionDefinition, IntentOptionDefinition, } from '../intentRuntime.ts' import type { SemanticIntentDescriptor, SemanticIntentPresentation } from './index.ts' import { parseOptionalEnumValue } from './parsing.ts' const speechTranscribeProviders = ['aws', 'gcp', 'replicate'] as const const speechTranscribeFormats = ['text', 'json', 'srt', 'webvtt'] as const type SpeechTranscribeProvider = (typeof speechTranscribeProviders)[number] type SpeechTranscribeFormat = (typeof speechTranscribeFormats)[number] const defaultSpeechTranscribeProvider = 'replicate' satisfies SpeechTranscribeProvider const defaultSpeechTranscribeFormat = 'text' satisfies SpeechTranscribeFormat const speechTranscribeExecutionDefinition = { kind: 'dynamic-step', handler: 'speech-transcribe', resultStepName: 'transcribe', fields: [ { name: 'provider', kind: 'string', propertyName: 'provider', optionFlags: '--provider', description: 'Provider to use for transcription. Defaults to replicate.', required: false, exampleValue: defaultSpeechTranscribeProvider, }, { name: 'format', kind: 'string', propertyName: 'format', optionFlags: '--format', description: 'Output format. Defaults to text.', required: false, exampleValue: defaultSpeechTranscribeFormat, }, { name: 'source_language', kind: 'string', propertyName: 'sourceLanguage', optionFlags: '--source-language', description: 'Spoken language as a BCP-47 code, for providers that support explicit source languages.', required: false, exampleValue: 'en-US', }, { name: 'target_language', kind: 'string', propertyName: 'targetLanguage', optionFlags: '--target-language', description: 'Target written language for providers that support translation.', required: false, exampleValue: 'en-US', }, ] as const satisfies readonly IntentOptionDefinition[], } satisfies IntentDynamicStepExecutionDefinition const speechTranscribeCommandPresentation = { description: 'Transcribe speech in audio or video files', details: 'Runs `/speech/transcribe` with a text-first default and writes the transcript to `--output`.', examples: [ [ 'Transcribe an audio file to text', 'transloadit speech transcribe --input voice.opus --output voice.txt', ], [ 'Generate subtitles', 'transloadit speech transcribe --input clip.mp4 --format webvtt --output captions.vtt', ], ] as Array<[string, string]>, } as const satisfies SemanticIntentPresentation function parseProvider(value: unknown): SpeechTranscribeProvider { return ( parseOptionalEnumValue({ flagName: '--provider', supportedValues: speechTranscribeProviders, value, }) ?? defaultSpeechTranscribeProvider ) } function parseFormat(value: unknown): SpeechTranscribeFormat { return ( parseOptionalEnumValue({ flagName: '--format', supportedValues: speechTranscribeFormats, value, }) ?? defaultSpeechTranscribeFormat ) } function parseOptionalString(value: unknown, flagName: string): string | null { if (value == null || value === '') { return null } if (typeof value !== 'string') { throw new Error(`${flagName} must be a string`) } const trimmed = value.trim() return trimmed.length > 0 ? trimmed : null } function createSpeechTranscribeStep( rawValues: Record, _context: { hasInputs: boolean }, ): Record { const provider = parseProvider(rawValues.provider) const format = parseFormat(rawValues.format) const sourceLanguage = parseOptionalString(rawValues.source_language, '--source-language') const targetLanguage = parseOptionalString(rawValues.target_language, '--target-language') return { robot: '/speech/transcribe', use: ':original', result: true, provider, format, ...(sourceLanguage != null ? { source_language: sourceLanguage } : {}), ...(targetLanguage != null ? { target_language: targetLanguage } : {}), } } export const speechTranscribeSemanticIntentDescriptor = { createStep: createSpeechTranscribeStep, defaultOutputPath: 'output.txt', execution: speechTranscribeExecutionDefinition, inputPolicy: { kind: 'required' }, outputDescription: 'Write the transcript to this path or directory', presentation: speechTranscribeCommandPresentation, runnerKind: 'watchable', } as const satisfies SemanticIntentDescriptor