import { ArrayMap, type Conv2d, Embedding, type Layer, LayerNorm, Linear, Tensor, TinyJit, UOp, type Variable } from '@jsgrad/jsgrad/base'; import { Tokenizer } from "./tokenizer.ts"; export declare const LANGUAGES: { en: string; zh: string; de: string; es: string; ru: string; ko: string; fr: string; ja: string; pt: string; tr: string; pl: string; ca: string; nl: string; ar: string; sv: string; it: string; id: string; hi: string; fi: string; vi: string; he: string; uk: string; el: string; ms: string; cs: string; ro: string; da: string; hu: string; ta: string; no: string; th: string; ur: string; hr: string; bg: string; lt: string; la: string; mi: string; ml: string; cy: string; sk: string; te: string; fa: string; lv: string; bn: string; sr: string; az: string; sl: string; kn: string; et: string; mk: string; br: string; eu: string; is: string; hy: string; ne: string; mn: string; bs: string; kk: string; sq: string; sw: string; gl: string; mr: string; pa: string; si: string; km: string; sn: string; yo: string; so: string; af: string; oc: string; ka: string; be: string; tg: string; sd: string; gu: string; am: string; yi: string; lo: string; uz: string; fo: string; ht: string; ps: string; tk: string; nn: string; mt: string; sa: string; lb: string; my: string; bo: string; tl: string; mg: string; as: string; tt: string; haw: string; ln: string; ha: string; ba: string; jw: string; su: string; }; export declare const MODELS: { 'tiny.en': { url: string; dims: { n_mels: number; n_vocab: number; n_audio_ctx: number; n_audio_state: number; n_audio_head: number; n_audio_layer: number; n_text_ctx: number; n_text_state: number; n_text_head: number; n_text_layer: number; }; }; tiny: { url: string; dims: { n_mels: number; n_vocab: number; n_audio_ctx: number; n_audio_state: number; n_audio_head: number; n_audio_layer: number; n_text_ctx: number; n_text_state: number; n_text_head: number; n_text_layer: number; }; }; 'base.en': { url: string; dims: { n_mels: number; n_vocab: number; n_audio_ctx: number; n_audio_state: number; n_audio_head: number; n_audio_layer: number; n_text_ctx: number; n_text_state: number; n_text_head: number; n_text_layer: number; }; }; base: { url: string; dims: { n_mels: number; n_vocab: number; n_audio_ctx: number; n_audio_state: number; n_audio_head: number; n_audio_layer: number; n_text_ctx: number; n_text_state: number; n_text_head: number; n_text_layer: number; }; }; 'small.en': { url: string; dims: { n_mels: number; n_vocab: number; n_audio_ctx: number; n_audio_state: number; n_audio_head: number; n_audio_layer: number; n_text_ctx: number; n_text_state: number; n_text_head: number; n_text_layer: number; }; }; small: { url: string; dims: { n_mels: number; n_vocab: number; n_audio_ctx: number; n_audio_state: number; n_audio_head: number; n_audio_layer: number; n_text_ctx: number; n_text_state: number; n_text_head: number; n_text_layer: number; }; }; 'medium.en': { url: string; dims: { n_mels: number; n_vocab: number; n_audio_ctx: number; n_audio_state: number; n_audio_head: number; n_audio_layer: number; n_text_ctx: number; n_text_state: number; n_text_head: number; n_text_layer: number; }; }; medium: { url: string; dims: { n_mels: number; n_vocab: number; n_audio_ctx: number; n_audio_state: number; n_audio_head: number; n_audio_layer: number; n_text_ctx: number; n_text_state: number; n_text_head: number; n_text_layer: number; }; }; 'large-v2': { url: string; dims: { n_mels: number; n_vocab: number; n_audio_ctx: number; n_audio_state: number; n_audio_head: number; n_audio_layer: number; n_text_ctx: number; n_text_state: number; n_text_head: number; n_text_layer: number; }; }; large: { url: string; dims: { n_mels: number; n_vocab: number; n_audio_ctx: number; n_audio_state: number; n_audio_head: number; n_audio_layer: number; n_text_ctx: number; n_text_state: number; n_text_head: number; n_text_layer: number; }; }; }; export type WhisperModel = keyof typeof MODELS; type Dims = typeof MODELS['tiny.en']['dims']; export declare class MultiHeadAttention { n_head: number; kv_caching?: "cross" | "self" | undefined; max_self_attn_cache_len?: number | undefined; query: Linear; key: Linear; value: Linear; out: Linear; cache_v?: Tensor; cache_k?: Tensor; constructor(n_state: number, n_head: number, kv_caching?: "cross" | "self" | undefined, max_self_attn_cache_len?: number | undefined); call: (x: Tensor, xa?: Tensor, mask?: Tensor, len?: Variable | number) => Promise; } export declare class ResidualAttentionBlock { attn: MultiHeadAttention; attn_ln: LayerNorm; cross_attn?: MultiHeadAttention; cross_attn_ln?: LayerNorm; mlp: Layer[]; mlp_ln: LayerNorm; constructor(n_state: number, n_head: number, is_decoder_block?: boolean, max_self_attn_cache_len?: number); call: (x: Tensor, xa?: Tensor, mask?: Tensor, len?: Variable | number) => Promise; } export declare class AudioEncoder { conv1: Conv2d; conv2: Conv2d; blocks: ResidualAttentionBlock[]; ln_post: LayerNorm; positional_embedding: Tensor; encode: TinyJit<[x: Tensor], Tensor>; constructor({ n_mels, n_audio_ctx, n_audio_state, n_audio_head, n_audio_layer }: Dims); call: (x: Tensor) => Promise; } declare class TextDecoder { max_tokens_to_sample: number; max_self_attn_cache_len: number; token_embedding: Embedding; positional_embedding: Tensor; blocks: ResidualAttentionBlock[]; ln: LayerNorm; mask: Tensor; getjitted: ArrayMap, [number[], TinyJit<[x: Tensor, pos: number | UOp, encoded_audio: Tensor], Tensor>]>; static init: ({ n_vocab, n_text_ctx, n_text_state, n_text_head, n_text_layer }: Dims) => Promise; call: (x: Tensor, pos: number, encoded_audio: Tensor) => Promise; forward: (x: Tensor, pos: Variable | number, encoded_audio: Tensor) => Promise; output_tok: (x: Tensor) => Promise; } declare class Whisper { encoder: AudioEncoder; decoder: TextDecoder; is_multilingual: boolean; batch_size: number; static init: (dims: Dims, batch_size?: number) => Promise; } /** * param waveforms: A list of possibly variable length 16000Hz audio samples * param batch_size: The batch_size associated with the Whisper model being used to transcribe the audio. Used to prevent JIT mismatch errors since the encoder does not accept symbolic shapes * param truncate: If true, truncates (or pads) audio to exactly 30s for a single encoder pass * return: mel spectrogram of the given waveforms */ export declare const prep_audio: (_waveforms: Float32Array[], batch_size: number, truncate?: boolean) => Promise; export declare const init_whisper: (model_name: WhisperModel, batch_size?: number) => Promise<[Whisper, Tokenizer]>; export declare const load_file_waveform: (filename: string) => Promise; export declare const transcribe_file: (model: any, enc: Tokenizer, filename: string, language?: string) => Promise; export {};