import { ArrayMap, Embedding, LayerNorm, Linear, Tensor } from '@jsgrad/jsgrad/base'; /** * Namespace for CLIP Text Tokenizer components. */ export declare class Tokenizer { /** * Return set of symbol pairs in a word. * Word is represented as tuple of symbols (symbols being variable-length strings). */ static get_pairs: (word: string[]) => Set<[string, string]>; static whitespace_clean: (text: string) => string; /** * Returns list of utf-8 byte and a corresponding list of unicode strings. * The reversible bpe codes work on unicode strings. * This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. * When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. * This is a significant percentage of your normal, say, 32K bpe vocab. * To avoid that, we want lookup tables between utf-8 bytes and unicode strings. * And avoids mapping to whitespace/control characters the bpe code barfs on. */ static bytes_to_unicode: () => Record; } export declare class ClipTokenizer { byte_encoder: Record; encoder: Record; bpe_ranks: ArrayMap<[string, string], number>; cache: Record; pat: RegExp; static init: () => Promise; bpe: (token: string) => string; encode: (text: string, pad_with_zeros?: boolean) => number[]; } export declare class ClipMlp { fc1: Linear; fc2: Linear; call: (h: Tensor) => Tensor; } export declare class ClipAttention { embed_dim: number; num_heads: number; head_dim: number; k_proj: Linear; v_proj: Linear; q_proj: Linear; out_proj: Linear; call: (hidden_states: Tensor, causal_attention_mask: Tensor) => Tensor; } export declare class ClipEncoderLayer { self_attn: ClipAttention; layer_norm1: LayerNorm; mlp: ClipMlp; layer_norm2: LayerNorm; call: (hidden_states: Tensor, causal_attention_mask: Tensor) => Tensor; } export declare class ClipTextEmbeddings { token_embedding: Embedding; position_embedding: Embedding; call: (input_ids: Tensor, position_ids: Tensor) => Tensor; } export declare class ClipEncoder { layers: ClipEncoderLayer[]; constructor(layer_count?: number); call: (x: Tensor, causal_attention_mask: Tensor, ret_layer_idx?: number) => Tensor; } export declare class ClipTextTransformer { ret_layer_idx?: number | undefined; embeddings: ClipTextEmbeddings; encoder: ClipEncoder; final_layer_norm: LayerNorm; constructor(ret_layer_idx?: number | undefined); call: (input_ids: Tensor) => Tensor; } export declare class ClipTextModel { text_model: ClipTextTransformer; constructor(ret_layer_idx?: number); }