import { ArrayMap, Embedding, LayerNorm, Linear, Tensor } from '@jsgrad/jsgrad/base';
/**
 * Namespace for CLIP Text Tokenizer components.
 */
export declare class Tokenizer {
    /**
     * Return set of symbol pairs in a word.
     * Word is represented as tuple of symbols (symbols being variable-length strings).
     */
    static get_pairs: (word: string[]) => Set<[string, string]>;
    static whitespace_clean: (text: string) => string;
    /**
     * Returns list of utf-8 byte and a corresponding list of unicode strings.
     * The reversible bpe codes work on unicode strings.
     * This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
     * When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
     * This is a significant percentage of your normal, say, 32K bpe vocab.
     * To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     * And avoids mapping to whitespace/control characters the bpe code barfs on.
     */
    static bytes_to_unicode: () => Record<number, string>;
}
export declare class ClipTokenizer {
    byte_encoder: Record<number, string>;
    encoder: Record<string, number>;
    bpe_ranks: ArrayMap<[string, string], number>;
    cache: Record<string, string>;
    pat: RegExp;
    static init: () => Promise<ClipTokenizer>;
    bpe: (token: string) => string;
    encode: (text: string, pad_with_zeros?: boolean) => number[];
}
export declare class ClipMlp {
    fc1: Linear;
    fc2: Linear;
    call: (h: Tensor) => Tensor;
}
export declare class ClipAttention {
    embed_dim: number;
    num_heads: number;
    head_dim: number;
    k_proj: Linear;
    v_proj: Linear;
    q_proj: Linear;
    out_proj: Linear;
    call: (hidden_states: Tensor, causal_attention_mask: Tensor) => Tensor;
}
export declare class ClipEncoderLayer {
    self_attn: ClipAttention;
    layer_norm1: LayerNorm;
    mlp: ClipMlp;
    layer_norm2: LayerNorm;
    call: (hidden_states: Tensor, causal_attention_mask: Tensor) => Tensor;
}
export declare class ClipTextEmbeddings {
    token_embedding: Embedding;
    position_embedding: Embedding;
    call: (input_ids: Tensor, position_ids: Tensor) => Tensor;
}
export declare class ClipEncoder {
    layers: ClipEncoderLayer[];
    constructor(layer_count?: number);
    call: (x: Tensor, causal_attention_mask: Tensor, ret_layer_idx?: number) => Tensor;
}
export declare class ClipTextTransformer {
    ret_layer_idx?: number | undefined;
    embeddings: ClipTextEmbeddings;
    encoder: ClipEncoder;
    final_layer_norm: LayerNorm;
    constructor(ret_layer_idx?: number | undefined);
    call: (input_ids: Tensor) => Tensor;
}
export declare class ClipTextModel {
    text_model: ClipTextTransformer;
    constructor(ret_layer_idx?: number);
}