export type TokenizeInput = Uint8Array | string; export type RawTokenizeResultBase = { /** * array of byte starting offsets for every token after the first one * * Offsets are measured against the tokenizer’s effective UTF-8 text representation after its built-in normalization and preprocessing. That usually matches the provided input, but tokenizers that normalize or discard characters can shift the reported positions. * * The array length is equal to the number of tokens minus one, since the first token always starts at byte offset `0` within that effective tokenizer input. */ offsets: Array; /** * array of token IDs in token order */ tokens: Array; }; export type RawTextTokenizeResultExtension = { /** * the actual input passed to the tokenizer after applying all preprocessing/normalization steps * * If no such steps are defined by the corresponding tokenizer or they didn’t modify the input, this property is absent. */ processedInput?: string; }; export type RawBinaryTokenizeResultExtension = { /** * the actual input passed to the tokenizer after applying all preprocessing/normalization steps * * If no such steps are defined by the corresponding tokenizer or they didn’t modify the input, this property is absent. */ processedInput?: Uint8Array; }; export type RawTokenizeResult = RawTokenizeResultBase & (InputGeneric extends Uint8Array ? RawBinaryTokenizeResultExtension : RawTextTokenizeResultExtension); export type RawBinaryTokenizeResult = RawTokenizeResult; export type RawTextTokenizeResult = RawTokenizeResult; export declare const toTokenizeText: (input: TokenizeInput) => string; export declare const getUtf8ByteLength: (text: string) => number; export declare const toRawTokenizeResult: (tokens: ReadonlyArray, tokenStartOffsets: ReadonlyArray, processedInput?: InputGeneric) => RawTokenizeResult; export declare const toRawTokenizeResultFromTokenByteLengths: (tokens: ReadonlyArray, tokenByteLengths: ReadonlyArray, processedInput?: InputGeneric) => RawTokenizeResult;