/** * @license * Copyright 2026 Steven Roussey * SPDX-License-Identifier: Apache-2.0 */ /** * Pluggable tokenizer for text indexes. Implementations must be deterministic * and side-effect free: indexing and querying always pass through the same * tokenizer, so any non-determinism produces silent recall regressions. */ export interface Tokenizer { tokenize(input: string): string[]; } /** * Default English stopwords. Exported so callers can extend (rather than * replace) the default list: * * ```ts * const stopwords = new Set([...DEFAULT_ENGLISH_STOPWORDS, "foo", "bar"]); * ``` */ export declare const DEFAULT_ENGLISH_STOPWORDS: ReadonlySet; export interface DefaultTokenizerOptions { readonly stopwords?: ReadonlySet; readonly minTokenLength?: number; } /** * Default tokenizer: lowercase, split on Unicode non-letter/non-digit * characters, drop tokens shorter than `minTokenLength`, drop stopwords. * * No stemming. Identical at index and query time. */ export declare function createDefaultTokenizer(options?: DefaultTokenizerOptions): Tokenizer; //# sourceMappingURL=Tokenizer.d.ts.map