/** * BM25 (Best Matching 25) implementation for keyword-based search. * * BM25 is a probabilistic ranking function used for information retrieval. * It ranks documents based on the query terms appearing in each document, * taking into account term frequency and document length normalization. */ import type { LineRange } from '../line-utils.js'; /** * BM25 configuration parameters */ export interface BM25Config { /** * Controls term frequency saturation. * Higher values give more weight to term frequency. * Typical range: 1.2 - 2.0 * @default 1.5 */ k1?: number; /** * Controls document length normalization. * 0 = no length normalization, 1 = full normalization * @default 0.75 */ b?: number; } /** * Represents a document in the BM25 index */ export interface BM25Document { /** Document identifier */ id: string; /** Document content */ content: string; /** Pre-computed tokens for the document */ tokens: string[]; /** Token frequency map */ termFrequencies: Map; /** Total number of tokens */ length: number; /** Optional metadata */ metadata?: Record; } /** * Result from a BM25 search */ export interface BM25SearchResult { /** Document identifier */ id: string; /** Document content */ content: string; /** BM25 score (higher is more relevant) */ score: number; /** Optional metadata */ metadata?: Record; /** Line range where query terms were found (if computed) */ lineRange?: LineRange; } /** * Tokenization options */ export interface TokenizeOptions { /** Convert to lowercase */ lowercase?: boolean; /** Remove punctuation */ removePunctuation?: boolean; /** Minimum token length */ minLength?: number; /** Custom stopwords to remove */ stopwords?: Set; /** Custom split pattern (default: /\s+/) */ splitPattern?: RegExp; } /** * Default English stopwords */ export declare const DEFAULT_STOPWORDS: Set; /** * Tokenize text into an array of terms */ export declare function tokenize(text: string, options?: TokenizeOptions): string[]; export { extractLines, extractLinesWithLimit, formatWithLineNumbers, replaceString, StringNotFoundError, StringNotUniqueError, } from '../line-utils.js'; /** * Find the line range where query terms appear in content. * Returns the range spanning from the first to the last line containing any query term. * * @param content - The document content * @param queryTerms - Tokenized query terms to find * @param options - Tokenization options (should match indexing options) * @returns LineRange if terms found, undefined otherwise */ export declare function findLineRange(content: string, queryTerms: string[], options?: TokenizeOptions): LineRange | undefined; /** * BM25 Index for keyword-based document retrieval */ export declare class BM25Index { #private; /** BM25 k1 parameter */ readonly k1: number; /** BM25 b parameter */ readonly b: number; constructor(config?: BM25Config, tokenizeOptions?: TokenizeOptions); /** * Add a document to the index */ add(id: string, content: string, metadata?: Record): void; /** * Remove a document from the index */ remove(id: string): boolean; /** * Clear all documents from the index */ clear(): void; /** * Search for documents matching the query */ search(query: string, topK?: number, minScore?: number): BM25SearchResult[]; /** * Get a document by ID */ get(id: string): BM25Document | undefined; /** * Check if a document exists in the index */ has(id: string): boolean; /** * Get the number of documents in the index */ get size(): number; /** * Get all document IDs */ get documentIds(): string[]; /** * Serialize the index to a JSON-compatible object */ serialize(): BM25IndexData; /** * Deserialize an index from a JSON object */ static deserialize(data: BM25IndexData, tokenizeOptions?: TokenizeOptions): BM25Index; } /** * Serialized document format for persistence */ interface SerializedBM25Document { id: string; content: string; tokens: string[]; termFrequencies: Record; length: number; metadata?: Record; } /** * Serialized index data for persistence */ export interface BM25IndexData { k1: number; b: number; documents: SerializedBM25Document[]; avgDocLength: number; } //# sourceMappingURL=bm25.d.ts.map