/** * @license * Copyright 2026 Steven Roussey * SPDX-License-Identifier: Apache-2.0 */ import type { ITextIndex, TextFields, TextSearchOptions, TextSearchResult } from "./ITextIndex"; import type { Tokenizer } from "./Tokenizer"; /** * Default per-field weights for {@link BM25Index} when indexing * {@link ChunkRecord}-shaped data. Tuned for hierarchical chunks — title and * section headings are heavier than the body to reflect their navigational * value, summaries are slightly above body, parent summaries are de-weighted * because they are inherited context rather than direct content. */ export declare const DEFAULT_CHUNK_FIELD_WEIGHTS: Readonly>; export interface BM25IndexOptions { /** Tokenizer used at index- and query-time. Defaults to {@link createDefaultTokenizer}. */ readonly tokenizer?: Tokenizer; /** * Per-field weight map. Fields not in this map are ignored at index time. * Defaults to {@link DEFAULT_CHUNK_FIELD_WEIGHTS}. */ readonly fieldWeights?: Readonly>; /** BM25 term-saturation parameter. Typical values 1.2–2.0. */ readonly k1?: number; /** BM25 length-normalisation parameter. Typical values 0.5–0.75. */ readonly b?: number; } interface SerialisedBM25State { readonly version: 1; readonly k1: number; readonly b: number; readonly fieldWeights: Record; readonly fieldStats: Record; readonly postings: Record>>; readonly docLengths: Record>; readonly chunkToDoc: Record; } /** * In-memory BM25F text index. State is JSON-serialisable via * {@link toJSON} / {@link fromJSON}. The serialised state captures all * scoring inputs — `k1`, `b`, `fieldWeights`, postings, and per-document * stats — so a `fromJSON` round-trip reproduces search results exactly. The * tokenizer is *not* serialised; callers must restore an index configured * with the same tokenizer that produced the state, otherwise query * tokenisation will diverge from indexed terms. * * Scoring formula (Lucene-style BM25F): * * ``` * idf(t) = ln(1 + (N - df(t) + 0.5) / (df(t) + 0.5)) * tilde_tf(t,d) = sum_f weight(f) * tf(t,d,f) / (1 - b + b * len_f(d) / avgLen_f) * score(t,d) = idf(t) * tilde_tf(t,d) / (k1 + tilde_tf(t,d)) * ``` * * Document frequency `df(t)` is the number of distinct chunks containing `t` * in *any* field. `N` is the total number of chunks. `tf(t,d,f)` is raw * term frequency in field `f` of chunk `d`. `len_f(d)` is the field length * (token count) for that chunk, and `avgLen_f` is the average length over * chunks that have field `f` populated. */ export declare class BM25Index implements ITextIndex { private readonly tokenizer; private fieldWeights; private k1; private b; private postings; private docLengths; private fieldStats; private chunkToDoc; private docToChunks; private chunkPostings; private termDf; constructor(options?: BM25IndexOptions); size(): number; add(chunkId: string, docId: string, fields: TextFields): void; remove(chunkId: string): void; removeByDocument(docId: string): void; clear(): void; search(query: string, options?: TextSearchOptions): TextSearchResult[]; toJSON(): SerialisedBM25State; fromJSON(state: unknown): void; } export {}; //# sourceMappingURL=BM25Index.d.ts.map