/** * VectorIndex * * Builds and queries a LanceDB vector index over the call graph functions. * Each function is represented by a document combining its signature, docstring, * file path, language, and topological metadata (fanIn/fanOut, hub, entry point). * * Storage: /vector-index/ (LanceDB database folder) * Table name: "functions" * * Usage: * // Build (after openlore analyze --embed) * await VectorIndex.build(outputDir, nodes, signatures, hubIds, entryPointIds, embedSvc); * * // Search * const results = await VectorIndex.search(outputDir, "authenticate user with JWT", embedSvc); */ import type { FunctionNode } from './call-graph.js'; import type { FileSignatureMap } from './signature-extractor.js'; import type { EmbeddingService } from './embedding-service.js'; export interface FunctionRecord { id: string; name: string; filePath: string; className: string; language: string; signature: string; docstring: string; fanIn: number; fanOut: number; isHub: boolean; isEntryPoint: boolean; /** Concatenated text used for embedding */ text: string; /** Embedding vector */ vector: number[]; } export interface SearchResult { record: Omit; /** * Relevance score. For hybrid search (default): RRF score, higher = more relevant. * For dense-only search: cosine distance from LanceDB, lower = more similar. */ score: number; } export interface VectorIndexMeta { hasEmbeddings: boolean; dim: number; model: string | null; builtAt: string; schemaVersion: number; } export interface Bm25Corpus { docs: Array<{ id: string; tfMap: Map; length: number; }>; /** term → number of documents containing it */ df: Map; avgLength: number; N: number; } export declare function tokenize(text: string): string[]; export declare function buildBm25Corpus(records: Array<{ id: string; text: string; }>): Bm25Corpus; export declare function bm25Score(corpus: Bm25Corpus, queryTokens: string[], docIdx: number): number; /** Test-only: clear in-memory BM25 + LanceDB caches to force cold path. */ export declare function _resetVectorIndexCachesForTesting(): void; export declare class VectorIndex { /** * Build (or rebuild) the vector index from call graph nodes + signatures. * * When `incremental` is true and an existing index is found, only functions * whose text has changed since the last build are re-embedded. Unchanged * functions reuse their cached vectors. Pass `incremental: false` (or omit * when no index exists) to do a full rebuild. * * Returns a summary of how many functions were embedded vs reused. * * When `embedSvc` is null, builds a **keyword-only (BM25)** index: the corpus * rows are written without a `vector` column and the meta sidecar records * `hasEmbeddings: false`. Search then serves BM25 results and never attempts * ANN. Re-building a previously-embedded index with `embedSvc=null` downgrades * it to BM25-only (overwrite + meta update), and vice-versa upgrades it. */ static build(outputDir: string, nodes: FunctionNode[], signatures: FileSignatureMap[], hubIds: Set, entryPointIds: Set, embedSvc: EmbeddingService | null, /** Optional map of filePath → source content for skeleton-based body indexing */ fileContents?: Map, /** When true, reuse cached vectors for unchanged functions */ incremental?: boolean): Promise<{ embedded: number; reused: number; total: number; hasEmbeddings: boolean; }>; /** * Watch-mode incremental update (Spec 13.1). Replace only the rows for the * changed files with freshly-built records — a row-level delete+add instead of * the full-corpus read+overwrite that build() performs. The cold build() path * is untouched, protecting the `analyze --embed` contract (G7). * * - Embedded index: reuse existing vectors for rows whose embed-text is * unchanged (queried for the changed files only, not the whole corpus), * embed just the new/changed texts, then delete the changed files' old rows * and add the rebuilt ones. The LanceDB table handle in _tableCache stays * valid across row ops, so search() does not pay a reconnect. * - BM25-only index: delete+add the changed files' documents and patch the * cached BM25 corpus in place rather than dropping the whole corpus cache. */ static updateFiles(outputDir: string, nodes: FunctionNode[], changedFilePaths: Set, signatures: FileSignatureMap[], hubIds: Set, entryPointIds: Set, embedSvc: EmbeddingService | null | undefined, fileContents?: Map): Promise<{ embedded: number; reused: number; total: number; hasEmbeddings: boolean; }>; /** * Hybrid search over the index: dense (ANN) + sparse (BM25) merged via RRF. * * Dense recall fetches top `limit*5` candidates from the vector index. * Sparse recall scores the full corpus with BM25 (cached per session). * Reciprocal Rank Fusion (RRF) combines both rankings into a single list. * * Set `hybrid: false` to use dense-only search (original behaviour). * Returns up to `limit` results sorted by relevance (highest first). */ static search(outputDir: string, query: string, embedSvc: EmbeddingService | null | undefined, opts?: { limit?: number; language?: string; minFanIn?: number; /** Enable hybrid dense+sparse retrieval via RRF (default: true when embedSvc available) */ hybrid?: boolean; }): Promise; /** * BM25-only search: used when no embedding service is available. * Scores the full corpus with BM25 and returns the top `limit` results. */ private static _bm25Only; /** * Returns true if a vector index has been built for this output directory. */ static exists(outputDir: string): boolean; } //# sourceMappingURL=vector-index.d.ts.map