/** * Embedding Pipeline Module * * Orchestrates the background embedding process: * 1. Query embeddable nodes from LadybugDB * 2. Generate text representations with enriched metadata * 3. Chunk long nodes, batch embed * 4. Update LadybugDB with chunk-aware embeddings * 5. Create vector index for semantic search */ import { type EmbeddingProgress, type EmbeddingConfig, type EmbeddableNode, type SemanticSearchResult, type EmbeddingContext } from './types.js'; import type { ExtensionInstallPolicy } from '../lbug/extension-loader.js'; /** * Resolve the extension-install policy for the embedding WRITE path (analyze). * * Generating embeddings is an explicit opt-in to a feature that requires the * VECTOR extension, so when the operator has NOT pinned a policy we default to * `auto` (one bounded, out-of-process INSTALL) — matching the documented * "auto = default for analyze" intent in extension-loader.ts. An explicit * GITNEXUS_LBUG_EXTENSION_INSTALL=load-only|never|auto always wins, so an * offline or locked-down operator is never silently forced onto the network * (the #1153 regression caused by hard-coding `auto` here). Read on every call * (not memoized) so test env stubbing works. */ export declare const resolveEmbeddingInstallPolicy: () => ExtensionInstallPolicy; /** * Bump this when the embedding text template changes in a way that should * invalidate existing vectors, such as metadata/header shape changes, * structural container context changes, or preceding-context formatting rules. */ export declare const EMBEDDING_TEXT_VERSION = "v2"; /** * Compute a stable content fingerprint for an embeddable node. * Used to detect when the underlying text has changed so stale vectors * can be replaced (DELETE-then-INSERT, the Kuzu-sanctioned pattern for * vector-indexed rows). */ export declare const contentHashForNode: (node: EmbeddableNode, config?: Partial) => string; /** * Progress callback type */ export type EmbeddingProgressCallback = (progress: EmbeddingProgress) => void; /** * Batch INSERT chunk-aware embeddings into CodeEmbedding table */ export declare const batchInsertEmbeddings: (executeWithReusedStatement: (cypher: string, paramsList: Array>) => Promise, updates: Array<{ nodeId: string; chunkIndex: number; startLine: number; endLine: number; embedding: number[]; contentHash?: string; }>) => Promise; export interface EmbeddingPipelineResult { nodesProcessed: number; chunksProcessed: number; vectorIndexReady: boolean; semanticMode: 'vector-index' | 'exact-scan'; } /** * Run the embedding pipeline * * @param executeQuery - Function to execute Cypher queries against LadybugDB * @param executeWithReusedStatement - Function to execute with reused prepared statement * @param onProgress - Callback for progress updates * @param config - Optional configuration override * @param skipNodeIds - Optional set of node IDs that already have embeddings (incremental mode) * @param context - Optional repo/server context for metadata enrichment * @param existingEmbeddings - Optional map of nodeId → contentHash for incremental mode. * Nodes whose hash matches are skipped; nodes with a changed hash are DELETE'd * and re-embedded; nodes not in the map are embedded fresh. */ export declare const runEmbeddingPipeline: (executeQuery: (cypher: string) => Promise, executeWithReusedStatement: (cypher: string, paramsList: Array>) => Promise, onProgress: EmbeddingProgressCallback, config?: Partial, skipNodeIds?: Set, context?: EmbeddingContext, existingEmbeddings?: Map) => Promise; /** * Perform semantic search using the vector index with chunk deduplication */ export declare const semanticSearch: (executeQuery: (cypher: string) => Promise, query: string, k?: number, maxDistance?: number) => Promise; /** * Semantic search with graph expansion (flattened results) */ export declare const semanticSearchWithContext: (executeQuery: (cypher: string) => Promise, query: string, k?: number, _hops?: number) => Promise;