/** * Semantic Embedding Index * * Optional ANN (approximate nearest neighbor) layer on top of the artifact * index. Embeds node summaries/titles into dense vectors using a small local * model and stores them in an HNSW index for fast similarity queries. * * Install: npm install @xenova/transformers hnswlib-node * * @implements #730 * @source @src/artifacts/types.ts * @tests @test/unit/artifacts/embedding-index.test.ts */ import type { MetadataEntry } from './types.js'; /** * Default embedding model (all-MiniLM-L6-v2: ~22MB, 384 dims, ~5ms/embedding on CPU) */ export declare const DEFAULT_EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2"; export declare const DEFAULT_EMBEDDING_DIMS = 384; /** * Embedding index manifest stored alongside the HNSW index */ export interface EmbeddingManifest { /** Model identifier used for embedding */ model: string; /** Vector dimensionality */ dims: number; /** Ordered list of node IDs (position → node ID) */ nodeIds: string[]; /** ISO timestamp of last build */ builtAt: string; /** Checksums at build time for incremental detection */ checksums: Record; } /** * Configuration for the embedding index (from .aiwg/config.yaml) */ export interface EmbeddingConfig { /** Enable embedding index for this graph */ enabled: boolean; /** Model to use (default: Xenova/all-MiniLM-L6-v2) */ model?: string; /** Number of results for semantic queries */ topK?: number; /** When to rebuild: 'content-change' | 'always' | 'never' */ rebuildOn?: 'content-change' | 'always' | 'never'; } /** * Semantic search result */ export interface SemanticResult { /** Node ID (artifact path or REF identifier) */ nodeId: string; /** Cosine similarity score (0-1, higher is more similar) */ score: number; } /** * Check if embedding dependencies are available. */ export declare function checkEmbeddingDeps(): Promise<{ available: boolean; missing: string[]; }>; /** * Build an embedding index from artifact metadata entries. * * Embeds each entry's title + summary into a dense vector and stores * them in an HNSW index for fast approximate nearest-neighbor queries. * * @param entries - Map of node ID → MetadataEntry * @param outputDir - Directory to write embeddings/ subfolder * @param model - Transformer model identifier * @returns Number of entries embedded */ export declare function buildEmbeddingIndex(entries: Record, outputDir: string, model?: string): Promise; /** * Load an embedding manifest from an index directory. */ export declare function loadEmbeddingManifest(indexDir: string): EmbeddingManifest | null; /** * Query the embedding index for semantically similar artifacts. * * @param query - Natural language query string * @param indexDir - Directory containing the embeddings/ subfolder * @param topK - Number of results to return * @returns Ranked list of semantic results */ export declare function semanticQuery(query: string, indexDir: string, topK?: number): Promise; /** * Get semantic neighbors of a specific node. * * @param nodeId - Node to find neighbors for * @param entries - Metadata entries to get the node's text * @param indexDir - Directory containing the embeddings/ subfolder * @param topK - Number of results */ export declare function semanticNeighbors(nodeId: string, entries: Record, indexDir: string, topK?: number): Promise; /** * Determine which entries need re-embedding based on checksum changes. * * @param entries - Current metadata entries * @param manifest - Existing embedding manifest * @returns Object with entries that changed and entries that are new */ export declare function detectEmbeddingChanges(entries: Record, manifest: EmbeddingManifest): { changed: string[]; added: string[]; removed: string[]; }; /** A near-duplicate pair surfaced by the embedding index. */ export interface DedupPair { /** First node id (lexicographically smaller). */ a: string; /** Second node id. */ b: string; /** Cosine similarity (0-1). */ score: number; } /** * Near-duplicate report (#1493). Reads the prebuilt embedding index and, using * each stored vector (no re-embedding), finds node pairs whose cosine * similarity is at or above `threshold`. Pairs are de-duplicated (unordered) * and returned most-similar-first. * * Requires a built embedding index (`aiwg index embed`). Throws if absent. */ export declare function dedupReport(indexDir: string, threshold?: number, topK?: number): Promise; //# sourceMappingURL=embedding-index.d.ts.map