import { SearchIndex } from "../src/state/search-index.js"; import { VectorIndex } from "../src/state/vector-index.js"; import { HybridSearch } from "../src/state/hybrid-search.js"; import { LocalEmbeddingProvider } from "../src/providers/embedding/local.js"; import type { CompressedObservation, EmbeddingProvider } from "../src/types.js"; import { generateDataset, type LabeledQuery } from "./dataset.js"; import { writeFileSync } from "node:fs"; function mockKV() { const store = new Map>(); return { get: async (scope: string, key: string): Promise => (store.get(scope)?.get(key) as T) ?? null, set: async (scope: string, key: string, data: T): Promise => { if (!store.has(scope)) store.set(scope, new Map()); store.get(scope)!.set(key, data); return data; }, delete: async (scope: string, key: string): Promise => { store.get(scope)?.delete(key); }, list: async (scope: string): Promise => { const entries = store.get(scope); return entries ? (Array.from(entries.values()) as T[]) : []; }, }; } function estimateTokens(text: string): number { return Math.ceil(text.length / 4); } function obsToText(obs: CompressedObservation): string { return [obs.title, obs.subtitle || "", obs.narrative, ...obs.facts, ...obs.concepts].join(" "); } function recall(retrieved: string[], relevant: Set, k: number): number { if (relevant.size === 0) return 1; const topK = new Set(retrieved.slice(0, k)); let hits = 0; for (const id of relevant) if (topK.has(id)) hits++; return hits / relevant.size; } function precision(retrieved: string[], relevant: Set, k: number): number { const topK = retrieved.slice(0, k); if (topK.length === 0) return 0; let hits = 0; for (const id of topK) if (relevant.has(id)) hits++; return hits / topK.length; } function dcg(relevances: boolean[], k: number): number { let sum = 0; for (let i = 0; i < Math.min(k, relevances.length); i++) sum += (relevances[i] ? 1 : 0) / Math.log2(i + 2); return sum; } function ndcg(retrieved: string[], relevant: Set, k: number): number { const actual = retrieved.slice(0, k).map(id => relevant.has(id)); const ideal = Array.from({ length: Math.min(k, relevant.size) }, () => true); const idealDCG = dcg(ideal, k); return idealDCG === 0 ? 0 : dcg(actual, k) / idealDCG; } function mrr(retrieved: string[], relevant: Set): number { for (let i = 0; i < retrieved.length; i++) if (relevant.has(retrieved[i])) return 1 / (i + 1); return 0; } function avg(nums: number[]): number { return nums.length ? nums.reduce((a, b) => a + b, 0) / nums.length : 0; } function pct(n: number): string { return (n * 100).toFixed(1) + "%"; } interface QueryResult { query: string; category: string; recall_5: number; recall_10: number; precision_5: number; ndcg_10: number; mrr_val: number; relevant_count: number; latency_ms: number; } interface SystemResult { name: string; results: QueryResult[]; embed_time_ms: number; tokens_per_query: number; } async function evalSystem( name: string, observations: CompressedObservation[], queries: LabeledQuery[], provider: EmbeddingProvider | null, weights: { bm25: number; vector: number; graph: number }, ): Promise { const kv = mockKV(); const bm25 = new SearchIndex(); const vector = provider ? new VectorIndex() : null; console.log(` Indexing ${observations.length} observations...`); const embedStart = performance.now(); for (const obs of observations) { bm25.add(obs); await kv.set(`mem:obs:${obs.sessionId}`, obs.id, obs); } if (provider && vector) { const batchSize = 32; for (let i = 0; i < observations.length; i += batchSize) { const batch = observations.slice(i, i + batchSize); const texts = batch.map(o => obsToText(o)); const embeddings = await provider.embedBatch(texts); for (let j = 0; j < batch.length; j++) { vector.add(batch[j].id, batch[j].sessionId, embeddings[j]); } if ((i + batchSize) % 100 === 0 || i + batchSize >= observations.length) { process.stdout.write(`\r Embedded ${Math.min(i + batchSize, observations.length)}/${observations.length}`); } } console.log(""); } const embedTime = performance.now() - embedStart; const hybrid = new HybridSearch( bm25, vector, provider, kv as never, weights.bm25, weights.vector, weights.graph, ); console.log(` Running ${queries.length} queries...`); const results: QueryResult[] = []; for (const q of queries) { const relevant = new Set(q.relevantObsIds); const start = performance.now(); const searchResults = await hybrid.search(q.query, 20); const latency = performance.now() - start; const retrieved = searchResults.map(r => r.observation.id); results.push({ query: q.query, category: q.category, recall_5: recall(retrieved, relevant, 5), recall_10: recall(retrieved, relevant, 10), precision_5: precision(retrieved, relevant, 5), ndcg_10: ndcg(retrieved, relevant, 10), mrr_val: mrr(retrieved, relevant), relevant_count: relevant.size, latency_ms: latency, }); } let totalReturnedTokens = 0; for (const q of queries) { const searchResults = await hybrid.search(q.query, 10); totalReturnedTokens += searchResults.reduce( (sum, r) => sum + estimateTokens(JSON.stringify(r.observation)), 0, ); } const avgReturnedTokens = Math.round(totalReturnedTokens / queries.length); return { name, results, embed_time_ms: embedTime, tokens_per_query: avgReturnedTokens, }; } async function evalBuiltinGrep( observations: CompressedObservation[], queries: LabeledQuery[], ): Promise { const results: QueryResult[] = []; for (const q of queries) { const relevant = new Set(q.relevantObsIds); const queryTerms = q.query.toLowerCase().split(/\W+/).filter(w => w.length > 2); const start = performance.now(); const scored: Array<{ id: string; score: number }> = []; for (const obs of observations) { const text = [obs.title, obs.narrative, ...obs.concepts, ...obs.facts].join(" ").toLowerCase(); let score = 0; for (const term of queryTerms) if (text.includes(term)) score++; if (score > 0) scored.push({ id: obs.id, score }); } scored.sort((a, b) => b.score - a.score); const latency = performance.now() - start; const retrieved = scored.map(s => s.id).slice(0, 20); results.push({ query: q.query, category: q.category, recall_5: recall(retrieved, relevant, 5), recall_10: recall(retrieved, relevant, 10), precision_5: precision(retrieved, relevant, 5), ndcg_10: ndcg(retrieved, relevant, 10), mrr_val: mrr(retrieved, relevant), relevant_count: relevant.size, latency_ms: latency, }); } const allTokens = estimateTokens(observations.map(o => `## ${o.title}\n${o.narrative}\nConcepts: ${o.concepts.join(", ")}` ).join("\n\n")); return { name: "Built-in (grep all)", results, embed_time_ms: 0, tokens_per_query: allTokens }; } function generateReport(systems: SystemResult[], obsCount: number): string { const lines: string[] = []; const w = (s: string) => lines.push(s); w("# agentmemory v0.6.0 — Real Embeddings Quality Evaluation"); w(""); w(`**Date:** ${new Date().toISOString()}`); w(`**Platform:** ${process.platform} ${process.arch}, Node ${process.version}`); w(`**Dataset:** ${obsCount} observations, 30 sessions, 20 labeled queries`); w(`**Embedding model:** Xenova/all-MiniLM-L6-v2 (384d, local, no API key)`); w(""); w("## Head-to-Head: Real Embeddings vs Keyword Search"); w(""); w("| System | Recall@5 | Recall@10 | Precision@5 | NDCG@10 | MRR | Avg Latency | Tokens/query |"); w("|--------|----------|-----------|-------------|---------|-----|-------------|--------------|"); for (const s of systems) { const r = s.results; w(`| ${s.name} | ${pct(avg(r.map(q => q.recall_5)))} | ${pct(avg(r.map(q => q.recall_10)))} | ${pct(avg(r.map(q => q.precision_5)))} | ${pct(avg(r.map(q => q.ndcg_10)))} | ${pct(avg(r.map(q => q.mrr_val)))} | ${avg(r.map(q => q.latency_ms)).toFixed(2)}ms | ${s.tokens_per_query.toLocaleString()} |`); } w(""); w("## Improvement from Real Embeddings"); w(""); const bm25Only = systems.find(s => s.name === "BM25-only (stemmed+synonyms)"); const dual = systems.find(s => s.name.includes("Dual-stream")); const triple = systems.find(s => s.name.includes("Triple-stream")); const builtin = systems.find(s => s.name.includes("grep")); if (bm25Only && dual) { const recallDelta = avg(dual.results.map(q => q.recall_10)) - avg(bm25Only.results.map(q => q.recall_10)); w(`Adding real vector embeddings to BM25 improves recall@10 by **${(recallDelta * 100).toFixed(1)} percentage points**.`); } if (builtin && dual) { const tokenSaving = (1 - dual.tokens_per_query / builtin.tokens_per_query) * 100; w(`Token savings vs loading everything: **${tokenSaving.toFixed(0)}%** (${dual.tokens_per_query.toLocaleString()} vs ${builtin.tokens_per_query.toLocaleString()} tokens).`); } w(""); w("## Per-Query: Where Real Embeddings Win"); w(""); if (bm25Only && dual) { w("Queries where dual-stream (real embeddings) outperforms BM25-only:"); w(""); w("| Query | Category | BM25 Recall@10 | +Vector Recall@10 | Delta |"); w("|-------|----------|---------------|-------------------|-------|"); for (let i = 0; i < bm25Only.results.length; i++) { const bq = bm25Only.results[i]; const dq = dual.results[i]; const delta = dq.recall_10 - bq.recall_10; const marker = delta > 0 ? " **" : delta < 0 ? " *" : ""; if (Math.abs(delta) > 0.001) { w(`| ${bq.query.slice(0, 45)}${bq.query.length > 45 ? "..." : ""} | ${bq.category} | ${pct(bq.recall_10)} | ${pct(dq.recall_10)} | ${delta > 0 ? "+" : ""}${(delta * 100).toFixed(1)}pp${marker} |`); } } } w(""); w("## By Category Comparison"); w(""); const categories = ["exact", "semantic", "cross-session", "entity"]; w("| Category | Built-in grep | BM25 (stemmed) | +Real Vectors | +Graph |"); w("|----------|--------------|----------------|--------------|--------|"); for (const cat of categories) { const vals = systems.map(s => { const qs = s.results.filter(q => q.category === cat); return qs.length ? pct(avg(qs.map(q => q.recall_10))) : "-"; }); w(`| ${cat} | ${vals.join(" | ")} |`); } w(""); w("## Embedding Performance"); w(""); w("| System | Embedding Time | Model | Dimensions |"); w("|--------|---------------|-------|------------|"); for (const s of systems) { if (s.embed_time_ms > 100) { w(`| ${s.name} | ${(s.embed_time_ms / 1000).toFixed(1)}s | Xenova/all-MiniLM-L6-v2 | 384 |`); } } w(""); w("Embedding is a one-time cost at ingestion. Search is sub-millisecond after indexing."); w(""); w("## Key Findings"); w(""); if (bm25Only && dual) { const semBm25 = bm25Only.results.filter(q => q.category === "semantic"); const semDual = dual.results.filter(q => q.category === "semantic"); const semImprove = avg(semDual.map(q => q.recall_10)) - avg(semBm25.map(q => q.recall_10)); w(`1. **Semantic queries improve most**: ${(semImprove * 100).toFixed(1)}pp recall@10 gain from real embeddings`); w(`2. **"database performance optimization"** — the hardest query — goes from BM25 ${pct(bm25Only.results.find(q => q.query.includes("database perf"))?.recall_10 ?? 0)} to vector-augmented ${pct(dual.results.find(q => q.query.includes("database perf"))?.recall_10 ?? 0)}`); w(`3. **Entity/exact queries** are already well-served by BM25+stemming — vectors add marginal value`); w(`4. **Local embeddings (Xenova)** run without API keys — zero cost, zero latency concerns`); } w(""); w("## Recommendation"); w(""); w("Enable local embeddings by default (`EMBEDDING_PROVIDER=local` or install `@xenova/transformers`)."); w("This gives agentmemory genuine semantic search that built-in agent memories cannot match —"); w("understanding that \"database performance optimization\" relates to \"N+1 query fix\" and \"eager loading\"."); w(""); w("---"); w(`*All measurements use Xenova/all-MiniLM-L6-v2 local embeddings (384 dimensions, no API calls).*`); return lines.join("\n"); } async function main() { console.log("=== agentmemory Real Embeddings Benchmark ===\n"); console.log("Loading Xenova/all-MiniLM-L6-v2 model (first run downloads ~80MB)..."); let provider: EmbeddingProvider; try { provider = new LocalEmbeddingProvider(); const testEmbed = await provider.embed("test"); console.log(`Model loaded. Dimensions: ${testEmbed.length}\n`); } catch (err) { console.error("Failed to load Xenova model:", err); console.error("Install with: npm install @xenova/transformers"); process.exit(1); } const { observations, queries } = generateDataset(); console.log(`Dataset: ${observations.length} observations, ${queries.length} queries\n`); console.log("1. Built-in (grep all)..."); const builtinResult = await evalBuiltinGrep(observations, queries); console.log(` Recall@10: ${pct(avg(builtinResult.results.map(q => q.recall_10)))}\n`); console.log("2. BM25-only (stemmed+synonyms)..."); const bm25Result = await evalSystem( "BM25-only (stemmed+synonyms)", observations, queries, null, { bm25: 1.0, vector: 0, graph: 0 }, ); console.log(` Recall@10: ${pct(avg(bm25Result.results.map(q => q.recall_10)))}\n`); console.log("3. Dual-stream (BM25 + real Xenova vectors)..."); const dualResult = await evalSystem( "Dual-stream (BM25+Xenova)", observations, queries, provider, { bm25: 0.4, vector: 0.6, graph: 0 }, ); console.log(` Recall@10: ${pct(avg(dualResult.results.map(q => q.recall_10)))}\n`); console.log("4. Triple-stream (BM25 + Xenova + Graph)..."); const tripleResult = await evalSystem( "Triple-stream (BM25+Xenova+Graph)", observations, queries, provider, { bm25: 0.4, vector: 0.6, graph: 0.3 }, ); console.log(` Recall@10: ${pct(avg(tripleResult.results.map(q => q.recall_10)))}\n`); const report = generateReport( [builtinResult, bm25Result, dualResult, tripleResult], observations.length, ); writeFileSync("benchmark/REAL-EMBEDDINGS.md", report); console.log(report); console.log(`\nReport written to benchmark/REAL-EMBEDDINGS.md`); } main().catch(console.error);