{"version":3,"file":"bm25.cjs","sources":["../../../src/algorithms/bm25.ts"],"sourcesContent":["/**\n * BM25 (Best Matching 25) Scoring Algorithm\n * Industry-standard probabilistic ranking function used by search engines\n *\n * BM25 considers:\n * - Term frequency (TF): How often does the term appear?\n * - Inverse document frequency (IDF): How rare is the term?\n * - Document length normalization: Penalize very long documents\n *\n * Formula: BM25(D, Q) = Σ IDF(qi) * (f(qi, D) * (k1 + 1)) / (f(qi, D) + k1 * (1 - b + b * |D| / avgdl))\n *\n * Where:\n * - D = document\n * - Q = query\n * - qi = query term i\n * - f(qi, D) = frequency of qi in D\n * - |D| = length of document D\n * - avgdl = average document length\n * - k1 = term frequency saturation parameter (default: 1.2)\n * - b = length normalization parameter (default: 0.75)\n */\n\nexport interface BM25Config {\n  /** Term frequency saturation parameter (typical: 1.2-2.0) */\n  k1: number;\n  /** Length normalization parameter (typical: 0.5-0.8) */\n  b: number;\n  /** Minimum IDF value to prevent negative scores */\n  minIDF: number;\n}\n\nexport const DEFAULT_BM25_CONFIG: BM25Config = {\n  k1: 1.2,\n  b: 0.75,\n  minIDF: 0.1,\n};\n\n/**\n * Document statistics for BM25 calculation\n */\nexport interface DocumentStats {\n  /** Document ID */\n  docId: number;\n  /** Document length (number of terms) */\n  length: number;\n  /** Term frequencies in this document */\n  termFrequencies: Map<string, number>;\n}\n\n/**\n * Corpus statistics for BM25 calculation\n */\nexport interface CorpusStats {\n  /** Total number of documents */\n  totalDocs: number;\n  /** Average document length */\n  avgDocLength: number;\n  /** Document frequency for each term (how many docs contain the term) */\n  documentFrequencies: Map<string, number>;\n}\n\n/**\n * Calculate IDF (Inverse Document Frequency) for a term\n * IDF = log((N - df + 0.5) / (df + 0.5) + 1)\n *\n * Where:\n * - N = total number of documents\n * - df = document frequency (number of documents containing the term)\n */\nexport function calculateIDF(term: string, corpusStats: CorpusStats, config: BM25Config = DEFAULT_BM25_CONFIG): number {\n  const df = corpusStats.documentFrequencies.get(term) || 0;\n  const N = corpusStats.totalDocs;\n\n  // Prevent division by zero and negative IDF\n  if (df === 0 || N === 0) {\n    return config.minIDF;\n  }\n\n  // BM25 IDF formula with smoothing\n  const idf = Math.log((N - df + 0.5) / (df + 0.5) + 1);\n\n  // Ensure minimum IDF\n  return Math.max(idf, config.minIDF);\n}\n\n/**\n * Calculate BM25 score for a single term in a document\n */\nexport function calculateTermScore(term: string, docStats: DocumentStats, corpusStats: CorpusStats, config: BM25Config = DEFAULT_BM25_CONFIG): number {\n  const tf = docStats.termFrequencies.get(term) || 0;\n\n  // If term not in document, score is 0\n  if (tf === 0) {\n    return 0;\n  }\n\n  const idf = calculateIDF(term, corpusStats, config);\n  const docLength = docStats.length;\n  const avgDocLength = corpusStats.avgDocLength;\n\n  // BM25 formula\n  const numerator = tf * (config.k1 + 1);\n  const denominator = tf + config.k1 * (1 - config.b + config.b * (docLength / avgDocLength));\n\n  return idf * (numerator / denominator);\n}\n\n/**\n * Calculate BM25 score for a query against a document\n * Returns the sum of BM25 scores for all query terms\n */\nexport function calculateBM25Score(queryTerms: string[], docStats: DocumentStats, corpusStats: CorpusStats, config: BM25Config = DEFAULT_BM25_CONFIG): number {\n  let totalScore = 0;\n\n  for (const term of queryTerms) {\n    totalScore += calculateTermScore(term, docStats, corpusStats, config);\n  }\n\n  return totalScore;\n}\n\n/**\n * Build corpus statistics from documents\n * This should be called once during index building\n */\nexport function buildCorpusStats(documents: DocumentStats[]): CorpusStats {\n  const totalDocs = documents.length;\n  let totalLength = 0;\n  const documentFrequencies = new Map<string, number>();\n\n  for (const doc of documents) {\n    totalLength += doc.length;\n\n    // Count unique terms per document (for document frequency)\n    const uniqueTerms = new Set(doc.termFrequencies.keys());\n    for (const term of uniqueTerms) {\n      documentFrequencies.set(term, (documentFrequencies.get(term) || 0) + 1);\n    }\n  }\n\n  const avgDocLength = totalDocs > 0 ? totalLength / totalDocs : 0;\n\n  return {\n    totalDocs,\n    avgDocLength,\n    documentFrequencies,\n  };\n}\n\n/**\n * Normalize BM25 score to 0-1 range for consistency with existing scoring\n * Uses sigmoid function for smooth normalization\n */\nexport function normalizeBM25Score(score: number, maxScore: number = 10): number {\n  if (maxScore === 0) return 0;\n\n  // Sigmoid normalization: 1 / (1 + e^(-x))\n  // Scale score to reasonable range first\n  const scaledScore = (score / maxScore) * 6 - 3; // Map to [-3, 3] range\n  return 1 / (1 + Math.exp(-scaledScore));\n}\n\n/**\n * Combine BM25 score with fuzzy match score\n * Provides a hybrid scoring approach\n */\nexport function combineScores(bm25Score: number, fuzzyScore: number, bm25Weight: number = 0.6, fuzzyWeight: number = 0.4): number {\n  // Normalize weights\n  const totalWeight = bm25Weight + fuzzyWeight;\n  const normalizedBM25Weight = bm25Weight / totalWeight;\n  const normalizedFuzzyWeight = fuzzyWeight / totalWeight;\n\n  // Weighted combination\n  return normalizedBM25Weight * bm25Score + normalizedFuzzyWeight * fuzzyScore;\n}\n"],"names":[],"mappings":";;AA+BO,MAAM,sBAAkC;AAAA,EAC7C,IAAI;AAAA,EACJ,GAAG;AAAA,EACH,QAAQ;AACV;AAkCO,SAAS,aAAa,MAAc,aAA0B,SAAqB,qBAA6B;AACrH,QAAM,KAAK,YAAY,oBAAoB,IAAI,IAAI,KAAK;AACxD,QAAM,IAAI,YAAY;AAGtB,MAAI,OAAO,KAAK,MAAM,GAAG;AACvB,WAAO,OAAO;AAAA,EAChB;AAGA,QAAM,MAAM,KAAK,KAAK,IAAI,KAAK,QAAQ,KAAK,OAAO,CAAC;AAGpD,SAAO,KAAK,IAAI,KAAK,OAAO,MAAM;AACpC;AAKO,SAAS,mBAAmB,MAAc,UAAyB,aAA0B,SAAqB,qBAA6B;AACpJ,QAAM,KAAK,SAAS,gBAAgB,IAAI,IAAI,KAAK;AAGjD,MAAI,OAAO,GAAG;AACZ,WAAO;AAAA,EACT;AAEA,QAAM,MAAM,aAAa,MAAM,aAAa,MAAM;AAClD,QAAM,YAAY,SAAS;AAC3B,QAAM,eAAe,YAAY;AAGjC,QAAM,YAAY,MAAM,OAAO,KAAK;AACpC,QAAM,cAAc,KAAK,OAAO,MAAM,IAAI,OAAO,IAAI,OAAO,KAAK,YAAY;AAE7E,SAAO,OAAO,YAAY;AAC5B;AAMO,SAAS,mBAAmB,YAAsB,UAAyB,aAA0B,SAAqB,qBAA6B;AAC5J,MAAI,aAAa;AAEjB,aAAW,QAAQ,YAAY;AAC7B,kBAAc,mBAAmB,MAAM,UAAU,aAAa,MAAM;AAAA,EACtE;AAEA,SAAO;AACT;AAMO,SAAS,iBAAiB,WAAyC;AACxE,QAAM,YAAY,UAAU;AAC5B,MAAI,cAAc;AAClB,QAAM,0CAA0B,IAAA;AAEhC,aAAW,OAAO,WAAW;AAC3B,mBAAe,IAAI;AAGnB,UAAM,cAAc,IAAI,IAAI,IAAI,gBAAgB,MAAM;AACtD,eAAW,QAAQ,aAAa;AAC9B,0BAAoB,IAAI,OAAO,oBAAoB,IAAI,IAAI,KAAK,KAAK,CAAC;AAAA,IACxE;AAAA,EACF;AAEA,QAAM,eAAe,YAAY,IAAI,cAAc,YAAY;AAE/D,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,EAAA;AAEJ;AAMO,SAAS,mBAAmB,OAAe,WAAmB,IAAY;AAC/E,MAAI,aAAa,EAAG,QAAO;AAI3B,QAAM,cAAe,QAAQ,WAAY,IAAI;AAC7C,SAAO,KAAK,IAAI,KAAK,IAAI,CAAC,WAAW;AACvC;AAMO,SAAS,cAAc,WAAmB,YAAoB,aAAqB,KAAK,cAAsB,KAAa;AAEhI,QAAM,cAAc,aAAa;AACjC,QAAM,uBAAuB,aAAa;AAC1C,QAAM,wBAAwB,cAAc;AAG5C,SAAO,uBAAuB,YAAY,wBAAwB;AACpE;;;;;;;;"}