All files BM25.ts

100% Statements 30/30
100% Branches 20/20
100% Functions 11/11
100% Lines 29/29

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91  1x 16x       1x 16x       1x   4x 12x   4x                                                                           4x 4x 4x 12x     12x 4x 4x 4x     4x 12x   12x 12x 12x 12x           12x 12x 6x   6x     4x 2x   2x    
/** Gets word count. */
export const getWordCount = (corpus: string) => {
  return ((corpus || "").match(/\w+/g) || []).length;
};
 
/** Number of occurences of a word in a string. */
export const getTermFrequency = (term: string, corpus: string) => {
  return ((corpus || "").match(new RegExp(term, "g")) || []).length;
};
 
/** Inverse document frequency. */
export const getIDF = (term: string, documents: string[]) => {
  // Number of relevant documents.
  const relevantDocuments = documents.filter((document: string) =>
    document.includes(term)
  ).length;
  return Math.log(
    (documents.length - relevantDocuments + 0.5) / (relevantDocuments + 0.5) + 1
  );
};
 
/** Represents a document; useful when sorting results.
 */
export interface BMDocument {
  /** The document is originally scoreed. */
  document: string;
  /** The score that the document recieves. */
  score: number;
}
 
/** Constants that are free parameters used in BM25, specifically when generating inverse document frequency. */
export interface BMConstants {
  /** Free parameter. Is 0.75 by default.  */
  b?: number;
  /** Free parameter. Is 1.2 by default. Generally in range [1.2, 2.0] */
  k1?: number;
}
 
/** If returns positive, the sorting results in secondEl coming before firstEl, else, firstEl comes before secondEL  */
export type BMSorter = (firstEl: BMDocument, secondEl: BMDocument) => number;
 
/** Implementation of Okapi BM25 algorithm.
 *  @param documents: Collection of documents.
 *  @param keywords: query terms.
 *  @param constants: Contains free parameters k1 and b. b=0.75 and k1=1.2 by default.
 *  @param sort: A function that allows you to sort queries by a given rule. If not provided, returns results corresponding to the original order. 
 * If this option is provided, the return type will not be an array of scores but an array of documents with their scores.
 */
export default function BM25(
  documents: string[],
  keywords: string[],
  constants?: BMConstants,
  sorter?: BMSorter
): number[] | BMDocument[] {
  const b = constants && constants.b ? constants.b : 0.75;
  const k1 = constants && constants.k1 ? constants.k1 : 1.2;
  const documentLengths = documents.map((document: string) =>
    getWordCount(document)
  );
  const averageDocumentLength =
    documentLengths.reduce((a, b) => a + b, 0) / documents.length;
  const idfByKeyword = keywords.reduce((obj, keyword) => {
    obj.set(keyword, getIDF(keyword, documents));
    return obj;
  }, new Map<string, number>());
 
  const scores = documents.map((document: string, index: number) => {
    const score = keywords
      .map((keyword: string) => {
        const inverseDocumentFrequency = idfByKeyword.get(keyword);
        const termFrequency = getTermFrequency(keyword, document);
        const documentLength = documentLengths[index];
        return (
          (inverseDocumentFrequency * (termFrequency * (k1 + 1))) /
          (termFrequency +
            k1 * (1 - b + (b * documentLength) / averageDocumentLength))
        );
      })
      .reduce((a: number, b: number) => a + b, 0);
    if (sorter) {
      return { score, document} as BMDocument
    }
    return score;
  });
  // sort the results
  if (sorter) {
    return (scores as BMDocument[]).sort(sorter);
  }
  return scores as number[];
}