Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | 1x 16x 1x 16x 1x 4x 12x 4x 4x 4x 4x 12x 12x 4x 4x 4x 4x 12x 12x 12x 12x 12x 12x 12x 6x 6x 4x 2x 2x | /** Gets word count. */
export const getWordCount = (corpus: string) => {
return ((corpus || "").match(/\w+/g) || []).length;
};
/** Number of occurences of a word in a string. */
export const getTermFrequency = (term: string, corpus: string) => {
return ((corpus || "").match(new RegExp(term, "g")) || []).length;
};
/** Inverse document frequency. */
export const getIDF = (term: string, documents: string[]) => {
// Number of relevant documents.
const relevantDocuments = documents.filter((document: string) =>
document.includes(term)
).length;
return Math.log(
(documents.length - relevantDocuments + 0.5) / (relevantDocuments + 0.5) + 1
);
};
/** Represents a document; useful when sorting results.
*/
export interface BMDocument {
/** The document is originally scoreed. */
document: string;
/** The score that the document recieves. */
score: number;
}
/** Constants that are free parameters used in BM25, specifically when generating inverse document frequency. */
export interface BMConstants {
/** Free parameter. Is 0.75 by default. */
b?: number;
/** Free parameter. Is 1.2 by default. Generally in range [1.2, 2.0] */
k1?: number;
}
/** If returns positive, the sorting results in secondEl coming before firstEl, else, firstEl comes before secondEL */
export type BMSorter = (firstEl: BMDocument, secondEl: BMDocument) => number;
/** Implementation of Okapi BM25 algorithm.
* @param documents: Collection of documents.
* @param keywords: query terms.
* @param constants: Contains free parameters k1 and b. b=0.75 and k1=1.2 by default.
* @param sort: A function that allows you to sort queries by a given rule. If not provided, returns results corresponding to the original order.
* If this option is provided, the return type will not be an array of scores but an array of documents with their scores.
*/
export default function BM25(
documents: string[],
keywords: string[],
constants?: BMConstants,
sorter?: BMSorter
): number[] | BMDocument[] {
const b = constants && constants.b ? constants.b : 0.75;
const k1 = constants && constants.k1 ? constants.k1 : 1.2;
const documentLengths = documents.map((document: string) =>
getWordCount(document)
);
const averageDocumentLength =
documentLengths.reduce((a, b) => a + b, 0) / documents.length;
const idfByKeyword = keywords.reduce((obj, keyword) => {
obj.set(keyword, getIDF(keyword, documents));
return obj;
}, new Map<string, number>());
const scores = documents.map((document: string, index: number) => {
const score = keywords
.map((keyword: string) => {
const inverseDocumentFrequency = idfByKeyword.get(keyword);
const termFrequency = getTermFrequency(keyword, document);
const documentLength = documentLengths[index];
return (
(inverseDocumentFrequency * (termFrequency * (k1 + 1))) /
(termFrequency +
k1 * (1 - b + (b * documentLength) / averageDocumentLength))
);
})
.reduce((a: number, b: number) => a + b, 0);
if (sorter) {
return { score, document} as BMDocument
}
return score;
});
// sort the results
if (sorter) {
return (scores as BMDocument[]).sort(sorter);
}
return scores as number[];
}
|