import { stopwords } from './stopwords.js'; /** * Filters out tokens based on a set of exclusion tokens. * * @param tokens The array of tokens to filter. * @param exclusions A set containing tokens to exclude. * @returns An array of filtered tokens. */ function filterTokens( tokens: readonly string[], exclusions: ReadonlySet ): string[] { return tokens.filter((token) => !exclusions.has(token)); } /** * Counts the occurrences of each token in an array of tokens. * * This function supports the preprocessing step for NLP tasks like text similarity * and classification by transforming text into a bag-of-words model, facilitating * the comparison of different texts based on their content. * * @param tokens An array of string tokens. * @returns A Counter object mapping each token to its count. */ function countTokens(tokens: readonly string[]): Record { const counter: Record = {}; for (const token of tokens) { counter[token] = (counter[token] || 0) + 1; } return counter; } /** * Normalizes text by lowercasing, removing punctuation, and squashing multiple spaces. * * This normalization is crucial in NLP for reducing the complexity of the text data, * minimizing the variance between words that should be considered the same for analysis * purposes (e.g., "Dog!" and "dog" are treated as the same word). * * @param s A string to be normalized. * @returns A normalized string. */ function normalizeText(s: string): string { let normalized = s.normalize('NFD'); normalized = normalized.replace(/\b(a|an|the)\b/g, ' '); normalized = normalized.split(/\s+/).join(' '); normalized = normalized.replace(/[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]/g, ''); return normalized.toLowerCase(); } /** * Calculates the Exact Match (EM) score between a prediction and ground truth. * * The EM score is a strict metric used in machine learning to assess if the predicted * answer matches the ground truth exactly, commonly used in tasks like question answering. * * @param prediction The predicted text. * @param groundTruth The actual correct text. * @returns A number (1.0 for exact match, 0.0 otherwise). */ function emScore(prediction: string, groundTruth: string): number { return normalizeText(prediction) === normalizeText(groundTruth) ? 1.0 : 0.0; } /** * Calculates the F1 score between a prediction and ground truth. * * The F1 score is a harmonic mean of precision and recall, widely used in NLP to measure * a model's accuracy in considering both false positives and false negatives, offering a * balance for evaluating classification models. * * @param prediction The predicted text. * @param groundTruth The actual correct text. * @returns The F1 score as a number. */ function f1Score(prediction: string, groundTruth: string): number { const predictionTokens = normalizeText(prediction).split(' '); const groundTruthTokens = normalizeText(groundTruth).split(' '); // Calculate the intersection of common tokens between prediction and ground truth const predictionCounts = countTokens(predictionTokens); const groundTruthCounts = countTokens(groundTruthTokens); let numSame = 0; for (const token in predictionCounts) { const v1 = predictionCounts[token] ?? 0; const v2 = groundTruthCounts[token] ?? 0; numSame += Math.min(v1, v2); } if (numSame === 0) { return 0; } const precision = numSame / predictionTokens.length; const recall = numSame / groundTruthTokens.length; return (2 * precision * recall) / (precision + recall); } /** * Calculates a novel F1 score, taking into account a history of interaction and excluding stopwords. * * This metric extends the F1 score by considering contextual relevance and filtering out common words * that might skew the assessment of the prediction's quality, especially in conversational models or * when historical context is relevant. * * @param history The historical context or preceding interactions. * @param prediction The predicted text. * @param groundTruth The actual correct text. * @param returnRecall Optionally return the recall score instead of F1. * @returns The novel F1 or recall score as a number. */ function novelF1ScoreOptimized( history: string, prediction: string, groundTruth: string, returnRecall = false ): number { // Normalize and split the input texts into tokens const historyTokens = normalizeText(history).split(' '); let predictionTokens = normalizeText(prediction).split(' '); let groundTruthTokens = normalizeText(groundTruth).split(' '); // Combine stopwords and history tokens for exclusion const exclusions = new Set([...stopwords, ...historyTokens]); // Filter prediction and ground truth tokens against the exclusions predictionTokens = filterTokens(predictionTokens, exclusions); groundTruthTokens = filterTokens(groundTruthTokens, exclusions); // Proceed with calculating common tokens, precision, recall, and F1 score as previously outlined // Placeholder for the calculation logic const numSame = 0; // This should be calculated as before const precision = numSame / predictionTokens.length; const recall = numSame / groundTruthTokens.length; const f1 = (2 * precision * recall) / (precision + recall); return returnRecall ? recall : f1; } export const AxEvalUtil = { emScore, f1Score, novelF1ScoreOptimized, };