import { map } from 'lodash' import Matrix from './data_structures/word_matrix' import { languageName } from './tools/guess_language' import Parser from './tools/parser' import Preprocessor from './tools/preprocessor' import Stemmer from './tools/stemmer' import load from './tools/stoplist' // can be used to tweak the algorithm or to use it without the defaults export interface IOptions { delimiters: string[] language: languageName } // the actual parameters for the RAKE algorithm export interface IParameters extends IOptions { corpus: string } export function rake(params: IParameters): string[] { // step 1: split the corpus text into a word array on `delimiters` const preprocessor = new Preprocessor(params.delimiters) const wordArray = preprocessor.process(params.corpus) // step 2: loop through all words, generate ngrams/stems/phrases/metrics const stemmer = new Stemmer(params.language) const stopwords = load(params.language) const parser = new Parser(stemmer, stopwords).process(wordArray) // step 3: build a co-occurence matrix for all words (-> stems) const stemList = stemmer.getStems() const matrix = new Matrix(stemList) for (const phrase of parser.phrases) { matrix.process(phrase.stems) } const stemScores = matrix.calculateScores() // step 4: examine the phrases with the best combined scores for (const phrase of parser.phrases) { phrase.calculateScore(stemScores) } parser.joinDuplicates() return parser.bestPhrases() }