import { squaredEuclidean } from 'ml-distance-euclidean'; import { Matrix } from 'ml-matrix'; import { Random } from 'ml-random'; import { xSum } from 'ml-spectra-processing'; /** * Choose K different random points from the original data * @ignore * @param data - Points in the format to cluster [x,y,z,...] * @param K - number of clusters * @param seed - seed for random number generation * @returns - Initial random points */ export function random(data: number[][], K: number, seed?: number) { const random = new Random(seed); return random.choice(data, { size: K }); } /** * Chooses the most distant points to a first random pick * @ignore * @param data - Points in the format to cluster [x,y,z,...] * @param K - number of clusters * @param distanceMatrix - matrix with the distance values * @param seed - seed for random number generation * @returns - Initial random points */ export function mostDistant( data: number[][], K: number, distanceMatrix: number[][], seed?: number, ): number[][] { const random = new Random(seed); const ans = new Array(K); // chooses a random point as initial cluster ans[0] = Math.floor(random.random() * data.length); if (K > 1) { // chooses the more distant point const maxDist = { dist: -1, index: -1 }; for (let l = 0; l < data.length; ++l) { if (distanceMatrix[ans[0]][l] > maxDist.dist) { maxDist.dist = distanceMatrix[ans[0]][l]; maxDist.index = l; } } ans[1] = maxDist.index; if (K > 2) { // chooses the set of points that maximises the min distance for (let k = 2; k < K; ++k) { let center = { dist: -1, index: -1 }; for (let m = 0; m < data.length; ++m) { // minimum distance to centers let minDistCent = { dist: Number.MAX_VALUE, index: -1 }; if (!ans.includes(m)) { for (let n = 0; n < k; ++n) { if (distanceMatrix[ans[n]][m] < minDistCent.dist) { minDistCent = { dist: distanceMatrix[ans[n]][m], index: m, }; } } } if ( minDistCent.dist !== Number.MAX_VALUE && minDistCent.dist > center.dist ) { center = { ...minDistCent }; } } ans[k] = center.index; } } } return ans.map((index) => data[index]); } interface Options { seed: number; localTrials: number; } /** * Chooses the initial centers using the kmeans++ method. * Implementation inspired from scikit. * @ignore * @param X - Points in the format to cluster [x,y,z,...]. * @param K - Number of clusters. * @param options - Options for the kmeans++ initialization. * @returns Initial centers in format [x,y,z,...]. */ export function kmeanspp( X: number[][], K: number, options: Partial = {}, ) { const m = new Matrix(X); const nSamples = m.rows; const random = new Random(options.seed); // Set the number of trials const centers: number[][] = []; const localTrials = options.localTrials || 2 + Math.floor(Math.log(K)); // Pick the first center at random from the dataset const firstCenterIdx = random.randInt(nSamples); centers.push(m.getRow(firstCenterIdx)); // Init closest distances let closestDistSquared = new Matrix(1, m.rows); for (let i = 0; i < m.rows; i++) { closestDistSquared.set(0, i, squaredEuclidean(m.getRow(i), centers[0])); } let probabilities: Matrix = Matrix.mul( closestDistSquared, 1 / xSum(closestDistSquared.getRow(0)), ); // Iterate over the remaining centers for (let i = 1; i < K; i++) { const candidateIdx: number[] = random.choice(nSamples, { replace: true, size: localTrials, probabilities: probabilities.getRow(0), }); const candidates = m.subMatrixRow(candidateIdx); const distanceToCandidates = euclideanDistances(candidates, m); let bestCandidate = Infinity; let bestPot = Infinity; let bestDistSquared = closestDistSquared; for (let j = 0; j < localTrials; j++) { const newDistSquared = Matrix.min(closestDistSquared, [ distanceToCandidates.getRow(j), ]); const newPot = newDistSquared.sum(); if (newPot < bestPot) { bestCandidate = candidateIdx[j]; bestPot = newPot; bestDistSquared = newDistSquared; } } centers[i] = m.getRow(bestCandidate); closestDistSquared = bestDistSquared; probabilities = Matrix.mul( closestDistSquared, 1 / xSum(closestDistSquared.getRow(0)), ); } return centers; } function euclideanDistances(A: Matrix, B: Matrix) { const result = new Matrix(A.rows, B.rows); for (let i = 0; i < A.rows; i++) { for (let j = 0; j < B.rows; j++) { result.set(i, j, squaredEuclidean(A.getRow(i), B.getRow(j))); } } return result; }