import { NormalizedFileCache } from './normalized-file-cache'; import { FilePath } from './types'; export async function linePercentages( files: FilePath[], cache = new NormalizedFileCache(), ): Promise { const all: FileLineSimilarity[] = []; for (let i = 0; i < files.length; i++) { const f0 = (await cache.get(files[i]))!; const perFile: FileLineSimilarity[] = []; for (let j = i + 1; j < files.length; j++) { const f1 = (await cache.get(files[j]))!; const similarity = commonLineCount(files[i], files[j], f0, f1); perFile.push(similarity); } sortHighestToLowest(perFile); // TODO: should this be configurable? It limits the number of results per file. const closest = perFile.slice(0, 10); all.push(...closest); } sortHighestToLowest(all); return all; } function sortHighestToLowest(arr: FileLineSimilarity[]) { return arr.sort((a, b) => b.score - a.score); } export type FileLineSimilarity = { filePath0: FilePath; filePath1: FilePath; lineCount0: number; lineCount1: number; commonLines: number; score: number; }; // Somewhat similar to the `comm` unix command, except it only returns the number // of lines the files have in common. // The score is tricky: what does it mean for a 7-line file to have 7-lines in // common with a 70-line file? But roughly, 1 == 100% of the lines were // shared, 0.5 === 50% of the lines were shared. function commonLineCount( filePath0: string, filePath1: string, lines0: string[], lines1: string[], ): FileLineSimilarity { let both = 0; let nextStart = 0; for (let i = 0; i < lines0.length && nextStart < lines1.length; i++) { for (let j = nextStart; j < lines1.length; j++) { if (lines0[i] === lines1[j]) { both++; // We don't want to count the same line twice, so mark where to start // the next iteration. Since it's sorted, we can be guaranteed not to // skip over something. nextStart = j + 1; break; } } } return { filePath0, filePath1, lineCount0: lines0.length, lineCount1: lines1.length, commonLines: both, score: (both / lines0.length + both / lines1.length) / 2, }; }