/* This function goes through a sorted array and divides it into two different clusters: continuous samples and discrete samples. The discrete samples are stored in a mutable map. Samples are considered discrete if they have at least `minDiscreteWight` duplicates. Using a `minDiscreteWight` higher than 2 is important because sometimes common elements will be generated by regular operations. The final continuous array will be sorted. The method here is designed for high performance for fairly small `minDiscreteWight` values for both mostly-continuous and mostly-discrete inputs. For each position i it visits, it compares it to the place where a run starting at i would end. For continuous distributions, this comparison is always false, keeping branch prediction costs low. If the comparison is true, it finds the complete run with recursive doubling then a binary search, which skips over many elements for long runs. */ type splitSamples = { continuousSamples: number[]; discreteShape: { xs: number[]; ys: number[]; }; }; export const splitContinuousAndDiscrete = ( sortedArray: readonly number[], minDiscreteWeight: number ): splitSamples => { const continuous: number[] = []; const discreteCount: number[] = []; const discreteValue: number[] = []; if (!Number.isInteger(minDiscreteWeight)) { throw new Error( `Minimum discrete weight must be an integer. Got ${minDiscreteWeight}` ); } if (minDiscreteWeight < 2) { // Weight of 1 is pointless because it indicates only discrete values, // and causes an infinite loop in the doubling search used here. throw new Error("Minimum discrete weight must be at least 2"); } // In a run of exactly minDiscreteWeight, the first and last // element indices differ by minDistance. const minDistance = minDiscreteWeight - 1; const len = sortedArray.length; let i = 0; while (i < len - minDistance) { // We are interested in runs of elements equal to value const value = sortedArray[i]; if (value !== sortedArray[i + minDistance]) { // No long run starting at i, so it's continuous continuous.push(value); i++; } else { // Now we know that a run starts at i // Move i forward to next unequal value // That is, find iNext so that isEqualAt(iNext-1) and !isEqualAt(iNext) const iOrig = i; // Find base so that iNext is in (iOrig+base, iOrig+2*base] // This is where we start the binary search let base = minDistance; const isEqualAt = (ind: number) => ind < len && sortedArray[ind] === value; while (isEqualAt(iOrig + base * 2)) { base *= 2; } // Maintain iNext in (lo, i]. Once lo+1 === i, i is iNext. let lo = iOrig + base; i = Math.min(lo + base, len); while (i - lo > 1) { const mid = lo + Math.floor((i - lo) / 2); if (sortedArray[mid] === value) { lo = mid; } else { i = mid; } } discreteValue.push(value); discreteCount.push(i - iOrig); } } // Remaining values are continuous continuous.push(...sortedArray.slice(i)); return { continuousSamples: continuous, discreteShape: { xs: discreteValue, ys: discreteCount }, }; }; /* These two filter functions are meant to run after splitContinuousAndDiscrete(). They assume that this data is sorted. */ // If all the continous samples are the same, changes them to be part of the discrete data export const continuousAreSameFilter = ({ continuousSamples, discreteShape: { xs, ys }, }: splitSamples): splitSamples => { const allContinuousAreSame = continuousSamples.length > 0 && continuousSamples[0] === continuousSamples[continuousSamples.length - 1]; if (allContinuousAreSame) { return { continuousSamples: [], discreteShape: { xs: [...xs, continuousSamples[0]], ys: [...ys, continuousSamples.length], }, }; } return { continuousSamples, discreteShape: { xs, ys } }; }; // If there are fewer than N continuous samples, removes them export const minContinuousSamplesFilter = ( minSampleCount: number, { continuousSamples, discreteShape }: splitSamples ): splitSamples => { if (continuousSamples.length < minSampleCount) { return { continuousSamples: [], discreteShape }; } return { continuousSamples, discreteShape }; };