#ifndef SIGNALSMITH_STRETCH_H #define SIGNALSMITH_STRETCH_H #include // https://github.com/Signalsmith-Audio/linear #include #include #include #include #include #include #include namespace signalsmith { namespace stretch { namespace _impl { template static std::complex mul(const std::complex &a, const std::complex &b) { return conjugateSecond ? std::complex{ b.real()*a.real() + b.imag()*a.imag(), b.real()*a.imag() - b.imag()*a.real() } : std::complex{ a.real()*b.real() - a.imag()*b.imag(), a.real()*b.imag() + a.imag()*b.real() }; } template static V norm(const std::complex &a) { V r = a.real(), i = a.imag(); return r*r + i*i; } } template struct SignalsmithStretch { static constexpr size_t version[3] = {1, 3, 2}; SignalsmithStretch() : randomEngine(std::random_device{}()) {} SignalsmithStretch(long seed) : randomEngine(seed) {} // The difference between the internal position (centre of a block) and the input samples you're supplying int inputLatency() const { return int(stft.analysisLatency()); } int outputLatency() const { return int(stft.synthesisLatency() + _splitComputation*stft.defaultInterval()); } void reset() { stft.reset(0.1); stashedInput = stft.input; stashedOutput = stft.output; prevInputOffset = -1; channelBands.assign(channelBands.size(), Band()); silenceCounter = 0; didSeek = false; blockProcess = {}; freqEstimateWeighted = freqEstimateWeight = 0; } // Configures using a default preset void presetDefault(int nChannels, Sample sampleRate, bool splitComputation=false) { configure(nChannels, sampleRate*0.12, sampleRate*0.03, splitComputation); } void presetCheaper(int nChannels, Sample sampleRate, bool splitComputation=true) { configure(nChannels, sampleRate*0.1, sampleRate*0.04, splitComputation); } // Manual setup void configure(int nChannels, int blockSamples, int intervalSamples, bool splitComputation=false) { _splitComputation = splitComputation; channels = nChannels; stft.configure(channels, channels, blockSamples, intervalSamples + 1); stft.setInterval(intervalSamples, stft.kaiser); stft.reset(0.1); stashedInput = stft.input; stashedOutput = stft.output; bands = int(stft.bands()); channelBands.assign(bands*channels, Band()); peaks.reserve(bands/2); energy.resize(bands); smoothedEnergy.resize(bands); outputMap.resize(bands); channelPredictions.resize(channels*bands); blockProcess = {}; formantMetric.resize(bands + 2); tmpProcessBuffer.resize(blockSamples + intervalSamples); tmpPreRollBuffer.resize(outputLatency()*channels); } // For querying the existing config int blockSamples() const { return int(stft.blockSamples()); } int intervalSamples() const { return int(stft.defaultInterval()); } bool splitComputation() const { return _splitComputation; } /// Frequency multiplier, and optional tonality limit (as multiple of sample-rate) void setTransposeFactor(Sample multiplier, Sample tonalityLimit=0) { freqMultiplier = multiplier; if (tonalityLimit > 0) { freqTonalityLimit = tonalityLimit/std::sqrt(multiplier); // compromise between input and output limits } else { freqTonalityLimit = 1; } customFreqMap = nullptr; } void setTransposeSemitones(Sample semitones, Sample tonalityLimit=0) { setTransposeFactor(std::pow(2, semitones/12), tonalityLimit); } // Sets a custom frequency map - should be monotonically increasing void setFreqMap(std::function inputToOutput) { customFreqMap = inputToOutput; } void setFormantFactor(Sample multiplier, bool compensatePitch=false) { formantMultiplier = multiplier; invFormantMultiplier = 1/multiplier; formantCompensation = compensatePitch; } void setFormantSemitones(Sample semitones, bool compensatePitch=false) { setFormantFactor(std::pow(2, semitones/12), compensatePitch); } // Rough guesstimate of the fundamental frequency, used for formant analysis. 0 means attempting to detect the pitch void setFormantBase(Sample baseFreq=0) { formantBaseFreq = baseFreq; } // Provide previous input ("pre-roll") to smoothly change the input location without interrupting the output. This doesn't do any calculation, just copies intput to a buffer. // You should ideally feed it `seekLength()` frames of input, unless it's directly after a `.reset()` (in which case `.outputSeek()` might be a better choice) template void seek(Inputs &&inputs, int inputSamples, double playbackRate) { tmpProcessBuffer.resize(0); tmpProcessBuffer.resize(stft.blockSamples() + stft.defaultInterval()); int startIndex = std::max(0, inputSamples - int(tmpProcessBuffer.size())); // start position in input int padStart = int(tmpProcessBuffer.size() + startIndex) - inputSamples; // start position in tmpProcessBuffer Sample totalEnergy = 0; for (int c = 0; c < channels; ++c) { auto &&inputChannel = inputs[c]; for (int i = startIndex; i < inputSamples; ++i) { Sample s = inputChannel[i]; totalEnergy += s*s; tmpProcessBuffer[i - startIndex + padStart] = s; } stft.writeInput(c, tmpProcessBuffer.size(), tmpProcessBuffer.data()); } stft.moveInput(tmpProcessBuffer.size()); if (totalEnergy >= noiseFloor) { silenceCounter = 0; silenceFirst = true; } didSeek = true; seekTimeFactor = (playbackRate*stft.defaultInterval() > 1) ? 1/playbackRate : stft.defaultInterval(); } int seekLength() const { return int(stft.blockSamples() + stft.defaultInterval()); } // Moves the input position *and* pre-calculates some output, so that the next samples returned from `.process()` are aligned to the beginning of the sample. // The time-stretch rate is inferred from `inputLength`, so use `.outputSeekLength()` to get a correct value for that. template void outputSeek(Inputs &&inputs, int inputLength) { // TODO: add fade-out parameter to avoid clicks, instead of doing a full reset reset(); // Assume we've been handed enough surplus input to produce `outputLatency()` samples of pre-roll int surplusInput = std::max(inputLength - inputLatency(), 0); Sample playbackRate = surplusInput/Sample(outputLatency()); // Move the input position to the start of the sound int seekSamples = inputLength - surplusInput; seek(inputs, seekSamples, playbackRate); tmpPreRollBuffer.resize(outputLatency()*channels); struct BufferOutput { Sample *samples; int length; Sample * operator[](int c) { return samples + c*length; } } preRollOutput{tmpPreRollBuffer.data(), outputLatency()}; // Use the surplus input to produce pre-roll output OffsetIO offsetInput{inputs, seekSamples}; process(offsetInput, surplusInput, preRollOutput, preRollOutput.length); // put the thing down, flip it and reverse it for (auto &v : tmpPreRollBuffer) v = -v; for (int c = 0; c < channels; ++c) { std::reverse(preRollOutput[c], preRollOutput[c] + preRollOutput.length); stft.addOutput(c, preRollOutput.length, preRollOutput[c]); } } int outputSeekLength(Sample playbackRate) const { return inputLatency() + playbackRate*outputLatency(); } template void process(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) { #ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_START SIGNALSMITH_STRETCH_PROFILE_PROCESS_START(inputSamples, outputSamples); #endif int prevCopiedInput = 0; auto copyInput = [&](int toIndex){ int length = std::min(int(stft.blockSamples() + stft.defaultInterval()), toIndex - prevCopiedInput); tmpProcessBuffer.resize(length); int offset = toIndex - length; for (int c = 0; c < channels; ++c) { auto &&inputBuffer = inputs[c]; for (int i = 0; i < length; ++i) { tmpProcessBuffer[i] = inputBuffer[i + offset]; } stft.writeInput(c, length, tmpProcessBuffer.data()); } stft.moveInput(length); prevCopiedInput = toIndex; }; Sample totalEnergy = 0; for (int c = 0; c < channels; ++c) { auto &&inputChannel = inputs[c]; for (int i = 0; i < inputSamples; ++i) { Sample s = inputChannel[i]; totalEnergy += s*s; } } if (totalEnergy < noiseFloor) { if (silenceCounter >= 2*stft.blockSamples()) { if (silenceFirst) { // first block of silence processing silenceFirst = false; //stft.reset(); blockProcess = {}; for (auto &b : channelBands) { b.input = b.prevInput = b.output = 0; b.inputEnergy = 0; } } if (inputSamples > 0) { // copy from the input, wrapping around if needed for (int outputIndex = 0; outputIndex < outputSamples; ++outputIndex) { int inputIndex = outputIndex%inputSamples; for (int c = 0; c < channels; ++c) { outputs[c][outputIndex] = inputs[c][inputIndex]; } } } else { for (int c = 0; c < channels; ++c) { auto &&outputChannel = outputs[c]; for (int outputIndex = 0; outputIndex < outputSamples; ++outputIndex) { outputChannel[outputIndex] = 0; } } } // Store input in history buffer copyInput(inputSamples); return; } else { silenceCounter += inputSamples; } } else { silenceCounter = 0; silenceFirst = true; } for (int outputIndex = 0; outputIndex < outputSamples; ++outputIndex) { bool newBlock = blockProcess.samplesSinceLast >= stft.defaultInterval(); if (newBlock) { blockProcess.step = 0; blockProcess.steps = 0; // how many processing steps this block will have blockProcess.samplesSinceLast = 0; // Time to process a spectrum! Where should it come from in the input? int inputOffset = std::round(outputIndex*Sample(inputSamples)/outputSamples); int inputInterval = inputOffset - prevInputOffset; prevInputOffset = inputOffset; copyInput(inputOffset); stashedInput = stft.input; // save the input state, since that's what we'll analyse later if (_splitComputation) { stashedOutput = stft.output; // save the current output, and read from it stft.moveOutput(stft.defaultInterval()); // the actual input jumps forward in time by one interval, ready for the synthesis } blockProcess.newSpectrum = didSeek || (inputInterval > 0); blockProcess.mappedFrequencies = customFreqMap || freqMultiplier != 1; if (blockProcess.newSpectrum) { // make sure the previous input is the correct distance in the past (give or take 1 sample) blockProcess.reanalysePrev = didSeek || std::abs(inputInterval - int(stft.defaultInterval())) > 1; if (blockProcess.reanalysePrev) blockProcess.steps += stft.analyseSteps() + 1; // analyse a new input blockProcess.steps += stft.analyseSteps() + 1; } blockProcess.processFormants = formantMultiplier != 1 || (formantCompensation && blockProcess.mappedFrequencies); blockProcess.timeFactor = didSeek ? seekTimeFactor : stft.defaultInterval()/std::max(1, inputInterval); didSeek = false; updateProcessSpectrumSteps(); blockProcess.steps += processSpectrumSteps; blockProcess.steps += stft.synthesiseSteps() + 1; } size_t processToStep = newBlock ? blockProcess.steps : 0; if (_splitComputation) { Sample processRatio = Sample(blockProcess.samplesSinceLast + 1)/stft.defaultInterval(); processToStep = std::min(blockProcess.steps, (blockProcess.steps + 0.999f)*processRatio); } while (blockProcess.step < processToStep) { size_t step = blockProcess.step++; #ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP SIGNALSMITH_STRETCH_PROFILE_PROCESS_STEP(step, blockProcess.steps); #endif if (blockProcess.newSpectrum) { if (blockProcess.reanalysePrev) { // analyse past input if (step < stft.analyseSteps()) { stashedInput.swap(stft.input); stft.analyseStep(step, stft.defaultInterval()); stashedInput.swap(stft.input); continue; } step -= stft.analyseSteps(); if (step < 1) { // Copy previous analysis to our band objects for (int c = 0; c < channels; ++c) { auto channelBands = bandsForChannel(c); auto *spectrumBands = stft.spectrum(c); for (int b = 0; b < bands; ++b) { channelBands[b].prevInput = spectrumBands[b]; } } continue; } step -= 1; } // Analyse latest (stashed) input if (step < stft.analyseSteps()) { stashedInput.swap(stft.input); stft.analyseStep(step); stashedInput.swap(stft.input); continue; } step -= stft.analyseSteps(); if (step < 1) { // Copy analysed spectrum into our band objects for (int c = 0; c < channels; ++c) { auto channelBands = bandsForChannel(c); auto *spectrumBands = stft.spectrum(c); for (int b = 0; b < bands; ++b) { channelBands[b].input = spectrumBands[b]; } } continue; } step -= 1; } if (step < processSpectrumSteps) { processSpectrum(step); continue; } step -= processSpectrumSteps; if (step < 1) { // Copy band objects into spectrum for (int c = 0; c < channels; ++c) { auto channelBands = bandsForChannel(c); auto *spectrumBands = stft.spectrum(c); for (int b = 0; b < bands; ++b) { spectrumBands[b] = channelBands[b].output; } } continue; } step -= 1; if (step < stft.synthesiseSteps()) { stft.synthesiseStep(step); continue; } } #ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP SIGNALSMITH_STRETCH_PROFILE_PROCESS_ENDSTEP(); #endif ++blockProcess.samplesSinceLast; if (_splitComputation) stashedOutput.swap(stft.output); for (int c = 0; c < channels; ++c) { auto &&outputChannel = outputs[c]; Sample v = 0; stft.readOutput(c, 1, &v); outputChannel[outputIndex] = v; } stft.moveOutput(1); if (_splitComputation) stashedOutput.swap(stft.output); } copyInput(inputSamples); prevInputOffset -= inputSamples; #ifdef SIGNALSMITH_STRETCH_PROFILE_PROCESS_END SIGNALSMITH_STRETCH_PROFILE_PROCESS_END(); #endif } // Read the remaining output, providing no further input. If `outputSamples` is more than one interval, it will compute additional blocks assuming a zero-valued input template void flush(Outputs &&outputs, int outputSamples, Sample playbackRate=0) { struct Zeros { struct Channel { Sample operator[](int) { return 0; } }; Channel operator[](int) { return {}; } } zeros; // If we're asked for more than an interval of extra output, then zero-pad the input int outputBlock = std::max(0, outputSamples - stft.defaultInterval()); if (outputBlock > 0) process(zeros, outputBlock*playbackRate, outputs, outputBlock); int tailSamples = outputSamples - outputBlock; // at most one interval tmpProcessBuffer.resize(tailSamples); stft.finishOutput(1); for (int c = 0; c < channels; ++c) { stft.readOutput(c, tailSamples, tmpProcessBuffer.data()); auto &&outputChannel = outputs[c]; for (int i = 0; i < tailSamples; ++i) { outputChannel[outputBlock + i] = tmpProcessBuffer[i]; } stft.readOutput(c, tailSamples, tailSamples, tmpProcessBuffer.data()); for (int i = 0; i < tailSamples; ++i) { outputChannel[outputBlock + tailSamples - 1 - i] -= tmpProcessBuffer[i]; } } stft.reset(0.1f); // Reset the phase-vocoder stuff, so the next block gets a fresh start for (int c = 0; c < channels; ++c) { auto channelBands = bandsForChannel(c); for (int b = 0; b < bands; ++b) { channelBands[b].prevInput = channelBands[b].output = 0; } } } // Process a complete audio buffer all in one go template bool exact(Inputs &&inputs, int inputSamples, Outputs &&outputs, int outputSamples) { Sample playbackRate = inputSamples/Sample(outputSamples); auto seekLength = outputSeekLength(playbackRate); if (inputSamples < seekLength) { // to short for this - zero the output just to be polite for (int c = 0; c < channels; ++c) { auto &&channel = outputs[c]; for (int i = 0; i < outputSamples; ++i) { channel[i] = 0; } } return false; } outputSeek(inputs, seekLength); int outputIndex = outputSamples - seekLength/playbackRate; OffsetIO offsetInput{inputs, seekLength}; process(offsetInput, inputSamples - seekLength, outputs, outputIndex); OffsetIO offsetOutput{outputs, outputIndex}; flush(offsetOutput, outputSamples - outputIndex, playbackRate); return true; } private: bool _splitComputation = false; struct { size_t samplesSinceLast = std::numeric_limits::max(); size_t steps = 0; size_t step = 0; bool newSpectrum = false; bool reanalysePrev = false; bool mappedFrequencies = false; bool processFormants = false; Sample timeFactor; } blockProcess; using Complex = std::complex; static constexpr Sample noiseFloor{1e-15}; static constexpr Sample maxCleanStretch{2}; // time-stretch ratio before we start randomising phases size_t silenceCounter = 0; bool silenceFirst = true; Sample freqMultiplier = 1, freqTonalityLimit = 0.5; std::function customFreqMap = nullptr; bool formantCompensation = false; // compensate for pitch/freq change Sample formantMultiplier = 1, invFormantMultiplier = 1; using STFT = signalsmith::linear::DynamicSTFT; STFT stft; typename STFT::Input stashedInput; typename STFT::Output stashedOutput; std::vector tmpProcessBuffer, tmpPreRollBuffer; int channels = 0, bands = 0; int prevInputOffset = -1; bool didSeek = false; Sample seekTimeFactor = 1; Sample bandToFreq(Sample b) const { return stft.binToFreq(b); } Sample freqToBand(Sample f) const { return stft.freqToBin(f); } struct Band { Complex input, prevInput{0}; Complex output{0}; Sample inputEnergy; }; std::vector channelBands; Band * bandsForChannel(int channel) { return channelBands.data() + channel*bands; } template Complex getBand(int channel, int index) { if (index < 0 || index >= bands) return 0; return channelBands[index + channel*bands].*member; } template Complex getFractional(int channel, int lowIndex, Sample fractional) { Complex low = getBand(channel, lowIndex); Complex high = getBand(channel, lowIndex + 1); return low + (high - low)*fractional; } template Complex getFractional(int channel, Sample inputIndex) { int lowIndex = std::floor(inputIndex); Sample fracIndex = inputIndex - lowIndex; return getFractional(channel, lowIndex, fracIndex); } template Sample getBand(int channel, int index) { if (index < 0 || index >= bands) return 0; return channelBands[index + channel*bands].*member; } template Sample getFractional(int channel, int lowIndex, Sample fractional) { Sample low = getBand(channel, lowIndex); Sample high = getBand(channel, lowIndex + 1); return low + (high - low)*fractional; } template Sample getFractional(int channel, Sample inputIndex) { int lowIndex = std::floor(inputIndex); Sample fracIndex = inputIndex - lowIndex; return getFractional(channel, lowIndex, fracIndex); } struct Peak { Sample input, output; }; std::vector peaks; std::vector energy, smoothedEnergy; struct PitchMapPoint { Sample inputBin, freqGrad; }; std::vector outputMap; struct Prediction { Sample energy = 0; Complex input; Complex makeOutput(Complex phase) { Sample phaseNorm = _impl::norm(phase); if (phaseNorm <= noiseFloor) { phase = input; // prediction is too weak, fall back to the input phaseNorm = _impl::norm(input) + noiseFloor; } return phase*std::sqrt(energy/phaseNorm); } }; std::vector channelPredictions; Prediction * predictionsForChannel(int c) { return channelPredictions.data() + c*bands; } // If RandomEngine=void, use std::default_random_engine; using RandomEngineImpl = typename std::conditional< std::is_void::value, std::default_random_engine, RandomEngine >::type; RandomEngineImpl randomEngine; size_t processSpectrumSteps = 0; static constexpr size_t splitMainPrediction = 8; // it's just heavy, since we're blending up to 4 different phase predictions void updateProcessSpectrumSteps() { processSpectrumSteps = 0; if (blockProcess.newSpectrum) processSpectrumSteps += channels; if (blockProcess.mappedFrequencies) { processSpectrumSteps += smoothEnergySteps; processSpectrumSteps += 1; // findPeaks } processSpectrumSteps += 1; // updating the output map processSpectrumSteps += channels; // preliminary phase-vocoder prediction processSpectrumSteps += splitMainPrediction; if (blockProcess.newSpectrum) processSpectrumSteps += 1; // .input -> .prevInput if (blockProcess.processFormants) processSpectrumSteps += 3; } void processSpectrum(size_t step) { Sample timeFactor = blockProcess.timeFactor; Sample smoothingBins = Sample(stft.fftSamples())/stft.defaultInterval(); int longVerticalStep = std::round(smoothingBins); timeFactor = std::max(timeFactor, 1/maxCleanStretch); bool randomTimeFactor = (timeFactor > maxCleanStretch); std::uniform_real_distribution timeFactorDist(maxCleanStretch*2*randomTimeFactor - timeFactor, timeFactor); if (blockProcess.newSpectrum) { if (step < size_t(channels)) { int channel = int(step); auto bins = bandsForChannel(channel); Complex rot = std::polar(Sample(1), bandToFreq(0)*stft.defaultInterval()*Sample(2*M_PI)); Sample freqStep = bandToFreq(1) - bandToFreq(0); Complex rotStep = std::polar(Sample(1), freqStep*stft.defaultInterval()*Sample(2*M_PI)); for (int b = 0; b < bands; ++b) { auto &bin = bins[b]; bin.output = _impl::mul(bin.output, rot); bin.prevInput = _impl::mul(bin.prevInput, rot); rot = _impl::mul(rot, rotStep); } return; } step -= channels; } if (blockProcess.mappedFrequencies) { if (step < smoothEnergySteps) { smoothEnergy(step, smoothingBins); return; } step -= smoothEnergySteps; if (step-- == 0) { findPeaks(); return; } } if (step-- == 0) { if (blockProcess.mappedFrequencies) { updateOutputMap(); } else { // we're not pitch-shifting, so no need to find peaks etc. for (int c = 0; c < channels; ++c) { Band *bins = bandsForChannel(c); for (int b = 0; b < bands; ++b) { bins[b].inputEnergy = _impl::norm(bins[b].input); } } for (int b = 0; b < bands; ++b) { outputMap[b] = {Sample(b), 1}; } } return; } if (blockProcess.processFormants) { if (step < 3) { updateFormants(step); return; } step -= 3; } // Preliminary output prediction from phase-vocoder if (step < size_t(channels)) { int c = int(step); Band *bins = bandsForChannel(c); auto *predictions = predictionsForChannel(c); for (int b = 0; b < bands; ++b) { auto mapPoint = outputMap[b]; int lowIndex = std::floor(mapPoint.inputBin); Sample fracIndex = mapPoint.inputBin - lowIndex; Prediction &prediction = predictions[b]; Sample prevEnergy = prediction.energy; prediction.energy = getFractional<&Band::inputEnergy>(c, lowIndex, fracIndex); prediction.energy *= std::max(0, mapPoint.freqGrad); // scale the energy according to local stretch factor prediction.input = getFractional<&Band::input>(c, lowIndex, fracIndex); auto &outputBin = bins[b]; Complex prevInput = getFractional<&Band::prevInput>(c, lowIndex, fracIndex); Complex freqTwist = _impl::mul(prediction.input, prevInput); Complex phase = _impl::mul(outputBin.output, freqTwist); outputBin.output = phase/(std::max(prevEnergy, prediction.energy) + noiseFloor); } return; } step -= channels; if (step < splitMainPrediction) { // Re-predict using phase differences between frequencies size_t chunk = step; int startB = int(bands*chunk/splitMainPrediction); int endB = int(bands*(chunk + 1)/splitMainPrediction); for (int b = startB; b < endB; ++b) { // Find maximum-energy channel and calculate that int maxChannel = 0; Sample maxEnergy = predictionsForChannel(0)[b].energy; for (int c = 1; c < channels; ++c) { Sample e = predictionsForChannel(c)[b].energy; if (e > maxEnergy) { maxChannel = c; maxEnergy = e; } } auto *predictions = predictionsForChannel(maxChannel); auto &prediction = predictions[b]; auto *bins = bandsForChannel(maxChannel); auto &outputBin = bins[b]; Complex phase = 0; auto mapPoint = outputMap[b]; // Upwards vertical steps if (b > 0) { Sample binTimeFactor = randomTimeFactor ? timeFactorDist(randomEngine) : timeFactor; Complex downInput = getFractional<&Band::input>(maxChannel, mapPoint.inputBin - binTimeFactor); Complex shortVerticalTwist = _impl::mul(prediction.input, downInput); auto &downBin = bins[b - 1]; phase += _impl::mul(downBin.output, shortVerticalTwist); if (b >= longVerticalStep) { Complex longDownInput = getFractional<&Band::input>(maxChannel, mapPoint.inputBin - longVerticalStep*binTimeFactor); Complex longVerticalTwist = _impl::mul(prediction.input, longDownInput); auto &longDownBin = bins[b - longVerticalStep]; phase += _impl::mul(longDownBin.output, longVerticalTwist); } } // Downwards vertical steps if (b < bands - 1) { auto &upPrediction = predictions[b + 1]; auto &upMapPoint = outputMap[b + 1]; Sample binTimeFactor = randomTimeFactor ? timeFactorDist(randomEngine) : timeFactor; Complex downInput = getFractional<&Band::input>(maxChannel, upMapPoint.inputBin - binTimeFactor); Complex shortVerticalTwist = _impl::mul(upPrediction.input, downInput); auto &upBin = bins[b + 1]; phase += _impl::mul(upBin.output, shortVerticalTwist); if (b < bands - longVerticalStep) { auto &longUpPrediction = predictions[b + longVerticalStep]; auto &longUpMapPoint = outputMap[b + longVerticalStep]; Complex longDownInput = getFractional<&Band::input>(maxChannel, longUpMapPoint.inputBin - longVerticalStep*binTimeFactor); Complex longVerticalTwist = _impl::mul(longUpPrediction.input, longDownInput); auto &longUpBin = bins[b + longVerticalStep]; phase += _impl::mul(longUpBin.output, longVerticalTwist); } } outputBin.output = prediction.makeOutput(phase); // All other bins are locked in phase for (int c = 0; c < channels; ++c) { if (c != maxChannel) { auto &channelBin = bandsForChannel(c)[b]; auto &channelPrediction = predictionsForChannel(c)[b]; Complex channelTwist = _impl::mul(channelPrediction.input, prediction.input); Complex channelPhase = _impl::mul(outputBin.output, channelTwist); channelBin.output = channelPrediction.makeOutput(channelPhase); } } } return; } step -= splitMainPrediction; if (blockProcess.newSpectrum) { if (step-- == 0) { for (auto &bin : channelBands) { bin.prevInput = bin.input; } } } } // Produces smoothed energy across all channels static constexpr size_t smoothEnergySteps = 3; Sample smoothEnergyState = 0; void smoothEnergy(size_t step, Sample smoothingBins) { Sample smoothingSlew = 1/(1 + smoothingBins*Sample(0.5)); if (step-- == 0) { for (auto &e : energy) e = 0; for (int c = 0; c < channels; ++c) { Band *bins = bandsForChannel(c); for (int b = 0; b < bands; ++b) { Sample e = _impl::norm(bins[b].input); bins[b].inputEnergy = e; // Used for interpolating prediction energy energy[b] += e; } } for (int b = 0; b < bands; ++b) { smoothedEnergy[b] = energy[b]; } smoothEnergyState = 0; return; } // The two other steps are repeated smoothing passes, down and up Sample e = smoothEnergyState; for (int b = bands - 1; b >= 0; --b) { e += (smoothedEnergy[b] - e)*smoothingSlew; smoothedEnergy[b] = e; } for (int b = 0; b < bands; ++b) { e += (smoothedEnergy[b] - e)*smoothingSlew; smoothedEnergy[b] = e; } smoothEnergyState = e; } Sample mapFreq(Sample freq) const { if (customFreqMap) return customFreqMap(freq); if (freq > freqTonalityLimit) { return freq + (freqMultiplier - 1)*freqTonalityLimit; } return freq*freqMultiplier; } // Identifies spectral peaks using energy across all channels void findPeaks() { peaks.resize(0); int start = 0; while (start < bands) { if (energy[start] > smoothedEnergy[start]) { int end = start; Sample bandSum = 0, energySum = 0; while (end < bands && energy[end] > smoothedEnergy[end]) { bandSum += end*energy[end]; energySum += energy[end]; ++end; } Sample avgBand = bandSum/energySum; Sample avgFreq = bandToFreq(avgBand); peaks.emplace_back(Peak{avgBand, freqToBand(mapFreq(avgFreq))}); start = end; } ++start; } } void updateOutputMap() { if (peaks.empty()) { for (int b = 0; b < bands; ++b) { outputMap[b] = {Sample(b), 1}; } return; } Sample bottomOffset = peaks[0].input - peaks[0].output; for (int b = 0; b < std::min(bands, std::ceil(peaks[0].output)); ++b) { outputMap[b] = {b + bottomOffset, 1}; } // Interpolate between points for (size_t p = 1; p < peaks.size(); ++p) { const Peak &prev = peaks[p - 1], &next = peaks[p]; Sample rangeScale = 1/(next.output - prev.output); Sample outOffset = prev.input - prev.output; Sample outScale = next.input - next.output - prev.input + prev.output; Sample gradScale = outScale*rangeScale; int startBin = std::max(0, std::ceil(prev.output)); int endBin = std::min(bands, std::ceil(next.output)); for (int b = startBin; b < endBin; ++b) { Sample r = (b - prev.output)*rangeScale; Sample h = r*r*(3 - 2*r); Sample outB = b + outOffset + h*outScale; Sample gradH = 6*r*(1 - r); Sample gradB = 1 + gradH*gradScale; outputMap[b] = {outB, gradB}; } } Sample topOffset = peaks.back().input - peaks.back().output; for (int b = std::max(0, peaks.back().output); b < bands; ++b) { outputMap[b] = {b + topOffset, 1}; } } // If we mapped formants the same way as mapFreq(), this would be the inverse Sample invMapFormant(Sample freq) const { if (freq*invFormantMultiplier > freqTonalityLimit) { return freq + (1 - formantMultiplier)*freqTonalityLimit; } return freq*invFormantMultiplier; } Sample freqEstimateWeighted = 0; Sample freqEstimateWeight = 0; Sample estimateFrequency() { // 3 highest peaks in the input std::array peakIndices{0, 0, 0}; for (int b = 1; b < bands - 1; ++b) { Sample e = formantMetric[b]; // local maxima only if (e < formantMetric[b - 1] || e <= formantMetric[b + 1]) continue; if (e > formantMetric[peakIndices[0]]) { if (e > formantMetric[peakIndices[1]]) { if (e > formantMetric[peakIndices[2]]) { peakIndices = {peakIndices[1], peakIndices[2], b}; } else { peakIndices = {peakIndices[1], b, peakIndices[2]}; } } else { peakIndices[0] = b; } } } // VERY rough pitch estimation int peakEstimate = peakIndices[2]; if (formantMetric[peakIndices[1]] > formantMetric[peakIndices[2]]*0.1) { int diff = std::abs(peakEstimate - peakIndices[1]); if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff; if (formantMetric[peakIndices[0]] > formantMetric[peakIndices[2]]*0.01) { int diff = std::abs(peakEstimate - peakIndices[0]); if (diff > peakEstimate/8 && diff < peakEstimate*7/8) peakEstimate = peakEstimate%diff; } } Sample weight = formantMetric[peakIndices[2]]; // Smooth it out a bit freqEstimateWeighted += (peakEstimate*weight - freqEstimateWeighted)*0.25; freqEstimateWeight += (weight - freqEstimateWeight)*0.25; return freqEstimateWeighted/(freqEstimateWeight + Sample(1e-30)); } Sample freqEstimate; std::vector formantMetric; Sample formantBaseFreq = 0; void updateFormants(size_t step) { if (step-- == 0) { for (auto &e : formantMetric) e = 0; for (int c = 0; c < channels; ++c) { Band *bins = bandsForChannel(c); for (int b = 0; b < bands; ++b) { formantMetric[b] += bins[b].inputEnergy; } } freqEstimate = freqToBand(formantBaseFreq); if (formantBaseFreq <= 0) freqEstimate = estimateFrequency(); } else if (step-- == 0) { Sample decay = 1 - 1/(freqEstimate*0.5 + 1); Sample e = 0; for (size_t repeat = 0; repeat < 2; ++repeat) { for (int b = bands - 1; b >= 0; --b) { e = std::max(formantMetric[b], e*decay); formantMetric[b] = e; } for (int b = 0; b < bands; ++b) { e = std::max(formantMetric[b], e*decay); formantMetric[b] = e; } } decay = 1/decay; for (size_t repeat = 0; repeat < 2; ++repeat) { for (int b = bands - 1; b >= 0; --b) { e = std::min(formantMetric[b], e*decay); formantMetric[b] = e; } for (int b = 0; b < bands; ++b) { e = std::min(formantMetric[b], e*decay); formantMetric[b] = e; } } } else { auto getFormant = [&](Sample band) -> Sample { if (band < 0) return 0; band = std::min(band, bands); int floorBand = std::floor(band); Sample fracBand = band - floorBand; Sample low = formantMetric[floorBand], high = formantMetric[floorBand + 1]; return low + (high - low)*fracBand; }; for (int b = 0; b < bands; ++b) { Sample inputF = bandToFreq(b); Sample outputF = formantCompensation ? mapFreq(inputF) : inputF; outputF = invMapFormant(outputF); Sample inputE = formantMetric[b]; Sample targetE = getFormant(freqToBand(outputF)); Sample formantRatio = targetE/(inputE + Sample(1e-30)); Sample energyRatio = formantRatio; for (int c = 0; c < channels; ++c) { Band *bins = bandsForChannel(c); // This is what's used to decide the output energy, so this affects the output bins[b].inputEnergy *= energyRatio; } } } } // Proxy class to avoid copying/allocating anything template struct OffsetIO { Io &io; int offset; struct Channel { Io &io; int channel; int offset; auto operator[](int i) -> decltype(io[0][0]) { return io[channel][i + offset]; } }; Channel operator[](int c) { return {io, c, offset}; } }; }; }} // namespace #endif // include guard