/* * Copyright (C) 2010, Google Inc. All rights reserved. * Copyright (C) 2020, Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #if defined(HAVE_ACCELERATE) #include #endif #if defined(HAVE_X86_SSE2) #include #endif #if defined(HAVE_ARM_NEON_INTRINSICS) #include #endif namespace audioapi::dsp { #ifdef HAVE_ACCELERATE void multiplyByScalar( const float *inputVector, float scalar, float *outputVector, size_t numberOfElementsToProcess) { vDSP_vsmul(inputVector, 1, &scalar, outputVector, 1, numberOfElementsToProcess); } void addScalar( const float *inputVector, float scalar, float *outputVector, size_t numberOfElementsToProcess) { vDSP_vsadd(inputVector, 1, &scalar, outputVector, 1, numberOfElementsToProcess); } void add( const float *inputVector1, const float *inputVector2, float *outputVector, size_t numberOfElementsToProcess) { vDSP_vadd(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess); } void subtract( const float *inputVector1, const float *inputVector2, float *outputVector, size_t numberOfElementsToProcess) { vDSP_vsub(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess); } void multiply( const float *inputVector1, const float *inputVector2, float *outputVector, size_t numberOfElementsToProcess) { vDSP_vmul(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess); } float maximumMagnitude(const float *inputVector, size_t numberOfElementsToProcess) { float maximumValue = 0; vDSP_maxmgv(inputVector, 1, &maximumValue, numberOfElementsToProcess); return maximumValue; } void multiplyByScalarThenAddToOutput( const float *inputVector, float scalar, float *outputVector, size_t numberOfElementsToProcess) { vDSP_vsma(inputVector, 1, &scalar, outputVector, 1, outputVector, 1, numberOfElementsToProcess); } void deinterleaveStereo( const float *__restrict inputInterleaved, float *__restrict outputLeft, float *__restrict outputRight, size_t numberOfFrames) { float zero = 0.0f; vDSP_vsadd(inputInterleaved, 2, &zero, outputLeft, 1, numberOfFrames); vDSP_vsadd(inputInterleaved + 1, 2, &zero, outputRight, 1, numberOfFrames); } void interleaveStereo( const float *__restrict inputLeft, const float *__restrict inputRight, float *__restrict outputInterleaved, size_t numberOfFrames) { float zero = 0.0f; vDSP_vsadd(inputLeft, 1, &zero, outputInterleaved, 2, numberOfFrames); vDSP_vsadd(inputRight, 1, &zero, outputInterleaved + 1, 2, numberOfFrames); } #else #ifdef HAVE_X86_SSE2 static inline bool is16ByteAligned(const float *vector) { return !(reinterpret_cast(vector) & 0x0F); } #endif void multiplyByScalar( const float *inputVector, float scalar, float *outputVector, size_t numberOfElementsToProcess) { size_t n = numberOfElementsToProcess; #ifdef HAVE_X86_SSE2 // If the inputVector address is not 16-byte aligned, the first several frames // (at most three) should be processed separately. while (!is16ByteAligned(inputVector) && n) { *outputVector = scalar * *inputVector; inputVector++; outputVector++; n--; } // Now the inputVector address is aligned and start to apply SSE. size_t group = n / 4; __m128 mScale = _mm_set_ps1(scalar); __m128 *pSource; __m128 *pDest; __m128 dest; if (!is16ByteAligned(outputVector)) { while (group--) { pSource = reinterpret_cast<__m128 *>(const_cast(inputVector)); dest = _mm_mul_ps(*pSource, mScale); _mm_storeu_ps(outputVector, dest); inputVector += 4; outputVector += 4; } } else { while (group--) { pSource = reinterpret_cast<__m128 *>(const_cast(inputVector)); pDest = reinterpret_cast<__m128 *>(outputVector); *pDest = _mm_mul_ps(*pSource, mScale); inputVector += 4; outputVector += 4; } } // Non-SSE handling for remaining frames which is less than 4. n %= 4; #elif defined(HAVE_ARM_NEON_INTRINSICS) size_t tailFrames = n % 4; const float *endP = outputVector + n - tailFrames; while (outputVector < endP) { float32x4_t source = vld1q_f32(inputVector); vst1q_f32(outputVector, vmulq_n_f32(source, scalar)); inputVector += 4; outputVector += 4; } n = tailFrames; #endif while (n--) { *outputVector = scalar * *inputVector; ++inputVector; ++outputVector; } } void addScalar( const float *inputVector, float scalar, float *outputVector, size_t numberOfElementsToProcess) { size_t n = numberOfElementsToProcess; #ifdef HAVE_X86_SSE2 // If the inputVector address is not 16-byte aligned, the first several frames // (at most three) should be processed separately. while (!is16ByteAligned(inputVector) && n) { *outputVector = *inputVector + scalar; inputVector++; outputVector++; n--; } // Now the inputVector address is aligned and start to apply SSE. size_t group = n / 4; __m128 mScalar = _mm_set_ps1(scalar); __m128 *pSource; __m128 *pDest; __m128 dest; bool destAligned = is16ByteAligned(outputVector); if (destAligned) { // all aligned while (group--) { pSource = reinterpret_cast<__m128 *>(const_cast(inputVector)); pDest = reinterpret_cast<__m128 *>(outputVector); *pDest = _mm_add_ps(*pSource, mScalar); inputVector += 4; outputVector += 4; } } else { while (group--) { pSource = reinterpret_cast<__m128 *>(const_cast(inputVector)); dest = _mm_add_ps(*pSource, mScalar); _mm_storeu_ps(outputVector, dest); inputVector += 4; outputVector += 4; } } // Non-SSE handling for remaining frames which is less than 4. n %= 4; #elif defined(HAVE_ARM_NEON_INTRINSICS) size_t tailFrames = n % 4; const float *endP = outputVector + n - tailFrames; float32x4_t scalarVector = vdupq_n_f32(scalar); while (outputVector < endP) { float32x4_t source = vld1q_f32(inputVector); vst1q_f32(outputVector, vaddq_f32(source, scalarVector)); inputVector += 4; outputVector += 4; } n = tailFrames; #endif while (n--) { *outputVector = *inputVector + scalar; ++inputVector; ++outputVector; } } void add( const float *inputVector1, const float *inputVector2, float *outputVector, size_t numberOfElementsToProcess) { size_t n = numberOfElementsToProcess; #ifdef HAVE_X86_SSE2 // If the inputVector address is not 16-byte aligned, the first several frames // (at most three) should be processed separately. while (!is16ByteAligned(inputVector1) && n) { *outputVector = *inputVector1 + *inputVector2; inputVector1++; inputVector2++; outputVector++; n--; } // Now the inputVector1 address is aligned and start to apply SSE. size_t group = n / 4; __m128 *pSource1; __m128 *pSource2; __m128 *pDest; __m128 source2; __m128 dest; bool source2Aligned = is16ByteAligned(inputVector2); bool destAligned = is16ByteAligned(outputVector); if (source2Aligned && destAligned) { // all aligned while (group--) { pSource1 = reinterpret_cast<__m128 *>(const_cast(inputVector1)); pSource2 = reinterpret_cast<__m128 *>(const_cast(inputVector2)); pDest = reinterpret_cast<__m128 *>(outputVector); *pDest = _mm_add_ps(*pSource1, *pSource2); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } } else if (source2Aligned && !destAligned) { // source2 aligned but dest not // aligned while (group--) { pSource1 = reinterpret_cast<__m128 *>(const_cast(inputVector1)); pSource2 = reinterpret_cast<__m128 *>(const_cast(inputVector2)); dest = _mm_add_ps(*pSource1, *pSource2); _mm_storeu_ps(outputVector, dest); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } } else if (!source2Aligned && destAligned) { // source2 not aligned but dest // aligned while (group--) { pSource1 = reinterpret_cast<__m128 *>(const_cast(inputVector1)); source2 = _mm_loadu_ps(inputVector2); pDest = reinterpret_cast<__m128 *>(outputVector); *pDest = _mm_add_ps(*pSource1, source2); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } } else if (!source2Aligned && !destAligned) { // both source2 and dest not // aligned while (group--) { pSource1 = reinterpret_cast<__m128 *>(const_cast(inputVector1)); source2 = _mm_loadu_ps(inputVector2); dest = _mm_add_ps(*pSource1, source2); _mm_storeu_ps(outputVector, dest); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } } // Non-SSE handling for remaining frames which is less than 4. n %= 4; #elif defined(HAVE_ARM_NEON_INTRINSICS) size_t tailFrames = n % 4; const float *endP = outputVector + n - tailFrames; while (outputVector < endP) { float32x4_t source1 = vld1q_f32(inputVector1); float32x4_t source2 = vld1q_f32(inputVector2); vst1q_f32(outputVector, vaddq_f32(source1, source2)); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } n = tailFrames; #endif while (n--) { *outputVector = *inputVector1 + *inputVector2; ++inputVector1; ++inputVector2; ++outputVector; } } void subtract( const float *inputVector1, const float *inputVector2, float *outputVector, size_t numberOfElementsToProcess) { size_t n = numberOfElementsToProcess; #ifdef HAVE_X86_SSE2 // If the inputVector address is not 16-byte aligned, the first several frames // (at most three) should be processed separately. while (!is16ByteAligned(inputVector1) && n) { *outputVector = *inputVector1 - *inputVector2; inputVector1++; inputVector2++; outputVector++; n--; } // Now the inputVector1 address is aligned and start to apply SSE. size_t group = n / 4; __m128 *pSource1; __m128 *pSource2; __m128 *pDest; __m128 source2; __m128 dest; bool source2Aligned = is16ByteAligned(inputVector2); bool destAligned = is16ByteAligned(outputVector); if (source2Aligned && destAligned) { // all aligned while (group--) { pSource1 = reinterpret_cast<__m128 *>(const_cast(inputVector1)); pSource2 = reinterpret_cast<__m128 *>(const_cast(inputVector2)); pDest = reinterpret_cast<__m128 *>(outputVector); *pDest = _mm_sub_ps(*pSource1, *pSource2); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } } else if (source2Aligned && !destAligned) { // source2 aligned but dest not // aligned while (group--) { pSource1 = reinterpret_cast<__m128 *>(const_cast(inputVector1)); pSource2 = reinterpret_cast<__m128 *>(const_cast(inputVector2)); dest = _mm_sub_ps(*pSource1, *pSource2); _mm_storeu_ps(outputVector, dest); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } } else if (!source2Aligned && destAligned) { // source2 not aligned but dest // aligned while (group--) { pSource1 = reinterpret_cast<__m128 *>(const_cast(inputVector1)); source2 = _mm_loadu_ps(inputVector2); pDest = reinterpret_cast<__m128 *>(outputVector); *pDest = _mm_sub_ps(*pSource1, source2); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } } else if (!source2Aligned && !destAligned) { // both source2 and dest not // aligned while (group--) { pSource1 = reinterpret_cast<__m128 *>(const_cast(inputVector1)); source2 = _mm_loadu_ps(inputVector2); dest = _mm_sub_ps(*pSource1, source2); _mm_storeu_ps(outputVector, dest); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } } // Non-SSE handling for remaining frames which is less than 4. n %= 4; #elif defined(HAVE_ARM_NEON_INTRINSICS) size_t tailFrames = n % 4; const float *endP = outputVector + n - tailFrames; while (outputVector < endP) { float32x4_t source1 = vld1q_f32(inputVector1); float32x4_t source2 = vld1q_f32(inputVector2); vst1q_f32(outputVector, vsubq_f32(source1, source2)); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } n = tailFrames; #endif while (n--) { *outputVector = *inputVector1 - *inputVector2; ++inputVector1; ++inputVector2; ++outputVector; } } void multiply( const float *inputVector1, const float *inputVector2, float *outputVector, size_t numberOfElementsToProcess) { size_t n = numberOfElementsToProcess; #ifdef HAVE_X86_SSE2 // If the inputVector1 address is not 16-byte aligned, the first several // frames (at most three) should be processed separately. while (!is16ByteAligned(inputVector1) && n) { *outputVector = *inputVector1 * *inputVector2; inputVector1++; inputVector2++; outputVector++; n--; } // Now the inputVector1 address aligned and start to apply SSE. size_t tailFrames = n % 4; const float *endP = outputVector + n - tailFrames; __m128 pSource1; __m128 pSource2; __m128 dest; bool source2Aligned = is16ByteAligned(inputVector2); bool destAligned = is16ByteAligned(outputVector); #define SSE2_MULT(loadInstr, storeInstr) \ while (outputVector < endP) { \ pSource1 = _mm_load_ps(inputVector1); \ pSource2 = _mm_##loadInstr##_ps(inputVector2); \ dest = _mm_mul_ps(pSource1, pSource2); \ _mm_##storeInstr##_ps(outputVector, dest); \ inputVector1 += 4; \ inputVector2 += 4; \ outputVector += 4; \ } if (source2Aligned && destAligned) // Both aligned. SSE2_MULT(load, store) else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. SSE2_MULT(load, storeu) else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. SSE2_MULT(loadu, store) else // Neither aligned. SSE2_MULT(loadu, storeu) n = tailFrames; #elif defined(HAVE_ARM_NEON_INTRINSICS) size_t tailFrames = n % 4; const float *endP = outputVector + n - tailFrames; while (outputVector < endP) { float32x4_t source1 = vld1q_f32(inputVector1); float32x4_t source2 = vld1q_f32(inputVector2); vst1q_f32(outputVector, vmulq_f32(source1, source2)); inputVector1 += 4; inputVector2 += 4; outputVector += 4; } n = tailFrames; #endif while (n--) { *outputVector = *inputVector1 * *inputVector2; ++inputVector1; ++inputVector2; ++outputVector; } } float maximumMagnitude(const float *inputVector, size_t numberOfElementsToProcess) { size_t n = numberOfElementsToProcess; float max = 0; #ifdef HAVE_X86_SSE2 // If the inputVector address is not 16-byte aligned, the first several frames // (at most three) should be processed separately. while (!is16ByteAligned(inputVector) && n) { max = std::max(max, std::abs(*inputVector)); inputVector++; n--; } // Now the inputVector is aligned, use SSE. size_t tailFrames = n % 4; const float *endP = inputVector + n - tailFrames; __m128 source; __m128 mMax = _mm_setzero_ps(); int mask = 0x7FFFFFFF; __m128 mMask = _mm_set1_ps(*reinterpret_cast(&mask)); while (inputVector < endP) { source = _mm_load_ps(inputVector); // Calculate the absolute value by anding source with mask, the sign bit is // set to 0. source = _mm_and_ps(source, mMask); mMax = _mm_max_ps(mMax, source); inputVector += 4; } // Get max from the SSE results. const float *groupMaxP = reinterpret_cast(&mMax); max = std::max(max, groupMaxP[0]); max = std::max(max, groupMaxP[1]); max = std::max(max, groupMaxP[2]); max = std::max(max, groupMaxP[3]); n = tailFrames; #elif defined(c) size_t tailFrames = n % 4; const float *endP = inputVector + n - tailFrames; float32x4_t fourMax = vdupq_n_f32(0); while (inputVector < endP) { float32x4_t source = vld1q_f32(inputVector); fourMax = vmaxq_f32(fourMax, vabsq_f32(source)); inputVector += 4; } float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); float groupMax[2]; vst1_f32(groupMax, twoMax); max = std::max(groupMax[0], groupMax[1]); n = tailFrames; #endif while (n--) { max = std::max(max, std::abs(*inputVector)); ++inputVector; } return max; } void multiplyByScalarThenAddToOutput( const float *inputVector, float scalar, float *outputVector, size_t numberOfElementsToProcess) { size_t n = numberOfElementsToProcess; #if HAVE_X86_SSE2 // If the inputVector address is not 16-byte aligned, the first several frames // (at most three) should be processed separately. while (!is16ByteAligned(inputVector) && n) { *outputVector += scalar * *inputVector; inputVector++; outputVector++; n--; } // Now the inputVector is aligned, use SSE. size_t tailFrames = n % 4; const float *endP = outputVector + n - tailFrames; __m128 pSource; __m128 dest; __m128 temp; __m128 mScale = _mm_set_ps1(scalar); bool destAligned = is16ByteAligned(outputVector); #define SSE2_MULT_ADD(loadInstr, storeInstr) \ while (outputVector < endP) { \ pSource = _mm_load_ps(inputVector); \ temp = _mm_mul_ps(pSource, mScale); \ dest = _mm_##loadInstr##_ps(outputVector); \ dest = _mm_add_ps(dest, temp); \ _mm_##storeInstr##_ps(outputVector, dest); \ inputVector += 4; \ outputVector += 4; \ } if (destAligned) SSE2_MULT_ADD(load, store) else SSE2_MULT_ADD(loadu, storeu) n = tailFrames; #elif HAVE_ARM_NEON_INTRINSICS size_t tailFrames = n % 4; const float *endP = outputVector + n - tailFrames; float32x4_t k = vdupq_n_f32(scalar); while (outputVector < endP) { float32x4_t source = vld1q_f32(inputVector); float32x4_t dest = vld1q_f32(outputVector); dest = vmlaq_f32(dest, source, k); vst1q_f32(outputVector, dest); inputVector += 4; outputVector += 4; } n = tailFrames; #endif while (n--) { *outputVector += *inputVector * scalar; ++inputVector; ++outputVector; } } void deinterleaveStereo( const float *__restrict inputInterleaved, float *__restrict outputLeft, float *__restrict outputRight, size_t numberOfFrames) { size_t n = numberOfFrames; #ifdef HAVE_ARM_NEON_INTRINSICS // process 4 frames (8 samples) at a time using NEON size_t group = n / 4; while (group--) { // vld2q_f32 deinterleaves L and R into separate registers in one hardware op float32x4x2_t v = vld2q_f32(inputInterleaved); vst1q_f32(outputLeft, v.val[0]); vst1q_f32(outputRight, v.val[1]); inputInterleaved += 8; outputLeft += 4; outputRight += 4; } n %= 4; #elif defined(HAVE_X86_SSE2) // process 4 frames (8 samples) at a time using SSE size_t group = n / 4; while (group--) { // load two 128-bit registers (8 floats total) __m128 s0 = _mm_loadu_ps(inputInterleaved); __m128 s1 = _mm_loadu_ps(inputInterleaved + 4); // use shuffle to group the Left samples and Right samples // mask 0x88 (2,0,2,0) picks indices 0 and 2 from both s0 and s1 // mask 0xDD (3,1,3,1) picks indices 1 and 3 from both s0 and s1 __m128 left_v = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(2, 0, 2, 0)); __m128 right_v = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(3, 1, 3, 1)); _mm_storeu_ps(outputLeft, left_v); _mm_storeu_ps(outputRight, right_v); inputInterleaved += 8; outputLeft += 4; outputRight += 4; } n %= 4; #endif while (n--) { *outputLeft++ = *inputInterleaved++; *outputRight++ = *inputInterleaved++; } } void interleaveStereo( const float *__restrict inputLeft, const float *__restrict inputRight, float *__restrict outputInterleaved, size_t numberOfFrames) { size_t n = numberOfFrames; #ifdef HAVE_ARM_NEON_INTRINSICS // process 4 frames (8 samples) at a time size_t group = n / 4; while (group--) { // load contiguous planar data float32x4_t vL = vld1q_f32(inputLeft); float32x4_t vR = vld1q_f32(inputRight); // vst2q_f32 takes two registers and interleaves them during the store: float32x4x2_t vOut = {vL, vR}; vst2q_f32(outputInterleaved, vOut); inputLeft += 4; inputRight += 4; outputInterleaved += 8; } n %= 4; #elif defined(HAVE_X86_SSE2) // process 4 frames (8 samples) at a time size_t group = n / 4; while (group--) { __m128 vL = _mm_loadu_ps(inputLeft); __m128 vR = _mm_loadu_ps(inputRight); // unpack low: Interleaves first two elements of each register __m128 vLow = _mm_unpacklo_ps(vL, vR); // unpack high: Interleaves last two elements of each register __m128 vHigh = _mm_unpackhi_ps(vL, vR); _mm_storeu_ps(outputInterleaved, vLow); _mm_storeu_ps(outputInterleaved + 4, vHigh); inputLeft += 4; inputRight += 4; outputInterleaved += 8; } n %= 4; #endif while (n--) { *outputInterleaved++ = *inputLeft++; *outputInterleaved++ = *inputRight++; } } #endif } // namespace audioapi::dsp