From 6f2f7270dd822670597917d62244d9117ead15cf Mon Sep 17 00:00:00 2001 From: XhmikosR Date: Tue, 10 Apr 2012 16:08:12 +0000 Subject: update SoundTouch to v1.7.0pre r142 git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/trunk@4331 10f7b99b-c216-0410-bff0-8a66a9350fd8 --- src/thirdparty/SoundTouch/include/BPMDetect.h | 2 +- src/thirdparty/SoundTouch/include/STTypes.h | 2 +- src/thirdparty/SoundTouch/include/SoundTouch.h | 4 +- .../SoundTouch/source/FIFOSampleBuffer.cpp | 3 +- src/thirdparty/SoundTouch/source/FIRFilter.cpp | 7 +- src/thirdparty/SoundTouch/source/PeakFinder.cpp | 64 +++- src/thirdparty/SoundTouch/source/PeakFinder.h | 4 + .../SoundTouch/source/RateTransposer.cpp | 6 +- src/thirdparty/SoundTouch/source/SoundTouch.cpp | 7 +- .../SoundTouch/source/SoundTouch.vcxproj | 2 +- .../SoundTouch/source/SoundTouch.vcxproj.filters | 2 +- src/thirdparty/SoundTouch/source/TDStretch.cpp | 340 ++++----------------- src/thirdparty/SoundTouch/source/TDStretch.h | 43 ++- .../SoundTouch/source/cpu_detect_x86.cpp | 139 +++++++++ .../SoundTouch/source/cpu_detect_x86_win.cpp | 137 --------- src/thirdparty/SoundTouch/source/mmx_optimized.cpp | 11 +- src/thirdparty/SoundTouch/source/sse_optimized.cpp | 79 +---- 17 files changed, 301 insertions(+), 551 deletions(-) create mode 100644 src/thirdparty/SoundTouch/source/cpu_detect_x86.cpp delete mode 100644 src/thirdparty/SoundTouch/source/cpu_detect_x86_win.cpp (limited to 'src') diff --git a/src/thirdparty/SoundTouch/include/BPMDetect.h b/src/thirdparty/SoundTouch/include/BPMDetect.h index ff1d3c44f..c5989cfc8 100644 --- a/src/thirdparty/SoundTouch/include/BPMDetect.h +++ b/src/thirdparty/SoundTouch/include/BPMDetect.h @@ -67,7 +67,7 @@ namespace soundtouch #define MIN_BPM 29 /// Maximum allowed BPM rate. Used to restrict accepted result below a reasonable limit. -#define MAX_BPM 230 +#define MAX_BPM 200 /// Class for calculating BPM rate for audio data. diff --git a/src/thirdparty/SoundTouch/include/STTypes.h b/src/thirdparty/SoundTouch/include/STTypes.h index 65772efcd..248683e7d 100644 --- a/src/thirdparty/SoundTouch/include/STTypes.h +++ b/src/thirdparty/SoundTouch/include/STTypes.h @@ -89,7 +89,7 @@ namespace soundtouch #endif - #ifndef _WIN64 // MPC-HC custom code: disable MMX for x64 + #ifndef _M_X64 // MPC-HC custom code: disable optimizations for x64; it fails when linking /// Define this to allow X86-specific assembler/intrinsic optimizations. /// Notice that library contains also usual C++ versions of each of these /// these routines, so if you're having difficulties getting the optimized diff --git a/src/thirdparty/SoundTouch/include/SoundTouch.h b/src/thirdparty/SoundTouch/include/SoundTouch.h index 164de19f5..3e4dbb89d 100644 --- a/src/thirdparty/SoundTouch/include/SoundTouch.h +++ b/src/thirdparty/SoundTouch/include/SoundTouch.h @@ -79,10 +79,10 @@ namespace soundtouch { /// Soundtouch library version string -#define SOUNDTOUCH_VERSION "1.6.1pre" +#define SOUNDTOUCH_VERSION "1.7.0" /// SoundTouch library version id -#define SOUNDTOUCH_VERSION_ID (10601) +#define SOUNDTOUCH_VERSION_ID (10700) // // Available setting IDs for the 'setSetting' & 'get_setting' functions: diff --git a/src/thirdparty/SoundTouch/source/FIFOSampleBuffer.cpp b/src/thirdparty/SoundTouch/source/FIFOSampleBuffer.cpp index 8393f7b0d..f9efee60e 100644 --- a/src/thirdparty/SoundTouch/source/FIFOSampleBuffer.cpp +++ b/src/thirdparty/SoundTouch/source/FIFOSampleBuffer.cpp @@ -47,7 +47,6 @@ #include #include #include -#include #include "FIFOSampleBuffer.h" @@ -175,7 +174,7 @@ void FIFOSampleBuffer::ensureCapacity(uint capacityRequirement) tempUnaligned = new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE) + 16 / sizeof(SAMPLETYPE)]; if (tempUnaligned == NULL) { - throw std::runtime_error("Couldn't allocate memory!\n"); + ST_THROW_RT_ERROR("Couldn't allocate memory!\n"); } // Align the buffer to begin at 16byte cache line boundary for optimal performance temp = (SAMPLETYPE *)(((ulong)tempUnaligned + 15) & (ulong)-16); diff --git a/src/thirdparty/SoundTouch/source/FIRFilter.cpp b/src/thirdparty/SoundTouch/source/FIRFilter.cpp index a2745625c..f882f7efd 100644 --- a/src/thirdparty/SoundTouch/source/FIRFilter.cpp +++ b/src/thirdparty/SoundTouch/source/FIRFilter.cpp @@ -43,7 +43,6 @@ #include #include #include -#include #include "FIRFilter.h" #include "cpu_detect.h" @@ -174,7 +173,7 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint uResultDivFactor) { assert(newLength > 0); - if (newLength % 8) throw std::runtime_error("FIR filter length not divisible by 8"); + if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8"); lengthDiv8 = newLength / 8; length = lengthDiv8 * 8; @@ -222,8 +221,8 @@ uint FIRFilter::evaluate(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSample void * FIRFilter::operator new(size_t s) { // Notice! don't use "new FIRFilter" directly, use "newInstance" to create a new instance instead! - throw std::runtime_error("Error in FIRFilter::new: Don't use 'new FIRFilter', use 'newInstance' member instead!"); - return NULL; + ST_THROW_RT_ERROR("Error in FIRFilter::new: Don't use 'new FIRFilter', use 'newInstance' member instead!"); + return newInstance(); } diff --git a/src/thirdparty/SoundTouch/source/PeakFinder.cpp b/src/thirdparty/SoundTouch/source/PeakFinder.cpp index 9ad601cd9..b122612d2 100644 --- a/src/thirdparty/SoundTouch/source/PeakFinder.cpp +++ b/src/thirdparty/SoundTouch/source/PeakFinder.cpp @@ -55,15 +55,46 @@ PeakFinder::PeakFinder() } +// Finds real 'top' of a peak hump from neighnourhood of the given 'peakpos'. +int PeakFinder::findTop(const float *data, int peakpos) const +{ + int i; + int start, end; + float refvalue; + + refvalue = data[peakpos]; + + // seek within �10 points + start = peakpos - 10; + if (start < minPos) start = minPos; + end = peakpos + 10; + if (end > maxPos) end = maxPos; + + for (i = start; i <= end; i ++) + { + if (data[i] > refvalue) + { + peakpos = i; + refvalue = data[i]; + } + } + + // failure if max value is at edges of seek range => it's not peak, it's at slope. + if ((peakpos == start) || (peakpos == end)) return 0; + + return peakpos; +} + + // Finds 'ground level' of a peak hump by starting from 'peakpos' and proceeding // to direction defined by 'direction' until next 'hump' after minimum value will // begin int PeakFinder::findGround(const float *data, int peakpos, int direction) const { - float refvalue; int lowpos; int pos; int climb_count; + float refvalue; float delta; climb_count = 0; @@ -210,30 +241,41 @@ double PeakFinder::detectPeak(const float *data, int aminPos, int amaxPos) // Now check if the highest peak were in fact harmonic of the true base beat peak // - sometimes the highest peak can be Nth harmonic of the true base peak yet // just a slightly higher than the true base - for (i = 2; i < 10; i ++) + + int hp = (int)(highPeak + 0.5); + + for (i = 3; i < 10; i ++) { - double peaktmp, tmp; + double peaktmp, harmonic; int i1,i2; - peakpos = (int)(highPeak / (double)i + 0.5f); + harmonic = (double)i * 0.5; + peakpos = (int)(highPeak / harmonic + 0.5f); if (peakpos < minPos) break; + peakpos = findTop(data, peakpos); // seek true local maximum index + if (peakpos == 0) continue; // no local max here - // calculate mass-center of possible base peak + // calculate mass-center of possible harmonic peak peaktmp = getPeakCenter(data, peakpos); + // accept harmonic peak if + // (a) it is found + // (b) is within �4% of the expected harmonic interval + // (c) has at least half x-corr value of the max. peak + + double diff = harmonic * peaktmp / highPeak; + if ((diff < 0.96) || (diff > 1.04)) continue; // peak too afar from expected + // now compare to highest detected peak i1 = (int)(highPeak + 0.5); i2 = (int)(peaktmp + 0.5); - tmp = 2 * (data[i2] - data[i1]) / (data[i2] + data[i1]); - if (fabs(tmp) < 0.1) + if (data[i2] >= 0.5*data[i1]) { - // The highest peak is harmonic of almost as high base peak, - // thus use the base peak instead + // The harmonic is at least half as high primary peak, + // thus use the harmonic peak instead peak = peaktmp; } } return peak; } - - diff --git a/src/thirdparty/SoundTouch/source/PeakFinder.h b/src/thirdparty/SoundTouch/source/PeakFinder.h index a72b24f28..fce4dc1be 100644 --- a/src/thirdparty/SoundTouch/source/PeakFinder.h +++ b/src/thirdparty/SoundTouch/source/PeakFinder.h @@ -63,6 +63,10 @@ protected: int direction /// Direction where to proceed from the peak: 1 = right, -1 = left. ) const; + // Finds real 'top' of a peak hump from neighnourhood of the given 'peakpos'. + int findTop(const float *data, int peakpos) const; + + /// Finds the 'ground' level, i.e. smallest level between two neighbouring peaks, to right- /// or left-hand side of the given peak position. int findGround(const float *data, /// Data vector. diff --git a/src/thirdparty/SoundTouch/source/RateTransposer.cpp b/src/thirdparty/SoundTouch/source/RateTransposer.cpp index 2afc18750..772e95b21 100644 --- a/src/thirdparty/SoundTouch/source/RateTransposer.cpp +++ b/src/thirdparty/SoundTouch/source/RateTransposer.cpp @@ -42,11 +42,9 @@ #include #include #include -#include #include "RateTransposer.h" #include "AAFilter.h" -using namespace std; using namespace soundtouch; @@ -108,8 +106,8 @@ public: // depending on if we've a MMX/SSE/etc-capable CPU available or not. void * RateTransposer::operator new(size_t s) { - throw runtime_error("Error in RateTransoser::new: don't use \"new TDStretch\" directly, use \"newInstance\" to create a new instance instead!"); - return NULL; + ST_THROW_RT_ERROR("Error in RateTransoser::new: don't use \"new TDStretch\" directly, use \"newInstance\" to create a new instance instead!"); + return newInstance(); } diff --git a/src/thirdparty/SoundTouch/source/SoundTouch.cpp b/src/thirdparty/SoundTouch/source/SoundTouch.cpp index 6f7b9a894..b67dd2c89 100644 --- a/src/thirdparty/SoundTouch/source/SoundTouch.cpp +++ b/src/thirdparty/SoundTouch/source/SoundTouch.cpp @@ -73,7 +73,6 @@ #include #include #include -#include #include #include "SoundTouch.h" @@ -146,7 +145,7 @@ void SoundTouch::setChannels(uint numChannels) { if (numChannels != 1 && numChannels != 2) { - throw std::runtime_error("Illegal number of channels"); + ST_THROW_RT_ERROR("Illegal number of channels"); } channels = numChannels; pRateTransposer->setChannels((int)numChannels); @@ -295,11 +294,11 @@ void SoundTouch::putSamples(const SAMPLETYPE *samples, uint nSamples) { if (bSrateSet == FALSE) { - throw std::runtime_error("SoundTouch : Sample rate not defined"); + ST_THROW_RT_ERROR("SoundTouch : Sample rate not defined"); } else if (channels == 0) { - throw std::runtime_error("SoundTouch : Number of channels not defined"); + ST_THROW_RT_ERROR("SoundTouch : Number of channels not defined"); } // Transpose the rate of the new samples if necessary diff --git a/src/thirdparty/SoundTouch/source/SoundTouch.vcxproj b/src/thirdparty/SoundTouch/source/SoundTouch.vcxproj index 7e9bc2cac..fda474482 100644 --- a/src/thirdparty/SoundTouch/source/SoundTouch.vcxproj +++ b/src/thirdparty/SoundTouch/source/SoundTouch.vcxproj @@ -116,7 +116,7 @@ - + diff --git a/src/thirdparty/SoundTouch/source/SoundTouch.vcxproj.filters b/src/thirdparty/SoundTouch/source/SoundTouch.vcxproj.filters index 347a988b8..66c309a4e 100644 --- a/src/thirdparty/SoundTouch/source/SoundTouch.vcxproj.filters +++ b/src/thirdparty/SoundTouch/source/SoundTouch.vcxproj.filters @@ -58,7 +58,7 @@ Source Files - + Source Files diff --git a/src/thirdparty/SoundTouch/source/TDStretch.cpp b/src/thirdparty/SoundTouch/source/TDStretch.cpp index 3ef7798a8..54aee423c 100644 --- a/src/thirdparty/SoundTouch/source/TDStretch.cpp +++ b/src/thirdparty/SoundTouch/source/TDStretch.cpp @@ -46,7 +46,6 @@ #include #include #include -#include #include "STTypes.h" #include "cpu_detect.h" @@ -91,7 +90,7 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer) channels = 2; pMidBuffer = NULL; - pRefMidBufferUnaligned = NULL; + pMidBufferUnaligned = NULL; overlapLength = 0; bAutoSeqSetting = TRUE; @@ -111,8 +110,7 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer) TDStretch::~TDStretch() { - delete[] pMidBuffer; - delete[] pRefMidBufferUnaligned; + delete[] pMidBufferUnaligned; } @@ -196,12 +194,17 @@ void TDStretch::getParameters(int *pSampleRate, int *pSequenceMs, int *pSeekWind // Overlaps samples in 'midBuffer' with the samples in 'pInput' void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const { - int i, itemp; + int i; + SAMPLETYPE m1, m2; + + m1 = (SAMPLETYPE)0; + m2 = (SAMPLETYPE)overlapLength; for (i = 0; i < overlapLength ; i ++) { - itemp = overlapLength - i; - pOutput[i] = (pInput[i] * i + pMidBuffer[i] * itemp ) / overlapLength; // >> overlapDividerBits; + pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength; + m1 += 1; + m2 -= 1; } } @@ -247,35 +250,17 @@ BOOL TDStretch::isQuickSeekEnabled() const // Seeks for the optimal overlap-mixing position. int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos) { - if (channels == 2) + if (bQuickSeek) { - // stereo sound - if (bQuickSeek) - { - return seekBestOverlapPositionStereoQuick(refPos); - } - else - { - return seekBestOverlapPositionStereo(refPos); - } + return seekBestOverlapPositionQuick(refPos); } else { - // mono sound - if (bQuickSeek) - { - return seekBestOverlapPositionMonoQuick(refPos); - } - else - { - return seekBestOverlapPositionMono(refPos); - } + return seekBestOverlapPositionFull(refPos); } } - - // Overlaps samples in 'midBuffer' with the samples in 'pInputBuffer' at position // of 'ovlPos'. inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, uint ovlPos) const @@ -292,22 +277,18 @@ inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, ui - // Seeks for the optimal overlap-mixing position. The 'stereo' version of the // routine // // The best position is determined as the position where the two overlapped // sample sequences are 'most alike', in terms of the highest cross-correlation // value over the overlapping period -int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) +int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) { int bestOffs; double bestCorr, corr; int i; - // Slopes the amplitudes of the 'midBuffer' samples - precalcCorrReferenceStereo(); - bestCorr = FLT_MIN; bestOffs = 0; @@ -317,7 +298,7 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) { // Calculates correlation value for the mixing position corresponding // to 'i' - corr = (double)calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer); + corr = calcCrossCorr(refPos + channels * i, pMidBuffer); // heuristic rule to slightly favour values close to mid of the range double tmp = (double)(2 * i - seekLength) / (double)seekLength; corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); @@ -342,16 +323,13 @@ int TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) // The best position is determined as the position where the two overlapped // sample sequences are 'most alike', in terms of the highest cross-correlation // value over the overlapping period -int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) +int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) { int j; int bestOffs; double bestCorr, corr; int scanCount, corrOffset, tempOffset; - // Slopes the amplitude of the 'midBuffer' samples - precalcCorrReferenceStereo(); - bestCorr = FLT_MIN; bestOffs = _scanOffsets[0][0]; corrOffset = 0; @@ -373,7 +351,7 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) // Calculates correlation value for the mixing position corresponding // to 'tempOffset' - corr = (double)calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer); + corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer); // heuristic rule to slightly favour values close to mid of the range double tmp = (double)(2 * tempOffset - seekLength) / seekLength; corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); @@ -396,111 +374,6 @@ int TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) -// Seeks for the optimal overlap-mixing position. The 'mono' version of the -// routine -// -// The best position is determined as the position where the two overlapped -// sample sequences are 'most alike', in terms of the highest cross-correlation -// value over the overlapping period -int TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos) -{ - int bestOffs; - double bestCorr, corr; - int tempOffset; - const SAMPLETYPE *compare; - - // Slopes the amplitude of the 'midBuffer' samples - precalcCorrReferenceMono(); - - bestCorr = FLT_MIN; - bestOffs = 0; - - // Scans for the best correlation value by testing each possible position - // over the permitted range. - for (tempOffset = 0; tempOffset < seekLength; tempOffset ++) - { - compare = refPos + tempOffset; - - // Calculates correlation value for the mixing position corresponding - // to 'tempOffset' - corr = (double)calcCrossCorrMono(pRefMidBuffer, compare); - // heuristic rule to slightly favour values close to mid of the range - double tmp = (double)(2 * tempOffset - seekLength) / seekLength; - corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); - - // Checks for the highest correlation value - if (corr > bestCorr) - { - bestCorr = corr; - bestOffs = tempOffset; - } - } - // clear cross correlation routine state if necessary (is so e.g. in MMX routines). - clearCrossCorrState(); - - return bestOffs; -} - - -// Seeks for the optimal overlap-mixing position. The 'mono' version of the -// routine -// -// The best position is determined as the position where the two overlapped -// sample sequences are 'most alike', in terms of the highest cross-correlation -// value over the overlapping period -int TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos) -{ - int j; - int bestOffs; - double bestCorr, corr; - int scanCount, corrOffset, tempOffset; - - // Slopes the amplitude of the 'midBuffer' samples - precalcCorrReferenceMono(); - - bestCorr = FLT_MIN; - bestOffs = _scanOffsets[0][0]; - corrOffset = 0; - tempOffset = 0; - - // Scans for the best correlation value using four-pass hierarchical search. - // - // The look-up table 'scans' has hierarchical position adjusting steps. - // In first pass the routine searhes for the highest correlation with - // relatively coarse steps, then rescans the neighbourhood of the highest - // correlation with better resolution and so on. - for (scanCount = 0;scanCount < 4; scanCount ++) - { - j = 0; - while (_scanOffsets[scanCount][j]) - { - tempOffset = corrOffset + _scanOffsets[scanCount][j]; - if (tempOffset >= seekLength) break; - - // Calculates correlation value for the mixing position corresponding - // to 'tempOffset' - corr = (double)calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer); - // heuristic rule to slightly favour values close to mid of the range - double tmp = (double)(2 * tempOffset - seekLength) / seekLength; - corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp)); - - // Checks for the highest correlation value - if (corr > bestCorr) - { - bestCorr = corr; - bestOffs = tempOffset; - } - j ++; - } - corrOffset = bestOffs; - } - // clear cross correlation routine state if necessary (is so e.g. in MMX routines). - clearCrossCorrState(); - - return bestOffs; -} - - /// clear cross correlation routine state if necessary void TDStretch::clearCrossCorrState() { @@ -713,15 +586,13 @@ void TDStretch::acceptNewOverlapLength(int newOverlapLength) if (overlapLength > prevOvl) { - delete[] pMidBuffer; - delete[] pRefMidBufferUnaligned; + delete[] pMidBufferUnaligned; - pMidBuffer = new SAMPLETYPE[overlapLength * 2]; - clearMidBuffer(); + pMidBufferUnaligned = new SAMPLETYPE[overlapLength * 2 + 16 / sizeof(SAMPLETYPE)]; + // ensure that 'pMidBuffer' is aligned to 16 byte boundary for efficiency + pMidBuffer = (SAMPLETYPE *)((((ulong)pMidBufferUnaligned) + 15) & (ulong)-16); - pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)]; - // ensure that 'pRefMidBuffer' is aligned to 16 byte boundary for efficiency - pRefMidBuffer = (SAMPLETYPE *)((((ulong)pRefMidBufferUnaligned) + 15) & (ulong)-16); + clearMidBuffer(); } } @@ -731,8 +602,8 @@ void TDStretch::acceptNewOverlapLength(int newOverlapLength) void * TDStretch::operator new(size_t s) { // Notice! don't use "new TDStretch" directly, use "newInstance" to create a new instance instead! - throw std::runtime_error("Error in TDStretch::new: Don't use 'new TDStretch' directly, use 'newInstance' member instead!"); - return NULL; + ST_THROW_RT_ERROR("Error in TDStretch::new: Don't use 'new TDStretch' directly, use 'newInstance' member instead!"); + return newInstance(); } @@ -778,43 +649,6 @@ TDStretch * TDStretch::newInstance() #ifdef SOUNDTOUCH_INTEGER_SAMPLES -// Slopes the amplitude of the 'midBuffer' samples so that cross correlation -// is faster to calculate -void TDStretch::precalcCorrReferenceStereo() -{ - int i, cnt2; - int temp, temp2; - - for (i=0 ; i < (int)overlapLength ;i ++) - { - temp = i * (overlapLength - i); - cnt2 = i * 2; - - temp2 = (pMidBuffer[cnt2] * temp) / slopingDivider; - pRefMidBuffer[cnt2] = (short)(temp2); - temp2 = (pMidBuffer[cnt2 + 1] * temp) / slopingDivider; - pRefMidBuffer[cnt2 + 1] = (short)(temp2); - } -} - - -// Slopes the amplitude of the 'midBuffer' samples so that cross correlation -// is faster to calculate -void TDStretch::precalcCorrReferenceMono() -{ - int i; - long temp; - long temp2; - - for (i=0 ; i < (int)overlapLength ;i ++) - { - temp = i * (overlapLength - i); - temp2 = (pMidBuffer[i] * temp) / slopingDivider; - pRefMidBuffer[i] = (short)temp2; - } -} - - // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Stereo' // version of the routine. void TDStretch::overlapStereo(short *poutput, const short *input) const @@ -865,44 +699,32 @@ void TDStretch::calculateOverlapLength(int aoverlapMs) } -long TDStretch::calcCrossCorrMono(const short *mixingPos, const short *compare) const +double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare) const { long corr; long norm; int i; corr = norm = 0; - for (i = 1; i < overlapLength; i ++) + // Same routine for stereo and mono. For stereo, unroll loop for better + // efficiency and gives slightly better resolution against rounding. + // For mono it same routine, just unrolls loop by factor of 4 + for (i = 0; i < channels * overlapLength; i += 4) { - corr += (mixingPos[i] * compare[i]) >> overlapDividerBits; - norm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBits; + corr += (mixingPos[i] * compare[i] + + mixingPos[i + 1] * compare[i + 1] + + mixingPos[i + 2] * compare[i + 2] + + mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits; + norm += (mixingPos[i] * mixingPos[i] + + mixingPos[i + 1] * mixingPos[i + 1] + + mixingPos[i + 2] * mixingPos[i + 2] + + mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits; } // Normalize result by dividing by sqrt(norm) - this step is easiest // done using floating point operation if (norm == 0) norm = 1; // to avoid div by zero - return (long)((double)corr * SHRT_MAX / sqrt((double)norm)); -} - - -long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare) const -{ - long corr; - long norm; - int i; - - corr = norm = 0; - for (i = 2; i < 2 * overlapLength; i += 2) - { - corr += (mixingPos[i] * compare[i] + - mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits; - norm += (mixingPos[i] * mixingPos[i] + mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBits; - } - - // Normalize result by dividing by sqrt(norm) - this step is easiest - // done using floating point operation - if (norm == 0) norm = 1; // to avoid div by zero - return (long)((double)corr * SHRT_MAX / sqrt((double)norm)); + return (double)corr / sqrt((double)norm); } #endif // SOUNDTOUCH_INTEGER_SAMPLES @@ -914,57 +736,26 @@ long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare #ifdef SOUNDTOUCH_FLOAT_SAMPLES - -// Slopes the amplitude of the 'midBuffer' samples so that cross correlation -// is faster to calculate -void TDStretch::precalcCorrReferenceStereo() -{ - int i, cnt2; - float temp; - - for (i=0 ; i < (int)overlapLength ;i ++) - { - temp = (float)i * (float)(overlapLength - i); - cnt2 = i * 2; - pRefMidBuffer[cnt2] = (float)(pMidBuffer[cnt2] * temp); - pRefMidBuffer[cnt2 + 1] = (float)(pMidBuffer[cnt2 + 1] * temp); - } -} - - -// Slopes the amplitude of the 'midBuffer' samples so that cross correlation -// is faster to calculate -void TDStretch::precalcCorrReferenceMono() -{ - int i; - float temp; - - for (i=0 ; i < (int)overlapLength ;i ++) - { - temp = (float)i * (float)(overlapLength - i); - pRefMidBuffer[i] = (float)(pMidBuffer[i] * temp); - } -} - - // Overlaps samples in 'midBuffer' with the samples in 'pInput' void TDStretch::overlapStereo(float *pOutput, const float *pInput) const { int i; - int cnt2; - float fTemp; float fScale; - float fi; + float f1; + float f2; fScale = 1.0f / (float)overlapLength; - for (i = 0; i < (int)overlapLength ; i ++) + f1 = 0; + f2 = 1.0f; + + for (i = 0; i < 2 * (int)overlapLength ; i += 2) { - fTemp = (float)(overlapLength - i) * fScale; - fi = (float)i * fScale; - cnt2 = 2 * i; - pOutput[cnt2 + 0] = pInput[cnt2 + 0] * fi + pMidBuffer[cnt2 + 0] * fTemp; - pOutput[cnt2 + 1] = pInput[cnt2 + 1] * fi + pMidBuffer[cnt2 + 1] * fTemp; + pOutput[i + 0] = pInput[i + 0] * f1 + pMidBuffer[i + 0] * f2; + pOutput[i + 1] = pInput[i + 1] * f1 + pMidBuffer[i + 1] * f2; + + f1 += fScale; + f2 -= fScale; } } @@ -985,38 +776,29 @@ void TDStretch::calculateOverlapLength(int overlapInMsec) } - -double TDStretch::calcCrossCorrMono(const float *mixingPos, const float *compare) const +double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare) const { double corr; double norm; int i; corr = norm = 0; - for (i = 1; i < overlapLength; i ++) - { - corr += mixingPos[i] * compare[i]; - norm += mixingPos[i] * mixingPos[i]; - } - - if (norm < 1e-9) norm = 1.0; // to avoid div by zero - return corr / sqrt(norm); -} - - -double TDStretch::calcCrossCorrStereo(const float *mixingPos, const float *compare) const -{ - double corr; - double norm; - int i; - - corr = norm = 0; - for (i = 2; i < 2 * overlapLength; i += 2) + // Same routine for stereo and mono. For Stereo, unroll by factor of 2. + // For mono it's same routine yet unrollsd by factor of 4. + for (i = 0; i < channels * overlapLength; i += 4) { corr += mixingPos[i] * compare[i] + mixingPos[i + 1] * compare[i + 1]; + norm += mixingPos[i] * mixingPos[i] + mixingPos[i + 1] * mixingPos[i + 1]; + + // unroll the loop for better CPU efficiency: + corr += mixingPos[i + 2] * compare[i + 2] + + mixingPos[i + 3] * compare[i + 3]; + + norm += mixingPos[i + 2] * mixingPos[i + 2] + + mixingPos[i + 3] * mixingPos[i + 3]; } if (norm < 1e-9) norm = 1.0; // to avoid div by zero diff --git a/src/thirdparty/SoundTouch/source/TDStretch.h b/src/thirdparty/SoundTouch/source/TDStretch.h index c236aa4e7..12ce2cf03 100644 --- a/src/thirdparty/SoundTouch/source/TDStretch.h +++ b/src/thirdparty/SoundTouch/source/TDStretch.h @@ -115,8 +115,7 @@ protected: float tempo; SAMPLETYPE *pMidBuffer; - SAMPLETYPE *pRefMidBuffer; - SAMPLETYPE *pRefMidBufferUnaligned; + SAMPLETYPE *pMidBufferUnaligned; int overlapLength; int seekLength; int seekWindowLength; @@ -127,8 +126,6 @@ protected: FIFOSampleBuffer outputBuffer; FIFOSampleBuffer inputBuffer; BOOL bQuickSeek; -// int outDebt; -// BOOL bMidBufferDirty; int sampleRate; int sequenceMs; @@ -142,13 +139,10 @@ protected: virtual void clearCrossCorrState(); void calculateOverlapLength(int overlapMs); - virtual LONG_SAMPLETYPE calcCrossCorrStereo(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const; - virtual LONG_SAMPLETYPE calcCrossCorrMono(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const; + virtual double calcCrossCorr(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const; - virtual int seekBestOverlapPositionStereo(const SAMPLETYPE *refPos); - virtual int seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos); - virtual int seekBestOverlapPositionMono(const SAMPLETYPE *refPos); - virtual int seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos); + virtual int seekBestOverlapPositionFull(const SAMPLETYPE *refPos); + virtual int seekBestOverlapPositionQuick(const SAMPLETYPE *refPos); int seekBestOverlapPosition(const SAMPLETYPE *refPos); virtual void overlapStereo(SAMPLETYPE *output, const SAMPLETYPE *input) const; @@ -157,9 +151,6 @@ protected: void clearMidBuffer(); void overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const; - void precalcCorrReferenceMono(); - void precalcCorrReferenceStereo(); - void calcSeqParameters(); /// Changes the tempo of the given sound samples. @@ -234,17 +225,17 @@ public: ///< contains both channels if stereo ); - /// return nominal input sample requirement for triggering a processing batch - int getInputSampleReq() const - { - return (int)(nominalSkip + 0.5); - } - - /// return nominal output sample amount when running a processing batch - int getOutputBatchSize() const - { - return seekWindowLength - overlapLength; - } + /// return nominal input sample requirement for triggering a processing batch + int getInputSampleReq() const + { + return (int)(nominalSkip + 0.5); + } + + /// return nominal output sample amount when running a processing batch + int getOutputBatchSize() const + { + return seekWindowLength - overlapLength; + } }; @@ -256,7 +247,7 @@ public: class TDStretchMMX : public TDStretch { protected: - long calcCrossCorrStereo(const short *mixingPos, const short *compare) const; + double calcCrossCorr(const short *mixingPos, const short *compare) const; virtual void overlapStereo(short *output, const short *input) const; virtual void clearCrossCorrState(); }; @@ -268,7 +259,7 @@ public: class TDStretchSSE : public TDStretch { protected: - double calcCrossCorrStereo(const float *mixingPos, const float *compare) const; + double calcCrossCorr(const float *mixingPos, const float *compare) const; }; #endif /// SOUNDTOUCH_ALLOW_SSE diff --git a/src/thirdparty/SoundTouch/source/cpu_detect_x86.cpp b/src/thirdparty/SoundTouch/source/cpu_detect_x86.cpp new file mode 100644 index 000000000..d0b5ea73e --- /dev/null +++ b/src/thirdparty/SoundTouch/source/cpu_detect_x86.cpp @@ -0,0 +1,139 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// Generic version of the x86 CPU extension detection routine. +/// +/// This file is for GNU & other non-Windows compilers, see 'cpu_detect_x86_win.cpp' +/// for the Microsoft compiler version. +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai 'at' iki.fi +/// SoundTouch WWW: http://www.surina.net/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// Last changed : $Date$ +// File revision : $Revision: 4 $ +// +// $Id$ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#include "cpu_detect.h" +#include "STTypes.h" + +#if defined(SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS) + + #if defined(__GNUC__) && defined(__i386__) + // gcc + #include "cpuid.h" + #endif + + #if defined(_M_IX86) + // windows + #include + #define bit_MMX (1 << 23) + #define bit_SSE (1 << 25) + #define bit_SSE2 (1 << 26) + #endif + +#endif + + +////////////////////////////////////////////////////////////////////////////// +// +// processor instructions extension detection routines +// +////////////////////////////////////////////////////////////////////////////// + +// Flag variable indicating whick ISA extensions are disabled (for debugging) +static uint _dwDisabledISA = 0x00; // 0xffffffff; //<- use this to disable all extensions + +// Disables given set of instruction extensions. See SUPPORT_... defines. +void disableExtensions(uint dwDisableMask) +{ + _dwDisabledISA = dwDisableMask; +} + + + +/// Checks which instruction set extensions are supported by the CPU. +uint detectCPUextensions(void) +{ +/// If building for a 64bit system (no Itanium) and the user wants optimizations. +/// Return the OR of SUPPORT_{MMX,SSE,SSE2}. 11001 or 0x19. +/// Keep the _dwDisabledISA test (2 more operations, could be eliminated). +#if ((defined(__GNUC__) && defined(__x86_64__)) \ + || defined(_M_X64)) \ + && defined(SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS) + return 0x19 & ~_dwDisabledISA; + +/// If building for a 32bit system and the user wants optimizations. +/// Keep the _dwDisabledISA test (2 more operations, could be eliminated). +#elif ((defined(__GNUC__) && defined(__i386__)) \ + || defined(_M_IX86)) \ + && defined(SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS) + + if (_dwDisabledISA == 0xffffffff) return 0; + + uint res = 0; + +#if defined(__GNUC__) + // GCC version of cpuid. Requires GCC 4.3.0 or later for __cpuid intrinsic support. + uint eax, ebx, ecx, edx; // unsigned int is the standard type. uint is defined by the compiler and not guaranteed to be portable. + + // Check if no cpuid support. + if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx)) return 0; // always disable extensions. + + if (edx & bit_MMX) res = res | SUPPORT_MMX; + if (edx & bit_SSE) res = res | SUPPORT_SSE; + if (edx & bit_SSE2) res = res | SUPPORT_SSE2; + +#else + // Window / VS version of cpuid. Notice that Visual Studio 2005 or later required + // for __cpuid intrinsic support. + int reg[4] = {-1}; + + // Check if no cpuid support. + __cpuid(reg,0); + if ((unsigned int)reg[0] == 0) return 0; // always disable extensions. + + __cpuid(reg,1); + if ((unsigned int)reg[3] & bit_MMX) res = res | SUPPORT_MMX; + if ((unsigned int)reg[3] & bit_SSE) res = res | SUPPORT_SSE; + if ((unsigned int)reg[3] & bit_SSE2) res = res | SUPPORT_SSE2; + +#endif + + return res & ~_dwDisabledISA; + +#else + +/// One of these is true: +/// 1) We don't want optimizations. +/// 2) Using an unsupported compiler. +/// 3) Running on a non-x86 platform. + return 0; + +#endif +} diff --git a/src/thirdparty/SoundTouch/source/cpu_detect_x86_win.cpp b/src/thirdparty/SoundTouch/source/cpu_detect_x86_win.cpp deleted file mode 100644 index 44e0520d6..000000000 --- a/src/thirdparty/SoundTouch/source/cpu_detect_x86_win.cpp +++ /dev/null @@ -1,137 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -/// -/// Win32 version of the x86 CPU detect routine. -/// -/// This file is to be compiled in Windows platform with Microsoft Visual C++ -/// Compiler. Please see 'cpu_detect_x86_gcc.cpp' for the gcc compiler version -/// for all GNU platforms. -/// -/// Author : Copyright (c) Olli Parviainen -/// Author e-mail : oparviai 'at' iki.fi -/// SoundTouch WWW: http://www.surina.net/soundtouch -/// -//////////////////////////////////////////////////////////////////////////////// -// -// Last changed : $Date$ -// File revision : $Revision: 4 $ -// -// $Id$ -// -//////////////////////////////////////////////////////////////////////////////// -// -// License : -// -// SoundTouch audio processing library -// Copyright (c) Olli Parviainen -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -//////////////////////////////////////////////////////////////////////////////// - -#include "cpu_detect.h" - -#include "STTypes.h" - -////////////////////////////////////////////////////////////////////////////// -// -// processor instructions extension detection routines -// -////////////////////////////////////////////////////////////////////////////// - -// Flag variable indicating whick ISA extensions are disabled (for debugging) -static uint _dwDisabledISA = 0x00; // 0xffffffff; //<- use this to disable all extensions - - -// Disables given set of instruction extensions. See SUPPORT_... defines. -void disableExtensions(uint dwDisableMask) -{ - _dwDisabledISA = dwDisableMask; -} - - - -/// Checks which instruction set extensions are supported by the CPU. -uint detectCPUextensions(void) -{ - uint res = 0; - - if (_dwDisabledISA == 0xffffffff) return 0; - -#ifndef _M_X64 - // 32bit compilation, detect CPU capabilities with inline assembler. - __asm - { - ; check if 'cpuid' instructions is available by toggling eflags bit 21 - ; - xor esi, esi ; clear esi = result register - - pushfd ; save eflags to stack - mov eax,dword ptr [esp] ; load eax from stack (with eflags) - mov ecx, eax ; save the original eflags values to ecx - xor eax, 0x00200000 ; toggle bit 21 - mov dword ptr [esp],eax ; store toggled eflags to stack - popfd ; load eflags from stack - - pushfd ; save updated eflags to stack - mov eax,dword ptr [esp] ; load eax from stack - popfd ; pop stack to restore stack pointer - - xor edx, edx ; clear edx for defaulting no mmx - cmp eax, ecx ; compare to original eflags values - jz end ; jumps to 'end' if cpuid not present - - ; cpuid instruction available, test for presence of mmx instructions - mov eax, 1 - cpuid - test edx, 0x00800000 - jz end ; branch if MMX not available - - or esi, SUPPORT_MMX ; otherwise add MMX support bit - - test edx, 0x02000000 - jz test3DNow ; branch if SSE not available - - or esi, SUPPORT_SSE ; otherwise add SSE support bit - - test3DNow: - ; test for precense of AMD extensions - mov eax, 0x80000000 - cpuid - cmp eax, 0x80000000 - jbe end ; branch if no AMD extensions detected - - ; test for precense of 3DNow! extension - mov eax, 0x80000001 - cpuid - test edx, 0x80000000 - jz end ; branch if 3DNow! not detected - - or esi, SUPPORT_3DNOW ; otherwise add 3DNow support bit - - end: - - mov res, esi - } - -#else - - // Visual C++ 64bit compilation doesn't support inline assembler. However, - // all x64 compatible CPUs support MMX & SSE extensions. - res = SUPPORT_MMX | SUPPORT_SSE | SUPPORT_SSE2; - -#endif - - return res & ~_dwDisabledISA; -} diff --git a/src/thirdparty/SoundTouch/source/mmx_optimized.cpp b/src/thirdparty/SoundTouch/source/mmx_optimized.cpp index feeab49a6..684bad0e9 100644 --- a/src/thirdparty/SoundTouch/source/mmx_optimized.cpp +++ b/src/thirdparty/SoundTouch/source/mmx_optimized.cpp @@ -68,7 +68,7 @@ using namespace soundtouch; // Calculates cross correlation of two buffers -long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const +double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2) const { const __m64 *pVec1, *pVec2; __m64 shifter; @@ -82,9 +82,9 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const shifter = _m_from_int(overlapDividerBits); normaccu = accu = _mm_setzero_si64(); - // Process 4 parallel sets of 2 * stereo samples each during each - // round to improve CPU-level parallellization. - for (i = 0; i < overlapLength / 8; i ++) + // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples + // during each round for improved CPU-level parallellization. + for (i = 0; i < channels * overlapLength / 16; i ++) { __m64 temp, temp2; @@ -126,7 +126,8 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const // Normalize result by dividing by sqrt(norm) - this step is easiest // done using floating point operation if (norm == 0) norm = 1; // to avoid div by zero - return (long)((double)corr * USHRT_MAX / sqrt((double)norm)); + + return (double)corr / sqrt((double)norm); // Note: Warning about the missing EMMS instruction is harmless // as it'll be called elsewhere. } diff --git a/src/thirdparty/SoundTouch/source/sse_optimized.cpp b/src/thirdparty/SoundTouch/source/sse_optimized.cpp index d989ad5aa..ddafb08e3 100644 --- a/src/thirdparty/SoundTouch/source/sse_optimized.cpp +++ b/src/thirdparty/SoundTouch/source/sse_optimized.cpp @@ -71,7 +71,7 @@ using namespace soundtouch; #include // Calculates cross correlation of two buffers -double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const +double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2) const { int i; const float *pVec1; @@ -110,8 +110,9 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con pVec2 = (const __m128*)pV2; vSum = vNorm = _mm_setzero_ps(); - // Unroll the loop by factor of 4 * 4 operations - for (i = 0; i < overlapLength / 8; i ++) + // Unroll the loop by factor of 4 * 4 operations. Use same routine for + // stereo & mono, for mono it just means twice the amount of unrolling. + for (i = 0; i < channels * overlapLength / 16; i ++) { __m128 vTemp; // vSum += pV1[0..3] * pV2[0..3] @@ -152,7 +153,7 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors corr = norm = 0.0; - for (i = 0; i < overlapLength / 8; i ++) + for (i = 0; i < channels * overlapLength / 16; i ++) { corr += pV1[0] * pV2[0] + pV1[1] * pV2[1] + @@ -171,81 +172,13 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con pV1[14] * pV2[14] + pV1[15] * pV2[15]; - for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j]; + for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j]; pV1 += 16; pV2 += 16; } return corr / sqrt(norm); */ - - /* This is a bit outdated, corresponding routine in assembler. This may be teeny-weeny bit - faster than intrinsic version, but more difficult to maintain & get compiled on multiple - platforms. - - uint overlapLengthLocal = overlapLength; - float corr; - - _asm - { - // Very important note: data in 'pV2' _must_ be aligned to - // 16-byte boundary! - - // give prefetch hints to CPU of what data are to be needed soonish - // give more aggressive hints on pV1 as that changes while pV2 stays - // same between runs - prefetcht0 [pV1] - prefetcht0 [pV2] - prefetcht0 [pV1 + 32] - - mov eax, dword ptr pV1 - mov ebx, dword ptr pV2 - - xorps xmm0, xmm0 - - mov ecx, overlapLengthLocal - shr ecx, 3 // div by eight - - loop1: - prefetcht0 [eax + 64] // give a prefetch hint to CPU what data are to be needed soonish - prefetcht0 [ebx + 32] // give a prefetch hint to CPU what data are to be needed soonish - movups xmm1, [eax] - mulps xmm1, [ebx] - addps xmm0, xmm1 - - movups xmm2, [eax + 16] - mulps xmm2, [ebx + 16] - addps xmm0, xmm2 - - prefetcht0 [eax + 96] // give a prefetch hint to CPU what data are to be needed soonish - prefetcht0 [ebx + 64] // give a prefetch hint to CPU what data are to be needed soonish - - movups xmm3, [eax + 32] - mulps xmm3, [ebx + 32] - addps xmm0, xmm3 - - movups xmm4, [eax + 48] - mulps xmm4, [ebx + 48] - addps xmm0, xmm4 - - add eax, 64 - add ebx, 64 - - dec ecx - jnz loop1 - - // add the four floats of xmm0 together and return the result. - - movhlps xmm1, xmm0 // move 3 & 4 of xmm0 to 1 & 2 of xmm1 - addps xmm1, xmm0 - movaps xmm2, xmm1 - shufps xmm2, xmm2, 0x01 // move 2 of xmm2 as 1 of xmm2 - addss xmm2, xmm1 - movss corr, xmm2 - } - - return (double)corr; - */ } -- cgit v1.2.3