sse_optimized.cpp   sse_optimized.cpp 
skipping to change at line 26 skipping to change at line 26
/// ///
/// If the above URL is expired or removed, go to "http://msdn.microsoft.co m" and /// If the above URL is expired or removed, go to "http://msdn.microsoft.co m" and
/// perform a search with keywords "processor pack". /// perform a search with keywords "processor pack".
/// ///
/// Author : Copyright (c) Olli Parviainen /// Author : Copyright (c) Olli Parviainen
/// Author e-mail : oparviai 'at' iki.fi /// Author e-mail : oparviai 'at' iki.fi
/// SoundTouch WWW: http://www.surina.net/soundtouch /// SoundTouch WWW: http://www.surina.net/soundtouch
/// ///
/////////////////////////////////////////////////////////////////////////// ///// /////////////////////////////////////////////////////////////////////////// /////
// //
// Last changed : $Date: 2015-02-21 21:24:29 +0000 (Sat, 21 Feb 2015) $ // Last changed : $Date: 2015-08-09 00:00:15 +0300 (Sun, 09 Aug 2015) $
// File revision : $Revision: 4 $ // File revision : $Revision: 4 $
// //
// $Id: sse_optimized.cpp 202 2015-02-21 21:24:29Z oparviai $ // $Id: sse_optimized.cpp 226 2015-08-08 21:00:15Z oparviai $
// //
/////////////////////////////////////////////////////////////////////////// ///// /////////////////////////////////////////////////////////////////////////// /////
// //
// License : // License :
// //
// SoundTouch audio processing library // SoundTouch audio processing library
// Copyright (c) Olli Parviainen // Copyright (c) Olli Parviainen
// //
// This library is free software; you can redistribute it and/or // This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public // modify it under the terms of the GNU Lesser General Public
skipping to change at line 74 skipping to change at line 74
// //
// implementation of SSE optimized functions of class 'TDStretchSSE' // implementation of SSE optimized functions of class 'TDStretchSSE'
// //
/////////////////////////////////////////////////////////////////////////// /// /////////////////////////////////////////////////////////////////////////// ///
#include "TDStretch.h" #include "TDStretch.h"
#include <xmmintrin.h> #include <xmmintrin.h>
#include <math.h> #include <math.h>
// Calculates cross correlation of two buffers // Calculates cross correlation of two buffers
double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, doub le &anorm) const double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, doub le &anorm)
{ {
int i; int i;
const float *pVec1; const float *pVec1;
const __m128 *pVec2; const __m128 *pVec2;
__m128 vSum, vNorm; __m128 vSum, vNorm;
// Note. It means a major slow-down if the routine needs to tolerate // Note. It means a major slow-down if the routine needs to tolerate
// unaligned __m128 memory accesses. It's way faster if we can skip // unaligned __m128 memory accesses. It's way faster if we can skip
// unaligned slots and use _mm_load_ps instruction instead of _mm_loadu _ps. // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu _ps.
// This can mean up to ~ 10-fold difference (incl. part of which is // This can mean up to ~ 10-fold difference (incl. part of which is
skipping to change at line 184 skipping to change at line 184
for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j]; for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j];
pV1 += 16; pV1 += 16;
pV2 += 16; pV2 += 16;
} }
return corr / sqrt(norm); return corr / sqrt(norm);
*/ */
} }
double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm) const double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm)
{ {
// call usual calcCrossCorr function because SSE does not show big bene fit of // call usual calcCrossCorr function because SSE does not show big bene fit of
// accumulating "norm" value, and also the "norm" rolling algorithm wou ld get // accumulating "norm" value, and also the "norm" rolling algorithm wou ld get
// complicated due to SSE-specific alignment-vs-nonexact correlation ru les. // complicated due to SSE-specific alignment-vs-nonexact correlation ru les.
return calcCrossCorr(pV1, pV2, norm); return calcCrossCorr(pV1, pV2, norm);
} }
/////////////////////////////////////////////////////////////////////////// /// /////////////////////////////////////////////////////////////////////////// ///
// //
// implementation of SSE optimized functions of class 'FIRFilter' // implementation of SSE optimized functions of class 'FIRFilter'
 End of changes. 4 change blocks. 
4 lines changed or deleted 4 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/