diff options
author | Torne (Richard Coles) <torne@google.com> | 2013-03-28 15:31:22 +0000 |
---|---|---|
committer | Torne (Richard Coles) <torne@google.com> | 2013-03-28 15:31:22 +0000 |
commit | 2a99a7e74a7f215066514fe81d2bfa6639d9eddd (patch) | |
tree | 7c2d04841fcd599fd83b0f0bb1100e1c89a35bae /media/base/sinc_resampler.cc | |
parent | 61c449bbbb53310a8c041d8cefdd6b01a126cc7e (diff) | |
download | chromium_org-2a99a7e74a7f215066514fe81d2bfa6639d9eddd.tar.gz |
Merge from Chromium at DEPS revision r190564
This commit was generated by merge_to_master.py.
Change-Id: Icadecbce29854b8fa25fd335b2c1949b5ca5d170
Diffstat (limited to 'media/base/sinc_resampler.cc')
-rw-r--r-- | media/base/sinc_resampler.cc | 126 |
1 files changed, 28 insertions, 98 deletions
diff --git a/media/base/sinc_resampler.cc b/media/base/sinc_resampler.cc index d836fc7cbc..00f9314c61 100644 --- a/media/base/sinc_resampler.cc +++ b/media/base/sinc_resampler.cc @@ -40,11 +40,6 @@ #include "base/cpu.h" #include "base/logging.h" -#include "build/build_config.h" - -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) -#include <xmmintrin.h> -#endif #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) #include <arm_neon.h> @@ -52,33 +47,6 @@ namespace media { -namespace { - -enum { - // The kernel size can be adjusted for quality (higher is better) at the - // expense of performance. Must be a multiple of 32. - // TODO(dalecurtis): Test performance to see if we can jack this up to 64+. - kKernelSize = 32, - - // The number of destination frames generated per processing pass. Affects - // how often and for how much SincResampler calls back for input. Must be - // greater than kKernelSize. - kBlockSize = 512, - - // The kernel offset count is used for interpolation and is the number of - // sub-sample kernel shifts. Can be adjusted for quality (higher is better) - // at the expense of allocating more memory. - kKernelOffsetCount = 32, - kKernelStorageSize = kKernelSize * (kKernelOffsetCount + 1), - - // The size (in samples) of the internal buffer used by the resampler. - kBufferSize = kBlockSize + kKernelSize -}; - -} // namespace - -const int SincResampler::kMaximumLookAheadSize = kBufferSize; - SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb) : io_sample_rate_ratio_(io_sample_rate_ratio), virtual_source_idx_(0), @@ -89,6 +57,9 @@ SincResampler::SincResampler(double io_sample_rate_ratio, const ReadCB& read_cb) base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))), input_buffer_(static_cast<float*>( base::AlignedAlloc(sizeof(float) * kBufferSize, 16))), +#if defined(ARCH_CPU_X86_FAMILY) && !defined(__SSE__) + convolve_proc_(base::CPU().has_sse() ? Convolve_SSE : Convolve_C), +#endif // Setup various region pointers in the buffer (see diagram above). r0_(input_buffer_.get() + kKernelSize / 2), r1_(input_buffer_.get()), @@ -168,6 +139,22 @@ void SincResampler::InitializeKernel() { } } +// If we know the minimum architecture avoid function hopping for CPU detection. +#if defined(ARCH_CPU_X86_FAMILY) +#if defined(__SSE__) +#define CONVOLVE_FUNC Convolve_SSE +#else +// X86 CPU detection required. |convolve_proc_| will be set upon construction. +// TODO(dalecurtis): Once Chrome moves to a SSE baseline this can be removed. +#define CONVOLVE_FUNC convolve_proc_ +#endif +#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) +#define CONVOLVE_FUNC Convolve_NEON +#else +// Unknown architecture. +#define CONVOLVE_FUNC Convolve_C +#endif + void SincResampler::Resample(float* destination, int frames) { int remaining_frames = frames; @@ -193,12 +180,17 @@ void SincResampler::Resample(float* destination, int frames) { float* k1 = kernel_storage_.get() + offset_idx * kKernelSize; float* k2 = k1 + kKernelSize; + // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be + // true so long as kKernelSize is a multiple of 16. + DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); + DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); + // Initialize input pointer based on quantized |virtual_source_idx_|. float* input_ptr = r1_ + source_idx; // Figure out how much to weight each kernel's "convolution". double kernel_interpolation_factor = virtual_offset_idx - offset_idx; - *destination++ = Convolve( + *destination++ = CONVOLVE_FUNC( input_ptr, k1, k2, kernel_interpolation_factor); // Advance the virtual index. @@ -222,7 +214,9 @@ void SincResampler::Resample(float* destination, int frames) { } } -int SincResampler::ChunkSize() { +#undef CONVOLVE_FUNC + +int SincResampler::ChunkSize() const { return kBlockSize / io_sample_rate_ratio_; } @@ -232,26 +226,6 @@ void SincResampler::Flush() { memset(input_buffer_.get(), 0, sizeof(*input_buffer_.get()) * kBufferSize); } -float SincResampler::Convolve(const float* input_ptr, const float* k1, - const float* k2, - double kernel_interpolation_factor) { - // Rely on function level static initialization to keep ConvolveProc selection - // thread safe. - typedef float (*ConvolveProc)(const float* src, const float* k1, - const float* k2, - double kernel_interpolation_factor); -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) - static const ConvolveProc kConvolveProc = - base::CPU().has_sse() ? Convolve_SSE : Convolve_C; -#elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) - static const ConvolveProc kConvolveProc = Convolve_NEON; -#else - static const ConvolveProc kConvolveProc = Convolve_C; -#endif - - return kConvolveProc(input_ptr, k1, k2, kernel_interpolation_factor); -} - float SincResampler::Convolve_C(const float* input_ptr, const float* k1, const float* k2, double kernel_interpolation_factor) { @@ -271,50 +245,6 @@ float SincResampler::Convolve_C(const float* input_ptr, const float* k1, + kernel_interpolation_factor * sum2; } -#if defined(ARCH_CPU_X86_FAMILY) && defined(__SSE__) -float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1, - const float* k2, - double kernel_interpolation_factor) { - // Ensure |k1|, |k2| are 16-byte aligned for SSE usage. Should always be true - // so long as kKernelSize is a multiple of 16. - DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F); - DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F); - - __m128 m_input; - __m128 m_sums1 = _mm_setzero_ps(); - __m128 m_sums2 = _mm_setzero_ps(); - - // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling - // these loops hurt performance in local testing. - if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) { - for (int i = 0; i < kKernelSize; i += 4) { - m_input = _mm_loadu_ps(input_ptr + i); - m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); - m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); - } - } else { - for (int i = 0; i < kKernelSize; i += 4) { - m_input = _mm_load_ps(input_ptr + i); - m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i))); - m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i))); - } - } - - // Linearly interpolate the two "convolutions". - m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor)); - m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor)); - m_sums1 = _mm_add_ps(m_sums1, m_sums2); - - // Sum components together. - float result; - m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1); - _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps( - m_sums2, m_sums2, 1))); - - return result; -} -#endif - #if defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON) float SincResampler::Convolve_NEON(const float* input_ptr, const float* k1, const float* k2, |