diff options
Diffstat (limited to 'src/modules/audio_coding/codecs/isac/fix/source/filters_neon.c')
-rw-r--r-- | src/modules/audio_coding/codecs/isac/fix/source/filters_neon.c | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/src/modules/audio_coding/codecs/isac/fix/source/filters_neon.c b/src/modules/audio_coding/codecs/isac/fix/source/filters_neon.c new file mode 100644 index 0000000000..93143fe432 --- /dev/null +++ b/src/modules/audio_coding/codecs/isac/fix/source/filters_neon.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * filters_neon.c + * + * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for + * ARM Neon platform. + * + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "codec.h" + +// Autocorrelation function in fixed point. +// NOTE! Different from SPLIB-version in how it scales the signal. +int WebRtcIsacfix_AutocorrNeon( + WebRtc_Word32* __restrict r, + const WebRtc_Word16* __restrict x, + WebRtc_Word16 N, + WebRtc_Word16 order, + WebRtc_Word16* __restrict scale) { + + // The 1st for loop assumed N % 4 == 0. + assert(N % 4 == 0); + + int i = 0; + int zeros_low = 0; + int zeros_high = 0; + int16_t scaling = 0; + int32_t sum = 0; + + // Step 1, calculate r[0] and how much scaling is needed. + + int16x4_t reg16x4; + int64x1_t reg64x1a; + int64x1_t reg64x1b; + int32x4_t reg32x4; + int64x2_t reg64x2 = vdupq_n_s64(0); // zeros + + // Loop over the samples and do: + // sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]); + for (i = 0; i < N; i += 4) { + reg16x4 = vld1_s16(&x[i]); + reg32x4 = vmull_s16(reg16x4, reg16x4); + reg64x2 = vpadalq_s32(reg64x2, reg32x4); + } + reg64x1a = vget_low_s64(reg64x2); + reg64x1b = vget_high_s64(reg64x2); + reg64x1a = vadd_s64(reg64x1a, reg64x1b); + + // Calculate the value of shifting (scaling). + __asm__ __volatile__( + "vmov %[z_l], %[z_h], %P[reg]\n\t" + "clz %[z_l], %[z_l]\n\t" + "clz %[z_h], %[z_h]\n\t" + :[z_l]"+r"(zeros_low), + [z_h]"+r"(zeros_high) + :[reg]"w"(reg64x1a) + ); + if (zeros_high != 32) { + scaling = (32 - zeros_high + 1); + } else if (zeros_low == 0) { + scaling = 1; + } + reg64x1b = -scaling; + reg64x1a = vshl_s64(reg64x1a, reg64x1b); + + // Record the result. + r[0] = (int32_t)vget_lane_s64(reg64x1a, 0); + + + // Step 2, perform the actual correlation calculation. + + /* Original C code (for the rest of the function): + for (i = 1; i < order + 1; i++) { + prod = 0; + for (j = 0; j < N - i; j++) { + prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]); + } + sum = (int32_t)(prod >> scaling); + r[i] = sum; + } + */ + + for (i = 1; i < order + 1; i++) { + int32_t prod_lower = 0; + int32_t prod_upper = 0; + const int16_t* ptr0 = &x[0]; + const int16_t* ptr1 = &x[i]; + int32_t tmp = 0; + + // Initialize the sum (q9) to zero. + __asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9"); + + // Calculate the major block of the samples (a multiple of 8). + for (; ptr0 < &x[N - i - 7];) { + __asm__ __volatile__( + "vld1.16 {d20, d21}, [%[ptr0]]!\n\t" + "vld1.16 {d22, d23}, [%[ptr1]]!\n\t" + "vmull.s16 q12, d20, d22\n\t" + "vmull.s16 q13, d21, d23\n\t" + "vpadal.s32 q9, q12\n\t" + "vpadal.s32 q9, q13\n\t" + + // Specify constraints. + :[ptr0]"+r"(ptr0), + [ptr1]"+r"(ptr1) + : + :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27" + ); + } + + // Calculate the rest of the samples. + for (; ptr0 < &x[N - i]; ptr0++, ptr1++) { + __asm__ __volatile__( + "smulbb %[tmp], %[ptr0], %[ptr1]\n\t" + "adds %[prod_lower], %[prod_lower], %[tmp]\n\t" + "adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t" + + // Specify constraints. + :[prod_lower]"+r"(prod_lower), + [prod_upper]"+r"(prod_upper), + [tmp]"+r"(tmp) + :[ptr0]"r"(*ptr0), + [ptr1]"r"(*ptr1) + ); + } + + // Sum the results up, and do shift. + __asm__ __volatile__( + "vadd.i64 d18, d19\n\t" + "vmov.32 d17[0], %[prod_lower]\n\t" + "vmov.32 d17[1], %[prod_upper]\n\t" + "vadd.i64 d17, d18\n\t" + "mov %[tmp], %[scaling], asr #31\n\t" + "vmov.32 d16, %[scaling], %[tmp]\n\t" + "vshl.s64 d17, d16\n\t" + "vmov.32 %[sum], d17[0]\n\t" + + // Specify constraints. + :[sum]"=r"(sum), + [tmp]"+r"(tmp) + :[prod_upper]"r"(prod_upper), + [prod_lower]"r"(prod_lower), + [scaling]"r"(-scaling) + :"d16", "d17", "d18", "d19" + ); + + // Record the result. + r[i] = sum; + } + + // Record the result. + *scale = scaling; + + return(order + 1); +} |