diff options
author | kma@webrtc.org <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d> | 2012-11-09 00:39:45 +0000 |
---|---|---|
committer | kma@webrtc.org <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d> | 2012-11-09 00:39:45 +0000 |
commit | d0ea5f0cdd6bee478e94877a564fedefa84cadd7 (patch) | |
tree | aaec9519863f0a3f3082add5c33811e524a79375 /modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S | |
parent | 09ea027a5c562d1b9a801553b880923211cd3718 (diff) | |
download | webrtc-d0ea5f0cdd6bee478e94877a564fedefa84cadd7.tar.gz |
Optimized function AllpassFilter2FixDec16() in isac fix for Android Neon platforms.
With an offline test, codec cycles were reduced by 4%.
Review URL: https://webrtc-codereview.appspot.com/936007
git-svn-id: http://webrtc.googlecode.com/svn/trunk/webrtc@3066 4adac7df-926f-26a2-2b94-8c16560cd09d
Diffstat (limited to 'modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S')
-rw-r--r-- | modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S | 270 |
1 files changed, 270 insertions, 0 deletions
diff --git a/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S b/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S new file mode 100644 index 00000000..e915faba --- /dev/null +++ b/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S @@ -0,0 +1,270 @@ +@ +@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + +@ Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon() +@ in iSAC codec, optimized for ARM Neon platform. Bit exact with function +@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype +@ C code is at end of this file. + +.arch armv7-a +.fpu neon +.global WebRtcIsacfix_AllpassFilter2FixDec16Neon +.align 2 + +@void WebRtcIsacfix_AllpassFilter2FixDec16Neon( +@ int16_t *data_ch1, // Input and output in channel 1, in Q0 +@ int16_t *data_ch2, // Input and output in channel 2, in Q0 +@ const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15 +@ const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15 +@ const int length, // Length of the data buffers +@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16 +@ int32_t *filter_state_ch2); // Filter state for channel 2, in Q16 + +WebRtcIsacfix_AllpassFilter2FixDec16Neon: + push {r4 - r7} + + ldr r5, [sp, #24] @ filter_state_ch2 + ldr r6, [sp, #20] @ filter_state_ch1 + + @ Initialize the Neon registers. + vld1.16 d0[0], [r0]! @ data_ch1[0] + vld1.16 d0[2], [r1]! @ data_ch2[0] + vld1.32 d30[0], [r2] @ factor_ch1[0], factor_ch1[1] + vld1.32 d30[1], [r3] @ factor_ch2[0], factor_ch2[1] + vld1.32 d16[0], [r6]! @ filter_state_ch1[0] + vld1.32 d17[0], [r5]! @ filter_state_ch2[0] + vneg.s16 d31, d30 + + ldr r3, [sp, #16] @ length + mov r4, #4 @ Post offset value for the loop + mov r2, #-2 @ Post offset value for the loop + sub r3, #2 @ Loop counter + + @ Loop unrolling pre-processing. + vqdmull.s16 q1, d30, d0 + vshll.s16 q0, d0, #16 + vqadd.s32 q2, q1, q8 + vshrn.i32 d6, q2, #16 + vmull.s16 q1, d31, d6 + vshl.s32 q1, #1 + vqadd.s32 q8, q1, q0 + vld1.32 d16[1], [r6] @ filter_state_ch1[1] + vld1.32 d17[1], [r5] @ filter_state_ch2[1] + sub r6, #4 @ &filter_state_ch1[0] + sub r5, #4 @ &filter_state_ch2[0] + vld1.16 d6[1], [r0], r2 @ data_ch1[1] + vld1.16 d6[3], [r1], r2 @ data_ch2[1] + vrev32.16 d0, d6 + +FOR_LOOP: + vqdmull.s16 q1, d30, d0 + vshll.s16 q0, d0, #16 + vqadd.s32 q2, q1, q8 + vshrn.i32 d4, q2, #16 + vmull.s16 q1, d31, d4 + vst1.16 d4[1], [r0], r4 @ Store data_ch1[n] + vst1.16 d4[3], [r1], r4 @ Store data_ch2[n] + vshl.s32 q1, #1 + vld1.16 d4[1], [r0], r2 @ Load data_ch1[n + 2] + vld1.16 d4[3], [r1], r2 @ Load data_ch2[n + 2] + vqadd.s32 q8, q1, q0 + vrev32.16 d0, d4 + vqdmull.s16 q1, d30, d0 + subs r3, #2 + vqadd.s32 q2, q1, q8 + vshrn.i32 d6, q2, #16 + vmull.s16 q1, d31, d6 + vshll.s16 q0, d0, #16 + vst1.16 d6[1], [r0], r4 @ Store data_ch1[n + 1] + vst1.16 d6[3], [r1], r4 @ Store data_ch2[n + 1] + vshl.s32 q1, #1 + vld1.16 d6[1], [r0], r2 @ Load data_ch1[n + 3] + vld1.16 d6[3], [r1], r2 @ Load data_ch2[n + 3] + vqadd.s32 q8, q1, q0 + vrev32.16 d0, d6 + bgt FOR_LOOP + + @ Loop unrolling post-processing. + vqdmull.s16 q1, d30, d0 + vshll.s16 q0, d0, #16 + vqadd.s32 q2, q1, q8 + vshrn.i32 d4, q2, #16 + vmull.s16 q1, d31, d4 + vst1.16 d4[1], [r0]! @ Store data_ch1[n] + vst1.16 d4[3], [r1]! @ Store data_ch2[n] + vshl.s32 q1, #1 + vqadd.s32 q8, q1, q0 + vrev32.16 d0, d4 + vqdmull.s16 q1, d30, d0 + vshll.s16 q0, d0, #16 + vqadd.s32 q2, q1, q8 + vshrn.i32 d6, q2, #16 + vmull.s16 q1, d31, d6 + vst1.16 d6[1], [r0] @ Store data_ch1[n + 1] + vst1.16 d6[3], [r1] @ Store data_ch2[n + 1] + vshl.s32 q1, #1 + vst1.32 d16[0], [r6]! @ Store filter_state_ch1[0] + vqadd.s32 q9, q1, q0 + vst1.32 d17[0], [r5]! @ Store filter_state_ch1[1] + vst1.32 d18[1], [r6] @ Store filter_state_ch2[0] + vst1.32 d19[1], [r5] @ Store filter_state_ch2[1] + + pop {r4 - r7} + bx lr + +@void AllpassFilter2FixDec16BothChannels( +@ int16_t *data_ch1, // Input and output in channel 1, in Q0 +@ int16_t *data_ch2, // Input and output in channel 2, in Q0 +@ const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15 +@ const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15 +@ const int length, // Length of the data buffers +@ int32_t *filter_state_ch1, // Filter state for channel 1, in Q16 +@ int32_t *filter_state_ch2) { // Filter state for channel 2, in Q16 +@ int n = 0; +@ int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1]; +@ int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1]; +@ int16_t sample0_ch1 = 0, sample0_ch2 = 0; +@ int16_t sample1_ch1 = 0, sample1_ch2 = 0; +@ int32_t a0_ch1 = 0, a0_ch2 = 0; +@ int32_t b0_ch1 = 0, b0_ch2 = 0; +@ +@ int32_t a1_ch1 = 0, a1_ch2 = 0; +@ int32_t b1_ch1 = 0, b1_ch2 = 0; +@ int32_t b2_ch1 = 0, b2_ch2 = 0; +@ +@ // Loop unrolling preprocessing. +@ +@ sample0_ch1 = data_ch1[n]; +@ sample0_ch2 = data_ch2[n]; +@ +@ a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample0_ch1) << 1; +@ a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample0_ch2) << 1; +@ +@ b0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state0_ch1); +@ b0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state0_ch2); //Q16+Q16=Q16 +@ +@ a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b0_ch1 >> 16)); +@ a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b0_ch2 >> 16)); +@ +@ state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16); +@ state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16); +@ +@ sample1_ch1 = data_ch1[n + 1]; +@ sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0 +@ sample1_ch2 = data_ch2[n + 1]; +@ sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0 +@ +@ +@ for (n = 0; n < length - 2; n += 2) { +@ a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample1_ch1) << 1; +@ a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample0_ch1) << 1; +@ a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample1_ch2 ) << 1; +@ a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample0_ch2) << 1; +@ +@ b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state0_ch1); +@ b0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state1_ch1); //Q16+Q16=Q16 +@ b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state0_ch2); //Q16+Q16=Q16 +@ b0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state1_ch2); //Q16+Q16=Q16 +@ +@ a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b1_ch1 >> 16)); +@ a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b0_ch1 >> 16)); +@ a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b1_ch2 >> 16)); +@ a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b0_ch2 >> 16)); +@ +@ state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16); +@ state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16); +@ state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16); +@ state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16); +@ +@ sample0_ch1 = data_ch1[n + 2]; +@ sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0 +@ sample0_ch2 = data_ch2[n + 2]; +@ sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0 +@ +@ a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample0_ch1) << 1; +@ a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample1_ch1) << 1; +@ a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample0_ch2) << 1; +@ a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample1_ch2 ) << 1; +@ +@ b2_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state0_ch1); +@ b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state1_ch1); //Q16+Q16=Q16 +@ b2_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state0_ch2); //Q16+Q16=Q16 +@ b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state1_ch2); //Q16+Q16=Q16 +@ +@ a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b2_ch1 >> 16)); +@ a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b1_ch1 >> 16)); +@ a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b2_ch2 >> 16)); +@ a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b1_ch2 >> 16)); +@ +@ state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1<<1, (uint32_t)sample0_ch1<<16); +@ state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1<<16); +@ state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2<<1, (uint32_t)sample0_ch2<<16); +@ state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2<<16); +@ +@ +@ sample1_ch1 = data_ch1[n + 3]; +@ sample0_ch1 = (int16_t) (b2_ch1 >> 16); //Save as Q0 +@ sample1_ch2 = data_ch2[n + 3]; +@ sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0 +@ +@ data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0 +@ data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0 +@ data_ch2[n] = (int16_t) (b0_ch2 >> 16); +@ data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16); +@ } +@ +@ // Loop unrolling post-processing. +@ +@ a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample1_ch1) << 1; +@ a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample0_ch1) << 1; +@ a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample1_ch2 ) << 1; +@ a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample0_ch2) << 1; +@ +@ b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state0_ch1); +@ b0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state1_ch1); +@ b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state0_ch2); +@ b0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state1_ch2); +@ +@ a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b1_ch1 >> 16)); +@ a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b0_ch1 >> 16)); +@ a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b1_ch2 >> 16)); +@ a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b0_ch2 >> 16)); +@ +@ state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16); +@ state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16); +@ state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16); +@ state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16); +@ +@ data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0 +@ data_ch2[n] = (int16_t) (b0_ch2 >> 16); +@ +@ sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0 +@ sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0 +@ +@ a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample1_ch1) << 1; +@ a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample1_ch2 ) << 1; +@ +@ b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state1_ch1); //Q16+Q16=Q16 +@ b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state1_ch2); //Q16+Q16=Q16 +@ +@ a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b1_ch1 >> 16)); +@ a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b1_ch2 >> 16)); +@ +@ state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1<<16); +@ state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2<<16); +@ +@ data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0 +@ data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16); +@ +@ filter_state_ch1[0] = state0_ch1; +@ filter_state_ch1[1] = state1_ch1; +@ filter_state_ch2[0] = state0_ch2; +@ filter_state_ch2[1] = state1_ch2; +@} |