Optimized function AllpassFilter2FixDec16() in isac fix for Android Neon platforms.

With an offline test, codec cycles were reduced by 4%. Review URL: https://webrtc-codereview.appspot.com/936007 git-svn-id: http://webrtc.googlecode.com/svn/trunk/webrtc@3066 4adac7df-926f-26a2-2b94-8c16560cd09d
author: kma@webrtc.org <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d> 2012-11-09 00:39:45 +0000
committer: kma@webrtc.org <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d> 2012-11-09 00:39:45 +0000
commit: d0ea5f0cdd6bee478e94877a564fedefa84cadd7 (patch)
tree: aaec9519863f0a3f3082add5c33811e524a79375 /modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
parent: 09ea027a5c562d1b9a801553b880923211cd3718 (diff)
download: webrtc-d0ea5f0cdd6bee478e94877a564fedefa84cadd7.tar.gz
1 files changed, 270 insertions, 0 deletions
diff --git a/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S b/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
new file mode 100644
index 00000000..e915faba
--- /dev/null
+++ b/modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
@@ -0,0 +1,270 @@
+@
+@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS.  All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon()
+@ in iSAC codec, optimized for ARM Neon platform. Bit exact with function
+@ WebRtcIsacfix_AllpassFilter2FixDec16Neon() in filterbanks.c. Prototype
+@ C code is at end of this file.
+
+.arch armv7-a
+.fpu neon
+.global WebRtcIsacfix_AllpassFilter2FixDec16Neon
+.align  2
+
+@void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
+@    int16_t *data_ch1,  // Input and output in channel 1, in Q0
+@    int16_t *data_ch2,  // Input and output in channel 2, in Q0
+@    const int16_t *factor_ch1,  // Scaling factor for channel 1, in Q15
+@    const int16_t *factor_ch2,  // Scaling factor for channel 2, in Q15
+@    const int length,           // Length of the data buffers
+@    int32_t *filter_state_ch1,  // Filter state for channel 1, in Q16
+@    int32_t *filter_state_ch2); // Filter state for channel 2, in Q16
+
+WebRtcIsacfix_AllpassFilter2FixDec16Neon:
+  push {r4 - r7}
+
+  ldr r5, [sp, #24]           @ filter_state_ch2
+  ldr r6, [sp, #20]           @ filter_state_ch1
+
+  @ Initialize the Neon registers.
+  vld1.16 d0[0], [r0]!        @ data_ch1[0]
+  vld1.16 d0[2], [r1]!        @ data_ch2[0]
+  vld1.32 d30[0], [r2]        @ factor_ch1[0], factor_ch1[1]
+  vld1.32 d30[1], [r3]        @ factor_ch2[0], factor_ch2[1]
+  vld1.32 d16[0], [r6]!       @ filter_state_ch1[0]
+  vld1.32 d17[0], [r5]!       @ filter_state_ch2[0]
+  vneg.s16 d31, d30
+
+  ldr r3, [sp, #16]           @ length
+  mov r4, #4                  @ Post offset value for the loop
+  mov r2, #-2                 @ Post offset value for the loop
+  sub r3, #2                  @ Loop counter
+
+  @ Loop unrolling pre-processing.
+  vqdmull.s16 q1, d30, d0
+  vshll.s16 q0, d0, #16
+  vqadd.s32 q2, q1, q8
+  vshrn.i32 d6, q2, #16
+  vmull.s16 q1, d31, d6
+  vshl.s32 q1, #1
+  vqadd.s32 q8, q1, q0
+  vld1.32 d16[1], [r6]        @ filter_state_ch1[1]
+  vld1.32 d17[1], [r5]        @ filter_state_ch2[1]
+  sub r6, #4                  @ &filter_state_ch1[0]
+  sub r5, #4                  @ &filter_state_ch2[0]
+  vld1.16 d6[1], [r0], r2     @ data_ch1[1]
+  vld1.16 d6[3], [r1], r2     @ data_ch2[1]
+  vrev32.16 d0, d6
+
+FOR_LOOP:
+  vqdmull.s16 q1, d30, d0
+  vshll.s16 q0, d0, #16
+  vqadd.s32 q2, q1, q8
+  vshrn.i32 d4, q2, #16
+  vmull.s16 q1, d31, d4
+  vst1.16 d4[1], [r0], r4     @ Store data_ch1[n]
+  vst1.16 d4[3], [r1], r4     @ Store data_ch2[n]
+  vshl.s32 q1, #1
+  vld1.16 d4[1], [r0], r2     @ Load data_ch1[n + 2]
+  vld1.16 d4[3], [r1], r2     @ Load data_ch2[n + 2]
+  vqadd.s32 q8, q1, q0
+  vrev32.16 d0, d4
+  vqdmull.s16 q1, d30, d0
+  subs r3, #2
+  vqadd.s32 q2, q1, q8
+  vshrn.i32 d6, q2, #16
+  vmull.s16 q1, d31, d6
+  vshll.s16 q0, d0, #16
+  vst1.16 d6[1], [r0], r4     @ Store data_ch1[n + 1]
+  vst1.16 d6[3], [r1], r4     @ Store data_ch2[n + 1]
+  vshl.s32 q1, #1
+  vld1.16 d6[1], [r0], r2     @ Load data_ch1[n + 3]
+  vld1.16 d6[3], [r1], r2     @ Load data_ch2[n + 3]
+  vqadd.s32 q8, q1, q0
+  vrev32.16 d0, d6
+  bgt FOR_LOOP
+
+  @ Loop unrolling post-processing.
+  vqdmull.s16 q1, d30, d0
+  vshll.s16 q0, d0, #16
+  vqadd.s32 q2, q1, q8
+  vshrn.i32 d4, q2, #16
+  vmull.s16 q1, d31, d4
+  vst1.16 d4[1], [r0]!        @ Store data_ch1[n]
+  vst1.16 d4[3], [r1]!        @ Store data_ch2[n]
+  vshl.s32 q1, #1
+  vqadd.s32 q8, q1, q0
+  vrev32.16 d0, d4
+  vqdmull.s16 q1, d30, d0
+  vshll.s16 q0, d0, #16
+  vqadd.s32 q2, q1, q8
+  vshrn.i32 d6, q2, #16
+  vmull.s16 q1, d31, d6
+  vst1.16 d6[1], [r0]         @ Store data_ch1[n + 1]
+  vst1.16 d6[3], [r1]         @ Store data_ch2[n + 1]
+  vshl.s32 q1, #1
+  vst1.32 d16[0], [r6]!       @ Store filter_state_ch1[0]
+  vqadd.s32 q9, q1, q0
+  vst1.32 d17[0], [r5]!       @ Store filter_state_ch1[1]
+  vst1.32 d18[1], [r6]        @ Store filter_state_ch2[0]
+  vst1.32 d19[1], [r5]        @ Store filter_state_ch2[1]
+
+  pop {r4 - r7}
+  bx lr
+
+@void AllpassFilter2FixDec16BothChannels(
+@    int16_t *data_ch1,  // Input and output in channel 1, in Q0
+@    int16_t *data_ch2,  // Input and output in channel 2, in Q0
+@    const int16_t *factor_ch1,  // Scaling factor for channel 1, in Q15
+@    const int16_t *factor_ch2,  // Scaling factor for channel 2, in Q15
+@    const int length,  // Length of the data buffers
+@    int32_t *filter_state_ch1,  // Filter state for channel 1, in Q16
+@    int32_t *filter_state_ch2) {  // Filter state for channel 2, in Q16
+@  int n = 0;
+@  int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1];
+@  int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1];
+@  int16_t sample0_ch1 = 0, sample0_ch2 = 0;
+@  int16_t sample1_ch1 = 0, sample1_ch2  = 0;
+@  int32_t a0_ch1 = 0, a0_ch2 = 0;
+@  int32_t b0_ch1 = 0, b0_ch2 = 0;
+@
+@  int32_t a1_ch1 = 0, a1_ch2 = 0;
+@  int32_t b1_ch1 = 0, b1_ch2 = 0;
+@  int32_t b2_ch1  = 0, b2_ch2 = 0;
+@
+@  // Loop unrolling preprocessing.
+@
+@  sample0_ch1 = data_ch1[n];
+@  sample0_ch2 = data_ch2[n];
+@
+@  a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample0_ch1) << 1;
+@  a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample0_ch2) << 1;
+@
+@  b0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state0_ch1);
+@  b0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state0_ch2); //Q16+Q16=Q16
+@
+@  a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b0_ch1 >> 16));
+@  a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b0_ch2 >> 16));
+@
+@  state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16);
+@  state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16);
+@
+@  sample1_ch1 = data_ch1[n + 1];
+@  sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0
+@  sample1_ch2  = data_ch2[n + 1];
+@  sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0
+@
+@
+@  for (n = 0; n < length - 2; n += 2) {
+@    a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample1_ch1) << 1;
+@    a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample0_ch1) << 1;
+@    a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample1_ch2 ) << 1;
+@    a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample0_ch2) << 1;
+@
+@    b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state0_ch1);
+@    b0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state1_ch1); //Q16+Q16=Q16
+@    b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state0_ch2); //Q16+Q16=Q16
+@    b0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state1_ch2); //Q16+Q16=Q16
+@
+@    a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b1_ch1 >> 16));
+@    a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b0_ch1 >> 16));
+@    a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b1_ch2 >> 16));
+@    a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b0_ch2 >> 16));
+@
+@    state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16);
+@    state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16);
+@    state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16);
+@    state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16);
+@
+@    sample0_ch1 = data_ch1[n + 2];
+@    sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
+@    sample0_ch2 = data_ch2[n + 2];
+@    sample1_ch2  = (int16_t) (b1_ch2 >> 16); //Save as Q0
+@
+@    a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample0_ch1) << 1;
+@    a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample1_ch1) << 1;
+@    a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample0_ch2) << 1;
+@    a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample1_ch2 ) << 1;
+@
+@    b2_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state0_ch1);
+@    b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state1_ch1); //Q16+Q16=Q16
+@    b2_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state0_ch2); //Q16+Q16=Q16
+@    b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state1_ch2); //Q16+Q16=Q16
+@
+@    a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b2_ch1 >> 16));
+@    a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b1_ch1 >> 16));
+@    a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b2_ch2 >> 16));
+@    a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b1_ch2 >> 16));
+@
+@    state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1<<1, (uint32_t)sample0_ch1<<16);
+@    state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
+@    state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2<<1, (uint32_t)sample0_ch2<<16);
+@    state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
+@
+@
+@    sample1_ch1 = data_ch1[n + 3];
+@    sample0_ch1 = (int16_t) (b2_ch1  >> 16); //Save as Q0
+@    sample1_ch2 = data_ch2[n + 3];
+@    sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0
+@
+@    data_ch1[n]     = (int16_t) (b0_ch1 >> 16); //Save as Q0
+@    data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
+@    data_ch2[n]     = (int16_t) (b0_ch2 >> 16);
+@    data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
+@  }
+@
+@  // Loop unrolling post-processing.
+@
+@  a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[0], sample1_ch1) << 1;
+@  a0_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample0_ch1) << 1;
+@  a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[0], sample1_ch2 ) << 1;
+@  a0_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample0_ch2) << 1;
+@
+@  b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state0_ch1);
+@  b0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1, state1_ch1);
+@  b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state0_ch2);
+@  b0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2, state1_ch2);
+@
+@  a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[0], (int16_t) (b1_ch1 >> 16));
+@  a0_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b0_ch1 >> 16));
+@  a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[0], (int16_t) (b1_ch2 >> 16));
+@  a0_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b0_ch2 >> 16));
+@
+@  state0_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16);
+@  state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16);
+@  state0_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16);
+@  state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16);
+@
+@  data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
+@  data_ch2[n] = (int16_t) (b0_ch2 >> 16);
+@
+@  sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
+@  sample1_ch2  = (int16_t) (b1_ch2 >> 16); //Save as Q0
+@
+@  a1_ch1 = WEBRTC_SPL_MUL_16_16(factor_ch1[1], sample1_ch1) << 1;
+@  a1_ch2 = WEBRTC_SPL_MUL_16_16(factor_ch2[1], sample1_ch2 ) << 1;
+@
+@  b1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1, state1_ch1); //Q16+Q16=Q16
+@  b1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2, state1_ch2); //Q16+Q16=Q16
+@
+@  a1_ch1 = WEBRTC_SPL_MUL_16_16(-factor_ch1[1], (int16_t) (b1_ch1 >> 16));
+@  a1_ch2 = WEBRTC_SPL_MUL_16_16(-factor_ch2[1], (int16_t) (b1_ch2 >> 16));
+@
+@  state1_ch1 = WEBRTC_SPL_ADD_SAT_W32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
+@  state1_ch2 = WEBRTC_SPL_ADD_SAT_W32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
+@
+@  data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
+@  data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
+@
+@  filter_state_ch1[0] = state0_ch1;
+@  filter_state_ch1[1] = state1_ch1;
+@  filter_state_ch2[0] = state0_ch2;
+@  filter_state_ch2[1] = state1_ch2;
+@}
author	kma@webrtc.org <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>	2012-11-09 00:39:45 +0000
committer	kma@webrtc.org <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>	2012-11-09 00:39:45 +0000
commit	d0ea5f0cdd6bee478e94877a564fedefa84cadd7 (patch)
tree	aaec9519863f0a3f3082add5c33811e524a79375 /modules/audio_coding/codecs/isac/fix/source/filterbanks_neon.S
parent	09ea027a5c562d1b9a801553b880923211cd3718 (diff)
download	webrtc-d0ea5f0cdd6bee478e94877a564fedefa84cadd7.tar.gz