diff options
author | kma@webrtc.org <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d> | 2012-11-17 00:22:46 +0000 |
---|---|---|
committer | kma@webrtc.org <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d> | 2012-11-17 00:22:46 +0000 |
commit | b238acaca55b4b345f0e37b82f8bbd9851c8bb6d (patch) | |
tree | 1525216cdad5fe5c0999b39672b3f53013636e3c /common_audio/signal_processing/min_max_operations_neon.S | |
parent | ece4890fda6f586c89d4ae25281dfba81feb1b0c (diff) | |
download | webrtc-b238acaca55b4b345f0e37b82f8bbd9851c8bb6d.tar.gz |
Porting ARM optimization from Android to ios.
Tested APM and iSAC in Android. Bit-exact with original versions.
Changes include removing or changing some GCC derivatives (e.g. .fnstart, .hword), instruction syntax, etc.
Review URL: https://webrtc-codereview.appspot.com/934009
git-svn-id: http://webrtc.googlecode.com/svn/trunk/webrtc@3124 4adac7df-926f-26a2-2b94-8c16560cd09d
Diffstat (limited to 'common_audio/signal_processing/min_max_operations_neon.S')
-rw-r--r-- | common_audio/signal_processing/min_max_operations_neon.S | 283 |
1 files changed, 283 insertions, 0 deletions
diff --git a/common_audio/signal_processing/min_max_operations_neon.S b/common_audio/signal_processing/min_max_operations_neon.S new file mode 100644 index 00000000..c84307f5 --- /dev/null +++ b/common_audio/signal_processing/min_max_operations_neon.S @@ -0,0 +1,283 @@ +@ +@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + +@ This file contains some minimum and maximum functions, optimized for +@ ARM Neon platform. The description header can be found in +@ signal_processing_library.h +@ +@ The reference C code is in file min_max_operations.c. Code here is basically +@ a loop unrolling by 8 with Neon instructions. Bit-exact. + +#include "webrtc/system_wrappers/interface/asm_defines.h" + +GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon +GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon +GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon +GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon +GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon +GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon + +.align 2 +@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length); +DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon + mov r2, #-1 @ Initialize the return value. + cmp r0, #0 + beq END_MAX_ABS_VALUE_W16 + cmp r1, #0 + ble END_MAX_ABS_VALUE_W16 + + cmp r1, #8 + blt LOOP_MAX_ABS_VALUE_W16 + + vmov.i16 q12, #0 + sub r1, #8 @ Counter for loops + +LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16: + vld1.16 {q13}, [r0]! + subs r1, #8 + vabs.s16 q13, q13 @ Note vabs doesn't change the value of -32768. + vmax.u16 q12, q13 @ Use u16 so we don't lose the value -32768. + bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16 + + @ Find the maximum value in the Neon registers and move it to r2. + vmax.u16 d24, d25 + vpmax.u16 d24, d24, d24 + vpmax.u16 d24, d24, d24 + adds r1, #8 + vmov.u16 r2, d24[0] + beq END_MAX_ABS_VALUE_W16 + +LOOP_MAX_ABS_VALUE_W16: + ldrsh r3, [r0], #2 + eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value. + sub r12, r12, r3, asr #31 + cmp r2, r12 + movlt r2, r12 + subs r1, #1 + bne LOOP_MAX_ABS_VALUE_W16 + +END_MAX_ABS_VALUE_W16: + cmp r2, #0x8000 @ Guard against the case for -32768. + subeq r2, #1 + mov r0, r2 + bx lr + + + +@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length); +DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon + cmp r0, #0 + moveq r0, #-1 + beq EXIT @ Return -1 for a NULL pointer. + cmp r1, #0 @ length + movle r0, #-1 + ble EXIT @ Return -1 if length <= 0. + + vmov.i32 q11, #0 + vmov.i32 q12, #0 + cmp r1, #8 + blt LOOP_MAX_ABS_VALUE_W32 + + sub r1, #8 @ Counter for loops + +LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32: + vld1.32 {q13, q14}, [r0]! + subs r1, #8 @ Counter for loops + vabs.s32 q13, q13 @ vabs doesn't change the value of 0x80000000. + vabs.s32 q14, q14 + vmax.u32 q11, q13 @ Use u32 so we don't lose the value 0x80000000. + vmax.u32 q12, q14 + bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32 + + @ Find the maximum value in the Neon registers and move it to r2. + vmax.u32 q12, q11 + vmax.u32 d24, d25 + vpmax.u32 d24, d24, d24 + adds r1, #8 + vmov.u32 r2, d24[0] + beq END_MAX_ABS_VALUE_W32 + +LOOP_MAX_ABS_VALUE_W32: + ldr r3, [r0], #4 + eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value. + sub r12, r12, r3, asr #31 + cmp r2, r12 + movcc r2, r12 + subs r1, #1 + bne LOOP_MAX_ABS_VALUE_W32 + +END_MAX_ABS_VALUE_W32: + mvn r0, #0x80000000 @ Guard against the case for 0x80000000. + cmp r2, r0 + movcc r0, r2 + +EXIT: + bx lr + +@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length); +DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon + mov r2, #0x8000 @ Initialize the return value. + cmp r0, #0 + beq END_MAX_VALUE_W16 + cmp r1, #0 + ble END_MAX_VALUE_W16 + + vmov.i16 q12, #0x8000 + cmp r1, #8 + blt LOOP_MAX_VALUE_W16 + + sub r1, #8 @ Counter for loops + +LOOP_UNROLLED_BY_8_MAX_VALUE_W16: + vld1.16 {q13}, [r0]! + subs r1, #8 + vmax.s16 q12, q13 + bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16 + + @ Find the maximum value in the Neon registers and move it to r2. + vmax.s16 d24, d25 + vpmax.s16 d24, d24, d24 + vpmax.s16 d24, d24, d24 + adds r1, #8 + vmov.u16 r2, d24[0] + beq END_MAX_VALUE_W16 + +LOOP_MAX_VALUE_W16: + ldrsh r3, [r0], #2 + cmp r2, r3 + movlt r2, r3 + subs r1, #1 + bne LOOP_MAX_VALUE_W16 + +END_MAX_VALUE_W16: + mov r0, r2 + bx lr + +@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length); +DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon + mov r2, #0x80000000 @ Initialize the return value. + cmp r0, #0 + beq END_MAX_VALUE_W32 + cmp r1, #0 + ble END_MAX_VALUE_W32 + + vmov.i32 q11, #0x80000000 + vmov.i32 q12, #0x80000000 + cmp r1, #8 + blt LOOP_MAX_VALUE_W32 + + sub r1, #8 @ Counter for loops + +LOOP_UNROLLED_BY_8_MAX_VALUE_W32: + vld1.32 {q13, q14}, [r0]! + subs r1, #8 + vmax.s32 q11, q13 + vmax.s32 q12, q14 + bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32 + + @ Find the maximum value in the Neon registers and move it to r2. + vmax.s32 q12, q11 + vpmax.s32 d24, d24, d25 + vpmax.s32 d24, d24, d24 + adds r1, #8 + vmov.s32 r2, d24[0] + beq END_MAX_VALUE_W32 + +LOOP_MAX_VALUE_W32: + ldr r3, [r0], #4 + cmp r2, r3 + movlt r2, r3 + subs r1, #1 + bne LOOP_MAX_VALUE_W32 + +END_MAX_VALUE_W32: + mov r0, r2 + bx lr + +@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length); +DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon + movw r2, #0x7FFF @ Initialize the return value. + cmp r0, #0 + beq END_MIN_VALUE_W16 + cmp r1, #0 + ble END_MIN_VALUE_W16 + + vmov.i16 q12, #0x7FFF + cmp r1, #8 + blt LOOP_MIN_VALUE_W16 + + sub r1, #8 @ Counter for loops + +LOOP_UNROLLED_BY_8_MIN_VALUE_W16: + vld1.16 {q13}, [r0]! + subs r1, #8 + vmin.s16 q12, q13 + bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16 + + @ Find the maximum value in the Neon registers and move it to r2. + vmin.s16 d24, d25 + vpmin.s16 d24, d24, d24 + vpmin.s16 d24, d24, d24 + adds r1, #8 + vmov.s16 r2, d24[0] + sxth r2, r2 + beq END_MIN_VALUE_W16 + +LOOP_MIN_VALUE_W16: + ldrsh r3, [r0], #2 + cmp r2, r3 + movge r2, r3 + subs r1, #1 + bne LOOP_MIN_VALUE_W16 + +END_MIN_VALUE_W16: + mov r0, r2 + bx lr + +@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length); +DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon + mov r2, #0x7FFFFFFF @ Initialize the return value. + cmp r0, #0 + beq END_MIN_VALUE_W32 + cmp r1, #0 + ble END_MIN_VALUE_W32 + + vdup.32 q11, r2 + vdup.32 q12, r2 + cmp r1, #8 + blt LOOP_MIN_VALUE_W32 + + sub r1, #8 @ Counter for loops + +LOOP_UNROLLED_BY_8_MIN_VALUE_W32: + vld1.32 {q13, q14}, [r0]! + subs r1, #8 + vmin.s32 q11, q13 + vmin.s32 q12, q14 + bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32 + + @ Find the maximum value in the Neon registers and move it to r2. + vmin.s32 q12, q11 + vpmin.s32 d24, d24, d25 + vpmin.s32 d24, d24, d24 + adds r1, #8 + vmov.s32 r2, d24[0] + beq END_MIN_VALUE_W32 + +LOOP_MIN_VALUE_W32: + ldr r3, [r0], #4 + cmp r2, r3 + movge r2, r3 + subs r1, #1 + bne LOOP_MIN_VALUE_W32 + +END_MIN_VALUE_W32: + mov r0, r2 + bx lr |