summaryrefslogtreecommitdiff
path: root/common_audio/signal_processing/min_max_operations_neon.S
diff options
context:
space:
mode:
authorkma@webrtc.org <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>2012-11-17 00:22:46 +0000
committerkma@webrtc.org <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>2012-11-17 00:22:46 +0000
commitb238acaca55b4b345f0e37b82f8bbd9851c8bb6d (patch)
tree1525216cdad5fe5c0999b39672b3f53013636e3c /common_audio/signal_processing/min_max_operations_neon.S
parentece4890fda6f586c89d4ae25281dfba81feb1b0c (diff)
downloadwebrtc-b238acaca55b4b345f0e37b82f8bbd9851c8bb6d.tar.gz
Porting ARM optimization from Android to ios.
Tested APM and iSAC in Android. Bit-exact with original versions. Changes include removing or changing some GCC derivatives (e.g. .fnstart, .hword), instruction syntax, etc. Review URL: https://webrtc-codereview.appspot.com/934009 git-svn-id: http://webrtc.googlecode.com/svn/trunk/webrtc@3124 4adac7df-926f-26a2-2b94-8c16560cd09d
Diffstat (limited to 'common_audio/signal_processing/min_max_operations_neon.S')
-rw-r--r--common_audio/signal_processing/min_max_operations_neon.S283
1 files changed, 283 insertions, 0 deletions
diff --git a/common_audio/signal_processing/min_max_operations_neon.S b/common_audio/signal_processing/min_max_operations_neon.S
new file mode 100644
index 00000000..c84307f5
--- /dev/null
+++ b/common_audio/signal_processing/min_max_operations_neon.S
@@ -0,0 +1,283 @@
+@
+@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+@
+@ Use of this source code is governed by a BSD-style license
+@ that can be found in the LICENSE file in the root of the source
+@ tree. An additional intellectual property rights grant can be found
+@ in the file PATENTS. All contributing project authors may
+@ be found in the AUTHORS file in the root of the source tree.
+@
+
+@ This file contains some minimum and maximum functions, optimized for
+@ ARM Neon platform. The description header can be found in
+@ signal_processing_library.h
+@
+@ The reference C code is in file min_max_operations.c. Code here is basically
+@ a loop unrolling by 8 with Neon instructions. Bit-exact.
+
+#include "webrtc/system_wrappers/interface/asm_defines.h"
+
+GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
+GLOBAL_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
+GLOBAL_FUNCTION WebRtcSpl_MaxValueW16Neon
+GLOBAL_FUNCTION WebRtcSpl_MaxValueW32Neon
+GLOBAL_FUNCTION WebRtcSpl_MinValueW16Neon
+GLOBAL_FUNCTION WebRtcSpl_MinValueW32Neon
+
+.align 2
+@ int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, int length);
+DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW16Neon
+ mov r2, #-1 @ Initialize the return value.
+ cmp r0, #0
+ beq END_MAX_ABS_VALUE_W16
+ cmp r1, #0
+ ble END_MAX_ABS_VALUE_W16
+
+ cmp r1, #8
+ blt LOOP_MAX_ABS_VALUE_W16
+
+ vmov.i16 q12, #0
+ sub r1, #8 @ Counter for loops
+
+LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16:
+ vld1.16 {q13}, [r0]!
+ subs r1, #8
+ vabs.s16 q13, q13 @ Note vabs doesn't change the value of -32768.
+ vmax.u16 q12, q13 @ Use u16 so we don't lose the value -32768.
+ bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W16
+
+ @ Find the maximum value in the Neon registers and move it to r2.
+ vmax.u16 d24, d25
+ vpmax.u16 d24, d24, d24
+ vpmax.u16 d24, d24, d24
+ adds r1, #8
+ vmov.u16 r2, d24[0]
+ beq END_MAX_ABS_VALUE_W16
+
+LOOP_MAX_ABS_VALUE_W16:
+ ldrsh r3, [r0], #2
+ eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
+ sub r12, r12, r3, asr #31
+ cmp r2, r12
+ movlt r2, r12
+ subs r1, #1
+ bne LOOP_MAX_ABS_VALUE_W16
+
+END_MAX_ABS_VALUE_W16:
+ cmp r2, #0x8000 @ Guard against the case for -32768.
+ subeq r2, #1
+ mov r0, r2
+ bx lr
+
+
+
+@ int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, int length);
+DEFINE_FUNCTION WebRtcSpl_MaxAbsValueW32Neon
+ cmp r0, #0
+ moveq r0, #-1
+ beq EXIT @ Return -1 for a NULL pointer.
+ cmp r1, #0 @ length
+ movle r0, #-1
+ ble EXIT @ Return -1 if length <= 0.
+
+ vmov.i32 q11, #0
+ vmov.i32 q12, #0
+ cmp r1, #8
+ blt LOOP_MAX_ABS_VALUE_W32
+
+ sub r1, #8 @ Counter for loops
+
+LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32:
+ vld1.32 {q13, q14}, [r0]!
+ subs r1, #8 @ Counter for loops
+ vabs.s32 q13, q13 @ vabs doesn't change the value of 0x80000000.
+ vabs.s32 q14, q14
+ vmax.u32 q11, q13 @ Use u32 so we don't lose the value 0x80000000.
+ vmax.u32 q12, q14
+ bge LOOP_UNROLLED_BY_8_MAX_ABS_VALUE_W32
+
+ @ Find the maximum value in the Neon registers and move it to r2.
+ vmax.u32 q12, q11
+ vmax.u32 d24, d25
+ vpmax.u32 d24, d24, d24
+ adds r1, #8
+ vmov.u32 r2, d24[0]
+ beq END_MAX_ABS_VALUE_W32
+
+LOOP_MAX_ABS_VALUE_W32:
+ ldr r3, [r0], #4
+ eor r12, r3, r3, asr #31 @ eor and then sub, to get absolute value.
+ sub r12, r12, r3, asr #31
+ cmp r2, r12
+ movcc r2, r12
+ subs r1, #1
+ bne LOOP_MAX_ABS_VALUE_W32
+
+END_MAX_ABS_VALUE_W32:
+ mvn r0, #0x80000000 @ Guard against the case for 0x80000000.
+ cmp r2, r0
+ movcc r0, r2
+
+EXIT:
+ bx lr
+
+@ int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, int length);
+DEFINE_FUNCTION WebRtcSpl_MaxValueW16Neon
+ mov r2, #0x8000 @ Initialize the return value.
+ cmp r0, #0
+ beq END_MAX_VALUE_W16
+ cmp r1, #0
+ ble END_MAX_VALUE_W16
+
+ vmov.i16 q12, #0x8000
+ cmp r1, #8
+ blt LOOP_MAX_VALUE_W16
+
+ sub r1, #8 @ Counter for loops
+
+LOOP_UNROLLED_BY_8_MAX_VALUE_W16:
+ vld1.16 {q13}, [r0]!
+ subs r1, #8
+ vmax.s16 q12, q13
+ bge LOOP_UNROLLED_BY_8_MAX_VALUE_W16
+
+ @ Find the maximum value in the Neon registers and move it to r2.
+ vmax.s16 d24, d25
+ vpmax.s16 d24, d24, d24
+ vpmax.s16 d24, d24, d24
+ adds r1, #8
+ vmov.u16 r2, d24[0]
+ beq END_MAX_VALUE_W16
+
+LOOP_MAX_VALUE_W16:
+ ldrsh r3, [r0], #2
+ cmp r2, r3
+ movlt r2, r3
+ subs r1, #1
+ bne LOOP_MAX_VALUE_W16
+
+END_MAX_VALUE_W16:
+ mov r0, r2
+ bx lr
+
+@ int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, int length);
+DEFINE_FUNCTION WebRtcSpl_MaxValueW32Neon
+ mov r2, #0x80000000 @ Initialize the return value.
+ cmp r0, #0
+ beq END_MAX_VALUE_W32
+ cmp r1, #0
+ ble END_MAX_VALUE_W32
+
+ vmov.i32 q11, #0x80000000
+ vmov.i32 q12, #0x80000000
+ cmp r1, #8
+ blt LOOP_MAX_VALUE_W32
+
+ sub r1, #8 @ Counter for loops
+
+LOOP_UNROLLED_BY_8_MAX_VALUE_W32:
+ vld1.32 {q13, q14}, [r0]!
+ subs r1, #8
+ vmax.s32 q11, q13
+ vmax.s32 q12, q14
+ bge LOOP_UNROLLED_BY_8_MAX_VALUE_W32
+
+ @ Find the maximum value in the Neon registers and move it to r2.
+ vmax.s32 q12, q11
+ vpmax.s32 d24, d24, d25
+ vpmax.s32 d24, d24, d24
+ adds r1, #8
+ vmov.s32 r2, d24[0]
+ beq END_MAX_VALUE_W32
+
+LOOP_MAX_VALUE_W32:
+ ldr r3, [r0], #4
+ cmp r2, r3
+ movlt r2, r3
+ subs r1, #1
+ bne LOOP_MAX_VALUE_W32
+
+END_MAX_VALUE_W32:
+ mov r0, r2
+ bx lr
+
+@ int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, int length);
+DEFINE_FUNCTION WebRtcSpl_MinValueW16Neon
+ movw r2, #0x7FFF @ Initialize the return value.
+ cmp r0, #0
+ beq END_MIN_VALUE_W16
+ cmp r1, #0
+ ble END_MIN_VALUE_W16
+
+ vmov.i16 q12, #0x7FFF
+ cmp r1, #8
+ blt LOOP_MIN_VALUE_W16
+
+ sub r1, #8 @ Counter for loops
+
+LOOP_UNROLLED_BY_8_MIN_VALUE_W16:
+ vld1.16 {q13}, [r0]!
+ subs r1, #8
+ vmin.s16 q12, q13
+ bge LOOP_UNROLLED_BY_8_MIN_VALUE_W16
+
+ @ Find the maximum value in the Neon registers and move it to r2.
+ vmin.s16 d24, d25
+ vpmin.s16 d24, d24, d24
+ vpmin.s16 d24, d24, d24
+ adds r1, #8
+ vmov.s16 r2, d24[0]
+ sxth r2, r2
+ beq END_MIN_VALUE_W16
+
+LOOP_MIN_VALUE_W16:
+ ldrsh r3, [r0], #2
+ cmp r2, r3
+ movge r2, r3
+ subs r1, #1
+ bne LOOP_MIN_VALUE_W16
+
+END_MIN_VALUE_W16:
+ mov r0, r2
+ bx lr
+
+@ int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, int length);
+DEFINE_FUNCTION WebRtcSpl_MinValueW32Neon
+ mov r2, #0x7FFFFFFF @ Initialize the return value.
+ cmp r0, #0
+ beq END_MIN_VALUE_W32
+ cmp r1, #0
+ ble END_MIN_VALUE_W32
+
+ vdup.32 q11, r2
+ vdup.32 q12, r2
+ cmp r1, #8
+ blt LOOP_MIN_VALUE_W32
+
+ sub r1, #8 @ Counter for loops
+
+LOOP_UNROLLED_BY_8_MIN_VALUE_W32:
+ vld1.32 {q13, q14}, [r0]!
+ subs r1, #8
+ vmin.s32 q11, q13
+ vmin.s32 q12, q14
+ bge LOOP_UNROLLED_BY_8_MIN_VALUE_W32
+
+ @ Find the maximum value in the Neon registers and move it to r2.
+ vmin.s32 q12, q11
+ vpmin.s32 d24, d24, d25
+ vpmin.s32 d24, d24, d24
+ adds r1, #8
+ vmov.s32 r2, d24[0]
+ beq END_MIN_VALUE_W32
+
+LOOP_MIN_VALUE_W32:
+ ldr r3, [r0], #4
+ cmp r2, r3
+ movge r2, r3
+ subs r1, #1
+ bne LOOP_MIN_VALUE_W32
+
+END_MIN_VALUE_W32:
+ mov r0, r2
+ bx lr