@ @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. @ @ Use of this source code is governed by a BSD-style license @ that can be found in the LICENSE file in the root of the source @ tree. An additional intellectual property rights grant can be found @ in the file PATENTS. All contributing project authors may @ be found in the AUTHORS file in the root of the source tree. @ @ vector_scaling_operations_neon.s @ This file contains the function WebRtcSpl_ScaleAndAddVectorsWithRoundNeon(), @ optimized for ARM Neon platform. Output is bit-exact with the reference @ C code in vector_scaling_operations.c. #include "webrtc/system_wrappers/interface/asm_defines.h" GLOBAL_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon .align 2 DEFINE_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon push {r4-r9} ldr r4, [sp, #32] @ length ldr r5, [sp, #28] @ out_vector ldrsh r6, [sp, #24] @ right_shifts cmp r4, #0 ble END @ Return if length <= 0. cmp r4, #8 blt SET_ROUND_VALUE vdup.16 d26, r1 @ in_vector1_scale vdup.16 d27, r3 @ in_vector2_scale @ Neon instructions can only right shift by an immediate value. To shift right @ by a register value, we have to do a left shift left by the negative value. rsb r7, r6, #0 vdup.16 q12, r7 @ -right_shifts bic r7, r4, #7 @ Counter for LOOP_UNROLLED_BY_8: length / 8 * 8. LOOP_UNROLLED_BY_8: vld1.16 {d28, d29}, [r0]! @ in_vector1[] vld1.16 {d30, d31}, [r2]! @ in_vector2[] vmull.s16 q0, d28, d26 vmull.s16 q1, d29, d26 vmull.s16 q2, d30, d27 vmull.s16 q3, d31, d27 vadd.s32 q0, q2 vadd.s32 q1, q3 vrshl.s32 q0, q12 @ Round shift right by right_shifts. vrshl.s32 q1, q12 vmovn.i32 d0, q0 @ Cast to 16 bit values. vmovn.i32 d1, q1 subs r7, #8 vst1.16 {d0, d1}, [r5]! bgt LOOP_UNROLLED_BY_8 ands r4, #0xFF @ Counter for LOOP_NO_UNROLLING: length % 8. beq END SET_ROUND_VALUE: mov r9, #1 lsl r9, r6 lsr r9, #1 LOOP_NO_UNROLLING: ldrh r7, [r0], #2 ldrh r8, [r2], #2 smulbb r7, r7, r1 smulbb r8, r8, r3 subs r4, #1 add r7, r9 add r7, r8 asr r7, r6 strh r7, [r5], #2 bne LOOP_NO_UNROLLING END: pop {r4-r9} bx lr