@ @ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. @ @ Use of this source code is governed by a BSD-style license @ that can be found in the LICENSE file in the root of the source @ tree. An additional intellectual property rights grant can be found @ in the file PATENTS. All contributing project authors may @ be found in the AUTHORS file in the root of the source tree. @ @ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for @ ARMv7 platform. The description header can be found in @ signal_processing_library.h @ @ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and @ the reference C code at end of this file. @ Assumptions: @ (1) data_length > 0 @ (2) coefficients_length > 1 @ Register usage: @ @ r0: &data_in[i] @ r1: &data_out[i], for result ouput @ r2: &coefficients[0] @ r3: coefficients_length @ r4: Iteration counter for the outer loop. @ r5: data_out[j] as multiplication inputs @ r6: Calculated value for output data_out[]; interation counter for inner loop @ r7: Partial sum of a filtering multiplication results @ r8: Partial sum of a filtering multiplication results @ r9: &data_out[], for filtering input; data_in[i] @ r10: coefficients[j] @ r11: Scratch @ r12: &coefficients[j] #include "rtc_base/system/asm_defines.h" GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12 .align 2 DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12 push {r4-r11} ldrsh r12, [sp, #32] @ data_length subs r4, r12, #1 beq ODD_LENGTH @ jump if data_length == 1 LOOP_LENGTH: add r12, r2, r3, lsl #1 sub r12, #4 @ &coefficients[coefficients_length - 2] sub r9, r1, r3, lsl #1 add r9, #2 @ &data_out[i - coefficients_length + 1] ldr r5, [r9], #4 @ data_out[i - coefficients_length + {1,2}] mov r7, #0 @ sum1 mov r8, #0 @ sum2 subs r6, r3, #3 @ Iteration counter for inner loop. beq ODD_A_LENGTH @ branch if coefficients_length == 3 blt POST_LOOP_A_LENGTH @ branch if coefficients_length == 2 LOOP_A_LENGTH: ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j] subs r6, #2 smlatt r8, r10, r5, r8 @ sum2 += coefficients[j] * data_out[i - j + 1]; smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j]; smlabt r7, r10, r5, r7 @ coefficients[j - 1] * data_out[i - j + 1]; ldr r5, [r9], #4 @ data_out[i - j + 2], data_out[i - j + 3] smlabb r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 2]; bgt LOOP_A_LENGTH blt POST_LOOP_A_LENGTH ODD_A_LENGTH: ldrsh r10, [r12, #2] @ Filter coefficients coefficients[2] sub r12, #2 @ &coefficients[0] smlabb r7, r10, r5, r7 @ sum1 += coefficients[2] * data_out[i - 2]; smlabt r8, r10, r5, r8 @ sum2 += coefficients[2] * data_out[i - 1]; ldr r5, [r9, #-2] @ data_out[i - 1], data_out[i] POST_LOOP_A_LENGTH: ldr r10, [r12] @ coefficients[0], coefficients[1] smlatb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1]; ldr r9, [r0], #4 @ data_in[i], data_in[i + 1] smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i]; sub r6, r7 @ output1 -= sum1; sbfx r11, r6, #12, #16 ssat r7, #16, r6, asr #12 cmp r7, r11 addeq r6, r6, #2048 ssat r6, #16, r6, asr #12 strh r6, [r1], #2 @ Store data_out[i] smlatb r8, r10, r6, r8 @ sum2 += coefficients[1] * data_out[i]; smulbt r6, r10, r9 @ output2 = coefficients[0] * data_in[i + 1]; sub r6, r8 @ output1 -= sum1; sbfx r11, r6, #12, #16 ssat r7, #16, r6, asr #12 cmp r7, r11 addeq r6, r6, #2048 ssat r6, #16, r6, asr #12 strh r6, [r1], #2 @ Store data_out[i + 1] subs r4, #2 bgt LOOP_LENGTH blt END @ For even data_length, it's done. Jump to END. @ Process i = data_length -1, for the case of an odd length. ODD_LENGTH: add r12, r2, r3, lsl #1 sub r12, #4 @ &coefficients[coefficients_length - 2] sub r9, r1, r3, lsl #1 add r9, #2 @ &data_out[i - coefficients_length + 1] mov r7, #0 @ sum1 mov r8, #0 @ sum1 subs r6, r3, #2 @ inner loop counter beq EVEN_A_LENGTH @ branch if coefficients_length == 2 LOOP2_A_LENGTH: ldr r10, [r12], #-4 @ coefficients[j - 1], coefficients[j] ldr r5, [r9], #4 @ data_out[i - j], data_out[i - j + 1] subs r6, #2 smlatb r7, r10, r5, r7 @ sum1 += coefficients[j] * data_out[i - j]; smlabt r8, r10, r5, r8 @ coefficients[j - 1] * data_out[i - j + 1]; bgt LOOP2_A_LENGTH addlt r12, #2 blt POST_LOOP2_A_LENGTH EVEN_A_LENGTH: ldrsh r10, [r12, #2] @ Filter coefficients coefficients[1] ldrsh r5, [r9] @ data_out[i - 1] smlabb r7, r10, r5, r7 @ sum1 += coefficients[1] * data_out[i - 1]; POST_LOOP2_A_LENGTH: ldrsh r10, [r12] @ Filter coefficients coefficients[0] ldrsh r9, [r0] @ data_in[i] smulbb r6, r10, r9 @ output1 = coefficients[0] * data_in[i]; sub r6, r7 @ output1 -= sum1; sub r6, r8 @ output1 -= sum1; sbfx r8, r6, #12, #16 ssat r7, #16, r6, asr #12 cmp r7, r8 addeq r6, r6, #2048 ssat r6, #16, r6, asr #12 strh r6, [r1] @ Store the data_out[i] END: pop {r4-r11} bx lr @Reference C code: @ @void WebRtcSpl_FilterARFastQ12(int16_t* data_in, @ int16_t* data_out, @ int16_t* __restrict coefficients, @ size_t coefficients_length, @ size_t data_length) { @ size_t i = 0; @ size_t j = 0; @ @ assert(data_length > 0); @ assert(coefficients_length > 1); @ @ for (i = 0; i < data_length - 1; i += 2) { @ int32_t output1 = 0; @ int32_t sum1 = 0; @ int32_t output2 = 0; @ int32_t sum2 = 0; @ @ for (j = coefficients_length - 1; j > 2; j -= 2) { @ sum1 += coefficients[j] * data_out[i - j]; @ sum1 += coefficients[j - 1] * data_out[i - j + 1]; @ sum2 += coefficients[j] * data_out[i - j + 1]; @ sum2 += coefficients[j - 1] * data_out[i - j + 2]; @ } @ @ if (j == 2) { @ sum1 += coefficients[2] * data_out[i - 2]; @ sum2 += coefficients[2] * data_out[i - 1]; @ } @ @ sum1 += coefficients[1] * data_out[i - 1]; @ output1 = coefficients[0] * data_in[i]; @ output1 -= sum1; @ // Saturate and store the output. @ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728); @ data_out[i] = (int16_t)((output1 + 2048) >> 12); @ @ sum2 += coefficients[1] * data_out[i]; @ output2 = coefficients[0] * data_in[i + 1]; @ output2 -= sum2; @ // Saturate and store the output. @ output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728); @ data_out[i + 1] = (int16_t)((output2 + 2048) >> 12); @ } @ @ if (i == data_length - 1) { @ int32_t output1 = 0; @ int32_t sum1 = 0; @ @ for (j = coefficients_length - 1; j > 1; j -= 2) { @ sum1 += coefficients[j] * data_out[i - j]; @ sum1 += coefficients[j - 1] * data_out[i - j + 1]; @ } @ @ if (j == 1) { @ sum1 += coefficients[1] * data_out[i - 1]; @ } @ @ output1 = coefficients[0] * data_in[i]; @ output1 -= sum1; @ // Saturate and store the output. @ output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728); @ data_out[i] = (int16_t)((output1 + 2048) >> 12); @ } @}