diff options
author | Rakesh Kumar <rakesh.kumar@ittiam.com> | 2017-11-07 22:51:30 +0000 |
---|---|---|
committer | android-build-merger <android-build-merger@google.com> | 2017-11-07 22:51:30 +0000 |
commit | 1d0fe6aaf99c3ef1e2d9c4c15ad49ad9180da5b5 (patch) | |
tree | 1654eb72f94c15fcda13fc52bf7cc8e1b05db214 /common | |
parent | 4c7f3d573692c09ccbb56bb2fd51527686d109f5 (diff) | |
parent | 68f18ba505a4c7fb39ec1ca1f6888e95acc1ff51 (diff) | |
download | libhevc-1d0fe6aaf99c3ef1e2d9c4c15ad49ad9180da5b5.tar.gz |
Add PUSH-POP of D registers in Arm Neon 32 bit functions am: a47cb8865a am: 9525ebc765 am: 0671e4cda7 am: ff7a95abd4 am: 6acf9167da am: 85ae219fca
am: 68f18ba505
Change-Id: I3f172309ba2c249d587987bd94d5b5b0937affd3
Diffstat (limited to 'common')
57 files changed, 835 insertions, 463 deletions
diff --git a/common/arm/ihevc_deblk_chroma_horz.s b/common/arm/ihevc_deblk_chroma_horz.s index 34422ff..b0a79eb 100644 --- a/common/arm/ihevc_deblk_chroma_horz.s +++ b/common/arm/ihevc_deblk_chroma_horz.s @@ -36,6 +36,12 @@ @* @*******************************************************************************/ +.equ qp_offset_u_offset, 40 +.equ qp_offset_v_offset, 44 +.equ tc_offset_div2_offset, 48 +.equ filter_p_offset, 52 +.equ filter_q_offset, 56 + .text .align 4 @@ -62,17 +68,17 @@ ihevc_deblk_chroma_horz_a9q: add r6,r0,r1 add r1,r2,r3 vmovl.u8 q0,d0 - ldr r10,[sp,#0x28] + ldr r10,[sp,#qp_offset_u_offset] vld1.8 {d2},[r12] add r2,r1,#1 - ldr r4,[sp,#0x30] + ldr r4,[sp,#tc_offset_div2_offset] vld1.8 {d4},[r5] - ldr r8,[sp,#0x34] + ldr r8,[sp,#filter_p_offset] vld1.8 {d16},[r6] - ldr r9,[sp,#0x38] + ldr r9,[sp,#filter_q_offset] adds r1,r10,r2,asr #1 vmovl.u8 q1,d2 - ldr r7,[sp,#0x2c] + ldr r7,[sp,#qp_offset_v_offset] ldr r3,gai4_ihevc_qp_table_addr ulbl1: add r3, r3, pc diff --git a/common/arm/ihevc_deblk_chroma_vert.s b/common/arm/ihevc_deblk_chroma_vert.s index 4cb305f..3962b28 100644 --- a/common/arm/ihevc_deblk_chroma_vert.s +++ b/common/arm/ihevc_deblk_chroma_vert.s @@ -37,6 +37,12 @@ @* @*******************************************************************************/ +.equ qp_offset_u_offset, 40 +.equ qp_offset_v_offset, 44 +.equ tc_offset_div2_offset, 48 +.equ filter_p_offset, 52 +.equ filter_q_offset, 56 + .text .align 4 @@ -63,19 +69,19 @@ ihevc_deblk_chroma_vert_a9q: vld1.8 {d5},[r8],r1 add r2,r2,#1 vld1.8 {d17},[r8],r1 - ldr r7,[sp,#0x28] + ldr r7,[sp,#qp_offset_u_offset] vld1.8 {d16},[r8],r1 - ldr r4,[sp,#0x38] + ldr r4,[sp,#filter_q_offset] vld1.8 {d4},[r8] - ldr r5,[sp,#0x30] + ldr r5,[sp,#tc_offset_div2_offset] vtrn.8 d5,d17 adds r3,r7,r2,asr #1 vtrn.8 d16,d4 ldr r7,gai4_ihevc_qp_table_addr ulbl1: add r7,r7,pc - ldr r12,[sp,#0x34] - ldr r6,[sp,#0x2c] + ldr r12,[sp,#filter_p_offset] + ldr r6,[sp,#qp_offset_v_offset] bmi l1.2944 cmp r3,#0x39 ldrle r3,[r7,r3,lsl #2] diff --git a/common/arm/ihevc_deblk_luma_horz.s b/common/arm/ihevc_deblk_luma_horz.s index b12ceb9..76660b3 100644 --- a/common/arm/ihevc_deblk_luma_horz.s +++ b/common/arm/ihevc_deblk_luma_horz.s @@ -36,6 +36,12 @@ @* @*******************************************************************************/ +.equ qp_q_offset, 108 +.equ beta_offset_div2_offset, 112 +.equ tc_offset_div2_offset, 116 +.equ filter_p_offset, 120 +.equ filter_q_offset, 124 + .text .align 4 @@ -57,12 +63,14 @@ gai4_ihevc_beta_table_addr: ihevc_deblk_luma_horz_a9q: stmfd sp!, {r3-r12,lr} - ldr r4,[sp,#0x2c] - ldr r5,[sp,#0x30] + vpush {d8 - d15} + + ldr r4,[sp,#qp_q_offset] + ldr r5,[sp,#beta_offset_div2_offset] add r3,r3,r4 add r3,r3,#1 - ldr r6, [sp,#0x34] + ldr r6, [sp,#tc_offset_div2_offset] asr r3,r3,#1 add r7,r3,r5,lsl #1 add r3,r3,r6,lsl #1 @@ -291,9 +299,9 @@ ulbl1: vmin.u8 d18,d20,d30 mov r2,#2 vqadd.u8 d30,d23,d1 - ldr r4,[sp,#0x38] @ loading the filter_flag_p + ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p vmax.u8 d2,d18,d31 - ldr r5,[sp,#0x3c] @ loading the filter_flag_q + ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q vrshrn.i16 d21,q7,#2 b end_dep_deq_decision_horz @ r2 has the value of de @@ -308,8 +316,8 @@ l1.1840: mov r2,#1 mov r11,r5 - ldr r4,[sp,#0x38] @ loading the filter_flag_p - ldr r5,[sp,#0x3c] @ loading the filter_flag_q + ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p + ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q cmp r6,#1 moveq r9,#0 @@ -397,6 +405,7 @@ strong_filtering_p: vst1.32 d3[0],[r12] l1.2404: + vpop {d8 - d15} ldmfd sp!, {r3-r12,pc} @ r4=flag p @@ -537,6 +546,8 @@ l1.2852: vbsl d19,d26,d13 vst1.32 {d19[0]},[r12],r1 vst1.32 {d18[0]},[r12] + + vpop {d8 - d15} ldmfd sp!, {r3-r12,r15} diff --git a/common/arm/ihevc_deblk_luma_vert.s b/common/arm/ihevc_deblk_luma_vert.s index ee247cc..91662c9 100644 --- a/common/arm/ihevc_deblk_luma_vert.s +++ b/common/arm/ihevc_deblk_luma_vert.s @@ -37,6 +37,12 @@ @* @*******************************************************************************/ +.equ qp_q_offset, 44 +.equ beta_offset_div2_offset, 48 +.equ tc_offset_div2_offset, 52 +.equ filter_p_offset, 56 +.equ filter_q_offset, 60 + .text .align 4 @@ -60,12 +66,12 @@ gai4_ihevc_beta_table_addr: ihevc_deblk_luma_vert_a9q: push {r3-r12,lr} - ldr r4,[sp,#0x2c] - ldr r5,[sp,#0x30] + ldr r4,[sp,#qp_q_offset] + ldr r5,[sp,#beta_offset_div2_offset] add r3,r3,r4 add r3,r3,#1 - ldr r6, [sp,#0x34] + ldr r6, [sp,#tc_offset_div2_offset] asr r3,r3,#1 add r7,r3,r5,lsl #1 add r3,r3,r6,lsl #1 @@ -291,9 +297,9 @@ ulbl1: vqadd.u8 d30,d6,d19 mov r2,#2 - ldr r4,[sp,#0x38] @ loading the filter_flag_p + ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p vqsub.u8 d31,d6,d19 - ldr r5,[sp,#0x3c] @ loading the filter_flag_q + ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q b end_dep_deq_decision @ r2 has the value of de @ r6 has teh value of tc @@ -307,8 +313,8 @@ l1.336: mov r2,#1 l1.424: mov r11,r5 - ldr r4,[sp,#0x38] @ loading the filter_flag_p - ldr r5,[sp,#0x3c] @ loading the filter_flag_q + ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p + ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q cmp r6,#1 moveq r9,#0 @@ -532,7 +538,6 @@ l1.1212: vst1.16 {d3[1]},[r12] vst1.8 {d16[3]},[r3] l1.1272: - @ ldr r3,[sp,#0x38] cmp r5,#0 beq l1.964 @ checks for the flag q diff --git a/common/arm/ihevc_inter_pred_chroma_copy.s b/common/arm/ihevc_inter_pred_chroma_copy.s index 0da34cc..1b38dbb 100644 --- a/common/arm/ihevc_inter_pred_chroma_copy.s +++ b/common/arm/ihevc_inter_pred_chroma_copy.s @@ -92,6 +92,9 @@ @ r5 => ht @ r6 => wd +.equ ht_offset, 44 +.equ wd_offset, 48 + .text .align 4 @@ -104,9 +107,9 @@ ihevc_inter_pred_chroma_copy_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - ldr r12,[sp,#48] @loads wd + ldr r12,[sp,#wd_offset] @loads wd lsl r12,r12,#1 - ldr r7,[sp,#44] @loads ht + ldr r7,[sp,#ht_offset] @loads ht cmp r7,#0 @checks ht == 0 ble end_loops and r8,r7,#3 @check ht for mul of 2 diff --git a/common/arm/ihevc_inter_pred_chroma_copy_w16out.s b/common/arm/ihevc_inter_pred_chroma_copy_w16out.s index a927fa7..4997b84 100644 --- a/common/arm/ihevc_inter_pred_chroma_copy_w16out.s +++ b/common/arm/ihevc_inter_pred_chroma_copy_w16out.s @@ -92,6 +92,11 @@ @r5 => ht @r6 => wd +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + + .text .align 4 @@ -105,9 +110,11 @@ ihevc_inter_pred_chroma_copy_w16out_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - ldr r12,[sp,#48] @loads wd + vpush {d8 - d15} + + ldr r12,[sp,#wd_offset] @loads wd lsl r12,r12,#1 @2*wd - ldr r7,[sp,#44] @loads ht + ldr r7,[sp,#ht_offset] @loads ht cmp r7,#0 @ht condition(ht == 0) ble end_loops @loop and r8,r7,#3 @check ht for mul of 2 @@ -162,6 +169,7 @@ end_inner_loop_wd_4: end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp @@ -316,6 +324,7 @@ core_loop_wd_8_ht_2: vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) bgt core_loop_wd_8_ht_2 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_chroma_horz.s b/common/arm/ihevc_inter_pred_chroma_horz.s index 4781d3e..c69b417 100644 --- a/common/arm/ihevc_inter_pred_chroma_horz.s +++ b/common/arm/ihevc_inter_pred_chroma_horz.s @@ -93,6 +93,10 @@ @r2 => src_strd @r3 => dst_strd +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 @@ -106,10 +110,11 @@ ihevc_inter_pred_chroma_horz_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4,[sp,#40] @loads pi1_coeff - ldr r7,[sp,#44] @loads ht - ldr r10,[sp,#48] @loads wd + ldr r4,[sp,#coeff_offset] @loads pi1_coeff + ldr r7,[sp,#ht_offset] @loads ht + ldr r10,[sp,#wd_offset] @loads wd vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) subs r14,r7,#0 @checks for ht == 0 @@ -672,6 +677,7 @@ inner_loop_4: end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_chroma_horz_w16out.s b/common/arm/ihevc_inter_pred_chroma_horz_w16out.s index f95937c..9c498e8 100644 --- a/common/arm/ihevc_inter_pred_chroma_horz_w16out.s +++ b/common/arm/ihevc_inter_pred_chroma_horz_w16out.s @@ -90,6 +90,9 @@ @r2 => src_strd @r3 => dst_strd +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 .text .align 4 @@ -105,10 +108,11 @@ ihevc_inter_pred_chroma_horz_w16out_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4,[sp,#40] @loads pi1_coeff - ldr r6,[sp,#44] @loads ht - ldr r10,[sp,#48] @loads wd + ldr r4,[sp,#coeff_offset] @loads pi1_coeff + ldr r6,[sp,#ht_offset] @loads ht + ldr r10,[sp,#wd_offset] @loads wd vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) subs r14,r6,#0 @checks for ht == 0 @@ -362,7 +366,7 @@ epilog_end: vst1.16 {q10},[r1],r6 @store the result pu1_dst - ldr r6,[sp,#44] @loads ht + ldr r6,[sp,#ht_offset] @loads ht and r7,r6,#1 @@ -710,6 +714,7 @@ loop_residue: end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_chroma_vert.s b/common/arm/ihevc_inter_pred_chroma_vert.s index e786497..8b4e48b 100644 --- a/common/arm/ihevc_inter_pred_chroma_vert.s +++ b/common/arm/ihevc_inter_pred_chroma_vert.s @@ -92,6 +92,11 @@ @r1 => *pi2_dst @r2 => src_strd @r3 => dst_strd + +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 @@ -105,11 +110,12 @@ ihevc_inter_pred_chroma_vert_a9q: stmfd sp!,{r4-r12,r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4,[sp,#44] @loads ht - ldr r12,[sp,#40] @loads pi1_coeff + ldr r4,[sp,#ht_offset] @loads ht + ldr r12,[sp,#coeff_offset] @loads pi1_coeff cmp r4,#0 @checks ht == 0 - ldr r6,[sp,#48] @loads wd + ldr r6,[sp,#wd_offset] @loads wd sub r0,r0,r2 @pu1_src - src_strd vld1.8 {d0},[r12] @loads pi1_coeff @@ -377,6 +383,7 @@ epilog: vqrshrun.s16 d24,q12,#6 vst1.8 {d24},[r7],r3 @stores the loaded value end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s b/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s index ba2ea8e..f9e513a 100644 --- a/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s +++ b/common/arm/ihevc_inter_pred_chroma_vert_w16inp.s @@ -92,6 +92,11 @@ @r2 => src_strd @r3 => dst_strd +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + + .text .align 4 @@ -105,11 +110,12 @@ ihevc_inter_pred_chroma_vert_w16inp_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4, [sp,#40] @loads pi1_coeff - ldr r6, [sp,#48] @wd + ldr r4, [sp,#coeff_offset] @loads pi1_coeff + ldr r6, [sp,#wd_offset] @wd lsl r2,r2,#1 @src_strd = 2* src_strd - ldr r5,[sp,#44] @loads ht + ldr r5,[sp,#ht_offset] @loads ht vld1.8 {d0},[r4] @loads pi1_coeff sub r4,r0,r2 @pu1_src - src_strd vmovl.s8 q0,d0 @long the value @@ -335,6 +341,7 @@ epilog: vst1.32 {d24[0]},[r9] @stores the loaded value end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s b/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s index 00b3011..0c2ffbd 100644 --- a/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s +++ b/common/arm/ihevc_inter_pred_chroma_vert_w16inp_w16out.s @@ -92,6 +92,11 @@ @r1 => *pi2_dst @r2 => src_strd @r3 => dst_strd + +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 @@ -105,11 +110,12 @@ ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4, [sp,#40] @loads pi1_coeff - ldr r6, [sp,#48] @wd + ldr r4, [sp,#coeff_offset] @loads pi1_coeff + ldr r6, [sp,#wd_offset] @wd lsl r2,r2,#1 @src_strd = 2* src_strd - ldr r5,[sp,#44] @loads ht + ldr r5,[sp,#ht_offset] @loads ht vld1.8 {d0},[r4] @loads pi1_coeff sub r4,r0,r2 @pu1_src - src_strd vmovl.s8 q0,d0 @long the value @@ -322,6 +328,7 @@ epilog: vst1.32 {d24},[r9] @stores the loaded value end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_chroma_vert_w16out.s b/common/arm/ihevc_inter_pred_chroma_vert_w16out.s index 6e6776c..84b0792 100644 --- a/common/arm/ihevc_inter_pred_chroma_vert_w16out.s +++ b/common/arm/ihevc_inter_pred_chroma_vert_w16out.s @@ -93,6 +93,10 @@ @r2 => src_strd @r3 => dst_strd +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 @@ -106,11 +110,12 @@ ihevc_inter_pred_chroma_vert_w16out_a9q: stmfd sp!,{r4-r12,r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4,[sp,#44] @loads ht - ldr r12,[sp,#40] @loads pi1_coeff + ldr r4,[sp,#ht_offset] @loads ht + ldr r12,[sp,#coeff_offset] @loads pi1_coeff cmp r4,#0 @checks ht == 0 - ldr r6,[sp,#48] @loads wd + ldr r6,[sp,#wd_offset] @loads wd sub r0,r0,r2 @pu1_src - src_strd vld1.8 {d0},[r12] @loads pi1_coeff @@ -361,6 +366,7 @@ epilog: vst1.8 {q12},[r7],r3 @stores the loaded value end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_filters_luma_horz.s b/common/arm/ihevc_inter_pred_filters_luma_horz.s index 215f8fd..5559aa7 100644 --- a/common/arm/ihevc_inter_pred_filters_luma_horz.s +++ b/common/arm/ihevc_inter_pred_filters_luma_horz.s @@ -103,6 +103,11 @@ @ r5 => ht @ r6 => wd + +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 @@ -116,15 +121,15 @@ ihevc_inter_pred_luma_horz_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - @str r1,[sp,#-4] - @ mov r7,#8192 + vpush {d8 - d15} + + start_loop_count: - @ ldr r1,[sp,#-4] - ldr r4,[sp,#40] @loads pi1_coeff - ldr r8,[sp,#44] @loads ht - ldr r10,[sp,#48] @loads wd + ldr r4,[sp,#coeff_offset] @loads pi1_coeff + ldr r8,[sp,#ht_offset] @loads ht + ldr r10,[sp,#wd_offset] @loads wd vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) mov r11,#1 @@ -262,7 +267,8 @@ end_inner_loop_8: - ldr r10,[sp,#48] @loads wd + ldr r10,[sp,#wd_offset] @loads wd + cmp r10,#12 beq outer_loop4_residual @@ -270,6 +276,7 @@ end_inner_loop_8: end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp @@ -417,7 +424,7 @@ epilog_16: ldr r7, [sp], #4 ldr r0, [sp], #4 - ldr r10,[sp,#48] + ldr r10,[sp,#wd_offset] cmp r10,#24 beq outer_loop8_residual @@ -426,6 +433,7 @@ epilog_16: end_loops1: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp @@ -527,6 +535,7 @@ end_inner_loop_4: @subs r7,r7,#1 @ bgt start_loop_count + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert.s b/common/arm/ihevc_inter_pred_filters_luma_vert.s index f51d68c..3d9ab1c 100644 --- a/common/arm/ihevc_inter_pred_filters_luma_vert.s +++ b/common/arm/ihevc_inter_pred_filters_luma_vert.s @@ -103,6 +103,11 @@ @ r12 => *pi1_coeff @ r5 => ht @ r3 => wd + +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 .syntax unified @@ -116,15 +121,16 @@ ihevc_inter_pred_luma_vert_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r12,[sp,#40] @load pi1_coeff + ldr r12,[sp,#coeff_offset] @load pi1_coeff mov r6,r3 - ldr r5,[sp,#48] @load wd + ldr r5,[sp,#wd_offset] @load wd vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff) sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff vabs.s8 d0,d0 @vabs_s8(coeff) add r0,r0,r12 @r0->pu1_src r12->pi1_coeff - ldr r3,[sp,#44] @load ht + ldr r3,[sp,#ht_offset] @load ht subs r7,r3,#0 @r3->ht @ble end_loops @end loop jump vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@ @@ -407,7 +413,8 @@ end_loops: ldr r1, [sp], #4 ldr r0, [sp], #4 - ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp + beq end1 + mov r5, #4 add r0, r0, #8 add r1, r1, #8 @@ -491,6 +498,8 @@ end_inner_loop_wd_4: add r0,r0,r8 bgt outer_loop_wd_4 +end1: + vpop {d8 - d15} ldmfd sp!, {r4-r12, r15} @reload the registers from sp @@ -564,15 +573,16 @@ end_inner_loop_wd_4: ihevc_inter_pred_luma_vert_w16out_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r12,[sp,#40] @load pi1_coeff + ldr r12,[sp,#coeff_offset] @load pi1_coeff mov r6,r3 - ldr r5,[sp,#48] @load wd + ldr r5,[sp,#wd_offset] @load wd vld1.u8 {d0},[r12] @coeff = vld1_s8(pi1_coeff) sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff vabs.s8 d0,d0 @vabs_s8(coeff) add r0,r0,r12 @r0->pu1_src r12->pi1_coeff - ldr r3,[sp,#44] @load ht + ldr r3,[sp,#ht_offset] @load ht subs r7,r3,#0 @r3->ht @ble end_loops_16out @end loop jump vdup.u8 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@ @@ -848,7 +858,8 @@ end_loops_16out: ldr r1, [sp], #4 ldr r0, [sp], #4 - ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp + beq end2 + mov r5, #4 add r0, r0, #8 add r1, r1, #16 @@ -934,7 +945,8 @@ end_inner_loop_wd_4_16out: add r1,r1,r9,lsl #1 add r0,r0,r8 bgt outer_loop_wd_4_16out - +end2: + vpop {d8 - d15} ldmfd sp!, {r4-r12, r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s b/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s index 4fbc5d1..9726710 100644 --- a/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s +++ b/common/arm/ihevc_inter_pred_filters_luma_vert_w16inp.s @@ -94,6 +94,10 @@ @ word32 ht, @ word32 wd ) +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 @@ -107,16 +111,17 @@ ihevc_inter_pred_luma_vert_w16inp_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r12,[sp,#40] @load pi1_coeff + ldr r12,[sp,#coeff_offset] @load pi1_coeff mov r6,r3 - ldr r5,[sp,#48] @load wd + ldr r5,[sp,#wd_offset] @load wd vld1.8 {d0},[r12] @coeff = vld1_s8(pi1_coeff) mov r2, r2, lsl #1 sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff @vabs.s8 d0,d0 @vabs_s8(coeff) add r0,r0,r12 @r0->pu1_src r12->pi1_coeff - ldr r3,[sp,#44] @load ht + ldr r3,[sp,#ht_offset] @load ht subs r7,r3,#0 @r3->ht @ble end_loops @end loop jump vmovl.s8 q0,d0 @@ -370,6 +375,7 @@ epilog_end: end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_luma_copy.s b/common/arm/ihevc_inter_pred_luma_copy.s index 8a61369..e4f5573 100644 --- a/common/arm/ihevc_inter_pred_luma_copy.s +++ b/common/arm/ihevc_inter_pred_luma_copy.s @@ -71,6 +71,10 @@ @ r7 => ht @ r12 => wd +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 @@ -83,8 +87,9 @@ ihevc_inter_pred_luma_copy_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - ldr r12,[sp,#48] @loads wd - ldr r7,[sp,#44] @loads ht + vpush {d8 - d15} + ldr r12,[sp,#wd_offset] @loads wd + ldr r7,[sp,#ht_offset] @loads ht cmp r7,#0 @checks ht == 0 ble end_loops tst r12,#15 @checks wd for multiples for 4 & 8 @@ -121,6 +126,7 @@ end_inner_loop_wd_4: bgt outer_loop_wd_4 end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp @@ -151,6 +157,7 @@ end_inner_loop_wd_8: sub r1,r6,r11 @pu1_dst = pu1_dst_tmp bgt outer_loop_wd_8 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp core_loop_wd_16: @@ -180,6 +187,7 @@ end_inner_loop_wd_16: sub r1,r6,r11 @pu1_dst = pu1_dst_tmp bgt outer_loop_wd_16 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_luma_copy_w16out.s b/common/arm/ihevc_inter_pred_luma_copy_w16out.s index 771bcb3..84dbbad 100644 --- a/common/arm/ihevc_inter_pred_luma_copy_w16out.s +++ b/common/arm/ihevc_inter_pred_luma_copy_w16out.s @@ -72,6 +72,10 @@ @ r7 => ht @ r12 => wd +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 @@ -85,8 +89,9 @@ ihevc_inter_pred_luma_copy_w16out_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - ldr r12,[sp,#48] @loads wd - ldr r7,[sp,#44] @loads ht + vpush {d8 - d15} + ldr r12,[sp,#wd_offset] @loads wd + ldr r7,[sp,#ht_offset] @loads ht cmp r7,#0 @ht condition(ht == 0) ble end_loops @loop tst r12,#7 @conditional check for wd (multiples) @@ -129,6 +134,7 @@ end_inner_loop_wd_4: bgt outer_loop_wd_4 end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp @@ -242,6 +248,7 @@ epilog_end: vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_luma_horz_w16out.s b/common/arm/ihevc_inter_pred_luma_horz_w16out.s index e8800e0..a60bb08 100644 --- a/common/arm/ihevc_inter_pred_luma_horz_w16out.s +++ b/common/arm/ihevc_inter_pred_luma_horz_w16out.s @@ -107,6 +107,11 @@ @r11 - #1 @r12 - src_ptr1 @r14 - loop_counter + +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 .syntax unified @@ -122,16 +127,16 @@ ihevc_inter_pred_luma_horz_w16out_a9q: bic r14, #1 @ clearing bit[0], so that it goes back to mode stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - ldr r4,[sp,#40] @loads pi1_coeff - ldr r7,[sp,#44] @loads ht + vpush {d8 - d15} + ldr r4,[sp,#coeff_offset] @loads pi1_coeff + ldr r7,[sp,#ht_offset] @loads ht vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) sub r14,r7,#0 @checks for ht == 0 vabs.s8 d2,d0 @vabs_s8(coeff) mov r11,#1 - @ble end_loops - ldr r10,[sp,#48] @loads wd + ldr r10,[sp,#wd_offset] @loads wd vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0) sub r12,r0,#3 @pu1_src - 3 vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1) @@ -274,11 +279,10 @@ end_inner_loop_4: height_residue_4: - ldr r7,[sp,#44] @loads ht + ldr r7,[sp,#ht_offset] @loads ht and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1) cmp r7,#0 - @beq end_loops - ldmfdeq sp!,{r4-r12,r15} @reload the registers from sp + beq end_loops outer_loop_height_residue_4: @@ -331,7 +335,7 @@ end_inner_loop_height_residue_4: add r12,r12,r9 @increment the input pointer src_strd-wd add r1,r1,r8 @increment the output pointer dst_strd-wd bgt outer_loop_height_residue_4 - + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp outer_loop8_residual: @@ -427,18 +431,18 @@ end_inner_loop_8: - ldr r10,[sp,#48] @loads wd + ldr r10,[sp,#wd_offset] @loads wd cmp r10,#12 beq outer_loop4_residual - ldr r7,[sp,#44] @loads ht + ldr r7,[sp,#ht_offset] @loads ht and r7,r7,#1 cmp r7,#1 beq height_residue_4 -@end_loops + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp @@ -452,7 +456,6 @@ outer_loop_16: add r4,r12,r2 @pu1_src + src_strd and r0, r12, #31 sub r5,r10,#0 @checks wd - @ble end_loops1 pld [r12, r2, lsl #1] vld1.u32 {q0},[r12],r11 @vector load pu1_src pld [r4, r2, lsl #1] @@ -580,17 +583,17 @@ epilog_16: ldr r7, [sp], #4 ldr r0, [sp], #4 - ldr r10,[sp,#48] + ldr r10,[sp,#wd_offset] cmp r10,#24 beq outer_loop8_residual add r1,r6,r8,lsl #1 - ldr r7,[sp,#44] @loads ht + ldr r7,[sp,#ht_offset] @loads ht and r7,r7,#1 cmp r7,#1 beq height_residue_4 -end_loops1: - +end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s b/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s index c6716fe..6e0f1ed 100644 --- a/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s +++ b/common/arm/ihevc_inter_pred_luma_vert_w16inp_w16out.s @@ -102,6 +102,10 @@ @ r5 => ht @ r6 => wd +.equ coeff_offset, 104 +.equ ht_offset, 108 +.equ wd_offset, 112 + .text .align 4 @@ -115,16 +119,17 @@ ihevc_inter_pred_luma_vert_w16inp_w16out_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r12,[sp,#40] @load pi1_coeff + ldr r12,[sp,#coeff_offset] @load pi1_coeff mov r6,r3,lsl #1 - ldr r5,[sp,#48] @load wd + ldr r5,[sp,#wd_offset] @load wd vld1.8 {d0},[r12] @coeff = vld1_s8(pi1_coeff) mov r2, r2, lsl #1 sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff @vabs.s8 d0,d0 @vabs_s8(coeff) add r0,r0,r12 @r0->pu1_src r12->pi1_coeff - ldr r3,[sp,#44] @load ht + ldr r3,[sp,#ht_offset] @load ht subs r7,r3,#0 @r3->ht @ble end_loops @end loop jump vmovl.s8 q0,d0 @@ -393,6 +398,7 @@ epilog_end: end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_chroma_dc.s b/common/arm/ihevc_intra_pred_chroma_dc.s index 72d9730..6e5900a 100644 --- a/common/arm/ihevc_intra_pred_chroma_dc.s +++ b/common/arm/ihevc_intra_pred_chroma_dc.s @@ -92,6 +92,8 @@ @ mode @ pi1_coeff +.equ nt_offset, 40 + .text .align 4 @@ -106,7 +108,7 @@ ihevc_intra_pred_chroma_dc_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - ldr r4,[sp,#40] @loads nt + ldr r4,[sp,#nt_offset] @loads nt mov r9, #0 vmov d17, r9, r9 diff --git a/common/arm/ihevc_intra_pred_chroma_horz.s b/common/arm/ihevc_intra_pred_chroma_horz.s index 6089fd8..4512d72 100644 --- a/common/arm/ihevc_intra_pred_chroma_horz.s +++ b/common/arm/ihevc_intra_pred_chroma_horz.s @@ -84,6 +84,8 @@ @r2 => *pu1_dst @r3 => dst_strd +.equ nt_offset, 104 + .text .align 4 @@ -97,8 +99,9 @@ ihevc_intra_pred_chroma_horz_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4,[sp,#40] @loads nt + ldr r4,[sp,#nt_offset] @loads nt lsl r6,r4,#2 @four_nt @@ -187,6 +190,7 @@ core_loop_16: vst1.16 {q4},[r2],r3 vst1.16 {q4},[r9],r3 bgt core_loop_16 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp b endloop @@ -263,6 +267,7 @@ core_loop_8: @vst1.8 {q5},[r2],r3 @vst1.8 {q6},[r2],r3 @vst1.8 {q7},[r2],r3 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp b endloop @@ -308,6 +313,7 @@ core_loop_4: @vst1.8 {d8},[r2],r3 @vst1.8 {d9},[r2],r3 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp b endloop @@ -339,6 +345,7 @@ core_loop_4: vst1.32 {d4[0]},[r2],r3 vst1.32 {d5[0]},[r2],r3 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp endloop: diff --git a/common/arm/ihevc_intra_pred_chroma_mode2.s b/common/arm/ihevc_intra_pred_chroma_mode2.s index cfa2ddb..013700d 100644 --- a/common/arm/ihevc_intra_pred_chroma_mode2.s +++ b/common/arm/ihevc_intra_pred_chroma_mode2.s @@ -87,11 +87,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #104 @ nt @ mode @ pi1_coeff +.equ nt_offset, 104 + .text .align 4 @@ -105,8 +107,9 @@ ihevc_intra_pred_chroma_mode2_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4,[sp,#40] @loads nt + ldr r4,[sp,#nt_offset] @loads nt mov r8,#-4 cmp r4,#4 @@ -290,6 +293,7 @@ mode2_4: vst1.8 {d6},[r2],r3 end_func: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_chroma_mode_18_34.s b/common/arm/ihevc_intra_pred_chroma_mode_18_34.s index b0dd1fa..6af6450 100644 --- a/common/arm/ihevc_intra_pred_chroma_mode_18_34.s +++ b/common/arm/ihevc_intra_pred_chroma_mode_18_34.s @@ -87,11 +87,14 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #104 @ nt @ mode @ pi1_coeff +.equ nt_offset, 104 +.equ mode_offset, 108 + .text .align 4 @@ -105,10 +108,10 @@ ihevc_intra_pred_chroma_mode_18_34_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - - ldr r4,[sp,#40] - ldr r5,[sp,#44] + ldr r4,[sp,#nt_offset] + ldr r5,[sp,#mode_offset] cmp r4,#4 beq mode2_4 @@ -181,6 +184,7 @@ mode2_4: vst1.32 {d0},[r2],r3 end_func: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s index fb75e96..21b54da 100644 --- a/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s +++ b/common/arm/ihevc_intra_pred_chroma_mode_27_to_33.s @@ -81,6 +81,9 @@ @ word32 nt, @ word32 mode) +.equ nt_offset, 104 +.equ mode_offset, 108 + .text .align 4 @@ -103,9 +106,10 @@ gau1_ihevc_planar_factor_addr: ihevc_intra_pred_chroma_mode_27_to_33_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4,[sp,#40] @loads nt - ldr r5,[sp,#44] @loads mode + ldr r4,[sp,#nt_offset] @loads nt + ldr r5,[sp,#mode_offset] @loads mode ldr r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35] ulbl1: add r6,r6,pc @@ -535,6 +539,7 @@ core_loop_4: vst1.8 {d22},[r2],r3 end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s b/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s index a5eb3ca..b7dcbfb 100644 --- a/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s +++ b/common/arm/ihevc_intra_pred_chroma_mode_3_to_9.s @@ -82,10 +82,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #104 @ nt @ mode +.equ nt_offset, 104 +.equ mode_offset, 108 + .text .align 4 @@ -123,13 +126,14 @@ col_for_intra_chroma_addr_3: ihevc_intra_pred_chroma_mode_3_to_9_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4,[sp,#40] @loads nt + ldr r4,[sp,#nt_offset] @loads nt ldr r7, gai4_ihevc_ang_table_addr ulbl1: add r7,r7,pc - ldr r5,[sp,#44] @mode (3 to 9) + ldr r5,[sp,#mode_offset] @mode (3 to 9) ldr r8, gai4_ihevc_inv_ang_table_addr ulbl2: add r8,r8,pc @@ -486,6 +490,7 @@ epil_8_16_32: vst1.8 d18, [r5], r3 @st (row 7) end_func: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_chroma_planar.s b/common/arm/ihevc_intra_pred_chroma_planar.s index 30b3144..7d03d55 100644 --- a/common/arm/ihevc_intra_pred_chroma_planar.s +++ b/common/arm/ihevc_intra_pred_chroma_planar.s @@ -87,11 +87,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #104 @ nt @ mode @ pi1_coeff +.equ nt_offset, 104 + .text .align 4 @@ -109,8 +111,9 @@ gau1_ihevc_planar_factor_addr: ihevc_intra_pred_chroma_planar_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4,[sp,#40] @loads nt + ldr r4,[sp,#nt_offset] @loads nt ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs ulbl1: add r11,r11,pc @@ -353,6 +356,7 @@ loop_sz_4: bne loop_sz_4 end_loop: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_chroma_ver.s b/common/arm/ihevc_intra_pred_chroma_ver.s index b68a045..ce2ad73 100644 --- a/common/arm/ihevc_intra_pred_chroma_ver.s +++ b/common/arm/ihevc_intra_pred_chroma_ver.s @@ -87,6 +87,8 @@ @ nt @ mode +.equ nt_offset, 40 + .text .align 4 @@ -101,7 +103,7 @@ ihevc_intra_pred_chroma_ver_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - ldr r4,[sp,#40] @loads nt + ldr r4,[sp,#nt_offset] @loads nt lsl r5, r4, #2 @4nt diff --git a/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s index 6c882cf..8644cc8 100644 --- a/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s +++ b/common/arm/ihevc_intra_pred_filters_chroma_mode_11_to_17.s @@ -84,10 +84,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #236 @ nt @ mode +.equ nt_offset, 236 +.equ mode_offset, 240 + .text .align 4 @@ -123,13 +126,15 @@ col_for_intra_chroma_addr_3: ihevc_intra_pred_chroma_mode_11_to_17_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} + sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2] - ldr r4,[sp,#40] @loads nt + ldr r4,[sp,#nt_offset] @loads wd ldr r7, gai4_ihevc_ang_table_addr ulbl1: add r7,r7,pc - ldr r5,[sp,#44] @mode (11 to 17) + ldr r5,[sp,#mode_offset] @mode (11 to 17) ldr r8, gai4_ihevc_inv_ang_table_addr ulbl2: add r8,r8,pc @@ -139,7 +144,6 @@ ulbl2: sub r8, r8, #44 ldr r7, [r7] @intra_pred_ang - sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2] ldr r8, [r8] @inv_ang add r6, sp, r4, lsl #1 @ref_temp + 2 * nt @@ -607,6 +611,7 @@ epil_8_16_32: end_func: add sp, sp, #132 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s b/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s index 2ede914..a555646 100644 --- a/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s +++ b/common/arm/ihevc_intra_pred_filters_chroma_mode_19_to_25.s @@ -84,10 +84,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #236 @ nt @ mode +.equ nt_offset, 236 +.equ mode_offset, 240 + .text .align 4 @@ -116,13 +119,15 @@ gai4_ihevc_ang_table_addr_2: ihevc_intra_pred_chroma_mode_19_to_25_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} + sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2] - ldr r4,[sp,#40] @loads nt + ldr r4,[sp,#nt_offset] @loads nt ldr r7, gai4_ihevc_ang_table_addr_1 ulbl3: add r7,r7,pc - ldr r5,[sp,#44] @mode (19 to 25) + ldr r5,[sp,#mode_offset] @mode (19 to 25) ldr r8, gai4_ihevc_inv_ang_table_addr ulbl1: add r8,r8,pc @@ -132,7 +137,6 @@ ulbl1: sub r8, r8, #48 @gai4_ihevc_inv_ang_table[mode - 12] ldr r7, [r7] @intra_pred_ang - sub sp, sp, #132 @ref_temp[2 * max_cu_size + 2] ldr r8, [r8] @inv_ang add r6, sp, r4 , lsl #1 @ref_temp + 2 * nt @@ -562,6 +566,7 @@ core_loop_4: end_loops: add sp, sp, #132 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s index ec38786..336af06 100644 --- a/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s +++ b/common/arm/ihevc_intra_pred_filters_luma_mode_11_to_17.s @@ -84,10 +84,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #236 @ nt @ mode +.equ nt_offset, 236 +.equ mode_offset, 240 + .text .align 4 @@ -129,13 +132,14 @@ col_for_intra_luma_addr_4: ihevc_intra_pred_luma_mode_11_to_17_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - - ldr r4,[sp,#40] @loads nt + vpush {d8 - d15} + sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1] + ldr r4,[sp,#nt_offset] @loads nt ldr r7, gai4_ihevc_ang_table_addr ulbl1: add r7,r7,pc - ldr r5,[sp,#44] @mode (11 to 17) + ldr r5,[sp,#mode_offset] @mode (11 to 17) ldr r8, gai4_ihevc_inv_ang_table_addr ulbl2: add r8,r8,pc @@ -145,7 +149,6 @@ ulbl2: sub r8, r8, #44 ldr r7, [r7] @intra_pred_ang - sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1] ldr r8, [r8] @inv_ang add r6, sp, r4 @ref_temp + nt @@ -684,6 +687,7 @@ ulbl4: end_func: add sp, sp, #132 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s b/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s index af342bf..32268a2 100644 --- a/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s +++ b/common/arm/ihevc_intra_pred_filters_luma_mode_19_to_25.s @@ -84,10 +84,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #236 @ nt @ mode +.equ nt_offset, 236 +.equ mode_offset, 240 + .text .align 4 @@ -116,13 +119,15 @@ gai4_ihevc_ang_table_addr_2: ihevc_intra_pred_luma_mode_19_to_25_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} + sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1] - ldr r4,[sp,#40] @loads nt + ldr r4,[sp,#nt_offset] @loads nt ldr r7, gai4_ihevc_ang_table_addr_1 ulbl_1: add r7,r7,pc - ldr r5,[sp,#44] @mode (19 to 25) + ldr r5,[sp,#mode_offset] @mode (19 to 25) ldr r8, gai4_ihevc_inv_ang_table_addr ulbl1: add r8,r8,pc @@ -132,7 +137,6 @@ ulbl1: sub r8, r8, #48 @gai4_ihevc_inv_ang_table[mode - 12] ldr r7, [r7] @intra_pred_ang - sub sp, sp, #132 @ref_temp[2 * max_cu_size + 1] ldr r8, [r8] @inv_ang add r6, sp, r4 @ref_temp + nt @@ -644,6 +648,7 @@ core_loop_4: end_loops: add sp, sp, #132 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_luma_dc.s b/common/arm/ihevc_intra_pred_luma_dc.s index f380d94..7d8cb91 100644 --- a/common/arm/ihevc_intra_pred_luma_dc.s +++ b/common/arm/ihevc_intra_pred_luma_dc.s @@ -87,11 +87,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #104 @ nt @ mode @ pi1_coeff +.equ nt_offset, 104 + .text .align 4 @@ -105,8 +107,8 @@ ihevc_intra_pred_luma_dc_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - - ldr r4,[sp,#40] @loads nt + vpush {d8 - d15} + ldr r4,[sp,#nt_offset] @loads nt @********** testing @mov r6, #128 @@ -498,6 +500,7 @@ dc_4: epilogue_end: end_func: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_luma_horz.s b/common/arm/ihevc_intra_pred_luma_horz.s index 581b673..2a44404 100644 --- a/common/arm/ihevc_intra_pred_luma_horz.s +++ b/common/arm/ihevc_intra_pred_luma_horz.s @@ -84,6 +84,8 @@ @r2 => *pu1_dst @r3 => dst_strd +.equ nt_offset, 104 + .text .align 4 @@ -97,9 +99,8 @@ ihevc_intra_pred_luma_horz_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - - ldr r4,[sp,#40] @loads nt - @ldr r5,[sp,#44] @loads mode + vpush {d8 - d15} + ldr r4,[sp,#nt_offset] @loads nt lsl r6,r4,#1 @two_nt @@ -185,6 +186,7 @@ core_loop_32: vst1.8 {q4},[r2],r3 vst1.8 {q4},[r9],r3 bgt core_loop_32 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp b end_func @@ -258,7 +260,7 @@ core_loop_16: vst1.8 {q5},[r2],r3 vst1.8 {q6},[r2],r3 vst1.8 {q7},[r2],r3 - + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp b end_func @@ -301,6 +303,7 @@ core_loop_8: vst1.8 {d8},[r2],r3 vst1.8 {d9},[r2],r3 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp b end_func @@ -331,7 +334,7 @@ core_loop_4: vst1.32 {d3[0]},[r2],r3 vst1.32 {d4[0]},[r2],r3 vst1.32 {d5[0]},[r2],r3 - + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp end_func: diff --git a/common/arm/ihevc_intra_pred_luma_mode2.s b/common/arm/ihevc_intra_pred_luma_mode2.s index cf7999b..935f02d 100644 --- a/common/arm/ihevc_intra_pred_luma_mode2.s +++ b/common/arm/ihevc_intra_pred_luma_mode2.s @@ -87,11 +87,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #104 @ nt @ mode @ pi1_coeff +.equ nt_offset, 104 + .text .align 4 @@ -105,8 +107,8 @@ ihevc_intra_pred_luma_mode2_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - - ldr r4,[sp,#40] @loads nt + vpush {d8 - d15} + ldr r4,[sp,#nt_offset] @loads nt mov r8,#-2 cmp r4,#4 @@ -260,6 +262,7 @@ mode2_4: vst1.32 {d7[0]},[r7] end_func: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_luma_mode_18_34.s b/common/arm/ihevc_intra_pred_luma_mode_18_34.s index 438c0f5..9287371 100644 --- a/common/arm/ihevc_intra_pred_luma_mode_18_34.s +++ b/common/arm/ihevc_intra_pred_luma_mode_18_34.s @@ -92,6 +92,9 @@ @ mode @ pi1_coeff +.equ nt_offset, 40 +.equ mode_offset, 44 + .text .align 4 @@ -107,8 +110,8 @@ ihevc_intra_pred_luma_mode_18_34_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - ldr r4,[sp,#40] - ldr r5,[sp,#44] + ldr r4,[sp,#nt_offset] + ldr r5,[sp,#mode_offset] cmp r4,#4 beq mode2_4 diff --git a/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s b/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s index 595d82a..9d95719 100644 --- a/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s +++ b/common/arm/ihevc_intra_pred_luma_mode_27_to_33.s @@ -85,6 +85,9 @@ @r2 => *pu1_dst @r3 => dst_strd +.equ nt_offset, 104 +.equ mode_offset, 108 + .text .align 4 @@ -107,9 +110,9 @@ gau1_ihevc_planar_factor_addr: ihevc_intra_pred_luma_mode_27_to_33_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - - ldr r4,[sp,#40] @loads nt - ldr r5,[sp,#44] @loads mode + vpush {d8 - d15} + ldr r4,[sp,#nt_offset] @loads nt + ldr r5,[sp,#mode_offset] @loads mode ldr r6,gai4_ihevc_ang_table_addr @loads word32 gai4_ihevc_ang_table[35] ulbl1: add r6,r6,pc @@ -534,6 +537,7 @@ core_loop_4: vst1.32 {d22[0]},[r2],r3 end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s b/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s index a8e93c8..e9c871c 100644 --- a/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s +++ b/common/arm/ihevc_intra_pred_luma_mode_3_to_9.s @@ -84,10 +84,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #104 @ nt @ mode +.equ nt_offset, 104 +.equ mode_offset, 108 + .text .align 4 @@ -126,13 +129,13 @@ col_for_intra_luma_addr_3: ihevc_intra_pred_luma_mode_3_to_9_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - - ldr r4,[sp,#40] @loads nt + vpush {d8 - d15} + ldr r4,[sp,#nt_offset] @loads nt ldr r7, gai4_ihevc_ang_table_addr ulbl1: add r7,r7,pc - ldr r5,[sp,#44] @mode (3 to 9) + ldr r5,[sp,#mode_offset] @mode (3 to 9) ldr r8, gai4_ihevc_inv_ang_table_addr ulbl2: add r8,r8,pc @@ -566,6 +569,7 @@ ulbl3_2: vst1.32 d18[0], [r2], r3 @st (row 3) end_func: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_luma_planar.s b/common/arm/ihevc_intra_pred_luma_planar.s index 666798e..50b6b1b 100644 --- a/common/arm/ihevc_intra_pred_luma_planar.s +++ b/common/arm/ihevc_intra_pred_luma_planar.s @@ -87,11 +87,13 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #104 @ nt @ mode @ pi1_coeff +.equ nt_offset, 104 + .text .align 4 @@ -114,8 +116,8 @@ gau1_ihevc_planar_factor_1_addr: ihevc_intra_pred_luma_planar_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - - ldr r4,[sp,#40] @loads nt + vpush {d8 - d15} + ldr r4,[sp,#nt_offset] @loads nt ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs ulbl1: add r11,r11,pc @@ -546,6 +548,7 @@ loop_sz_4: bne loop_sz_4 end_loop: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_intra_pred_luma_vert.s b/common/arm/ihevc_intra_pred_luma_vert.s index 5eeaeb3..9610773 100644 --- a/common/arm/ihevc_intra_pred_luma_vert.s +++ b/common/arm/ihevc_intra_pred_luma_vert.s @@ -84,10 +84,12 @@ @r2 => *pu1_dst @r3 => dst_strd -@stack contents from #40 +@stack contents from #104 @ nt @ mode +.equ nt_offset, 104 + .text .align 4 @@ -101,8 +103,8 @@ ihevc_intra_pred_luma_ver_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - - ldr r4,[sp,#40] @loads nt + vpush {d8 - d15} + ldr r4,[sp,#nt_offset] @loads nt lsl r5, r4, #1 @2nt @@ -417,5 +419,6 @@ blk_4: end_func: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_itrans_recon_16x16.s b/common/arm/ihevc_itrans_recon_16x16.s index 82055ad..198fd52 100644 --- a/common/arm/ihevc_itrans_recon_16x16.s +++ b/common/arm/ihevc_itrans_recon_16x16.s @@ -105,6 +105,12 @@ @ r12 @ r11 +.equ src_stride_offset, 104 +.equ pred_stride_offset, 108 +.equ out_stride_offset, 112 +.equ zero_cols_offset, 116 +.equ zero_rows_offset, 120 + .text .align 4 @@ -129,15 +135,10 @@ g_ai2_ihevc_trans_16_transpose_addr: ihevc_itrans_recon_16x16_a9q: stmfd sp!,{r4-r12,lr} -@ add sp,sp,#40 - - - -@ ldr r8,[sp,#4] @ prediction stride -@ ldr r7,[sp,#8] @ destination stride - ldr r6,[sp,#40] @ src stride - ldr r12,[sp,#52] - ldr r11,[sp,#56] + vpush {d8 - d15} + ldr r6,[sp,#src_stride_offset] @ src stride + ldr r12,[sp,#zero_cols_offset] + ldr r11,[sp,#zero_rows_offset] @@ -661,8 +662,8 @@ skip_last12rows_kernel2: mov r6,r7 - ldr r8,[sp,#44] @ prediction stride - ldr r7,[sp,#48] @ destination stride + ldr r8,[sp,#pred_stride_offset] @ prediction stride + ldr r7,[sp,#out_stride_offset] @ destination stride mov r10,#16 @@ -1126,7 +1127,7 @@ skip_last8rows_stage2_kernel2: bne second_stage -@ sub sp,sp,#40 + vpop {d8 - d15} ldmfd sp!,{r4-r12,pc} diff --git a/common/arm/ihevc_itrans_recon_32x32.s b/common/arm/ihevc_itrans_recon_32x32.s index eeb1d66..65b6ffd 100644 --- a/common/arm/ihevc_itrans_recon_32x32.s +++ b/common/arm/ihevc_itrans_recon_32x32.s @@ -124,6 +124,14 @@ @d5[2]= 43 d7[2]=9 @d5[3]= 38 d7[3]=4 +.equ pi2_src_offset, 64 +.equ pi2_tmp_offset, 68 +.equ src_strd_offset, 120 +.equ pred_strd_offset, 124 +.equ dst_strd_offset, 128 +.equ zero_cols_offset, 132 +.equ zero_rows_offset, 136 + .text .align 4 @@ -152,13 +160,11 @@ r9_addr: .word 0xffff0000 ihevc_itrans_recon_32x32_a9q: stmfd sp!,{r0-r12,lr} + vpush {d8 - d15} - -@ldr r8,[sp,#56] @ prediction stride -@ldr r7,[sp,#64] @ destination stride - ldr r6,[sp,#56] @ src stride - ldr r12,[sp,#68] - ldr r11,[sp,#72] + ldr r6,[sp,#src_strd_offset] @ src stride + ldr r12,[sp,#zero_cols_offset] + ldr r11,[sp,#zero_rows_offset] mov r6,r6,lsl #1 @ x sizeof(word16) add r10,r6,r6, lsl #1 @ 3 rows @@ -1493,10 +1499,10 @@ shift4: bne dct_stage1 second_stage_dct: @ mov r0,r1 - ldr r0,[sp] - ldr r1,[sp,#4] - ldr r8,[sp,#60] @ prediction stride - ldr r7,[sp,#64] @ destination stride + ldr r0,[sp,#pi2_src_offset] + ldr r1,[sp,#pi2_tmp_offset] + ldr r8,[sp,#pred_strd_offset] @ prediction stride + ldr r7,[sp,#dst_strd_offset] @ destination stride @ add r4,r2,r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data @ add r5,r8,r8, lsl #1 @ @@ -2855,6 +2861,7 @@ prediction_buffer: subs r14,r14,#1 bne dct_stage2 + vpop {d8 - d15} ldmfd sp!,{r0-r12,pc} diff --git a/common/arm/ihevc_itrans_recon_4x4.s b/common/arm/ihevc_itrans_recon_4x4.s index c955502..fb5796c 100644 --- a/common/arm/ihevc_itrans_recon_4x4.s +++ b/common/arm/ihevc_itrans_recon_4x4.s @@ -100,6 +100,10 @@ @ r6 => dst_strd @ r7 => zero_cols +.equ src_strd_offset, 104 +.equ pred_strd_offset, 108 +.equ dst_strd_offset, 112 +.equ zero_cols_offset, 116 .text .align 4 @@ -122,17 +126,18 @@ g_ai2_ihevc_trans_4_transpose_addr: ihevc_itrans_recon_4x4_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} ldr r8,g_ai2_ihevc_trans_4_transpose_addr ulbl1: add r8,r8,pc - ldr r4,[sp,#40] @loading src_strd - ldr r5,[sp,#44] @loading pred_strd + ldr r4,[sp,#src_strd_offset] @loading src_strd + ldr r5,[sp,#pred_strd_offset] @loading pred_strd add r4,r4,r4 @ src_strd in terms of word16 - ldr r6,[sp,#48] @loading dst_strd - ldr r7,[sp,#52] @loading zero_cols + ldr r6,[sp,#dst_strd_offset] @loading dst_strd + ldr r7,[sp,#zero_cols_offset] @loading zero_cols add r9,r0,r4 @ pi2_src[0] + src_strd @@ -223,7 +228,7 @@ ulbl1: vst1.32 {d1[0]},[r3],r6 vst1.32 {d1[1]},[r3],r6 - + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_itrans_recon_4x4_ttype1.s b/common/arm/ihevc_itrans_recon_4x4_ttype1.s index ab65dae..82ed8a0 100644 --- a/common/arm/ihevc_itrans_recon_4x4_ttype1.s +++ b/common/arm/ihevc_itrans_recon_4x4_ttype1.s @@ -103,6 +103,11 @@ @ r6 => dst_strd @ r7 => zero_cols +.equ src_strd_offset, 104 +.equ pred_strd_offset, 108 +.equ dst_strd_offset, 112 +.equ zero_cols_offset, 116 + .text .align 4 @@ -119,10 +124,12 @@ ihevc_itrans_recon_4x4_ttype1_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - ldr r4,[sp,#40] @loading src_strd - ldr r5,[sp,#44] @loading pred_strd - ldr r6,[sp,#48] @loading dst_strd - ldr r7,[sp,#52] @loading zero_cols + vpush {d8 - d15} + + ldr r4,[sp,#src_strd_offset] @loading src_strd + ldr r5,[sp,#pred_strd_offset] @loading pred_strd + ldr r6,[sp,#dst_strd_offset] @loading dst_strd + ldr r7,[sp,#zero_cols_offset] @loading zero_cols add r4,r4,r4 @ src_strd in terms of word16 @@ -224,6 +231,7 @@ ihevc_itrans_recon_4x4_ttype1_a9q: vst1.32 {d1[0]},[r3],r6 vst1.32 {d1[1]},[r3],r6 + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_itrans_recon_8x8.s b/common/arm/ihevc_itrans_recon_8x8.s index e9b53b4..94113d8 100644 --- a/common/arm/ihevc_itrans_recon_8x8.s +++ b/common/arm/ihevc_itrans_recon_8x8.s @@ -104,6 +104,11 @@ @ dst_strd @ zero_cols +.equ src_stride_offset, 104 +.equ pred_stride_offset, 108 +.equ out_stride_offset, 112 +.equ zero_cols_offset, 116 +.equ zero_rows_offset, 120 .text @@ -151,12 +156,13 @@ ihevc_itrans_recon_8x8_a9q: @// copy the input pointer to another register @// step 1 : load all constants stmfd sp!,{r4-r12,lr} + vpush {d8 - d15} - ldr r8,[sp,#44] @ prediction stride - ldr r7,[sp,#48] @ destination stride - ldr r6,[sp, #40] @ src stride - ldr r12,[sp,#52] - ldr r11,[sp,#56] + ldr r8, [sp, #pred_stride_offset] @ prediction stride + ldr r7, [sp, #out_stride_offset] @ destination stride + ldr r6, [sp, #src_stride_offset] @ src stride + ldr r12, [sp, #zero_cols_offset] + ldr r11, [sp, #zero_rows_offset] mov r6,r6,lsl #1 @ x sizeof(word16) add r9,r0,r6, lsl #1 @ 2 rows @@ -925,7 +931,7 @@ pred_buff_addition: - + vpop {d8 - d15} ldmfd sp!,{r4-r12,pc} diff --git a/common/arm/ihevc_sao_band_offset_chroma.s b/common/arm/ihevc_sao_band_offset_chroma.s index 32e149d..a9da725 100644 --- a/common/arm/ihevc_sao_band_offset_chroma.s +++ b/common/arm/ihevc_sao_band_offset_chroma.s @@ -61,6 +61,14 @@ @r9 => wd @r10=> ht +.equ pu1_src_top_left_offset, 104 +.equ sao_band_pos_u_offset, 108 +.equ sao_band_pos_v_offset, 112 +.equ pi1_sao_u_offset, 116 +.equ pi1_sao_v_offset, 120 +.equ wd_offset, 124 +.equ ht_offset, 128 + .text .p2align 2 @@ -76,10 +84,11 @@ gu1_table_band_idx_addr_2: ihevc_sao_band_offset_chroma_a9q: STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments - LDR r4,[sp,#40] @Loads pu1_src_top_left - LDR r10,[sp,#64] @Loads ht + vpush {d8 - d15} + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left + LDR r10,[sp,#ht_offset] @Loads ht - LDR r9,[sp,#60] @Loads wd + LDR r9,[sp,#wd_offset] @Loads wd MOV r11,r10 @Move the ht to r9 for loop counter ADD r12,r0,r9 @pu1_src[row * src_strd + (wd)] @@ -94,7 +103,7 @@ SRC_LEFT_LOOP: STRH r5,[r2],#2 @Store the value in pu1_src_left pointer BNE SRC_LEFT_LOOP - LDR r5,[sp,#44] @Loads sao_band_pos_u + LDR r5,[sp,#sao_band_pos_u_offset] @Loads sao_band_pos_u VLD1.8 D1,[r14]! @band_table_u.val[0] ADD r12,r3,r9 @pu1_src_top[wd] @@ -104,7 +113,7 @@ SRC_LEFT_LOOP: STRH r11,[r4] @store to pu1_src_top_left[0] VLD1.8 D3,[r14]! @band_table_u.val[2] - LDR r7,[sp,#52] @Loads pi1_sao_offset_u + LDR r7,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u SUB r4,r10,#1 @ht-1 VDUP.8 D31,r6 @band_pos_u @@ -147,7 +156,7 @@ ulbl2: VLD1.8 D10,[r14]! @band_table_v.val[1] VADD.I8 D3,D7,D27 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3])) - LDR r6,[sp,#48] @Loads sao_band_pos_v + LDR r6,[sp,#sao_band_pos_v_offset] @Loads sao_band_pos_v VADD.I8 D4,D8,D26 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4])) LSL r11,r6,#3 @sao_band_pos_v @@ -198,7 +207,7 @@ SAO_BAND_POS_U_0: SWITCH_BREAK_U: VDUP.8 D30,r11 @band_pos_v - LDR r8,[sp,#56] @Loads pi1_sao_offset_v + LDR r8,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v VLD1.8 D11,[r14]! @band_table_v.val[2] VADD.I8 D13,D9,D30 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v) @@ -387,6 +396,7 @@ WIDTH_RESIDUE: @If width is not multiple of 16 BNE WIDTH_RESIDUE END_LOOP: + vpop {d8 - d15} LDMFD sp!,{r4-r12,r15} @Reload the registers from SP diff --git a/common/arm/ihevc_sao_band_offset_luma.s b/common/arm/ihevc_sao_band_offset_luma.s index 3875377..66f2968 100644 --- a/common/arm/ihevc_sao_band_offset_luma.s +++ b/common/arm/ihevc_sao_band_offset_luma.s @@ -57,6 +57,12 @@ @r7 => wd @r8 => ht +.equ pu1_src_top_left_offset, 104 +.equ sao_band_pos_offset, 108 +.equ pi1_sao_offset, 112 +.equ wd_offset, 116 +.equ ht_offset, 120 + .text .p2align 2 @@ -69,15 +75,16 @@ gu1_table_band_idx_addr: ihevc_sao_band_offset_luma_a9q: STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - LDR r8,[sp,#56] @Loads ht - LDR r7,[sp,#52] @Loads wd + LDR r8,[sp,#ht_offset] @Loads ht + LDR r7,[sp,#wd_offset] @Loads wd MOV r9,r8 @Move the ht to r9 for loop counter - LDR r5,[sp,#44] @Loads sao_band_pos + LDR r5,[sp,#sao_band_pos_offset] @Loads sao_band_pos ADD r10,r0,r7 @pu1_src[row * src_strd + (wd)] - LDR r4,[sp,#40] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left SUB r10,r10,#1 @wd-1 LDR r14, gu1_table_band_idx_addr ulbl1: @@ -91,7 +98,7 @@ SRC_LEFT_LOOP: ADD r9,r3,r7 @pu1_src_top[wd] VLD1.8 D1,[r14]! @band_table.val[0] - LDR r6,[sp,#48] @Loads pi1_sao_offset + LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset LSL r11,r5,#3 VLD1.8 D2,[r14]! @band_table.val[1] @@ -226,6 +233,7 @@ HEIGHT_LOOP: ADD r0,r0,#8 BNE SWITCH_BREAK + vpop {d8 - d15} LDMFD sp!,{r4-r12,r15} @Reload the registers from SP diff --git a/common/arm/ihevc_sao_edge_offset_class0.s b/common/arm/ihevc_sao_edge_offset_class0.s index a9fe046..e4bb455 100644 --- a/common/arm/ihevc_sao_edge_offset_class0.s +++ b/common/arm/ihevc_sao_edge_offset_class0.s @@ -59,6 +59,14 @@ @r9 => wd @r10=> ht +.equ pu1_src_top_left_offset, 104 +.equ pu1_src_top_right_offset, 108 +.equ pu1_src_bot_left_offset, 112 +.equ pu1_avail_offset, 116 +.equ pi1_sao_offset, 120 +.equ wd_offset, 124 +.equ ht_offset, 128 + .text .p2align 2 @@ -72,23 +80,25 @@ ihevc_sao_edge_offset_class0_a9q: STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments - LDR r9,[sp,#60] @Loads wd + vpush {d8 - d15} + + LDR r9,[sp,#wd_offset] @Loads wd - LDR r4,[sp,#40] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left VMOV.I8 Q1,#2 @const_2 = vdupq_n_s8(2) ADD r11,r3,r9 @pu1_src_top[wd] - LDR r10,[sp,#64] @Loads ht + LDR r10,[sp,#ht_offset] @Loads ht VMOV.I16 Q2,#0 @const_min_clip = vdupq_n_s16(0) LDRB r12,[r11,#-1] @pu1_src_top[wd - 1] - LDR r7,[sp,#52] @Loads pu1_avail + LDR r7,[sp,#pu1_avail_offset] @Loads pu1_avail VMOV.I16 Q3,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) LDR r14, gi1_table_edge_idx_addr @table pointer ulbl1: add r14,r14,pc - LDR r8,[sp,#56] @Loads pi1_sao_offset + LDR r8,[sp,#pi1_sao_offset] @Loads pi1_sao_offset VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) STRB r12,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1] @@ -337,6 +347,7 @@ PU1_SRC_LOOP_RESIDUE: BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to the pu1_src loop END_LOOPS: + vpop {d8 - d15} LDMFD sp!,{r4-r12,r15} @Reload the registers from SP diff --git a/common/arm/ihevc_sao_edge_offset_class0_chroma.s b/common/arm/ihevc_sao_edge_offset_class0_chroma.s index 1dd56f6..e11cd4f 100644 --- a/common/arm/ihevc_sao_edge_offset_class0_chroma.s +++ b/common/arm/ihevc_sao_edge_offset_class0_chroma.s @@ -60,6 +60,15 @@ @r9 => wd @r10=> ht +.equ pu1_src_top_left_offset, 104 +.equ pu1_src_top_right_offset, 108 +.equ pu1_src_bot_left_offset, 112 +.equ pu1_avail_offset, 116 +.equ pi1_sao_u_offset, 120 +.equ pi1_sao_v_offset, 124 +.equ wd_offset, 128 +.equ ht_offset, 132 + .text .p2align 2 @@ -73,20 +82,22 @@ ihevc_sao_edge_offset_class0_chroma_a9q: STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments - LDR r9,[sp,#64] @Loads wd + vpush {d8 - d15} + + LDR r9,[sp,#wd_offset] @Loads wd - LDR r4,[sp,#40] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left ADD r11,r3,r9 @pu1_src_top[wd] - LDR r10,[sp,#68] @Loads ht + LDR r10,[sp,#ht_offset] @Loads ht VMOV.I8 Q1,#2 @const_2 = vdupq_n_s8(2) LDRH r12,[r11,#-2] @pu1_src_top[wd - 1] - LDR r7,[sp,#52] @Loads pu1_avail + LDR r7,[sp,#pu1_avail_offset] @Loads pu1_avail VMOV.I16 Q2,#0 @const_min_clip = vdupq_n_s16(0) STRH r12,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1] - LDR r8,[sp,#56] @Loads pi1_sao_offset_u + LDR r8,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u VMOV.I16 Q3,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) SUB r4,r10,#1 @(ht - 1) @@ -96,7 +107,7 @@ ulbl1: VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) MUL r4,r4,r1 @(ht - 1) * src_strd - LDR r5,[sp,#60] @Loads pi1_sao_offset_v + LDR r5,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v VLD1.8 D11,[r8] @offset_tbl = vld1_s8(pi1_sao_offset_u) ADD r4,r4,r0 @pu1_src[(ht - 1) * src_strd] @@ -423,6 +434,7 @@ PU1_SRC_LOOP_RESIDUE: BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to the pu1_src loop END_LOOPS: + vpop {d8 - d15} LDMFD sp!,{r4-r12,r15} @Reload the registers from SP diff --git a/common/arm/ihevc_sao_edge_offset_class1.s b/common/arm/ihevc_sao_edge_offset_class1.s index aa1337f..029ac46 100644 --- a/common/arm/ihevc_sao_edge_offset_class1.s +++ b/common/arm/ihevc_sao_edge_offset_class1.s @@ -58,6 +58,14 @@ @r7 => wd @r8 => ht +.equ pu1_src_top_left_offset, 104 +.equ pu1_src_top_right_offset, 108 +.equ pu1_src_bot_left_offset, 112 +.equ pu1_avail_offset, 116 +.equ pi1_sao_offset, 120 +.equ wd_offset, 124 +.equ ht_offset, 128 + .text .p2align 2 @@ -71,11 +79,13 @@ ihevc_sao_edge_offset_class1_a9q: STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments - LDR r7,[sp,#60] @Loads wd - LDR r4,[sp,#40] @Loads pu1_src_top_left - LDR r5,[sp,#52] @Loads pu1_avail - LDR r6,[sp,#56] @Loads pi1_sao_offset - LDR r8,[sp,#64] @Loads ht + vpush {d8 - d15} + + LDR r7,[sp,#wd_offset] @Loads wd + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail + LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset + LDR r8,[sp,#ht_offset] @Loads ht SUB r9,r7,#1 @wd - 1 LDRB r10,[r3,r9] @pu1_src_top[wd - 1] @@ -362,6 +372,7 @@ PU1_SRC_LOOP_RESIDUE: VST1.8 {D30},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) END_LOOPS: + vpop {d8 - d15} LDMFD sp!,{r4-r12,r15} @Reload the registers from SP diff --git a/common/arm/ihevc_sao_edge_offset_class1_chroma.s b/common/arm/ihevc_sao_edge_offset_class1_chroma.s index 09d925f..b377220 100644 --- a/common/arm/ihevc_sao_edge_offset_class1_chroma.s +++ b/common/arm/ihevc_sao_edge_offset_class1_chroma.s @@ -60,6 +60,15 @@ @r8 => wd @r9 => ht +.equ pu1_src_top_left_offset, 104 +.equ pu1_src_top_right_offset, 108 +.equ pu1_src_bot_left_offset, 112 +.equ pu1_avail_offset, 116 +.equ pi1_sao_u_offset, 120 +.equ pi1_sao_v_offset, 124 +.equ wd_offset, 128 +.equ ht_offset, 132 + .text .p2align 2 @@ -73,13 +82,13 @@ ihevc_sao_edge_offset_class1_chroma_a9q: STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments - LDR r7,[sp,#60] @Loads wd - LDR r4,[sp,#40] @Loads pu1_src_top_left - LDR r5,[sp,#52] @Loads pu1_avail - LDR r6,[sp,#56] @Loads pi1_sao_offset_u - LDR r7,[sp,#60] @Loads pi1_sao_offset_v - LDR r8,[sp,#64] @Loads wd - LDR r9,[sp,#68] @Loads ht + vpush {d8 - d15} + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail + LDR r6,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u + LDR r7,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v + LDR r8,[sp,#wd_offset] @Loads wd + LDR r9,[sp,#ht_offset] @Loads ht SUB r10,r8,#2 @wd - 2 LDRH r11,[r3,r10] @pu1_src_top[wd - 2] @@ -398,6 +407,7 @@ PU1_SRC_LOOP_RESIDUE: VST1.8 {D30},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) END_LOOPS: + vpop {d8 - d15} LDMFD sp!,{r4-r12,r15} @Reload the registers from SP diff --git a/common/arm/ihevc_sao_edge_offset_class2.s b/common/arm/ihevc_sao_edge_offset_class2.s index 536f941..15d6efa 100644 --- a/common/arm/ihevc_sao_edge_offset_class2.s +++ b/common/arm/ihevc_sao_edge_offset_class2.s @@ -58,6 +58,14 @@ @r7 => wd @r8=> ht +.equ pu1_src_top_left_offset, 264 +.equ pu1_src_top_right_offset, 268 +.equ pu1_src_bot_left_offset, 272 +.equ pu1_avail_offset, 276 +.equ pi1_sao_offset, 280 +.equ wd_offset, 284 +.equ ht_offset, 288 + .text .syntax unified .p2align 2 @@ -78,28 +86,29 @@ ihevc_sao_edge_offset_class2_a9q: STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments - LDR r7,[sp,#0x3C] @Loads wd + vpush {d8 - d15} + SUB sp,sp,#160 @Decrement the stack pointer to store some temp arr values - LDR r8,[sp,#0x40] @Loads ht + LDR r7,[sp,#wd_offset] @Loads wd + LDR r8,[sp,#ht_offset] @Loads ht SUB r9,r7,#1 @wd - 1 - LDR r4,[sp,#0x28] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left LDRB r10,[r3,r9] @pu1_src_top[wd - 1] - STR r0,[sp,#0x2C] @Store pu1_src in sp + STR r0,[sp,#152] @Store pu1_src in sp MOV r9,r7 @Move width to r9 for loop count - STR r2,[sp,#0x30] @Store pu1_src_left in sp - LDR r5,[sp,#0x34] @Loads pu1_avail - LDR r6,[sp,#0x38] @Loads pi1_sao_offset - STR r3,[sp,#0x38] @Store pu1_src_top in sp + STR r2,[sp,#156] @Store pu1_src_left in sp + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail + LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset + STR r3,[sp,#148] @Store pu1_src_top in sp - SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1] SUB r10,r8,#1 @ht-1 MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col] - ADD r12,sp,#0x02 @temp array + ADD r12,sp,#2 @temp array AU1_SRC_TOP_LOOP: VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col] @@ -203,7 +212,7 @@ ulbl3: VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1 - STR r0,[sp,#0x90] @Store pu1_src in sp + STR r0,[sp,#144] @Store pu1_src in sp CMP r7,#16 @Compare wd with 16 BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case @@ -211,9 +220,9 @@ ulbl3: BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP WIDTH_LOOP_16: - LDR r7,[sp,#0xD0] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail CMP r6,r7 @col == wd LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) @@ -232,21 +241,21 @@ SKIP_AU1_MASK_VAL: MOVNE r8,r3 @pu1_src_top_cpy SUB r8,r8,#1 @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1 - LDR r7,[sp,#0xD0] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) SUB r8,#8 ADD r3,r3,#16 - ADD r5,sp,#0x42 @*au1_src_left_tmp + ADD r5,sp,#66 @*au1_src_left_tmp VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) SUB r0,#8 - LDR r4,[sp,#0xD4] @Loads ht + LDR r4,[sp,#ht_offset] @Loads ht SUB r7,r7,r6 @(wd - col) VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) - LDR r8,[sp,#0xC0] @Loads *pu1_src + LDR r8,[sp,#152] @Loads *pu1_src ADD r7,r7,#15 @15 + (wd - col) VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) @@ -263,7 +272,7 @@ AU1_SRC_LEFT_LOOP: ADD r8,r0,r1 @I Iteration *pu1_src + src_strd VMOV.I8 Q9,#0 - LDR r4,[sp,#0xC8] @I Loads pu1_avail + LDR r4,[sp,#pu1_avail_offset] @I Loads pu1_avail MOV r7,r12 @row count, move ht_tmp to r7 VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) @@ -498,11 +507,11 @@ PU1_SRC_LOOP: INNER_LOOP_DONE: - ADD r5,sp,#0x42 @*au1_src_left_tmp + ADD r5,sp,#66 @*au1_src_left_tmp VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) - LDR r2,[sp,#0xC4] @Loads *pu1_src_left + LDR r2,[sp,#156] @Loads *pu1_src_left - LDR r8,[sp,#0xD4] @Loads ht + LDR r8,[sp,#ht_offset] @Loads ht SUB r5,r5,#1 SUB r2,r2,#1 @@ -515,8 +524,8 @@ SRC_LEFT_LOOP: SUB r6,r6,#16 @Decrement the wd loop count by 16 CMP r6,#8 @Check whether residue remains BLT RE_ASSINING_LOOP @Jump to re-assigning loop - LDR r7,[sp,#0xD0] @Loads wd - LDR r0,[sp,#0x90] @Loads *pu1_src + LDR r7,[sp,#wd_offset] @Loads wd + LDR r0,[sp,#144] @Loads *pu1_src SUB r7,r7,r6 ADD r0,r0,r7 BGT WIDTH_LOOP_16 @If not equal jump to width_loop @@ -524,8 +533,8 @@ SRC_LEFT_LOOP: WD_16_HT_4_LOOP: - LDR r7,[sp,#0xD0] @Loads wd - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r7,[sp,#wd_offset] @Loads wd + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail CMP r6,r7 @col == wd LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) @@ -544,21 +553,21 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4: MOVNE r8,r3 SUB r8,r8,#1 @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1 - LDR r7,[sp,#0xD0] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) SUB r8,#8 ADD r3,r3,#16 - ADD r5,sp,#0x42 @*au1_src_left_tmp + ADD r5,sp,#66 @*au1_src_left_tmp VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) SUB r0,#8 - LDR r4,[sp,#0xD4] @Loads ht + LDR r4,[sp,#ht_offset] @Loads ht SUB r7,r7,r6 @(wd - col) VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) - LDR r8,[sp,#0xC0] @Loads *pu1_src + LDR r8,[sp,#152] @Loads *pu1_src ADD r7,r7,#15 @15 + (wd - col) VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) @@ -588,7 +597,7 @@ PU1_SRC_LOOP_WD_16_HT_4: CMP r7,r12 BLT SIGN_UP_CHANGE_WD_16_HT_4 - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#2] @pu1_avail[2] CMP r5,#0 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 @@ -639,9 +648,9 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4: SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 - LDR r8,[sp,#0xD4] @Loads ht - ADD r5,sp,#0x42 @*au1_src_left_tmp - LDR r2,[sp,#0xC4] @Loads *pu1_src_left + LDR r8,[sp,#ht_offset] @Loads ht + ADD r5,sp,#66 @*au1_src_left_tmp + LDR r2,[sp,#156] @Loads *pu1_src_left SUB r5,r5,#1 SUB r2,r2,#1 @@ -656,8 +665,8 @@ SRC_LEFT_LOOP_WD_16_HT_4: WIDTH_RESIDUE: - LDR r7,[sp,#0xD0] @Loads wd - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r7,[sp,#wd_offset] @Loads wd + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail CMP r6,r7 @wd_residue == wd LDRBEQ r8,[r5] @pu1_avail[0] @@ -679,16 +688,16 @@ PU1_AVAIL_2_RESIDUE: SUB r8,r8,#1 - ADD r5,sp,#0x42 @*au1_src_left_tmp + ADD r5,sp,#66 @*au1_src_left_tmp VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1) VLD1.8 D11,[r8]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1) - LDR r7,[sp,#0xD0] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd - LDR r4,[sp,#0xD4] @Loads ht + LDR r4,[sp,#ht_offset] @Loads ht VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) SUB r7,r7,#1 @(wd - 1) - LDR r8,[sp,#0xC0] @Loads *pu1_src + LDR r8,[sp,#152] @Loads *pu1_src VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) SUB r5,r5,#1 @@ -718,7 +727,7 @@ PU1_SRC_LOOP_RESIDUE: CMP r7,r12 BLT SIGN_UP_CHANGE_RESIDUE - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#2] @pu1_avail[2] CMP r5,#0 BNE SIGN_UP_CHANGE_DONE_RESIDUE @@ -762,10 +771,10 @@ SIGN_UP_CHANGE_DONE_RESIDUE: SUBS r7,r7,#1 BNE PU1_SRC_LOOP_RESIDUE - LDR r8,[sp,#0xD4] @Loads ht - ADD r5,sp,#0x42 @*au1_src_left_tmp + LDR r8,[sp,#ht_offset] @Loads ht + ADD r5,sp,#66 @*au1_src_left_tmp - LDR r2,[sp,#0xC4] @Loads *pu1_src_left + LDR r2,[sp,#156] @Loads *pu1_src_left SUB r5,r5,#1 SUB r2,r2,#1 @@ -778,23 +787,23 @@ SRC_LEFT_LOOP_RESIDUE: RE_ASSINING_LOOP: - LDR r8,[sp,#0xD4] @Loads ht - LDR r7,[sp,#0xD0] @Loads wd + LDR r8,[sp,#ht_offset] @Loads ht + LDR r7,[sp,#wd_offset] @Loads wd - LDR r0,[sp,#0xC0] @Loads *pu1_src + LDR r0,[sp,#152] @Loads *pu1_src SUB r8,r8,#1 @ht - 1 MLA r6,r8,r1,r7 @wd - 1 + (ht - 1) * src_strd STRB r9,[r0] @pu1_src_org[0] = u1_pos_0_0_tmp - LDR r4,[sp,#0xBC] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left ADD r6,r0,r6 @pu1_src[wd - 1 + (ht - 1) * src_strd] - ADD r12,sp,#0x02 + ADD r12,sp,#2 STRB r10,[r6,#-1] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp LDRB r11,[sp] @load u1_src_top_left_tmp from stack pointer - LDR r3,[sp,#0xCC] @Loads pu1_src_top + LDR r3,[sp,#148] @Loads pu1_src_top STRB r11,[r4] @*pu1_src_top_left = u1_src_top_left_tmp @@ -805,7 +814,8 @@ SRC_TOP_LOOP: BNE SRC_TOP_LOOP END_LOOPS: - ADD sp,sp,#0x94 + ADD sp,sp,#160 + vpop {d8 - d15} LDMFD sp!,{r4-r12,r15} @Reload the registers from SP diff --git a/common/arm/ihevc_sao_edge_offset_class2_chroma.s b/common/arm/ihevc_sao_edge_offset_class2_chroma.s index 6a301cb..f7ab3f8 100644 --- a/common/arm/ihevc_sao_edge_offset_class2_chroma.s +++ b/common/arm/ihevc_sao_edge_offset_class2_chroma.s @@ -60,6 +60,15 @@ @r7 => wd @r8=> ht +.equ pu1_src_top_left_offset, 328 +.equ pu1_src_top_right_offset, 332 +.equ pu1_src_bot_left_offset, 336 +.equ pu1_avail_offset, 340 +.equ pi1_sao_u_offset, 344 +.equ pi1_sao_v_offset, 348 +.equ wd_offset, 352 +.equ ht_offset, 356 + .text .syntax unified .p2align 2 @@ -86,23 +95,24 @@ ihevc_sao_edge_offset_class2_chroma_a9q: STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments + vpush {d8 - d15} + SUB sp,sp,#224 @Decrement the stack pointer to store some temp arr values - LDR r7,[sp,#0x40] @Loads wd - LDR r8,[sp,#0x44] @Loads ht + LDR r7,[sp,#wd_offset] @Loads wd + LDR r8,[sp,#ht_offset] @Loads ht SUB r9,r7,#2 @wd - 2 - LDR r4,[sp,#0x28] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left LDRH r10,[r3,r9] @pu1_src_top[wd - 2] - STR r0,[sp,#0x2C] @Store pu1_src in sp + STR r0,[sp,#212] @Store pu1_src in sp MOV r9,r7 @Move width to r9 for loop count - STR r2,[sp,#0x30] @Store pu1_src_left in sp - LDR r5,[sp,#0x34] @Loads pu1_avail - LDR r6,[sp,#0x38] @Loads pi1_sao_offset_u + STR r2,[sp,#216] @Store pu1_src_left in sp + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail + LDR r6,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u - STR r3,[sp,#0x38] @Store pu1_src_top in sp - SUB sp,sp,#0xD4 @Decrement the stack pointer to store some temp arr values + STR r3,[sp,#220] @Store pu1_src_top in sp STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2] SUB r10,r8,#1 @ht-1 @@ -178,7 +188,7 @@ ulbl2: LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] CMP r12,#0 @0 != edge_idx BEQ PU1_AVAIL_7_LOOP_U - LDR r11,[sp,#0x110] @Loads pi1_sao_offset_v + LDR r11,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx] ADD r10,r10,r11 @pu1_src[0] + pi1_sao_offset_v[edge_idx] USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) @@ -253,7 +263,7 @@ ulbl4: LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] CMP r12,#0 BEQ PU1_AVAIL_3_LOOP - LDR r14,[sp,#0x110] @Loads pi1_sao_offset_v + LDR r14,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx] ADD r9,r9,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx] USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) @@ -280,7 +290,7 @@ PU1_AVAIL_3_LOOP: VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u) SUBEQ r12,r12,#1 @ht_tmp-- - LDR r6,[sp,#0x110] @Loads pi1_sao_offset_v + LDR r6,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v ADDEQ r14,r14,#2 @pu1_src_left_cpy += 2 STR r0,[sp,#2] @Store pu1_src in sp @@ -298,8 +308,8 @@ ulbl5: BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP WIDTH_LOOP_16: - LDR r5,[sp,#0x108] @Loads pu1_avail - LDR r7,[sp,#0x114] @Loads wd + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail + LDR r7,[sp,#wd_offset] @Loads wd CMP r6,r7 @col == wd LDRBEQ r8,[r5] @pu1_avail[0] @@ -321,16 +331,16 @@ SKIP_AU1_MASK_VAL: SUB r0,#8 CMP r9,#0 - LDR r4,[sp,#0x118] @Loads ht + LDR r4,[sp,#ht_offset] @Loads ht SUBEQ r8,r0,r1 @pu1_src - src_strd - LDR r7,[sp,#0x114] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd MOVNE r8,r3 @pu1_src_top_cpy SUB r8,r8,#2 @pu1_src - src_strd - 2 ADD r3,r3,#16 - ADD r5,sp,#0x4B @*au1_src_left_tmp + ADD r5,sp,#75 @*au1_src_left_tmp VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) SUB r8,#8 @@ -338,7 +348,7 @@ SKIP_AU1_MASK_VAL: ADD r7,r7,#14 @15 + (wd - col) VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) - LDR r8,[sp,#0x100] @Loads *pu1_src + LDR r8,[sp,#212] @Loads *pu1_src ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) @@ -364,7 +374,7 @@ AU1_SRC_LEFT_LOOP: VMOV.I8 Q9,#0 LDRH r5,[r8] @I pu1_src_cpy[src_strd + 16] - LDR r10,[sp,#0x108] @I Loads pu1_avail + LDR r10,[sp,#pu1_avail_offset] @I Loads pu1_avail VMOV.16 D18[0],r5 @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) LDRB r10,[r10,#2] @I pu1_avail[2] @@ -654,11 +664,11 @@ PU1_SRC_LOOP: INNER_LOOP_DONE: - LDR r8,[sp,#0x118] @Loads ht + LDR r8,[sp,#ht_offset] @Loads ht VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0]) - ADD r5,sp,#0x4B @*au1_src_left_tmp + ADD r5,sp,#75 @*au1_src_left_tmp - LDR r11,[sp,#0x104] @Loads *pu1_src_left + LDR r11,[sp,#216] @Loads *pu1_src_left VMOVN.I16 D21,Q9 @vmovn_s16(pi2_tmp_cur_row.val[1]) @@ -673,8 +683,8 @@ SRC_LEFT_LOOP: CMP r6,#8 @Check whether residue remains BLT RE_ASSINING_LOOP @Jump to re-assigning loop - LDR r7,[sp,#0x114] @Loads wd - LDR r0,[sp,#0x02] @Loads *pu1_src + LDR r7,[sp,#wd_offset] @Loads wd + LDR r0,[sp,#2] @Loads *pu1_src SUB r7,r7,r6 ADD r0,r0,r7 BGT WIDTH_LOOP_16 @If not equal jump to width_loop @@ -682,8 +692,8 @@ SRC_LEFT_LOOP: WD_16_HT_4_LOOP: - LDR r5,[sp,#0x108] @Loads pu1_avail - LDR r7,[sp,#0x114] @Loads wd + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail + LDR r7,[sp,#wd_offset] @Loads wd CMP r6,r7 @col == wd LDRBEQ r8,[r5] @pu1_avail[0] @@ -709,12 +719,12 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4: SUB r8,#8 ADD r3,r3,#16 - ADD r5,sp,#0x4B @*au1_src_left_tmp - LDR r4,[sp,#0x118] @Loads ht - LDR r7,[sp,#0x114] @Loads wd + ADD r5,sp,#75 @*au1_src_left_tmp + LDR r4,[sp,#ht_offset] @Loads ht + LDR r7,[sp,#wd_offset] @Loads wd SUB r7,r7,r6 @(wd - col) ADD r7,r7,#14 @15 + (wd - col) - LDR r8,[sp,#0x100] @Loads *pu1_src + LDR r8,[sp,#212] @Loads *pu1_src ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] AU1_SRC_LEFT_LOOP_WD_16_HT_4: @@ -749,7 +759,7 @@ PU1_SRC_LOOP_WD_16_HT_4: CMP r7,r12 BLT SIGN_UP_CHANGE_WD_16_HT_4 - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#2] @pu1_avail[2] CMP r5,#0 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 @@ -815,9 +825,9 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4: SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 - LDR r8,[sp,#0x118] @Loads ht - ADD r5,sp,#0x4B @*au1_src_left_tmp - LDR r11,[sp,#0x104] @Loads *pu1_src_left + LDR r8,[sp,#ht_offset] @Loads ht + ADD r5,sp,#75 @*au1_src_left_tmp + LDR r11,[sp,#216] @Loads *pu1_src_left SRC_LEFT_LOOP_WD_16_HT_4: LDR r7,[r5],#4 @au1_src_left_tmp[row] @@ -829,16 +839,16 @@ SRC_LEFT_LOOP_WD_16_HT_4: SUBS r6,r6,#16 @Decrement the wd loop count by 16 BLE RE_ASSINING_LOOP @Jump to re-assigning loop - LDR r7,[sp,#0x114] @Loads wd - LDR r0,[sp,#0x02] @Loads *pu1_src + LDR r7,[sp,#wd_offset] @Loads wd + LDR r0,[sp,#2] @Loads *pu1_src SUB r7,r7,r6 ADD r0,r0,r7 BGT WD_16_HT_4_LOOP WIDTH_RESIDUE: - LDR r7,[sp,#0x114] @Loads wd - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r7,[sp,#wd_offset] @Loads wd + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail CMP r6,r7 @wd_residue == wd LDRBEQ r8,[r5] @pu1_avail[0] @@ -860,10 +870,10 @@ WIDTH_RESIDUE: VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) SUB r8,#8 - ADD r5,sp,#0x4B @*au1_src_left_tmp - LDR r4,[sp,#0x118] @Loads ht - LDR r7,[sp,#0x114] @Loads wd - LDR r8,[sp,#0x100] @Loads *pu1_src + ADD r5,sp,#75 @*au1_src_left_tmp + LDR r4,[sp,#ht_offset] @Loads ht + LDR r7,[sp,#wd_offset] @Loads wd + LDR r8,[sp,#212] @Loads *pu1_src SUB r7,r7,#2 @(wd - 2) ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 2)] @@ -897,7 +907,7 @@ PU1_SRC_LOOP_RESIDUE: CMP r7,r12 BLT SIGN_UP_CHANGE_RESIDUE - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#2] @pu1_avail[2] CMP r5,#0 BNE SIGN_UP_CHANGE_DONE_RESIDUE @@ -957,9 +967,9 @@ SIGN_UP_CHANGE_DONE_RESIDUE: SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP - LDR r8,[sp,#0x118] @Loads ht - LDR r11,[sp,#0x104] @Loads *pu1_src_left - ADD r5,sp,#0x4B @*au1_src_left_tmp + LDR r8,[sp,#ht_offset] @Loads ht + LDR r11,[sp,#216] @Loads *pu1_src_left + ADD r5,sp,#75 @*au1_src_left_tmp SRC_LEFT_LOOP_RESIDUE: LDR r7,[r5],#4 @au1_src_left_tmp[row] @@ -970,12 +980,12 @@ SRC_LEFT_LOOP_RESIDUE: RE_ASSINING_LOOP: - LDR r8,[sp,#0x118] @Loads ht + LDR r8,[sp,#ht_offset] @Loads ht - LDR r0,[sp,#0x100] @Loads *pu1_src + LDR r0,[sp,#212] @Loads *pu1_src SUB r8,r8,#1 @ht - 1 - LDR r7,[sp,#0x114] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd LDRH r9,[sp,#6] MLA r6,r8,r1,r7 @wd - 2 + (ht - 1) * src_strd @@ -987,10 +997,10 @@ RE_ASSINING_LOOP: ADD r12,sp,#10 STRH r9,[r6,#-2] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u - LDR r4,[sp,#0xFC] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp - LDR r3,[sp,#0x10C] @Loads pu1_src_top + LDR r3,[sp,#220] @Loads pu1_src_top SRC_TOP_LOOP: VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col] @@ -999,7 +1009,9 @@ SRC_TOP_LOOP: BNE SRC_TOP_LOOP END_LOOPS: - ADD sp,sp,#0xD4 + ADD sp,sp,#224 + + vpop {d8 - d15} LDMFD sp!,{r4-r12,r15} @Reload the registers from SP diff --git a/common/arm/ihevc_sao_edge_offset_class3.s b/common/arm/ihevc_sao_edge_offset_class3.s index f3482dc..fb3b05c 100644 --- a/common/arm/ihevc_sao_edge_offset_class3.s +++ b/common/arm/ihevc_sao_edge_offset_class3.s @@ -58,6 +58,14 @@ @r7 => wd @r8=> ht +.equ pu1_src_top_left_offset, 264 +.equ pu1_src_top_right_offset, 268 +.equ pu1_src_bot_left_offset, 272 +.equ pu1_avail_offset, 276 +.equ pi1_sao_offset, 280 +.equ wd_offset, 284 +.equ ht_offset, 288 + .text .syntax unified .p2align 2 @@ -78,26 +86,27 @@ ihevc_sao_edge_offset_class3_a9q: STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments - LDR r7,[sp,#0x3C] @Loads wd + vpush {d8 - d15} + SUB sp,sp,#160 @Decrement the stack pointer to store some temp arr values + LDR r7,[sp,#wd_offset] @Loads wd - LDR r8,[sp,#0x40] @Loads ht + LDR r8,[sp,#ht_offset] @Loads ht SUB r9,r7,#1 @wd - 1 - LDR r4,[sp,#0x28] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left LDRB r10,[r3,r9] @pu1_src_top[wd - 1] MOV r9,r7 @Move width to r9 for loop count - LDR r5,[sp,#0x34] @Loads pu1_avail - LDR r6,[sp,#0x38] @Loads pi1_sao_offset - STR r3,[sp,#0x38] @Store pu1_src_top in sp + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail + LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset + STR r3,[sp,#156] @Store pu1_src_top in sp - SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1] SUB r10,r8,#1 @ht-1 MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col] - ADD r12,sp,#0x02 @temp array + ADD r12,sp,#2 @temp array AU1_SRC_TOP_LOOP: VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col] @@ -112,7 +121,7 @@ PU1_AVAIL_5_LOOP: LDRB r9,[r0,r10] @u1_pos_0_0_tmp = pu1_src[wd - 1] BEQ PU1_AVAIL_6_LOOP - LDR r11,[sp,#0xC0] @Load pu1_src_top_right from sp + LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp SUB r10,r10,#1 @[wd - 1 - 1] LDRB r11,[r11] @pu1_src_top_right[0] @@ -147,13 +156,13 @@ PU1_AVAIL_6_LOOP: SUB r11,r8,#1 @ht - 1 CMP r10,#0 - STR r0,[sp,#0xC0] @Store pu1_src in sp + STR r0,[sp,#148] @Store pu1_src in sp MLA r12,r11,r1,r0 @pu1_src[(ht - 1) * src_strd] LDRB r10,[r12] @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd] BEQ PU1_AVAIL_3_LOOP - LDR r14,[sp,#0xC4] @Load pu1_src_bot_left from sp + LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd) - src_strd] LDRB r14,[r14] @Load pu1_src_bot_left[0] @@ -186,7 +195,7 @@ ulbl2: USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) PU1_AVAIL_3_LOOP: - STR r2,[sp,#0xC4] @Store pu1_src_left in sp + STR r2,[sp,#152] @Store pu1_src_left in sp MOV r12,r8 @Move ht MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy @@ -211,7 +220,7 @@ ulbl3: VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1 - STR r0,[sp,#0x90] @Store pu1_src in sp + STR r0,[sp,#144] @Store pu1_src in sp VLD1.8 D6,[r6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) MOV r6,r7 @move wd to r6 loop_count @@ -221,9 +230,9 @@ ulbl3: BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP WIDTH_LOOP_16: - LDR r7,[sp,#0xD0] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail CMP r6,r7 @col == wd LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 @@ -238,13 +247,13 @@ SKIP_AU1_MASK_VAL: LDRB r8,[r5,#2] @pu1_avail[2] CMP r8,#0 - LDR r4,[sp,#0xD4] @Loads ht + LDR r4,[sp,#ht_offset] @Loads ht SUBEQ r8,r0,r1 @pu1_src - src_strd MOVNE r8,r3 - ADD r5,sp,#0x42 @*au1_src_left_tmp + ADD r5,sp,#66 @*au1_src_left_tmp - LDR r7,[sp,#0xD0] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd ADD r8,r8,#1 @pu1_src - src_strd + 1 SUB r7,r7,r6 @(wd - col) @@ -253,7 +262,7 @@ SKIP_AU1_MASK_VAL: SUB r8,#8 ADD r3,r3,#16 - LDR r8,[sp,#0xC0] @Loads *pu1_src + LDR r8,[sp,#148] @Loads *pu1_src VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) SUB r0,#8 @@ -285,7 +294,7 @@ AU1_SRC_LEFT_LOOP: ADD r8,r8,#1 @I pu1_src_left_cpy[ht_tmp - row + 1] LDRB r8,[r8] - LDR r5,[sp,#0xC8] @I Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @I Loads pu1_avail VMOV.8 D19[7],r8 @I vsetq_lane_u8 LDRB r5,[r5,#2] @I pu1_avail[2] @@ -375,7 +384,7 @@ PU1_SRC_LOOP: CMP r7,#1 @III BNE NEXT_ROW_ELSE_2 @III - LDR r5,[sp,#0xC8] @III Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @III Loads pu1_avail LDRB r5,[r5,#3] @III pu1_avail[3] CMP r5,#0 @III SUBNE r8,r2,#2 @III pu1_src_cpy[src_strd - 1] @@ -465,7 +474,7 @@ NEXT_ROW_ELSE_2: ADD r8,r0,r1,LSL #1 @*pu1_src + src_strd VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#3] @pu1_avail[3] VMOVN.I16 D21,Q11 @III vmovn_s16(pi2_tmp_cur_row.val[1]) @@ -529,13 +538,13 @@ NEXT_ROW_POINTER_ASSIGNED_3: INNER_LOOP_DONE: VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0]) - LDR r8,[sp,#0xD4] @Loads ht + LDR r8,[sp,#ht_offset] @Loads ht VMOVN.I16 D21,Q11 @vmovn_s16(pi2_tmp_cur_row.val[1]) - ADD r5,sp,#0x42 @*au1_src_left_tmp + ADD r5,sp,#66 @*au1_src_left_tmp VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) - LDR r2,[sp,#0xC4] @Loads *pu1_src_left + LDR r2,[sp,#152] @Loads *pu1_src_left SRC_LEFT_LOOP: LDR r7,[r5],#4 @au1_src_left_tmp[row] SUBS r8,r8,#4 @@ -545,8 +554,8 @@ SRC_LEFT_LOOP: SUBS r6,r6,#16 @Decrement the wd loop count by 16 CMP r6,#8 @Check whether residue remains BLT RE_ASSINING_LOOP @Jump to re-assigning loop - LDR r7,[sp,#0xD0] @Loads wd - LDR r0,[sp,#0x90] @Loads *pu1_src + LDR r7,[sp,#wd_offset] @Loads wd + LDR r0,[sp,#144] @Loads *pu1_src SUB r7,r7,r6 ADD r0,r0,r7 BGT WIDTH_LOOP_16 @If not equal jump to width_loop @@ -555,8 +564,8 @@ SRC_LEFT_LOOP: WD_16_HT_4_LOOP: - LDR r5,[sp,#0xC8] @Loads pu1_avail - LDR r7,[sp,#0xD0] @Loads wd + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail + LDR r7,[sp,#wd_offset] @Loads wd CMP r6,r7 @col == wd LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 @@ -579,12 +588,12 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4: SUB r8,#8 ADD r3,r3,#16 - ADD r5,sp,#0x42 @*au1_src_left_tmp - LDR r4,[sp,#0xD4] @Loads ht - LDR r7,[sp,#0xD0] @Loads wd + ADD r5,sp,#66 @*au1_src_left_tmp + LDR r4,[sp,#ht_offset] @Loads ht + LDR r7,[sp,#wd_offset] @Loads wd SUB r7,r7,r6 @(wd - col) ADD r7,r7,#15 @15 + (wd - col) - LDR r8,[sp,#0xC0] @Loads *pu1_src + LDR r8,[sp,#148] @Loads *pu1_src ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] SUB r5,r5,#1 @@ -609,7 +618,7 @@ PU1_SRC_LOOP_WD_16_HT_4: VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) SUB r8,#8 - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#3] @pu1_avail[3] CMP r5,#0 BEQ NEXT_ROW_ELSE_WD_16_HT_4 @@ -628,7 +637,7 @@ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: CMP r7,r12 BNE SIGN_UP_CHANGE_WD_16_HT_4 - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#2] @pu1_avail[2] CMP r5,#0 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 @@ -680,9 +689,9 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4: SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 - LDR r8,[sp,#0xD4] @Loads ht - ADD r5,sp,#0x42 @*au1_src_left_tmp - LDR r2,[sp,#0xC4] @Loads *pu1_src_left + LDR r8,[sp,#ht_offset] @Loads ht + ADD r5,sp,#66 @*au1_src_left_tmp + LDR r2,[sp,#152] @Loads *pu1_src_left SRC_LEFT_LOOP_WD_16_HT_4: LDR r7,[r5],#4 @au1_src_left_tmp[row] STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row] @@ -691,16 +700,16 @@ SRC_LEFT_LOOP_WD_16_HT_4: SUBS r6,r6,#16 @Decrement the wd loop count by 16 BLE RE_ASSINING_LOOP @Jump to re-assigning loop - LDR r7,[sp,#0xD0] @Loads wd - LDR r0,[sp,#0x90] @Loads *pu1_src + LDR r7,[sp,#wd_offset] @Loads wd + LDR r0,[sp,#144] @Loads *pu1_src SUB r7,r7,r6 ADD r0,r0,r7 BGT WD_16_HT_4_LOOP @If not equal jump to width_loop WIDTH_RESIDUE: - LDR r7,[sp,#0xD0] @Loads wd - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r7,[sp,#wd_offset] @Loads wd + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail CMP r6,r7 @wd_residue == wd LDRBEQ r8,[r5] @pu1_avail[0] @@ -722,10 +731,10 @@ PU1_AVAIL_2_RESIDUE: SUB r8,#8 - ADD r5,sp,#0x42 @*au1_src_left_tmp - LDR r4,[sp,#0xD4] @Loads ht - LDR r7,[sp,#0xD0] @Loads wd - LDR r8,[sp,#0xC0] @Loads *pu1_src + ADD r5,sp,#66 @*au1_src_left_tmp + LDR r4,[sp,#ht_offset] @Loads ht + LDR r7,[sp,#wd_offset] @Loads wd + LDR r8,[sp,#148] @Loads *pu1_src SUB r7,r7,#1 @(wd - 1) ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 1)] SUB r5,r5,#1 @@ -751,7 +760,7 @@ PU1_SRC_LOOP_RESIDUE: VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) SUB r8,#8 - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#3] @pu1_avail[3] CMP r5,#0 BEQ NEXT_ROW_ELSE_RESIDUE @@ -770,7 +779,7 @@ NEXT_ROW_POINTER_ASSIGNED_RESIDUE: CMP r7,r12 BNE SIGN_UP_CHANGE_RESIDUE - LDR r5,[sp,#0xC8] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#2] @pu1_avail[2] CMP r5,#0 BNE SIGN_UP_CHANGE_DONE_RESIDUE @@ -814,9 +823,9 @@ SIGN_UP_CHANGE_DONE_RESIDUE: SUBS r7,r7,#1 BNE PU1_SRC_LOOP_RESIDUE - LDR r8,[sp,#0xD4] @Loads ht - LDR r2,[sp,#0xC4] @Loads *pu1_src_left - ADD r5,sp,#0x42 @*au1_src_left_tmp + LDR r8,[sp,#ht_offset] @Loads ht + LDR r2,[sp,#152] @Loads *pu1_src_left + ADD r5,sp,#66 @*au1_src_left_tmp SRC_LEFT_LOOP_RESIDUE: LDR r7,[r5],#4 @au1_src_left_tmp[row] @@ -826,24 +835,24 @@ SRC_LEFT_LOOP_RESIDUE: RE_ASSINING_LOOP: - LDR r7,[sp,#0xD0] @Loads wd - LDR r0,[sp,#0xC0] @Loads *pu1_src + LDR r7,[sp,#wd_offset] @Loads wd + LDR r0,[sp,#148] @Loads *pu1_src - LDR r11,[sp,#0xD4] @Loads ht + LDR r11,[sp,#ht_offset] @Loads ht ADD r8,r0,r7 @pu1_src[wd] - LDR r4,[sp,#0xBC] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left SUB r11,r11,#1 @ht - 1 STRB r9,[r8,#-1] @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp MLA r6,r11,r1,r0 @pu1_src_org[(ht - 1) * src_strd] LDRB r8,[sp] @load u1_src_top_left_tmp from stack pointer - ADD r12,sp,#0x02 + ADD r12,sp,#2 STRB r10,[r6] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp STRB r8,[r4] @*pu1_src_top_left = u1_src_top_left_tmp - LDR r3,[sp,#0xCC] @Loads pu1_src_top + LDR r3,[sp,#156] @Loads pu1_src_top SRC_TOP_LOOP: VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col] @@ -852,7 +861,8 @@ SRC_TOP_LOOP: BNE SRC_TOP_LOOP END_LOOPS: - ADD sp,sp,#0x94 + ADD sp,sp,#160 + vpop {d8 - d15} LDMFD sp!,{r4-r12,r15} @Reload the registers from SP diff --git a/common/arm/ihevc_sao_edge_offset_class3_chroma.s b/common/arm/ihevc_sao_edge_offset_class3_chroma.s index fe3b459..9f4eb62 100644 --- a/common/arm/ihevc_sao_edge_offset_class3_chroma.s +++ b/common/arm/ihevc_sao_edge_offset_class3_chroma.s @@ -60,6 +60,15 @@ @r7 => wd @r8=> ht +.equ pu1_src_top_left_offset, 328 +.equ pu1_src_top_right_offset, 332 +.equ pu1_src_bot_left_offset, 336 +.equ pu1_avail_offset, 340 +.equ pi1_sao_u_offset, 344 +.equ pi1_sao_v_offset, 348 +.equ wd_offset, 352 +.equ ht_offset, 356 + .text .syntax unified .p2align 2 @@ -86,21 +95,22 @@ ihevc_sao_edge_offset_class3_chroma_a9q: STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments + vpush {d8 - d15} + SUB sp,sp,#224 @Decrement the stack pointer to store some temp arr values - LDR r7,[sp,#0x40] @Loads wd - LDR r8,[sp,#0x44] @Loads ht + LDR r7,[sp,#wd_offset] @Loads wd + LDR r8,[sp,#ht_offset] @Loads ht SUB r9,r7,#2 @wd - 2 - LDR r4,[sp,#0x28] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left LDRH r10,[r3,r9] @pu1_src_top[wd - 2] MOV r9,r7 @Move width to r9 for loop count - LDR r5,[sp,#0x34] @Loads pu1_avail - LDR r6,[sp,#0x38] @Loads pi1_sao_offset_u + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail + LDR r6,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u - STR r3,[sp,#0x38] @Store pu1_src_top in sp - SUB sp,sp,#0xD4 @Decrement the stack pointer to store some temp arr values + STR r3,[sp,#220] @Store pu1_src_top in sp STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2] SUB r10,r8,#1 @ht-1 @@ -122,7 +132,7 @@ PU1_AVAIL_5_LOOP_U: LDRB r10,[r0,r11] @u1_pos_0_0_tmp_v = pu1_src[wd - 1] BEQ PU1_AVAIL_6_LOOP_U - LDR r11,[sp,#0x100] @Load pu1_src_top_right from sp + LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp LDRB r11,[r11] @pu1_src_top_right[0] SUB r12,r9,r11 @pu1_src[wd - 2] - pu1_src_top_right[0] CMP r12,#0 @@ -150,7 +160,7 @@ ulbl1: PU1_AVAIL_5_LOOP_V: - LDR r11,[sp,#0x100] @Load pu1_src_top_right from sp + LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp LDRB r11,[r11,#1] @pu1_src_top_right[1] SUB r12,r10,r11 @pu1_src[wd - 1] - pu1_src_top_right[1] CMP r12,#0 @@ -172,7 +182,7 @@ ulbl2: LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] CMP r12,#0 @0 != edge_idx BEQ PU1_AVAIL_6_LOOP_U - LDR r11,[sp,#0x110] @Loads pi1_sao_offset_v + LDR r11,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx] ADD r10,r10,r11 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx] USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) @@ -180,7 +190,7 @@ ulbl2: PU1_AVAIL_6_LOOP_U: STRB r9,[sp,#6] STRB r10,[sp,#7] - STR r0,[sp,#0x100] @Store pu1_src in sp + STR r0,[sp,#212] @Store pu1_src in sp LDRB r10,[r5,#6] @pu1_avail[6] CMP r10,#0 @@ -198,7 +208,7 @@ PU1_AVAIL_6_LOOP_U: MVNLT r11,#0 MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) - LDR r14,[sp,#0x104] @Load pu1_src_bot_left from sp + LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp LDRB r14,[r14] @Load pu1_src_bot_left[0] SUB r14,r10,r14 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0] CMP r14,#0 @@ -228,7 +238,7 @@ PU1_AVAIL_6_LOOP_V: MVNLT r11,#0 MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) - LDR r14,[sp,#0x104] @Load pu1_src_bot_left from sp + LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp LDRB r14,[r14,#1] @Load pu1_src_bot_left[1] SUB r14,r9,r14 @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1] CMP r14,#0 @@ -244,7 +254,7 @@ ulbl4: LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] CMP r12,#0 BEQ PU1_AVAIL_3_LOOP - LDR r14,[sp,#0x110] @Loads pi1_sao_offset_v + LDR r14,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx] ADD r9,r9,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) @@ -252,7 +262,7 @@ ulbl4: PU1_AVAIL_3_LOOP: STRB r10,[sp,#8] STRB r9,[sp,#9] - STR r2,[sp,#0x104] @Store pu1_src_left in sp + STR r2,[sp,#216] @Store pu1_src_left in sp MOV r12,r8 @Move ht MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy @@ -276,7 +286,7 @@ PU1_AVAIL_2_LOOP_END: VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0) VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u) - LDR r6,[sp,#0x110] @Loads pi1_sao_offset_v + LDR r6,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v VLD1.8 D7,[r6] @offset_tbl_v = vld1_s8(pi1_sao_offset_v) LDR r2, gi1_table_edge_idx_addr_5 @table pointer ulbl5: @@ -291,9 +301,9 @@ ulbl5: BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP WIDTH_LOOP_16: - LDR r7,[sp,#0x114] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd CMP r6,r7 @col == wd - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRBEQ r8,[r5] @pu1_avail[0] MOVNE r8,#-1 @@ -314,7 +324,7 @@ SKIP_AU1_MASK_VAL: VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) SUB r0,#8 - ADD r5,sp,#0x4B @*au1_src_left_tmp + ADD r5,sp,#75 @*au1_src_left_tmp SUBEQ r8,r0,r1 @pu1_src - src_strd VMOV.I8 Q9,#0 @@ -326,15 +336,15 @@ SKIP_AU1_MASK_VAL: SUB r8,#8 ADD r3,r3,#16 - LDR r4,[sp,#0x118] @Loads ht + LDR r4,[sp,#ht_offset] @Loads ht VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) - LDR r7,[sp,#0x114] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd SUB r7,r7,r6 @(wd - col) VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) ADD r7,r7,#14 @15 + (wd - col) - LDR r8,[sp,#0x100] @Loads *pu1_src + LDR r8,[sp,#212] @Loads *pu1_src VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] @@ -359,7 +369,7 @@ AU1_SRC_LEFT_LOOP: LDRH r5,[r8,#2] @I VMOV.16 D19[3],r5 @I vsetq_lane_u8 - LDR r11,[sp,#0x108] @I Loads pu1_avail + LDR r11,[sp,#pu1_avail_offset] @I Loads pu1_avail LDRB r11,[r11,#2] @I pu1_avail[2] VEXT.8 Q9,Q9,Q8,#14 @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) @@ -477,7 +487,7 @@ PU1_SRC_LOOP: VCGT.U8 Q11,Q6,Q14 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) BNE NEXT_ROW_POINTER_ASSIGNED_2 @III - LDR r5,[sp,#0x108] @III Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @III Loads pu1_avail LDRB r5,[r5,#3] @III pu1_avail[3] CMP r5,#0 @III SUBNE r11,r4,#4 @III pu1_src[src_strd - 2] @@ -597,7 +607,7 @@ NEXT_ROW_POINTER_ASSIGNED_2: LDRB r9,[r0,#17] @load the value pu1_src_cpy[17 - src_strd] BNE NEXT_ROW_POINTER_ASSIGNED_3 - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#3] @pu1_avail[3] CMP r5,#0 SUBNE r8,r11,#4 @pu1_src[src_strd - 2] @@ -657,13 +667,13 @@ NEXT_ROW_POINTER_ASSIGNED_3: INNER_LOOP_DONE: - LDR r8,[sp,#0x118] @Loads ht + LDR r8,[sp,#ht_offset] @Loads ht VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) - ADD r5,sp,#0x4B @*au1_src_left_tmp + ADD r5,sp,#75 @*au1_src_left_tmp LSL r8,r8,#1 VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1]) - LDR r11,[sp,#0x104] @Loads *pu1_src_left + LDR r11,[sp,#216] @Loads *pu1_src_left SRC_LEFT_LOOP: LDR r7,[r5],#4 @au1_src_left_tmp[row] @@ -676,7 +686,7 @@ SRC_LEFT_LOOP: CMP r6,#8 @Check whether residue remains BLT RE_ASSINING_LOOP @Jump to re-assigning loop - LDR r7,[sp,#0x114] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd LDR r0,[sp,#0x02] @Loads *pu1_src SUB r7,r7,r6 ADD r0,r0,r7 @@ -684,9 +694,9 @@ SRC_LEFT_LOOP: BEQ WIDTH_RESIDUE @If residue remains jump to residue loop WD_16_HT_4_LOOP: - LDR r7,[sp,#0x114] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail CMP r6,r7 @col == wd LDRBEQ r8,[r5] @pu1_avail[0] @@ -716,17 +726,17 @@ SKIP_AU1_MASK_VAL_WD_16_HT_4: VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) SUB r8,#8 - ADD r5,sp,#0x4B @*au1_src_left_tmp + ADD r5,sp,#75 @*au1_src_left_tmp - LDR r4,[sp,#0x118] @Loads ht + LDR r4,[sp,#ht_offset] @Loads ht VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) - LDR r7,[sp,#0x114] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd SUB r7,r7,r6 @(wd - col) VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) ADD r7,r7,#14 @15 + (wd - col) - LDR r8,[sp,#0x100] @Loads *pu1_src + LDR r8,[sp,#212] @Loads *pu1_src VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] @@ -744,7 +754,7 @@ AU1_SRC_LEFT_LOOP_WD_16_HT_4: PU1_SRC_LOOP_WD_16_HT_4: ADD r9,r0,r1 @*pu1_src + src_strd - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) SUB r9,#8 @@ -766,7 +776,7 @@ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: CMP r7,r12 BLT SIGN_UP_CHANGE_WD_16_HT_4 - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#2] @pu1_avail[2] CMP r5,#0 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 @@ -839,9 +849,9 @@ SIGN_UP_CHANGE_DONE_WD_16_HT_4: VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 - LDR r8,[sp,#0x118] @Loads ht - ADD r5,sp,#0x4B @*au1_src_left_tmp - LDR r11,[sp,#0x104] @Loads *pu1_src_left + LDR r8,[sp,#ht_offset] @Loads ht + ADD r5,sp,#75 @*au1_src_left_tmp + LDR r11,[sp,#216] @Loads *pu1_src_left SRC_LEFT_LOOP_WD_16_HT_4: LDR r7,[r5],#4 @au1_src_left_tmp[row] @@ -851,16 +861,16 @@ SRC_LEFT_LOOP_WD_16_HT_4: SUBS r6,r6,#16 @Decrement the wd loop count by 16 BLE RE_ASSINING_LOOP @Jump to re-assigning loop - LDR r7,[sp,#0x114] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd LDR r0,[sp,#0x02] @Loads *pu1_src SUB r7,r7,r6 ADD r0,r0,r7 BGT WD_16_HT_4_LOOP @If not equal jump to width_loop WIDTH_RESIDUE: - LDR r7,[sp,#0x114] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail CMP r6,r7 @wd_residue == wd LDRBEQ r8,[r5] @pu1_avail[0] @@ -878,13 +888,13 @@ WIDTH_RESIDUE: ADD r10,r10,#2 @pu1_src - src_strd + 2 VMOV.8 d8[6],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - ADD r5,sp,#0x4B @*au1_src_left_tmp + ADD r5,sp,#75 @*au1_src_left_tmp - LDR r4,[sp,#0x118] @Loads ht + LDR r4,[sp,#ht_offset] @Loads ht VMOV.8 d8[7],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) - LDR r7,[sp,#0x114] @Loads wd + LDR r7,[sp,#wd_offset] @Loads wd - LDR r8,[sp,#0x100] @Loads *pu1_src + LDR r8,[sp,#212] @Loads *pu1_src VLD1.8 D10,[r10]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) VLD1.8 D11,[r10] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) SUB r10,#8 @@ -917,7 +927,7 @@ PU1_SRC_LOOP_RESIDUE: VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) SUB r9,#8 - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#3] @pu1_avail[3] ADD r8,r14,r11,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2] @@ -940,7 +950,7 @@ NEXT_ROW_POINTER_ASSIGNED_RESIDUE: VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) BLT SIGN_UP_CHANGE_RESIDUE - LDR r5,[sp,#0x108] @Loads pu1_avail + LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail LDRB r5,[r5,#2] @pu1_avail[2] CMP r5,#0 BNE SIGN_UP_CHANGE_DONE_RESIDUE @@ -1007,10 +1017,10 @@ SIGN_UP_CHANGE_DONE_RESIDUE: BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP - LDR r8,[sp,#0x118] @Loads ht - ADD r5,sp,#0x4B @*au1_src_left_tmp + LDR r8,[sp,#ht_offset] @Loads ht + ADD r5,sp,#75 @*au1_src_left_tmp - LDR r11,[sp,#0x104] @Loads *pu1_src_left + LDR r11,[sp,#216] @Loads *pu1_src_left SRC_LEFT_LOOP_RESIDUE: LDR r7,[r5],#4 @au1_src_left_tmp[row] @@ -1020,10 +1030,10 @@ SRC_LEFT_LOOP_RESIDUE: RE_ASSINING_LOOP: - LDR r7,[sp,#0x114] @Loads wd - LDR r8,[sp,#0x118] @Loads ht + LDR r7,[sp,#wd_offset] @Loads wd + LDR r8,[sp,#ht_offset] @Loads ht - LDR r0,[sp,#0x100] @Loads *pu1_src + LDR r0,[sp,#212] @Loads *pu1_src SUB r10,r7,#2 @wd - 2 LDRH r9,[sp,#6] @@ -1032,7 +1042,7 @@ RE_ASSINING_LOOP: STRH r9,[r0,r10] @pu1_src_org[0] = u1_pos_0_0_tmp MLA r6,r8,r1,r0 @pu1_src[(ht - 1) * src_strd] - LDR r4,[sp,#0xFC] @Loads pu1_src_top_left + LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left LDRH r9,[sp,#8] ADD r12,sp,#10 @@ -1041,7 +1051,7 @@ RE_ASSINING_LOOP: LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp - LDR r3,[sp,#0x10C] @Loads pu1_src_top + LDR r3,[sp,#220] @Loads pu1_src_top SRC_TOP_LOOP: VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col] @@ -1050,7 +1060,8 @@ SRC_TOP_LOOP: BNE SRC_TOP_LOOP END_LOOPS: - ADD sp,sp,#0xD4 + ADD sp,sp,#224 + vpop {d8 - d15} LDMFD sp!,{r4-r12,r15} @Reload the registers from SP diff --git a/common/arm/ihevc_weighted_pred_bi.s b/common/arm/ihevc_weighted_pred_bi.s index 5308423..8845b8b 100644 --- a/common/arm/ihevc_weighted_pred_bi.s +++ b/common/arm/ihevc_weighted_pred_bi.s @@ -134,6 +134,18 @@ @ r14 => ht @ r7 => wd +.equ src_strd2_offset, 104 +.equ dst_strd_offset, 108 +.equ wgt0_offset, 112 +.equ off0_offset, 116 +.equ wgt1_offset, 120 +.equ off1_offset, 124 +.equ shift_offset, 128 +.equ lvl_shift1_offset, 132 +.equ lvl_shift2_offset, 136 +.equ ht_offset, 140 +.equ wd_offset, 144 + .text .align 4 @@ -147,32 +159,33 @@ ihevc_weighted_pred_bi_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r6,[sp,#48] @load wgt0 - ldr r11,[sp,#68] @load lvl_shift1 - ldr r12,[sp,#72] @load lvl_shift2 + ldr r6,[sp,#wgt0_offset] @load wgt0 + ldr r11,[sp,#lvl_shift1_offset] @load lvl_shift1 + ldr r12,[sp,#lvl_shift2_offset] @load lvl_shift2 vmov.s16 d7[0],r6 @moved for scalar multiplication mul r4,r11,r6 @lvl_shift1 * wgt0 - ldr r8,[sp,#56] @load wgt1 - ldr r7,[sp,#52] @load off0 + ldr r8,[sp,#wgt1_offset] @load wgt1 + ldr r7,[sp,#off0_offset] @load off0 vmov.s16 d7[1],r8 @moved for scalar multiplication mla r4,r12,r8,r4 @(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1) - ldr r9,[sp,#60] @load off1 + ldr r9,[sp,#off1_offset] @load off1 add r5,r7,r9 @off0 + off1 - ldr r10,[sp,#64] @load shift + ldr r10,[sp,#shift_offset] @load shift add r5,r5,#1 @off0 + off1 + 1 sub r14,r10,#1 @shift - 1 - ldr r7,[sp,#80] @load wd + ldr r7,[sp,#wd_offset] @load wd lsl r5,r5,r14 @((off0 + off1 + 1) << (shift - 1)) vdup.u32 q14,r10 @vmovq_n_s32(0-shift) add r4,r4,r5 @tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1)) vdup.u32 q15,r4 @vmovq_n_s32(tmp_lvl_shift) vneg.s32 q14,q14 - ldr r4,[sp,#40] @load src_strd2 + ldr r4,[sp,#src_strd2_offset] @load src_strd2 lsl r9,r7,#1 - ldr r5,[sp,#44] @load dst_strd + ldr r5,[sp,#dst_strd_offset] @load dst_strd lsl r3,r3,#1 - ldr r14,[sp,#76] @load ht + ldr r14,[sp,#ht_offset] @load ht lsl r4,r4,#1 cmp r14,#0 @check ht == 0 @@ -260,6 +273,7 @@ end_core_loop: bgt core_loop @if ht is greater than 0 goto outer_loop end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_weighted_pred_bi_default.s b/common/arm/ihevc_weighted_pred_bi_default.s index 6bdb8cc..5b369be 100644 --- a/common/arm/ihevc_weighted_pred_bi_default.s +++ b/common/arm/ihevc_weighted_pred_bi_default.s @@ -107,6 +107,14 @@ @ r7 => lvl_shift2 @ r8 => ht @ r9 => wd + +.equ src_strd2_offset, 104 +.equ dst_strd_offset, 108 +.equ lvl_shift1_offset, 112 +.equ lvl_shift2_offset, 116 +.equ ht_offset, 120 +.equ wd_offset, 124 + .text .syntax unified .align 4 @@ -121,14 +129,15 @@ ihevc_weighted_pred_bi_default_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments - ldr r4,[sp,#40] @load src_strd2 + vpush {d8 - d15} + ldr r4,[sp,#src_strd2_offset] @load src_strd2 lsl r3,r3,#1 - ldr r5,[sp,#44] @load dst_strd - ldr r6,[sp,#48] @load lvl_shift1 + ldr r5,[sp,#dst_strd_offset] @load dst_strd + ldr r6,[sp,#lvl_shift1_offset] @load lvl_shift1 lsl r4,r4,#1 - ldr r7,[sp,#52] @load lvl_shift2 - ldr r8,[sp,#56] @load ht - ldr r9,[sp,#60] @load wd + ldr r7,[sp,#lvl_shift2_offset] @load lvl_shift2 + ldr r8,[sp,#ht_offset] @load ht + ldr r9,[sp,#wd_offset] @load wd vdup.16 q2,r6 @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1) vdup.16 q3,r7 @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2) vmov.i16 q0,#0x40 @tmp_lvl_shift = 1 << (shift - 1) @@ -488,6 +497,7 @@ end_core_loop_16: end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp diff --git a/common/arm/ihevc_weighted_pred_uni.s b/common/arm/ihevc_weighted_pred_uni.s index e9b69c1..1f99ff8 100644 --- a/common/arm/ihevc_weighted_pred_uni.s +++ b/common/arm/ihevc_weighted_pred_uni.s @@ -112,6 +112,13 @@ @ r8 => ht @ r9 => wd +.equ wgt0_offset, 104 +.equ off0_offset, 108 +.equ shift_offset, 112 +.equ lvl_shift_offset, 116 +.equ ht_offset, 120 +.equ wd_offset, 124 + .text .align 4 @@ -125,16 +132,17 @@ ihevc_weighted_pred_uni_a9q: stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments + vpush {d8 - d15} - ldr r4,[sp,#40] @load wgt0 - ldr r7,[sp,#52] @load lvl_shift + ldr r4,[sp,#wgt0_offset] @load wgt0 + ldr r7,[sp,#lvl_shift_offset] @load lvl_shift mov r11,#1 - ldr r5,[sp,#44] @load off0 + ldr r5,[sp,#off0_offset] @load off0 mul r10,r7,r4 @lvl_shift * wgt0 - ldr r6,[sp,#48] @load shift - ldr r8,[sp,#56] @load ht + ldr r6,[sp,#shift_offset] @load shift + ldr r8,[sp,#ht_offset] @load ht add r10,r10,r5,lsl r6 @lvl_shift * wgt0 + (off0 << shift) - ldr r9,[sp,#60] @load wt + ldr r9,[sp,#wd_offset] @load wt sub r12,r6,#1 vmov.s16 d0[0],r4 @moved for scalar multiplication lsl r2,r2,#1 @@ -214,6 +222,7 @@ end_core_loop: bgt core_loop @if ht is greater than 0 goto outer_loop end_loops: + vpop {d8 - d15} ldmfd sp!,{r4-r12,r15} @reload the registers from sp |