diff options
Diffstat (limited to 'common')
30 files changed, 569 insertions, 620 deletions
diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s index 6681a7c..e2b8c99 100644 --- a/common/arm/ih264_inter_pred_chroma_a9q.s +++ b/common/arm/ih264_inter_pred_chroma_a9q.s @@ -91,8 +91,8 @@ @ UWORD8 *pu1_dst, @ WORD32 src_strd, @ WORD32 dst_strd, -@ UWORD8 u1_dx, -@ UWORD8 u1_dy, +@ WORD32 u1_dx, +@ WORD32 u1_dy, @ WORD32 ht, @ WORD32 wd) @**************Variables Vs Registers***************************************** diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s index 0dd82f3..7597444 100644 --- a/common/arm/ih264_intra_pred_luma_16x16_a9q.s +++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s @@ -413,7 +413,7 @@ scrlbl1: add r7, r0, r4, lsl #3 sub r0, r7, r4, lsl #1 - rsb lr, r4, #0x0 + neg lr, r4 vpadd.s16 d0, d0, d1 diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s index 39ad9b3..b9595d7 100644 --- a/common/arm/ih264_mem_fns_neon.s +++ b/common/arm/ih264_mem_fns_neon.s @@ -68,7 +68,7 @@ @* @void ih264_memcpy_mul_8(UWORD8 *pu1_dst, @ UWORD8 *pu1_src, -@ UWORD8 num_bytes) +@ UWORD32 num_bytes) @**************Variables Vs Registers************************* @ r0 => *pu1_dst @ r1 => *pu1_src @@ -97,7 +97,7 @@ loop_neon_memcpy_mul_8: @* @void ih264_memcpy(UWORD8 *pu1_dst, @ UWORD8 *pu1_src, -@ UWORD8 num_bytes) +@ UWORD32 num_bytes) @**************Variables Vs Registers************************* @ r0 => *pu1_dst @ r1 => *pu1_src @@ -135,7 +135,7 @@ loop_memcpy: @void ih264_memset_mul_8(UWORD8 *pu1_dst, @ UWORD8 value, -@ UWORD8 num_bytes) +@ UWORD32 num_bytes) @**************Variables Vs Registers************************* @ r0 => *pu1_dst @ r1 => value @@ -202,7 +202,7 @@ loop_memset: @void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst, @ UWORD16 value, -@ UWORD8 num_words) +@ UWORD32 num_words) @**************Variables Vs Registers************************* @ r0 => *pu2_dst @ r1 => value @@ -234,7 +234,7 @@ loop_memset_16bit_mul_8: @void ih264_memset_16bit(UWORD16 *pu2_dst, @ UWORD16 value, -@ UWORD8 num_words) +@ UWORD32 num_words) @**************Variables Vs Registers************************* @ r0 => *pu2_dst @ r1 => value diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s index e7a1f91..819b0b3 100644 --- a/common/arm/ih264_padding_neon.s +++ b/common/arm/ih264_padding_neon.s @@ -88,7 +88,7 @@ ih264_pad_top_a9q: stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments sub r5, r0, r1 - rsb r6, r1, #0 + neg r6, r1 loop_neon_memcpy_mul_16: @ Load 16 bytes diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s index 33859e6..304bd8a 100644 --- a/common/arm/ih264_weighted_bi_pred_a9q.s +++ b/common/arm/ih264_weighted_bi_pred_a9q.s @@ -144,7 +144,7 @@ ih264_weighted_bi_pred_luma_a9q: ldr r4, [sp, #40] @Load src_strd2 in r4 ldr r5, [sp, #44] @Load dst_strd in r5 sxtb r9, r9 @sign-extend 8-bit ofst1 to 32-bit - rsb r10, r6, #0 @r13 = -(log_wd + 1) + neg r10, r6 @r10 = -(log_wd + 1) ldr r11, [sp, #68] @Load ht in r11 ldr r12, [sp, #72] @Load wd in r12 vdup.16 q0, r10 @Q0 = -(log_wd + 1) (32-bit) @@ -456,7 +456,7 @@ ih264_weighted_bi_pred_chroma_a9q: ldr r9, [sp, #60] @Load ofst1 in r9 ldr r10, [sp, #64] @Load ofst2 in r10 - rsb r12, r6, #0 @r12 = -(log_wd + 1) + neg r12, r6 @r12 = -(log_wd + 1) ldr r4, [sp, #40] @Load src_strd2 in r4 ldr r5, [sp, #44] @Load dst_strd in r5 vdup.16 q0, r12 @Q0 = -(log_wd + 1) (16-bit) diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s index 81d26d4..80c2c6d 100644 --- a/common/arm/ih264_weighted_pred_a9q.s +++ b/common/arm/ih264_weighted_pred_a9q.s @@ -122,7 +122,7 @@ ih264_weighted_pred_luma_a9q: vpush {d8-d15} vdup.16 d2, r5 @D2 = wt (16-bit) - rsb r9, r4, #0 @r9 = -log_wd + neg r9, r4 @r9 = -log_wd vdup.8 d3, r6 @D3 = ofst (8-bit) cmp r8, #16 @check if wd is 16 vdup.16 q0, r9 @Q0 = -log_wd (16-bit) @@ -349,7 +349,7 @@ ih264_weighted_pred_chroma_a9q: ldr r6, [sp, #36] @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)} ldr r8, [sp, #44] @Load wd - rsb r9, r4, #0 @r9 = -log_wd + neg r9, r4 @r9 = -log_wd vdup.32 q1, r5 @Q1 = {wt_u (16-bit), wt_v (16-bit)} ldr r7, [sp, #40] @Load ht vpush {d8-d15} diff --git a/common/armv8/ih264_deblk_chroma_av8.s b/common/armv8/ih264_deblk_chroma_av8.s index a4dbd23..b7f2d58 100644 --- a/common/armv8/ih264_deblk_chroma_av8.s +++ b/common/armv8/ih264_deblk_chroma_av8.s @@ -56,19 +56,19 @@ //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha_cb +//* @param[in] w2 - alpha_cb //* Alpha Value for the boundary in U //* -//* @param[in] x3 - beta_cb +//* @param[in] w3 - beta_cb //* Beta Value for the boundary in U //* -//* @param[in] sp(0) - alpha_cr +//* @param[in] w4 - alpha_cr //* Alpha Value for the boundary in V //* -//* @param[in] sp(4) - beta_cr +//* @param[in] w5 - beta_cr //* Beta Value for the boundary in V //* //* @returns @@ -87,6 +87,7 @@ ih264_deblk_chroma_horz_bs4_av8: // STMFD sp!,{x4-x6,x14} // push_v_regs stp x19, x20, [sp, #-16]! + sxtw x1, w1 mov x6, x5 mov x5, x4 sub x0, x0, x1, lsl #1 //x0 = uc_edgePixel pointing to p1 of chroma @@ -155,19 +156,19 @@ ih264_deblk_chroma_horz_bs4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha_cb +//* @param[in] w2 - alpha_cb //* Alpha Value for the boundary in U //* -//* @param[in] x3 - beta_cb +//* @param[in] w3 - beta_cb //* Beta Value for the boundary in U //* -//* @param[in] sp(0) - alpha_cr +//* @param[in] w4 - alpha_cr //* Alpha Value for the boundary in V //* -//* @param[in] sp(4) - beta_cr +//* @param[in] w5 - beta_cr //* Beta Value for the boundary in V //* //* @returns @@ -186,12 +187,13 @@ ih264_deblk_chroma_vert_bs4_av8: // STMFD sp!,{x4,x5,x12,x14} push_v_regs stp x19, x20, [sp, #-16]! + sxtw x1, w1 sub x0, x0, #4 //point x0 to p1u of row0. mov x12, x0 //keep a back up of x0 for buffer write - add x2, x2, x4, lsl #8 //x2 = (alpha_cr,alpha_cb) - add x3, x3, x5, lsl #8 //x3 = (beta_cr,beta_cb) + add w2, w2, w4, lsl #8 //w2 = (alpha_cr,alpha_cb) + add w3, w3, w5, lsl #8 //w3 = (beta_cr,beta_cb) ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1 ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1 @@ -292,28 +294,28 @@ ih264_deblk_chroma_vert_bs4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha_cb +//* @param[in] w2 - alpha_cb //* Alpha Value for the boundary in U //* -//* @param[in] x3 - beta_cb +//* @param[in] w3 - beta_cb //* Beta Value for the boundary in U //* -//* @param[in] sp(0) - alpha_cr +//* @param[in] w4 - alpha_cr //* Alpha Value for the boundary in V //* -//* @param[in] sp(4) - beta_cr +//* @param[in] w5 - beta_cr //* Beta Value for the boundary in V //* -//* @param[in] sp(8) - u4_bs +//* @param[in] w6 - u4_bs //* Packed Boundary strength array //* -//* @param[in] sp(12) - pu1_cliptab_cb +//* @param[in] x7 - pu1_cliptab_cb //* tc0_table for U //* -//* @param[in] sp(16) - pu1_cliptab_cr +//* @param[in] sp(0) - pu1_cliptab_cr //* tc0_table for V //* //* @returns @@ -332,14 +334,13 @@ ih264_deblk_chroma_horz_bslt4_av8: // STMFD sp!,{x4-x9,x14} // push_v_regs stp x19, x20, [sp, #-16]! - mov x8, x7 - mov x7, x6 - ldr x9, [sp, #80] + sxtw x1, w1 + ldr x8, [sp, #80] sub x0, x0, x1, lsl #1 //x0 = uc_edgePixelU pointing to p1 of chroma U - rev w7, w7 // - mov v12.s[0], w7 //D12[0] = ui_Bs - ld1 {v16.s}[0], [x8] //D16[0] contains cliptab_cb - ld1 {v17.s}[0], [x9] //D17[0] contains cliptab_cr + rev w6, w6 // + mov v12.s[0], w6 //D12[0] = ui_Bs + ld1 {v16.s}[0], [x7] //D16[0] contains cliptab_cb + ld1 {v17.s}[0], [x8] //D17[0] contains cliptab_cr ld2 {v6.8b, v7.8b}, [x0], x1 //Q3=p1 tbl v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U tbl v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V @@ -428,28 +429,28 @@ ih264_deblk_chroma_horz_bslt4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha_cb +//* @param[in] w2 - alpha_cb //* Alpha Value for the boundary in U //* -//* @param[in] x3 - beta_cb +//* @param[in] w3 - beta_cb //* Beta Value for the boundary in U //* -//* @param[in] sp(0) - alpha_cr +//* @param[in] w4 - alpha_cr //* Alpha Value for the boundary in V //* -//* @param[in] sp(4) - beta_cr +//* @param[in] w5 - beta_cr //* Beta Value for the boundary in V //* -//* @param[in] sp(8) - u4_bs +//* @param[in] w6 - u4_bs //* Packed Boundary strength array //* -//* @param[in] sp(12) - pu1_cliptab_cb +//* @param[in] x7 - pu1_cliptab_cb //* tc0_table for U //* -//* @param[in] sp(16) - pu1_cliptab_cr +//* @param[in] sp(0) - pu1_cliptab_cr //* tc0_table for V //* //* @returns @@ -468,11 +469,12 @@ ih264_deblk_chroma_vert_bslt4_av8: // STMFD sp!,{x4-x7,x10-x12,x14} push_v_regs stp x19, x20, [sp, #-16]! + sxtw x1, w1 mov x10, x7 - ldr x11, [sp, #80] //x6 = u4_bs + ldr x11, [sp, #80] //x11 = u4_bs sub x0, x0, #4 //point x0 to p1u of row0. - add x2, x2, x4, lsl #8 - add x3, x3, x5, lsl #8 + add w2, w2, w4, lsl #8 + add w3, w3, w5, lsl #8 mov x12, x0 //keep a back up of x0 for buffer write ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1 ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1 diff --git a/common/armv8/ih264_deblk_luma_av8.s b/common/armv8/ih264_deblk_luma_av8.s index 1b3950d..7705df2 100644 --- a/common/armv8/ih264_deblk_luma_av8.s +++ b/common/armv8/ih264_deblk_luma_av8.s @@ -60,19 +60,19 @@ //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha +//* @param[in] w2 - alpha //* Alpha Value for the boundary //* -//* @param[in] x3 - beta +//* @param[in] w3 - beta //* Beta Value for the boundary //* -//* @param[in] sp(0) - u4_bs +//* @param[in] w4 - u4_bs //* Packed Boundary strength array //* -//* @param[in] sp(4) - pu1_cliptab +//* @param[in] x5 - pu1_cliptab //* tc0_table //* //* @returns @@ -90,6 +90,7 @@ ih264_deblk_luma_horz_bslt4_av8: // STMFD sp!,{x4-x7,x14} push_v_regs + sxtw x1, w1 stp x19, x20, [sp, #-16]! //LDRD x4,x5,[SP,#0x14] //x4 = ui_Bs , x5 = *puc_ClpTab @@ -214,13 +215,13 @@ ih264_deblk_luma_horz_bslt4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha +//* @param[in] w2 - alpha //* Alpha Value for the boundary //* -//* @param[in] x3 - beta +//* @param[in] w3 - beta //* Beta Value for the boundary //* //* @returns @@ -240,6 +241,7 @@ ih264_deblk_luma_horz_bs4_av8: // STMFD sp!,{x12,x14} push_v_regs stp x19, x20, [sp, #-16]! + sxtw x1, w1 // Init dup v0.16b, w2 //duplicate alpha @@ -401,19 +403,19 @@ ih264_deblk_luma_horz_bs4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha +//* @param[in] w2 - alpha //* Alpha Value for the boundary //* -//* @param[in] x3 - beta +//* @param[in] w3 - beta //* Beta Value for the boundary //* -//* @param[in] sp(0) - u4_bs +//* @param[in] w4 - u4_bs //* Packed Boundary strength array //* -//* @param[in] sp(4) - pu1_cliptab +//* @param[in] x5 - pu1_cliptab //* tc0_table //* //* @returns @@ -432,6 +434,7 @@ ih264_deblk_luma_vert_bslt4_av8: // STMFD sp!,{x12,x14} push_v_regs stp x19, x20, [sp, #-16]! + sxtw x1, w1 sub x0, x0, #4 //pointer uc_edgePixel-4 mov x12, x4 @@ -743,13 +746,13 @@ ih264_deblk_luma_vert_bslt4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha +//* @param[in] w2 - alpha //* Alpha Value for the boundary //* -//* @param[in] x3 - beta +//* @param[in] w3 - beta //* Beta Value for the boundary //* //* @returns diff --git a/common/armv8/ih264_default_weighted_pred_av8.s b/common/armv8/ih264_default_weighted_pred_av8.s index 6823015..d10047e 100644 --- a/common/armv8/ih264_default_weighted_pred_av8.s +++ b/common/armv8/ih264_default_weighted_pred_av8.s @@ -88,18 +88,18 @@ // WORD32 src_strd1, // WORD32 src_strd2, // WORD32 dst_strd, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src1 // x1 => puc_src2 // x2 => puc_dst -// x3 => src_strd1 -// [sp] => src_strd2 (x4) -// [sp+4] => dst_strd (x5) -// [sp+8] => ht (x6) -// [sp+12] => wd (x7) +// w3 => src_strd1 +// w4 => src_strd2 +// w5 => dst_strd +// w6 => ht +// w7 => wd // .text .p2align 2 @@ -113,6 +113,9 @@ ih264_default_weighted_pred_luma_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 cmp w7, #16 beq loop_16 //branch if wd is 16 cmp w7, #8 @@ -263,18 +266,18 @@ end_loops: // WORD32 src_strd1, // WORD32 src_strd2, // WORD32 dst_strd, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src1 // x1 => puc_src2 // x2 => puc_dst -// x3 => src_strd1 -// [sp] => src_strd2 (x4) -// [sp+4] => dst_strd (x5) -// [sp+8] => ht (x6) -// [sp+12] => wd (x7) +// w3 => src_strd1 +// w4 => src_strd2 +// w5 => dst_strd +// w6 => ht +// w7 => wd // @@ -286,6 +289,9 @@ ih264_default_weighted_pred_chroma_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 cmp w7, #8 beq loop_8_uv //branch if wd is 8 cmp w7, #4 diff --git a/common/armv8/ih264_inter_pred_chroma_av8.s b/common/armv8/ih264_inter_pred_chroma_av8.s index 714e271..f6aef40 100644 --- a/common/armv8/ih264_inter_pred_chroma_av8.s +++ b/common/armv8/ih264_inter_pred_chroma_av8.s @@ -91,19 +91,19 @@ // UWORD8 *pu1_dst, // WORD32 src_strd, // WORD32 dst_strd, -// UWORD8 u1_dx, -// UWORD8 u1_dy, +// WORD32 u1_dx, +// WORD32 u1_dy, // WORD32 ht, // WORD32 wd) //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => u1_dx -// x5 => u1_dy -// x6 => height -// x7 => width +// w2 => src_strd +// w3 => dst_strd +// w4 => u1_dx +// w5 => u1_dy +// w6 => height +// w7 => width // .text .p2align 2 @@ -120,6 +120,12 @@ ih264_inter_pred_chroma_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 + sxtw x6, w6 + sxtw x7, w7 diff --git a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s index 6ad463a..e7c9f86 100644 --- a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s +++ b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s @@ -89,10 +89,10 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd .text .p2align 2 @@ -111,6 +111,10 @@ ih264_inter_pred_luma_horz_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 sub x0, x0, #2 //pu1_src-2 sub x14, x4, #16 movi v0.8b, #5 //filter coeff diff --git a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s index 9564f99..711d73e 100644 --- a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s +++ b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s @@ -89,10 +89,10 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd .text .p2align 2 @@ -108,6 +108,10 @@ ih264_inter_pred_luma_vert_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd diff --git a/common/armv8/ih264_inter_pred_luma_copy_av8.s b/common/armv8/ih264_inter_pred_luma_copy_av8.s index 1a76c1c..007df30 100644 --- a/common/armv8/ih264_inter_pred_luma_copy_av8.s +++ b/common/armv8/ih264_inter_pred_luma_copy_av8.s @@ -65,10 +65,10 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x7 => ht -// x12 => wd +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd .text .p2align 2 @@ -82,6 +82,10 @@ ih264_inter_pred_luma_copy_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 mov x12, x5 mov x7, x4 @@ -228,14 +232,16 @@ end_inner_loop_wd_16: // Register Usage // x0 : pi2_src // x1 : pu1_out -// x2 : src_strd -// x3 : out_strd +// w2 : src_strd +// w3 : out_strd // Neon registers d0-d7, d16-d30 are used // No need for pushing arm and neon registers .global ih264_interleave_copy_av8 ih264_interleave_copy_av8: push_v_regs + sxtw x2, w2 + sxtw x3, w3 ld1 {v2.8b}, [x0], x2 //load src plane 1 => d2 &pred palne 2 => d3 ld1 {v3.8b}, [x0], x2 mov v2.d[1], v3.d[0] diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s index d2897b6..dd4383e 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s @@ -52,10 +52,10 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd .text @@ -71,6 +71,10 @@ ih264_inter_pred_luma_horz_hpel_vert_hpel_av8: //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd sub x0, x0, #2 //pu1_src-2 diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s index 546c807..3563ac0 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s @@ -105,12 +105,12 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd -// x7 => dydx -// x9 => *pu1_tmp +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd +// x6 => *pu1_tmp +// w7 => dydx .text .p2align 2 @@ -126,6 +126,10 @@ ih264_inter_pred_luma_horz_hpel_vert_qpel_av8: // store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 @@ -134,7 +138,8 @@ ih264_inter_pred_luma_horz_hpel_vert_qpel_av8: mov x9, x6 - lsr x7, x7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit + // by writing to w7 here, we clear the upper half of x7 + lsr w7, w7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit add x7, x7, #2 mov x6, #48 diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s index 39e3253..38268c7 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s @@ -94,11 +94,11 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd -// x7 => dydx +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd +// w7 => dydx .text .p2align 2 @@ -114,6 +114,10 @@ ih264_inter_pred_luma_horz_qpel_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 and x7, x7, #3 //Finds x-offset diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s index 3f3e297..6ccf11f 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s @@ -105,12 +105,12 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd -// x6 => dydx -// x9 => *pu1_tmp +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd +// x6 => *pu1_tmp +// w7 => dydx .text .p2align 2 @@ -125,11 +125,15 @@ ih264_inter_pred_luma_horz_qpel_vert_hpel_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd sub x0, x0, #2 //pu1_src-2 mov x9, x6 - mov x6, x7 + mov w6, w7 and x6, x6, #2 // dydx & 0x3 followed by dydx>>1 and dydx<<1 diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s index ab663d0..a9dfbd1 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s @@ -104,11 +104,11 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd -// x6 => dydx +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd +// w7 => dydx .text .p2align 2 @@ -122,7 +122,11 @@ ih264_inter_pred_luma_horz_qpel_vert_qpel_av8: push_v_regs stp x19, x20, [sp, #-16]! - mov x6, x7 + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 + mov w6, w7 and x7, x6, #3 add x7, x0, x7, lsr #1 //pu1_pred_vert = pu1_src + (x_offset>>1) diff --git a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s index 9d19a2d..014faca 100644 --- a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s @@ -94,11 +94,11 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd -// x7 => dydx +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd +// w7 => dydx .text .p2align 2 @@ -112,6 +112,10 @@ ih264_inter_pred_luma_vert_qpel_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 and x7, x7, #12 //Finds y-offset diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s index 8f0f282..39c0256 100644 --- a/common/armv8/ih264_intra_pred_chroma_av8.s +++ b/common/armv8/ih264_intra_pred_chroma_av8.s @@ -100,9 +100,9 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability @@ -113,13 +113,14 @@ ih264_intra_pred_chroma_8x8_mode_dc_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 - mov x19, #5 - ands x6, x4, x19 + mov w19, #5 + ands w6, w4, w19 beq none_available - cmp x6, #1 + cmp w6, #1 beq left_only_available - cmp x6, #4 + cmp w6, #4 beq top_only_available all_available: @@ -251,9 +252,9 @@ end_func: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_chroma_8x8_mode_horz_av8 @@ -263,6 +264,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8: push_v_regs + sxtw x3, w3 ld1 {v0.8h}, [x0] dup v10.8h, v0.h[7] @@ -332,9 +334,9 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_chroma_8x8_mode_vert_av8 @@ -342,6 +344,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8: ih264_intra_pred_chroma_8x8_mode_vert_av8: push_v_regs + sxtw x3, w3 add x0, x0, #18 ld1 {v0.8b, v1.8b}, [x0] @@ -405,15 +408,16 @@ ih264_intra_pred_chroma_8x8_mode_vert_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_chroma_8x8_mode_plane_av8 ih264_intra_pred_chroma_8x8_mode_plane_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 {v0.2s}, [x0] add x10, x0, #10 @@ -457,18 +461,14 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8: rshrn v13.4h, v26.4s, #6 rshrn v14.4h, v28.4s, #6 ldrb w6, [x0], #1 - sxtw x6, w6 add x10, x0, #31 ldrb w8, [x0], #1 - sxtw x8, w8 ldrb w7, [x10], #1 - sxtw x7, w7 ldrb w9, [x10], #1 - sxtw x9, w9 - add x6, x6, x7 - add x8, x8, x9 - lsl x6, x6, #4 - lsl x8, x8, #4 + add w6, w6, w7 + add w8, w8, w9 + lsl w6, w6, #4 + lsl w8, w8, #4 dup v0.8h, w6 dup v2.8h, w8 dup v4.8h, v12.h[0] diff --git a/common/armv8/ih264_intra_pred_luma_16x16_av8.s b/common/armv8/ih264_intra_pred_luma_16x16_av8.s index c1847b5..fa19c12 100644 --- a/common/armv8/ih264_intra_pred_luma_16x16_av8.s +++ b/common/armv8/ih264_intra_pred_luma_16x16_av8.s @@ -98,9 +98,9 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_16x16_mode_vert_av8 @@ -108,6 +108,7 @@ ih264_intra_pred_luma_16x16_mode_vert_av8: push_v_regs + sxtw x3, w3 add x0, x0, #17 @@ -181,9 +182,9 @@ ih264_intra_pred_luma_16x16_mode_vert_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_16x16_mode_horz_av8 @@ -192,6 +193,7 @@ ih264_intra_pred_luma_16x16_mode_horz_av8: push_v_regs + sxtw x3, w3 ld1 {v0.16b}, [x0] @@ -283,9 +285,9 @@ ih264_intra_pred_luma_16x16_mode_horz_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_16x16_mode_dc_av8 @@ -295,18 +297,19 @@ ih264_intra_pred_luma_16x16_mode_dc_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 sub v0.16b, v0.16b, v0.16b sub v1.16b, v1.16b, v1.16b mov w10, #0 mov w11 , #3 - ands x6, x4, #0x01 + ands w6, w4, #0x01 beq top_available //LEFT NOT AVAILABLE ld1 {v0.16b}, [x0] add w10, w10, #8 add w11, w11, #1 top_available: - ands x6, x4, #0x04 + ands w6, w4, #0x04 beq none_available add x6, x0, #17 ld1 {v1.16b}, [x6] @@ -314,7 +317,7 @@ top_available: add w11, w11, #1 b summation none_available: - cmp x4, #0 + cmp w4, #0 bne summation mov w15, #128 dup v20.16b, w15 @@ -410,15 +413,16 @@ end_func: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_16x16_mode_plane_av8 ih264_intra_pred_luma_16x16_mode_plane_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 mov x2, x1 add x1, x0, #17 add x0, x0, #15 @@ -440,76 +444,58 @@ ih264_intra_pred_luma_16x16_mode_plane_av8: uxtl v18.8h, v7.8b add x7, x0, x4, lsl #3 sub x0, x7, x4, lsl #1 - sub x20, x4, #0x0 - neg x14, x20 + neg x14, x4 addp v0.8h, v0.8h, v1.8h ldrb w8, [x7], #-1 - sxtw x8, w8 ldrb w9, [x0], #1 - sxtw x9, w9 saddlp v0.2s, v0.4h - sub x12, x8, x9 + sub w12, w8, w9 ldrb w8, [x7], #-1 - sxtw x8, w8 saddlp v0.1d, v0.2s ldrb w9, [x0], #1 - sxtw x9, w9 - sub x8, x8, x9 + sub w8, w8, w9 shl v2.2s, v0.2s, #2 - add x12, x12, x8, lsl #1 + add w12, w12, w8, lsl #1 add v0.2s, v0.2s , v2.2s ldrb w8, [x7], #-1 - sxtw x8, w8 ldrb w9, [x0], #1 - sxtw x9, w9 srshr v0.2s, v0.2s, #6 // i_b = D0[0] - sub x8, x8, x9 + sub w8, w8, w9 ldrb w5, [x7], #-1 - sxtw x5, w5 - add x8, x8, x8, lsl #1 + add w8, w8, w8, lsl #1 dup v4.8h, v0.h[0] - add x12, x12, x8 + add w12, w12, w8 ldrb w9, [x0], #1 - sxtw x9, w9 mul v0.8h, v4.8h , v16.8h - sub x5, x5, x9 + sub w5, w5, w9 mul v2.8h, v4.8h , v18.8h - add x12, x12, x5, lsl #2 + add w12, w12, w5, lsl #2 ldrb w8, [x7], #-1 - sxtw x8, w8 ldrb w9, [x0], #1 - sxtw x9, w9 - sub x8, x8, x9 + sub w8, w8, w9 ldrb w5, [x7], #-1 - sxtw x5, w5 - add x8, x8, x8, lsl #2 + add w8, w8, w8, lsl #2 ldrb w6, [x0], #1 - sxtw x6, w6 - add x12, x12, x8 + add w12, w12, w8 ldrb w8, [x7], #-1 - sxtw x8, w8 ldrb w9, [x0], #1 - sxtw x9, w9 - sub x5, x5, x6 - sub x8, x8, x9 - add x5, x5, x5, lsl #1 - sub x20, x8, x8, lsl #3 - neg x8, x20 - add x12, x12, x5, lsl #1 + sub w5, w5, w6 + sub w8, w8, w9 + add w5, w5, w5, lsl #1 + sub w20, w8, w8, lsl #3 + neg w8, w20 + add w12, w12, w5, lsl #1 ldrb w5, [x7], #-1 - sxtw x5, w5 ldrb w6, [x10] //top_left - sxtw x6, w6 - add x12, x12, x8 - sub x9, x5, x6 + add w12, w12, w8 + sub w9, w5, w6 ldrb w6, [x1, #7] - sxtw x6, w6 - add x12, x12, x9, lsl #3 // i_c = x12 - add x8, x5, x6 - add x12, x12, x12, lsl #2 - lsl x8, x8, #4 // i_a = x8 - add x12, x12, #0x20 - lsr x12, x12, #6 + add w12, w12, w9, lsl #3 // i_c = w12 + add w8, w5, w6 + add w12, w12, w12, lsl #2 + lsl w8, w8, #4 // i_a = w8 + add w12, w12, #0x20 + lsr w12, w12, #6 shl v28.8h, v4.8h, #3 dup v6.8h, w12 dup v30.8h, w8 diff --git a/common/armv8/ih264_intra_pred_luma_4x4_av8.s b/common/armv8/ih264_intra_pred_luma_4x4_av8.s index 62e8cee..1f95131 100644 --- a/common/armv8/ih264_intra_pred_luma_4x4_av8.s +++ b/common/armv8/ih264_intra_pred_luma_4x4_av8.s @@ -102,15 +102,16 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_vert_av8 ih264_intra_pred_luma_4x4_mode_vert_av8: push_v_regs + sxtw x3, w3 add x0, x0, #5 @@ -171,9 +172,9 @@ ih264_intra_pred_luma_4x4_mode_vert_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability @@ -182,6 +183,7 @@ ih264_intra_pred_luma_4x4_mode_vert_av8: ih264_intra_pred_luma_4x4_mode_horz_av8: push_v_regs + sxtw x3, w3 ld1 {v1.s}[0], [x0] dup v0.8b, v1.b[3] @@ -246,9 +248,9 @@ ih264_intra_pred_luma_4x4_mode_horz_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability @@ -261,41 +263,34 @@ ih264_intra_pred_luma_4x4_mode_dc_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 - ands x5, x4, #0x01 + ands w5, w4, #0x01 beq top_available //LEFT NOT AVAILABLE add x10, x0, #3 mov x2, #-1 ldrb w5, [x10], #-1 - sxtw x5, w5 ldrb w6, [x10], #-1 - sxtw x6, w6 ldrb w7, [x10], #-1 - sxtw x7, w7 - add x5, x5, x6 + add w5, w5, w6 ldrb w8, [x10], #-1 - sxtw x8, w8 - add x5, x5, x7 - ands x11, x4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE - add x5, x5, x8 + add w5, w5, w7 + ands w11, w4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + add w5, w5, w8 beq left_available add x10, x0, #5 // BOTH LEFT AND TOP AVAILABLE ldrb w6, [x10], #1 - sxtw x6, w6 ldrb w7, [x10], #1 - sxtw x7, w7 - add x5, x5, x6 + add w5, w5, w6 ldrb w8, [x10], #1 - sxtw x8, w8 - add x5, x5, x7 + add w5, w5, w7 ldrb w9, [x10], #1 - sxtw x9, w9 - add x5, x5, x8 - add x5, x5, x9 - add x5, x5, #4 - lsr x5, x5, #3 + add w5, w5, w8 + add w5, w5, w9 + add w5, w5, #4 + lsr w5, w5, #3 dup v0.8b, w5 st1 {v0.s}[0], [x1], x3 st1 {v0.s}[0], [x1], x3 @@ -304,23 +299,19 @@ ih264_intra_pred_luma_4x4_mode_dc_av8: b end_func top_available: // ONLT TOP AVAILABLE - ands x11, x4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + ands w11, w4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE beq none_available add x10, x0, #5 ldrb w6, [x10], #1 - sxtw x6, w6 ldrb w7, [x10], #1 - sxtw x7, w7 ldrb w8, [x10], #1 - sxtw x8, w8 - add x5, x6, x7 + add w5, w6, w7 ldrb w9, [x10], #1 - sxtw x9, w9 - add x5, x5, x8 - add x5, x5, x9 - add x5, x5, #2 - lsr x5, x5, #2 + add w5, w5, w8 + add w5, w5, w9 + add w5, w5, #2 + lsr w5, w5, #2 dup v0.8b, w5 st1 {v0.s}[0], [x1], x3 st1 {v0.s}[0], [x1], x3 @@ -401,9 +392,9 @@ end_func: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_diag_dl_av8 @@ -413,6 +404,7 @@ ih264_intra_pred_luma_4x4_mode_diag_dl_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 add x0, x0, #5 sub x5, x3, #2 @@ -488,9 +480,9 @@ end_func_diag_dl: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_diag_dr_av8 @@ -499,6 +491,7 @@ ih264_intra_pred_luma_4x4_mode_diag_dr_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 {v0.8b}, [x0] @@ -571,9 +564,9 @@ end_func_diag_dr: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_vert_r_av8 @@ -582,6 +575,7 @@ ih264_intra_pred_luma_4x4_mode_vert_r_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 {v0.8b}, [x0] @@ -656,9 +650,9 @@ end_func_vert_r: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_horz_d_av8 @@ -667,6 +661,7 @@ ih264_intra_pred_luma_4x4_mode_horz_d_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 {v0.8b}, [x0] add x0, x0, #1 @@ -743,9 +738,9 @@ end_func_horz_d: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_vert_l_av8 @@ -754,6 +749,7 @@ ih264_intra_pred_luma_4x4_mode_vert_l_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 add x0, x0, #4 ld1 {v0.8b}, [x0] add x0, x0, #1 @@ -825,9 +821,9 @@ end_func_vert_l: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_horz_u_av8 @@ -835,11 +831,11 @@ end_func_vert_l: ih264_intra_pred_luma_4x4_mode_horz_u_av8: push_v_regs + sxtw x3, w3 stp x19, x20, [sp, #-16]! mov x10, x0 ld1 {v0.8b}, [x0] ldrb w9, [x0], #1 - sxtw x9, w9 ext v1.8b, v0.8b , v0.8b , #1 ld1 {v0.b}[7], [x10] ext v2.8b, v1.8b , v1.8b , #1 diff --git a/common/armv8/ih264_intra_pred_luma_8x8_av8.s b/common/armv8/ih264_intra_pred_luma_8x8_av8.s index bf9a4c1..273aa81 100644 --- a/common/armv8/ih264_intra_pred_luma_8x8_av8.s +++ b/common/armv8/ih264_intra_pred_luma_8x8_av8.s @@ -102,9 +102,9 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_vert_av8 @@ -114,6 +114,7 @@ ih264_intra_pred_luma_8x8_mode_vert_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs //stp x19, x20,[sp,#-16]! + sxtw x3, w3 add x0, x0, #9 ld1 {v0.8b}, [x0] @@ -180,9 +181,9 @@ ih264_intra_pred_luma_8x8_mode_vert_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_horz_av8 @@ -194,38 +195,30 @@ ih264_intra_pred_luma_8x8_mode_horz_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 add x0, x0, #7 - mov x2 , #-1 ldrb w5, [x0], #-1 - sxtw x5, w5 ldrb w6, [x0], #-1 - sxtw x6, w6 dup v0.8b, w5 st1 {v0.8b}, [x1], x3 ldrb w7, [x0], #-1 - sxtw x7, w7 dup v1.8b, w6 st1 {v1.8b}, [x1], x3 dup v2.8b, w7 ldrb w8, [x0], #-1 - sxtw x8, w8 dup v3.8b, w8 st1 {v2.8b}, [x1], x3 ldrb w5, [x0], #-1 - sxtw x5, w5 st1 {v3.8b}, [x1], x3 dup v0.8b, w5 ldrb w6, [x0], #-1 - sxtw x6, w6 st1 {v0.8b}, [x1], x3 ldrb w7, [x0], #-1 - sxtw x7, w7 dup v1.8b, w6 dup v2.8b, w7 st1 {v1.8b}, [x1], x3 ldrb w8, [x0], #-1 - sxtw x8, w8 dup v3.8b, w8 st1 {v2.8b}, [x1], x3 st1 {v3.8b}, [x1], x3 @@ -285,9 +278,9 @@ ih264_intra_pred_luma_8x8_mode_horz_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_dc_av8 @@ -298,37 +291,30 @@ ih264_intra_pred_luma_8x8_mode_dc_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs + sxtw x3, w3 stp x19, x20, [sp, #-16]! - ands x6, x4, #0x01 + ands w6, w4, #0x01 beq top_available //LEFT NOT AVAILABLE add x10, x0, #7 mov x2, #-1 ldrb w5, [x10], -1 - sxtw x5, w5 ldrb w6, [x10], -1 - sxtw x6, w6 ldrb w7, [x10], -1 - sxtw x7, w7 - add x5, x5, x6 + add w5, w5, w6 ldrb w8, [x10], -1 - sxtw x8, w8 - add x5, x5, x7 + add w5, w5, w7 ldrb w6, [x10], -1 - sxtw x6, w6 - add x5, x5, x8 + add w5, w5, w8 ldrb w7, [x10], -1 - sxtw x7, w7 - add x5, x5, x6 + add w5, w5, w6 ldrb w8, [x10], -1 - sxtw x8, w8 - add x5, x5, x7 - ands x11, x4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE - add x5, x5, x8 + add w5, w5, w7 + ands w11, w4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + add w5, w5, w8 ldrb w6, [x10], -1 - sxtw x6, w6 - add x5, x5, x6 + add w5, w5, w6 beq left_available add x10, x0, #9 // BOTH LEFT AND TOP AVAILABLE @@ -351,7 +337,7 @@ ih264_intra_pred_luma_8x8_mode_dc_av8: b end_func top_available: // ONLT TOP AVAILABLE - ands x11, x4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + ands w11, w4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE beq none_available add x10, x0, #9 @@ -452,9 +438,9 @@ end_func: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_diag_dl_av8 @@ -463,6 +449,7 @@ ih264_intra_pred_luma_8x8_mode_diag_dl_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 add x0, x0, #9 sub x5, x3, #4 @@ -554,9 +541,9 @@ end_func_diag_dl: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_diag_dr_av8 @@ -566,6 +553,7 @@ ih264_intra_pred_luma_8x8_mode_diag_dr_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 { v0.16b}, [x0] @@ -654,9 +642,9 @@ end_func_diag_dr: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_vert_r_av8 @@ -666,6 +654,7 @@ ih264_intra_pred_luma_8x8_mode_vert_r_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 { v0.16b}, [x0] mov v1.d[0], v0.d[1] @@ -780,9 +769,9 @@ end_func_vert_r: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_horz_d_av8 @@ -791,6 +780,7 @@ ih264_intra_pred_luma_8x8_mode_horz_d_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 { v0.16b}, [x0] mov v1.d[0], v0.d[1] @@ -910,9 +900,9 @@ end_func_horz_d: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_vert_l_av8 @@ -922,6 +912,7 @@ ih264_intra_pred_luma_8x8_mode_vert_l_av8: // STMFD sp!, {x4-x12, x14} //Restoring registers from stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 add x0, x0, #9 ld1 { v0.16b}, [x0] mov v1.d[0], v0.d[1] @@ -1018,9 +1009,9 @@ end_func_vert_l: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_horz_u_av8 @@ -1029,6 +1020,7 @@ ih264_intra_pred_luma_8x8_mode_horz_u_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 {v0.8b}, [x0] ld1 {v1.b}[7], [x0] diff --git a/common/armv8/ih264_iquant_itrans_recon_av8.s b/common/armv8/ih264_iquant_itrans_recon_av8.s index 4c83036..003ee74 100644 --- a/common/armv8/ih264_iquant_itrans_recon_av8.s +++ b/common/armv8/ih264_iquant_itrans_recon_av8.s @@ -103,11 +103,11 @@ //x0 => *pi2_src //x1 => *pu1_pred //x2 => *pu1_out -//x3 => pred_strd -//x4 => out_strd +//w3 => pred_strd +//w4 => out_strd //x5 => *pu2_iscal_mat //x6 => *pu2_weigh_mat -//x7 => u4_qp_div_6 +//w7 => u4_qp_div_6 // => pi4_tmp // => iq_start_idx // => pi2_dc_ld_addr @@ -119,6 +119,8 @@ ih264_iquant_itrans_recon_4x4_av8: push_v_regs + sxtw x3, w3 + sxtw x4, w4 dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 @@ -292,11 +294,11 @@ skip_loading_luma_dc_src: //x0 => *pi2_src //x1 => *pu1_pred //x2 => *pu1_out -//x3 => pred_strd -//x4 => out_strd +//w3 => pred_strd +//w4 => out_strd //x5 => *pu2_iscal_mat //x6 => *pu2_weigh_mat -//x7 => u4_qp_div_6 +//w7 => u4_qp_div_6 //sp => pi4_tmp //sp#8 => *pi2_dc_src @@ -315,6 +317,8 @@ ih264_iquant_itrans_recon_chroma_4x4_av8: //reduce sp by 64 push_v_regs + sxtw x3, w3 + sxtw x4, w4 dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 @@ -512,11 +516,11 @@ ih264_iquant_itrans_recon_chroma_4x4_av8: //x0 => *pi2_src //x1 => *pu1_pred //x2 => *pu1_out -//x3 => pred_strd -//x4 => out_strd +//w3 => pred_strd +//w4 => out_strd //x5 => *pu2_iscal_mat //x6 => *pu2_weigh_mat -//x7 => u4_qp_div_6 +//w7 => u4_qp_div_6 //NOT USED => pi4_tmp //NOT USED => iq_start_idx //NOT USED => pi2_dc_ld_addr @@ -525,6 +529,8 @@ ih264_iquant_itrans_recon_chroma_4x4_av8: ih264_iquant_itrans_recon_8x8_av8: push_v_regs + sxtw x3, w3 + sxtw x4, w4 ld1 {v8.8h -v11.8h}, [x5], #64 ld1 {v12.8h-v15.8h}, [x5] diff --git a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s index 8bb9c32..13061ec 100644 --- a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s +++ b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s @@ -104,11 +104,11 @@ //x0 => *pi2_src //x1 => *pu1_pred //x2 => *pu1_out -//x3 => pred_strd -//x4 => out_strd +//w3 => pred_strd +//w4 => out_strd //x5 => *pu2_iscal_mat //x6 => *pu2_weigh_mat -//x7 => u4_qp_div_6 +//w7 => u4_qp_div_6 // => pi4_tmp // => iq_start_idx // => pi2_dc_ld_addr @@ -119,6 +119,8 @@ .global ih264_iquant_itrans_recon_4x4_dc_av8 ih264_iquant_itrans_recon_4x4_dc_av8: + sxtw x3, w3 + sxtw x4, w4 ldr w8, [sp, #8] //Loads iq_start_idx subs w8, w8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set @@ -209,11 +211,11 @@ donot_use_pi2_src_luma_dc: // x0 : pi2_src // x1 : pu1_pred // x2 : pu1_out -// x3 : pred_strd -// x4 : out_strd +// w3 : pred_strd +// w4 : out_strd // x5 : pu2_iscal_mat // x6 : pu2_weigh_mat -// x7 : u4_qp_div_6 +// w7 : u4_qp_div_6 // : pi2_tmp // : pi2_dc_src // Neon registers d0-d7, d16-d30 are used @@ -223,6 +225,8 @@ donot_use_pi2_src_luma_dc: .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8 ih264_iquant_itrans_recon_chroma_4x4_dc_av8: + sxtw x3, w3 + sxtw x4, w4 ldr x0, [sp, #8] push_v_regs ld1 {v0.h}[0], [x0] @@ -327,11 +331,11 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_av8: //x0 => *pi2_src //x1 => *pu1_pred //x2 => *pu1_out -//x3 => pred_strd -//x4 => out_strd +//w3 => pred_strd +//w4 => out_strd //x5 => *pu2_iscal_mat //x6 => *pu2_weigh_mat -//x7 => u4_qp_div_6 +//w7 => u4_qp_div_6 //NOT USED => pi4_tmp //NOT USED => iq_start_idx //NOT USED => pi2_dc_ld_addr @@ -340,6 +344,8 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_av8: ih264_iquant_itrans_recon_8x8_dc_av8: push_v_regs + sxtw x3, w3 + sxtw x4, w4 ld1 {v1.h}[0], [x5] ld1 {v2.h}[0], [x6] diff --git a/common/armv8/ih264_mem_fns_neon_av8.s b/common/armv8/ih264_mem_fns_neon_av8.s index 4e9020d..802550d 100644 --- a/common/armv8/ih264_mem_fns_neon_av8.s +++ b/common/armv8/ih264_mem_fns_neon_av8.s @@ -70,11 +70,11 @@ //*/ //void ih264_memcpy_mul_8(UWORD8 *pu1_dst, // UWORD8 *pu1_src, -// UWORD8 num_bytes) +// UWORD32 num_bytes) //**************Variables Vs Registers************************* // x0 => *pu1_dst // x1 => *pu1_src -// x2 => num_bytes +// w2 => num_bytes @@ -89,7 +89,7 @@ loop_neon_memcpy_mul_8: ld1 {v0.8b}, [x1], #8 st1 {v0.8b}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bne loop_neon_memcpy_mul_8 ret @@ -99,38 +99,36 @@ loop_neon_memcpy_mul_8: //*/ //void ih264_memcpy(UWORD8 *pu1_dst, // UWORD8 *pu1_src, -// UWORD8 num_bytes) +// UWORD32 num_bytes) //**************Variables Vs Registers************************* // x0 => *pu1_dst // x1 => *pu1_src -// x2 => num_bytes +// w2 => num_bytes .global ih264_memcpy_av8 ih264_memcpy_av8: - subs x2, x2, #8 + subs w2, w2, #8 blt arm_memcpy loop_neon_memcpy: // Memcpy 8 bytes ld1 {v0.8b}, [x1], #8 st1 {v0.8b}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bge loop_neon_memcpy - cmn x2, #8 + cmn w2, #8 beq end_func1 arm_memcpy: - add x2, x2, #8 + add w2, w2, #8 loop_arm_memcpy: ldrb w3, [x1], #1 - sxtw x3, w3 strb w3, [x0], #1 - sxtw x3, w3 - subs x2, x2, #1 + subs w2, w2, #1 bne loop_arm_memcpy ret end_func1: @@ -139,7 +137,7 @@ end_func1: //void ih264_memset_mul_8(UWORD8 *pu1_dst, // UWORD8 value, -// UWORD8 num_bytes) +// UWORD32 num_bytes) //**************Variables Vs Registers************************* // x0 => *pu1_dst // x1 => value @@ -156,7 +154,7 @@ loop_memset_mul_8: // Memset 8 bytes st1 {v0.8b}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bne loop_memset_mul_8 ret @@ -164,36 +162,35 @@ loop_memset_mul_8: //void ih264_memset(UWORD8 *pu1_dst, // UWORD8 value, -// UWORD8 num_bytes) +// UWORD32 num_bytes) //**************Variables Vs Registers************************* // x0 => *pu1_dst -// x1 => value -// x2 => num_bytes +// w1 => value +// w2 => num_bytes .global ih264_memset_av8 ih264_memset_av8: - subs x2, x2, #8 + subs w2, w2, #8 blt arm_memset dup v0.8b, w1 loop_neon_memset: // Memcpy 8 bytes st1 {v0.8b}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bge loop_neon_memset - cmn x2, #8 + cmn w2, #8 beq end_func2 arm_memset: - add x2, x2, #8 + add w2, w2, #8 loop_arm_memset: strb w1, [x0], #1 - sxtw x1, w1 - subs x2, x2, #1 + subs w2, w2, #1 bne loop_arm_memset ret end_func2: @@ -205,11 +202,11 @@ end_func2: //void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst, // UWORD16 value, -// UWORD8 num_words) +// UWORD32 num_words) //**************Variables Vs Registers************************* // x0 => *pu2_dst -// x1 => value -// x2 => num_words +// w1 => value +// w2 => num_words .global ih264_memset_16bit_mul_8_av8 @@ -224,7 +221,7 @@ loop_memset_16bit_mul_8: st1 {v0.4h}, [x0], #8 st1 {v0.4h}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bne loop_memset_16bit_mul_8 ret @@ -233,18 +230,18 @@ loop_memset_16bit_mul_8: //void ih264_memset_16bit(UWORD16 *pu2_dst, // UWORD16 value, -// UWORD8 num_words) +// UWORD32 num_words) //**************Variables Vs Registers************************* // x0 => *pu2_dst -// x1 => value -// x2 => num_words +// w1 => value +// w2 => num_words .global ih264_memset_16bit_av8 ih264_memset_16bit_av8: - subs x2, x2, #8 + subs w2, w2, #8 blt arm_memset_16bit dup v0.4h, w1 loop_neon_memset_16bit: @@ -252,18 +249,17 @@ loop_neon_memset_16bit: st1 {v0.4h}, [x0], #8 st1 {v0.4h}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bge loop_neon_memset_16bit - cmn x2, #8 + cmn w2, #8 beq end_func3 arm_memset_16bit: - add x2, x2, #8 + add w2, w2, #8 loop_arm_memset_16bit: strh w1, [x0], #2 - sxtw x1, w1 - subs x2, x2, #1 + subs w2, w2, #1 bne loop_arm_memset_16bit ret diff --git a/common/armv8/ih264_padding_neon_av8.s b/common/armv8/ih264_padding_neon_av8.s index 35d9c8a..e03fe2f 100644 --- a/common/armv8/ih264_padding_neon_av8.s +++ b/common/armv8/ih264_padding_neon_av8.s @@ -76,9 +76,9 @@ // WORD32 pad_size) //**************Variables Vs Registers************************* // x0 => *pu1_src -// x1 => src_strd -// x2 => wd -// x3 => pad_size +// w1 => src_strd +// w2 => wd +// w3 => pad_size .global ih264_pad_top_av8 @@ -86,25 +86,25 @@ ih264_pad_top_av8: // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 stp x19, x20, [sp, #-16]! sub x5, x0, x1 - sub x20, x1, #0 - neg x6, x20 + neg x6, x1 loop_neon_memcpy_mul_16: // Load 16 bytes ld1 {v0.8b, v1.8b}, [x0], #16 mov x4, x5 - mov x7, x3 + mov w7, w3 add x5, x5, #16 loop_neon_pad_top: st1 {v0.8b, v1.8b}, [x4], x6 - subs x7, x7, #1 + subs w7, w7, #1 bne loop_neon_pad_top - subs x2, x2, #16 + subs w2, w2, #16 bne loop_neon_memcpy_mul_16 // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP @@ -160,9 +160,9 @@ loop_neon_pad_top: // WORD32 pad_size) //**************Variables Vs Registers************************* // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -172,6 +172,8 @@ ih264_pad_left_luma_av8: // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! @@ -182,43 +184,35 @@ ih264_pad_left_luma_av8: loop_16: // /*hard coded for width=16 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 st1 {v2.16b}, [x4], x1 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 dup v4.16b, w10 dup v6.16b, w11 st1 {v4.16b}, [x4], x1 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], x1 // 16 bytes store ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], x1 // 16 bytes store dup v4.16b, w10 dup v6.16b, w11 - subs x2, x2, #8 + subs w2, w2, #8 st1 {v4.16b}, [x4], x1 // 16 bytes store st1 {v6.16b}, [x4], x1 // 16 bytes store bne loop_16 @@ -227,14 +221,11 @@ loop_16: // /*hard coded for width=16 ,height = loop_32: // /*hard coded for width=32 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 @@ -243,35 +234,30 @@ loop_32: // /*hard coded for width=32 ,height =8 st1 {v2.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], #16 // 16 bytes store dup v0.16b, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 st1 {v6.16b}, [x4], x6 // 16 bytes store ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.16b, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #8 + subs w2, w2, #8 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store bne loop_32 @@ -333,9 +319,9 @@ end_func: // WORD32 pad_size) //{ // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -345,6 +331,8 @@ ih264_pad_left_chroma_av8: // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! sub x4, x0, x3 @@ -354,27 +342,23 @@ ih264_pad_left_chroma_av8: loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/ ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store @@ -383,27 +367,23 @@ loop_32_l_c: // /*hard coded for width=32 ,height = ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store @@ -412,20 +392,16 @@ loop_32_l_c: // /*hard coded for width=32 ,height = ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store @@ -500,9 +476,9 @@ end_func_l_c: //} // // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -512,6 +488,8 @@ ih264_pad_right_luma_av8: // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! mov x4, x0 @@ -522,43 +500,35 @@ ih264_pad_right_luma_av8: loop_16_r: // /*hard coded for width=16 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 st1 {v2.16b}, [x4], x1 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 dup v4.16b, w10 dup v6.16b, w11 st1 {v4.16b}, [x4], x1 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], x1 // 16 bytes store ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], x1 // 16 bytes store dup v4.16b, w10 dup v6.16b, w11 - subs x2, x2, #8 + subs w2, w2, #8 st1 {v4.16b}, [x4], x1 // 16 bytes store st1 {v6.16b}, [x4], x1 // 16 bytes store bne loop_16_r @@ -567,14 +537,11 @@ loop_16_r: // /*hard coded for width=16 ,height =8,16*/ loop_32_r: // /*hard coded for width=32 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 @@ -583,35 +550,30 @@ loop_32_r: // /*hard coded for width=32 ,height = st1 {v2.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], #16 // 16 bytes store ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 st1 {v6.16b}, [x4], x6 // 16 bytes store ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.16b, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #8 + subs w2, w2, #8 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store bne loop_32_r @@ -672,9 +634,9 @@ end_func_r: // WORD32 ht, // WORD32 pad_size) // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -684,6 +646,8 @@ ih264_pad_right_chroma_av8: // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! mov x4, x0 @@ -692,24 +656,20 @@ ih264_pad_right_chroma_av8: loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store @@ -720,27 +680,23 @@ loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 dup v0.8h, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store @@ -748,20 +704,16 @@ loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ bne loop_32_r_c ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 dup v0.8h, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store diff --git a/common/armv8/ih264_resi_trans_quant_av8.s b/common/armv8/ih264_resi_trans_quant_av8.s index 316c220..d2ba3cf 100644 --- a/common/armv8/ih264_resi_trans_quant_av8.s +++ b/common/armv8/ih264_resi_trans_quant_av8.s @@ -45,18 +45,6 @@ //* function name : ih264_resi_trans_quant_4x4 //* description : this function does cf4 of h264 //* -//* arguments : x0 :pointer to src buffer -// x1 :pointer to pred buffer -// x2 :pointer to dst buffer -// x3 :source stride -// x4 :pred stride, -// x5 :dst stride, -// x6 :pointer to scaling matrix, -// x7 :pointer to threshold matrix, -// stack qbits, -// rounding factor, -// pointer to store nnz -// pointer to store non quantized dc value // values returned : none // // register usage : @@ -77,34 +65,24 @@ .global ih264_resi_trans_quant_4x4_av8 ih264_resi_trans_quant_4x4_av8: - //x0 :pointer to src buffer - //x1 :pointer to pred buffer - //x2 :pointer to dst buffer - //x3 :source stride - //x4 :pred stride - //x5 :dst stride, - //x6 :scale matirx, - //x7 :threshold matrix - // :qbits - // :round factor - // :nnz - // :pointer to store non quantized dc value push_v_regs //x0 :pointer to src buffer //x1 :pointer to pred buffer //x2 :pointer to dst buffer - //x3 :source stride - //x4 :pred stride - //x5 :scale matirx, + //w3 :source stride + //w4 :pred stride + //w5 :scale matirx, //x6 :threshold matrix - //x7 :qbits - //x8 :round factor + //w7 :qbits + //w8 :round factor //x9 :nnz //x10 :pointer to store non quantized dc value + sxtw x3, w3 + sxtw x4, w4 ldr w8, [sp, #64] //load round factor ldr x10, [sp, #80] //load addres for non quant val - neg x7, x7 //negate the qbit value for usiing lsl + neg w7, w7 //negate the qbit value for usiing lsl ldr x9, [sp, #72] //------------fucntion loading done----------------; @@ -259,18 +237,6 @@ ih264_resi_trans_quant_4x4_av8: //* description : this function does residue calculation, forward transform //* and quantization for 4x4 chroma block. //* -//* arguments : x0 :pointer to src buffer -// x1 :pointer to pred buffer -// x2 :pointer to dst buffer -// x3 :source stride -// x4 :pred stride, -// x5 :dst stride, -// x6 :pointer to scaling matrix, -// x7 :pointer to threshold matrix, -// stack qbits, -// rounding factor, -// pointer to store nnz -// pointer to store unquantized dc values // values returned : none // // register usage : @@ -290,33 +256,24 @@ ih264_resi_trans_quant_4x4_av8: .global ih264_resi_trans_quant_chroma_4x4_av8 ih264_resi_trans_quant_chroma_4x4_av8: - //x0 :pointer to src buffer - //x1 :pointer to pred buffer - //x2 :pointer to dst buffer - //x3 :source stride - //stack :pred stride - // :scale matirx, - // :threshold matrix - // :qbits - // :round factor - // :nnz - // :pu1_dc_alt_addr push_v_regs //x0 :pointer to src buffer //x1 :pointer to pred buffer //x2 :pointer to dst buffer - //x3 :source stride - //x4 :pred stride + //w3 :source stride + //w4 :pred stride //x5 :scale matirx, //x6 :threshold matrix - //x7 :qbits - //x8 :round factor + //w7 :qbits + //w8 :round factor //x9 :nnz //x10 :pointer to store non quantized dc value + sxtw x3, w3 + sxtw x4, w4 ldr w8, [sp, #64] //load round factor ldr x10, [sp, #80] //load addres for non quant val - neg x7, x7 //negate the qbit value for usiing lsl + neg w7, w7 //negate the qbit value for usiing lsl ldr x9, [sp, #72] //------------fucntion loading done----------------; @@ -485,10 +442,10 @@ ih264_resi_trans_quant_chroma_4x4_av8: //* arguments : x0 :pointer to src buffer // x1 :pointer to dst buffer // x2 :pu2_scale_matrix -// x2 :pu2_threshold_matrix -// x3 :u4_qbits -// x4 :u4_round_factor -// x5 :pu1_nnz +// x3 :pu2_threshold_matrix +// w4 :u4_qbits +// w5 :u4_round_factor +// x6 :pu1_nnz // values returned : none // // register usage : @@ -516,8 +473,8 @@ ih264_hadamard_quant_4x4_av8: //x1 :pointer to dst buffer //x2 :pu2_scale_matrix //x3 :pu2_threshold_matrix -//x4 :u4_qbits -//x5 :u4_round_factor +//w4 :u4_qbits +//w5 :u4_round_factor //x6 :pu1_nnz push_v_regs @@ -632,10 +589,10 @@ ih264_hadamard_quant_4x4_av8: //* arguments : x0 :pointer to src buffer // x1 :pointer to dst buffer // x2 :pu2_scale_matrix -// x2 :pu2_threshold_matrix -// x3 :u4_qbits -// x4 :u4_round_factor -// x5 :pu1_nnz +// x3 :pu2_threshold_matrix +// w4 :u4_qbits +// w5 :u4_round_factor +// x6 :pu1_nnz // values returned : none // // register usage : diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s index b039fba..475f690 100644 --- a/common/armv8/ih264_weighted_bi_pred_av8.s +++ b/common/armv8/ih264_weighted_bi_pred_av8.s @@ -103,28 +103,28 @@ // WORD32 src_strd1, // WORD32 src_strd2, // WORD32 dst_strd, -// UWORD16 log_WD, -// UWORD32 wt1, -// UWORD32 wt2, -// UWORD16 ofst1, -// UWORD16 ofst2, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 log_WD, +// WORD32 wt1, +// WORD32 wt2, +// WORD16 ofst1, +// WORD16 ofst2, +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src1 // x1 => puc_src2 // x2 => puc_dst -// x3 => src_strd1 -// [sp] => src_strd2 (x4) -// [sp+4] => dst_strd (x5) -// [sp+8] => log_WD (x6) -// [sp+12] => wt1 (x7) -// [sp+16] => wt2 (x8) -// [sp+20] => ofst1 (x9) -// [sp+24] => ofst2 (x10) -// [sp+28] => ht (x11) -// [sp+32] => wd (x12) +// w3 => src_strd1 +// w4 => src_strd2 +// w5 => dst_strd +// w6 => log_WD +// w7 => wt1 +// [sp] => wt2 (w8) +// [sp+8] => ofst1 (w9) +// [sp+16] => ofst2 (w10) +// [sp+24] => ht (w11) +// [sp+32] => wd (w12) // .text .p2align 2 @@ -138,21 +138,23 @@ ih264_weighted_bi_pred_luma_av8: // STMFD sp!, {x4-x12,x14} //stack stores the values of the arguments push_v_regs + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 stp x19, x20, [sp, #-16]! - ldr x8, [sp, #80] //Load wt2 in x8 - ldr x9, [sp, #88] //Load ofst1 in x9 - add x6, x6, #1 //x6 = log_WD + 1 - sub x20, x6, #0 //x13 = -(log_WD + 1) - neg x10, x20 + ldr w8, [sp, #80] //Load wt2 in w8 + ldr w9, [sp, #88] //Load ofst1 in w9 + add w6, w6, #1 //w6 = log_WD + 1 + neg w10, w6 //w10 = -(log_WD + 1) dup v0.8h, w10 //Q0 = -(log_WD + 1) (32-bit) - ldr x10, [sp, #96] //Load ofst2 in x10 - ldr x11, [sp, #104] //Load ht in x11 - ldr x12, [sp, #112] //Load wd in x12 - add x9, x9, #1 //x9 = ofst1 + 1 - add x9, x9, x10 //x9 = ofst1 + ofst2 + 1 + ldr w10, [sp, #96] //Load ofst2 in w10 + ldr w11, [sp, #104] //Load ht in w11 + ldr w12, [sp, #112] //Load wd in w12 + add w9, w9, #1 //w9 = ofst1 + 1 + add w9, w9, w10 //w9 = ofst1 + ofst2 + 1 mov v2.s[0], w7 mov v2.s[1], w8 //D2 = {wt1(32-bit), wt2(32-bit)} - asr x9, x9, #1 //x9 = ofst = (ofst1 + ofst2 + 1) >> 1 + asr w9, w9, #1 //w9 = ofst = (ofst1 + ofst2 + 1) >> 1 dup v3.8b, w9 //D3 = ofst (8-bit) cmp w12, #16 beq loop_16 //branch if wd is 16 @@ -383,28 +385,28 @@ end_loops: // WORD32 src_strd1, // WORD32 src_strd2, // WORD32 dst_strd, -// UWORD16 log_WD, -// UWORD32 wt1, -// UWORD32 wt2, -// UWORD16 ofst1, -// UWORD16 ofst2, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 log_WD, +// WORD32 wt1, +// WORD32 wt2, +// WORD32 ofst1, +// WORD32 ofst2, +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src1 // x1 => puc_src2 // x2 => puc_dst -// x3 => src_strd1 -// [sp] => src_strd2 (x4) -// [sp+4] => dst_strd (x5) -// [sp+8] => log_WD (x6) -// [sp+12] => wt1 (x7) -// [sp+16] => wt2 (x8) -// [sp+20] => ofst1 (x9) -// [sp+24] => ofst2 (x10) -// [sp+28] => ht (x11) -// [sp+32] => wd (x12) +// w3 => src_strd1 +// w4 => src_strd2 +// w5 => dst_strd +// w6 => log_WD +// w7 => wt1 +// [sp] => wt2 (w8) +// [sp+8] => ofst1 (w9) +// [sp+16] => ofst2 (w10) +// [sp+24] => ht (w11) +// [sp+32] => wd (w12) // @@ -417,24 +419,22 @@ ih264_weighted_bi_pred_chroma_av8: // STMFD sp!, {x4-x12,x14} //stack stores the values of the arguments push_v_regs + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 stp x19, x20, [sp, #-16]! - ldr x8, [sp, #80] //Load wt2 in x8 + ldr w8, [sp, #80] //Load wt2 in w8 dup v4.4s, w8 //Q2 = (wt2_u, wt2_v) (32-bit) dup v2.4s, w7 //Q1 = (wt1_u, wt1_v) (32-bit) - add x6, x6, #1 //x6 = log_WD + 1 - ldr w9, [sp, #88] //Load ofst1 in x9 - sxtw x9, w9 - ldr w10, [sp, #96] //Load ofst2 in x10 - sxtw x10, w10 - sub x20, x6, #0 //x12 = -(log_WD + 1) - neg x20, x20 + add w6, w6, #1 //w6 = log_WD + 1 + ldr w9, [sp, #88] //Load ofst1 in w9 + ldr w10, [sp, #96] //Load ofst2 in w10 + neg w20, w6 //w20 = -(log_WD + 1) dup v0.8h, w20 //Q0 = -(log_WD + 1) (16-bit) ldr w11, [sp, #104] //Load ht in x11 ldr w12, [sp, #112] //Load wd in x12 - sxtw x11, w11 - sxtw x12, w12 dup v20.8h, w9 //0ffset1 dup v21.8h, w10 //0ffset2 srhadd v6.8b, v20.8b, v21.8b diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s index 69ed3b0..f145217 100644 --- a/common/armv8/ih264_weighted_pred_av8.s +++ b/common/armv8/ih264_weighted_pred_av8.s @@ -89,22 +89,22 @@ // UWORD8 *puc_dst, // WORD32 src_strd, // WORD32 dst_strd, -// UWORD8 log_WD, -// UWORD32 wt, -// UWORD16 ofst, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 log_WD, +// WORD32 wt, +// WORD32 ofst, +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src // x1 => puc_dst -// x2 => src_strd -// x3 => dst_strd -// [sp] => log_WD (x4) -// [sp+4] => wt (x5) -// [sp+8] => ofst (x6) -// [sp+12] => ht (x7) -// [sp+16] => wd (x8) +// w2 => src_strd +// w3 => dst_strd +// w4 => log_WD +// w5 => wt +// w6 => ofst +// w7 => ht +// [sp] => wd (w8) // .text .p2align 2 @@ -118,13 +118,14 @@ ih264_weighted_pred_luma_av8: // STMFD sp!, {x4-x9,x14} //stack stores the values of the arguments push_v_regs + sxtw x2, w2 + sxtw x3, w3 stp x19, x20, [sp, #-16]! ldr w8, [sp, #80] //Load wd sxtw x8, w8 dup v2.4h, w5 //D2 = wt (16-bit) - sub x20, x4, #0 //x9 = -log_WD - neg x9, x20 + neg w9, w4 //w9 = -log_WD dup v3.8b, w6 //D3 = ofst (8-bit) cmp w8, #16 //check if wd is 16 dup v0.8h, w9 //Q0 = -log_WD (16-bit) @@ -318,22 +319,22 @@ end_loops: // UWORD8 *puc_dst, // WORD32 src_strd, // WORD32 dst_strd, -// UWORD8 log_WD, -// UWORD32 wt, -// UWORD16 ofst, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 log_WD, +// WORD32 wt, +// WORD32 ofst, +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src // x1 => puc_dst -// x2 => src_strd -// x3 => dst_strd -// [sp] => log_WD (x4) -// [sp+4] => wt (x5) -// [sp+8] => ofst (x6) -// [sp+12] => ht (x7) -// [sp+16] => wd (x8) +// w2 => src_strd +// w3 => dst_strd +// w4 => log_WD +// w5 => wt +// w6 => ofst +// w7 => ht +// [sp] => wd (w8) // @@ -345,13 +346,14 @@ ih264_weighted_pred_chroma_av8: // STMFD sp!, {x4-x9,x14} //stack stores the values of the arguments push_v_regs + sxtw x2, w2 + sxtw x3, w3 stp x19, x20, [sp, #-16]! ldr w8, [sp, #80] //Load wd sxtw x8, w8 - sub x20, x4, #0 //x9 = -log_WD - neg x9, x20 + neg w9, w4 //w9 = -log_WD dup v2.4s, w5 //Q1 = {wt_u (16-bit), wt_v (16-bit)} |