diff options
55 files changed, 1127 insertions, 902 deletions
diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s index 6681a7c..e2b8c99 100644 --- a/common/arm/ih264_inter_pred_chroma_a9q.s +++ b/common/arm/ih264_inter_pred_chroma_a9q.s @@ -91,8 +91,8 @@ @ UWORD8 *pu1_dst, @ WORD32 src_strd, @ WORD32 dst_strd, -@ UWORD8 u1_dx, -@ UWORD8 u1_dy, +@ WORD32 u1_dx, +@ WORD32 u1_dy, @ WORD32 ht, @ WORD32 wd) @**************Variables Vs Registers***************************************** diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s index 0dd82f3..7597444 100644 --- a/common/arm/ih264_intra_pred_luma_16x16_a9q.s +++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s @@ -413,7 +413,7 @@ scrlbl1: add r7, r0, r4, lsl #3 sub r0, r7, r4, lsl #1 - rsb lr, r4, #0x0 + neg lr, r4 vpadd.s16 d0, d0, d1 diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s index 39ad9b3..b9595d7 100644 --- a/common/arm/ih264_mem_fns_neon.s +++ b/common/arm/ih264_mem_fns_neon.s @@ -68,7 +68,7 @@ @* @void ih264_memcpy_mul_8(UWORD8 *pu1_dst, @ UWORD8 *pu1_src, -@ UWORD8 num_bytes) +@ UWORD32 num_bytes) @**************Variables Vs Registers************************* @ r0 => *pu1_dst @ r1 => *pu1_src @@ -97,7 +97,7 @@ loop_neon_memcpy_mul_8: @* @void ih264_memcpy(UWORD8 *pu1_dst, @ UWORD8 *pu1_src, -@ UWORD8 num_bytes) +@ UWORD32 num_bytes) @**************Variables Vs Registers************************* @ r0 => *pu1_dst @ r1 => *pu1_src @@ -135,7 +135,7 @@ loop_memcpy: @void ih264_memset_mul_8(UWORD8 *pu1_dst, @ UWORD8 value, -@ UWORD8 num_bytes) +@ UWORD32 num_bytes) @**************Variables Vs Registers************************* @ r0 => *pu1_dst @ r1 => value @@ -202,7 +202,7 @@ loop_memset: @void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst, @ UWORD16 value, -@ UWORD8 num_words) +@ UWORD32 num_words) @**************Variables Vs Registers************************* @ r0 => *pu2_dst @ r1 => value @@ -234,7 +234,7 @@ loop_memset_16bit_mul_8: @void ih264_memset_16bit(UWORD16 *pu2_dst, @ UWORD16 value, -@ UWORD8 num_words) +@ UWORD32 num_words) @**************Variables Vs Registers************************* @ r0 => *pu2_dst @ r1 => value diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s index e7a1f91..819b0b3 100644 --- a/common/arm/ih264_padding_neon.s +++ b/common/arm/ih264_padding_neon.s @@ -88,7 +88,7 @@ ih264_pad_top_a9q: stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments sub r5, r0, r1 - rsb r6, r1, #0 + neg r6, r1 loop_neon_memcpy_mul_16: @ Load 16 bytes diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s index 33859e6..304bd8a 100644 --- a/common/arm/ih264_weighted_bi_pred_a9q.s +++ b/common/arm/ih264_weighted_bi_pred_a9q.s @@ -144,7 +144,7 @@ ih264_weighted_bi_pred_luma_a9q: ldr r4, [sp, #40] @Load src_strd2 in r4 ldr r5, [sp, #44] @Load dst_strd in r5 sxtb r9, r9 @sign-extend 8-bit ofst1 to 32-bit - rsb r10, r6, #0 @r13 = -(log_wd + 1) + neg r10, r6 @r10 = -(log_wd + 1) ldr r11, [sp, #68] @Load ht in r11 ldr r12, [sp, #72] @Load wd in r12 vdup.16 q0, r10 @Q0 = -(log_wd + 1) (32-bit) @@ -456,7 +456,7 @@ ih264_weighted_bi_pred_chroma_a9q: ldr r9, [sp, #60] @Load ofst1 in r9 ldr r10, [sp, #64] @Load ofst2 in r10 - rsb r12, r6, #0 @r12 = -(log_wd + 1) + neg r12, r6 @r12 = -(log_wd + 1) ldr r4, [sp, #40] @Load src_strd2 in r4 ldr r5, [sp, #44] @Load dst_strd in r5 vdup.16 q0, r12 @Q0 = -(log_wd + 1) (16-bit) diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s index 81d26d4..80c2c6d 100644 --- a/common/arm/ih264_weighted_pred_a9q.s +++ b/common/arm/ih264_weighted_pred_a9q.s @@ -122,7 +122,7 @@ ih264_weighted_pred_luma_a9q: vpush {d8-d15} vdup.16 d2, r5 @D2 = wt (16-bit) - rsb r9, r4, #0 @r9 = -log_wd + neg r9, r4 @r9 = -log_wd vdup.8 d3, r6 @D3 = ofst (8-bit) cmp r8, #16 @check if wd is 16 vdup.16 q0, r9 @Q0 = -log_wd (16-bit) @@ -349,7 +349,7 @@ ih264_weighted_pred_chroma_a9q: ldr r6, [sp, #36] @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)} ldr r8, [sp, #44] @Load wd - rsb r9, r4, #0 @r9 = -log_wd + neg r9, r4 @r9 = -log_wd vdup.32 q1, r5 @Q1 = {wt_u (16-bit), wt_v (16-bit)} ldr r7, [sp, #40] @Load ht vpush {d8-d15} diff --git a/common/armv8/ih264_deblk_chroma_av8.s b/common/armv8/ih264_deblk_chroma_av8.s index a4dbd23..b7f2d58 100644 --- a/common/armv8/ih264_deblk_chroma_av8.s +++ b/common/armv8/ih264_deblk_chroma_av8.s @@ -56,19 +56,19 @@ //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha_cb +//* @param[in] w2 - alpha_cb //* Alpha Value for the boundary in U //* -//* @param[in] x3 - beta_cb +//* @param[in] w3 - beta_cb //* Beta Value for the boundary in U //* -//* @param[in] sp(0) - alpha_cr +//* @param[in] w4 - alpha_cr //* Alpha Value for the boundary in V //* -//* @param[in] sp(4) - beta_cr +//* @param[in] w5 - beta_cr //* Beta Value for the boundary in V //* //* @returns @@ -87,6 +87,7 @@ ih264_deblk_chroma_horz_bs4_av8: // STMFD sp!,{x4-x6,x14} // push_v_regs stp x19, x20, [sp, #-16]! + sxtw x1, w1 mov x6, x5 mov x5, x4 sub x0, x0, x1, lsl #1 //x0 = uc_edgePixel pointing to p1 of chroma @@ -155,19 +156,19 @@ ih264_deblk_chroma_horz_bs4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha_cb +//* @param[in] w2 - alpha_cb //* Alpha Value for the boundary in U //* -//* @param[in] x3 - beta_cb +//* @param[in] w3 - beta_cb //* Beta Value for the boundary in U //* -//* @param[in] sp(0) - alpha_cr +//* @param[in] w4 - alpha_cr //* Alpha Value for the boundary in V //* -//* @param[in] sp(4) - beta_cr +//* @param[in] w5 - beta_cr //* Beta Value for the boundary in V //* //* @returns @@ -186,12 +187,13 @@ ih264_deblk_chroma_vert_bs4_av8: // STMFD sp!,{x4,x5,x12,x14} push_v_regs stp x19, x20, [sp, #-16]! + sxtw x1, w1 sub x0, x0, #4 //point x0 to p1u of row0. mov x12, x0 //keep a back up of x0 for buffer write - add x2, x2, x4, lsl #8 //x2 = (alpha_cr,alpha_cb) - add x3, x3, x5, lsl #8 //x3 = (beta_cr,beta_cb) + add w2, w2, w4, lsl #8 //w2 = (alpha_cr,alpha_cb) + add w3, w3, w5, lsl #8 //w3 = (beta_cr,beta_cb) ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1 ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1 @@ -292,28 +294,28 @@ ih264_deblk_chroma_vert_bs4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha_cb +//* @param[in] w2 - alpha_cb //* Alpha Value for the boundary in U //* -//* @param[in] x3 - beta_cb +//* @param[in] w3 - beta_cb //* Beta Value for the boundary in U //* -//* @param[in] sp(0) - alpha_cr +//* @param[in] w4 - alpha_cr //* Alpha Value for the boundary in V //* -//* @param[in] sp(4) - beta_cr +//* @param[in] w5 - beta_cr //* Beta Value for the boundary in V //* -//* @param[in] sp(8) - u4_bs +//* @param[in] w6 - u4_bs //* Packed Boundary strength array //* -//* @param[in] sp(12) - pu1_cliptab_cb +//* @param[in] x7 - pu1_cliptab_cb //* tc0_table for U //* -//* @param[in] sp(16) - pu1_cliptab_cr +//* @param[in] sp(0) - pu1_cliptab_cr //* tc0_table for V //* //* @returns @@ -332,14 +334,13 @@ ih264_deblk_chroma_horz_bslt4_av8: // STMFD sp!,{x4-x9,x14} // push_v_regs stp x19, x20, [sp, #-16]! - mov x8, x7 - mov x7, x6 - ldr x9, [sp, #80] + sxtw x1, w1 + ldr x8, [sp, #80] sub x0, x0, x1, lsl #1 //x0 = uc_edgePixelU pointing to p1 of chroma U - rev w7, w7 // - mov v12.s[0], w7 //D12[0] = ui_Bs - ld1 {v16.s}[0], [x8] //D16[0] contains cliptab_cb - ld1 {v17.s}[0], [x9] //D17[0] contains cliptab_cr + rev w6, w6 // + mov v12.s[0], w6 //D12[0] = ui_Bs + ld1 {v16.s}[0], [x7] //D16[0] contains cliptab_cb + ld1 {v17.s}[0], [x8] //D17[0] contains cliptab_cr ld2 {v6.8b, v7.8b}, [x0], x1 //Q3=p1 tbl v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U tbl v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V @@ -428,28 +429,28 @@ ih264_deblk_chroma_horz_bslt4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha_cb +//* @param[in] w2 - alpha_cb //* Alpha Value for the boundary in U //* -//* @param[in] x3 - beta_cb +//* @param[in] w3 - beta_cb //* Beta Value for the boundary in U //* -//* @param[in] sp(0) - alpha_cr +//* @param[in] w4 - alpha_cr //* Alpha Value for the boundary in V //* -//* @param[in] sp(4) - beta_cr +//* @param[in] w5 - beta_cr //* Beta Value for the boundary in V //* -//* @param[in] sp(8) - u4_bs +//* @param[in] w6 - u4_bs //* Packed Boundary strength array //* -//* @param[in] sp(12) - pu1_cliptab_cb +//* @param[in] x7 - pu1_cliptab_cb //* tc0_table for U //* -//* @param[in] sp(16) - pu1_cliptab_cr +//* @param[in] sp(0) - pu1_cliptab_cr //* tc0_table for V //* //* @returns @@ -468,11 +469,12 @@ ih264_deblk_chroma_vert_bslt4_av8: // STMFD sp!,{x4-x7,x10-x12,x14} push_v_regs stp x19, x20, [sp, #-16]! + sxtw x1, w1 mov x10, x7 - ldr x11, [sp, #80] //x6 = u4_bs + ldr x11, [sp, #80] //x11 = u4_bs sub x0, x0, #4 //point x0 to p1u of row0. - add x2, x2, x4, lsl #8 - add x3, x3, x5, lsl #8 + add w2, w2, w4, lsl #8 + add w3, w3, w5, lsl #8 mov x12, x0 //keep a back up of x0 for buffer write ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1 ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1 diff --git a/common/armv8/ih264_deblk_luma_av8.s b/common/armv8/ih264_deblk_luma_av8.s index 1b3950d..7705df2 100644 --- a/common/armv8/ih264_deblk_luma_av8.s +++ b/common/armv8/ih264_deblk_luma_av8.s @@ -60,19 +60,19 @@ //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha +//* @param[in] w2 - alpha //* Alpha Value for the boundary //* -//* @param[in] x3 - beta +//* @param[in] w3 - beta //* Beta Value for the boundary //* -//* @param[in] sp(0) - u4_bs +//* @param[in] w4 - u4_bs //* Packed Boundary strength array //* -//* @param[in] sp(4) - pu1_cliptab +//* @param[in] x5 - pu1_cliptab //* tc0_table //* //* @returns @@ -90,6 +90,7 @@ ih264_deblk_luma_horz_bslt4_av8: // STMFD sp!,{x4-x7,x14} push_v_regs + sxtw x1, w1 stp x19, x20, [sp, #-16]! //LDRD x4,x5,[SP,#0x14] //x4 = ui_Bs , x5 = *puc_ClpTab @@ -214,13 +215,13 @@ ih264_deblk_luma_horz_bslt4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha +//* @param[in] w2 - alpha //* Alpha Value for the boundary //* -//* @param[in] x3 - beta +//* @param[in] w3 - beta //* Beta Value for the boundary //* //* @returns @@ -240,6 +241,7 @@ ih264_deblk_luma_horz_bs4_av8: // STMFD sp!,{x12,x14} push_v_regs stp x19, x20, [sp, #-16]! + sxtw x1, w1 // Init dup v0.16b, w2 //duplicate alpha @@ -401,19 +403,19 @@ ih264_deblk_luma_horz_bs4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha +//* @param[in] w2 - alpha //* Alpha Value for the boundary //* -//* @param[in] x3 - beta +//* @param[in] w3 - beta //* Beta Value for the boundary //* -//* @param[in] sp(0) - u4_bs +//* @param[in] w4 - u4_bs //* Packed Boundary strength array //* -//* @param[in] sp(4) - pu1_cliptab +//* @param[in] x5 - pu1_cliptab //* tc0_table //* //* @returns @@ -432,6 +434,7 @@ ih264_deblk_luma_vert_bslt4_av8: // STMFD sp!,{x12,x14} push_v_regs stp x19, x20, [sp, #-16]! + sxtw x1, w1 sub x0, x0, #4 //pointer uc_edgePixel-4 mov x12, x4 @@ -743,13 +746,13 @@ ih264_deblk_luma_vert_bslt4_av8: //* @param[in] x0 - pu1_src //* Pointer to the src sample q0 //* -//* @param[in] x1 - src_strd +//* @param[in] w1 - src_strd //* Source stride //* -//* @param[in] x2 - alpha +//* @param[in] w2 - alpha //* Alpha Value for the boundary //* -//* @param[in] x3 - beta +//* @param[in] w3 - beta //* Beta Value for the boundary //* //* @returns diff --git a/common/armv8/ih264_default_weighted_pred_av8.s b/common/armv8/ih264_default_weighted_pred_av8.s index 6823015..d10047e 100644 --- a/common/armv8/ih264_default_weighted_pred_av8.s +++ b/common/armv8/ih264_default_weighted_pred_av8.s @@ -88,18 +88,18 @@ // WORD32 src_strd1, // WORD32 src_strd2, // WORD32 dst_strd, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src1 // x1 => puc_src2 // x2 => puc_dst -// x3 => src_strd1 -// [sp] => src_strd2 (x4) -// [sp+4] => dst_strd (x5) -// [sp+8] => ht (x6) -// [sp+12] => wd (x7) +// w3 => src_strd1 +// w4 => src_strd2 +// w5 => dst_strd +// w6 => ht +// w7 => wd // .text .p2align 2 @@ -113,6 +113,9 @@ ih264_default_weighted_pred_luma_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 cmp w7, #16 beq loop_16 //branch if wd is 16 cmp w7, #8 @@ -263,18 +266,18 @@ end_loops: // WORD32 src_strd1, // WORD32 src_strd2, // WORD32 dst_strd, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src1 // x1 => puc_src2 // x2 => puc_dst -// x3 => src_strd1 -// [sp] => src_strd2 (x4) -// [sp+4] => dst_strd (x5) -// [sp+8] => ht (x6) -// [sp+12] => wd (x7) +// w3 => src_strd1 +// w4 => src_strd2 +// w5 => dst_strd +// w6 => ht +// w7 => wd // @@ -286,6 +289,9 @@ ih264_default_weighted_pred_chroma_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 cmp w7, #8 beq loop_8_uv //branch if wd is 8 cmp w7, #4 diff --git a/common/armv8/ih264_inter_pred_chroma_av8.s b/common/armv8/ih264_inter_pred_chroma_av8.s index 714e271..f6aef40 100644 --- a/common/armv8/ih264_inter_pred_chroma_av8.s +++ b/common/armv8/ih264_inter_pred_chroma_av8.s @@ -91,19 +91,19 @@ // UWORD8 *pu1_dst, // WORD32 src_strd, // WORD32 dst_strd, -// UWORD8 u1_dx, -// UWORD8 u1_dy, +// WORD32 u1_dx, +// WORD32 u1_dy, // WORD32 ht, // WORD32 wd) //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => u1_dx -// x5 => u1_dy -// x6 => height -// x7 => width +// w2 => src_strd +// w3 => dst_strd +// w4 => u1_dx +// w5 => u1_dy +// w6 => height +// w7 => width // .text .p2align 2 @@ -120,6 +120,12 @@ ih264_inter_pred_chroma_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 + sxtw x6, w6 + sxtw x7, w7 diff --git a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s index 6ad463a..e7c9f86 100644 --- a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s +++ b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s @@ -89,10 +89,10 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd .text .p2align 2 @@ -111,6 +111,10 @@ ih264_inter_pred_luma_horz_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 sub x0, x0, #2 //pu1_src-2 sub x14, x4, #16 movi v0.8b, #5 //filter coeff diff --git a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s index 9564f99..711d73e 100644 --- a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s +++ b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s @@ -89,10 +89,10 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd .text .p2align 2 @@ -108,6 +108,10 @@ ih264_inter_pred_luma_vert_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd diff --git a/common/armv8/ih264_inter_pred_luma_copy_av8.s b/common/armv8/ih264_inter_pred_luma_copy_av8.s index 1a76c1c..007df30 100644 --- a/common/armv8/ih264_inter_pred_luma_copy_av8.s +++ b/common/armv8/ih264_inter_pred_luma_copy_av8.s @@ -65,10 +65,10 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x7 => ht -// x12 => wd +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd .text .p2align 2 @@ -82,6 +82,10 @@ ih264_inter_pred_luma_copy_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 mov x12, x5 mov x7, x4 @@ -228,14 +232,16 @@ end_inner_loop_wd_16: // Register Usage // x0 : pi2_src // x1 : pu1_out -// x2 : src_strd -// x3 : out_strd +// w2 : src_strd +// w3 : out_strd // Neon registers d0-d7, d16-d30 are used // No need for pushing arm and neon registers .global ih264_interleave_copy_av8 ih264_interleave_copy_av8: push_v_regs + sxtw x2, w2 + sxtw x3, w3 ld1 {v2.8b}, [x0], x2 //load src plane 1 => d2 &pred palne 2 => d3 ld1 {v3.8b}, [x0], x2 mov v2.d[1], v3.d[0] diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s index d2897b6..dd4383e 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s @@ -52,10 +52,10 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd .text @@ -71,6 +71,10 @@ ih264_inter_pred_luma_horz_hpel_vert_hpel_av8: //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd sub x0, x0, #2 //pu1_src-2 diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s index 546c807..3563ac0 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s @@ -105,12 +105,12 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd -// x7 => dydx -// x9 => *pu1_tmp +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd +// x6 => *pu1_tmp +// w7 => dydx .text .p2align 2 @@ -126,6 +126,10 @@ ih264_inter_pred_luma_horz_hpel_vert_qpel_av8: // store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 @@ -134,7 +138,8 @@ ih264_inter_pred_luma_horz_hpel_vert_qpel_av8: mov x9, x6 - lsr x7, x7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit + // by writing to w7 here, we clear the upper half of x7 + lsr w7, w7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit add x7, x7, #2 mov x6, #48 diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s index 39e3253..38268c7 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s @@ -94,11 +94,11 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd -// x7 => dydx +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd +// w7 => dydx .text .p2align 2 @@ -114,6 +114,10 @@ ih264_inter_pred_luma_horz_qpel_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 and x7, x7, #3 //Finds x-offset diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s index 3f3e297..6ccf11f 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s @@ -105,12 +105,12 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd -// x6 => dydx -// x9 => *pu1_tmp +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd +// x6 => *pu1_tmp +// w7 => dydx .text .p2align 2 @@ -125,11 +125,15 @@ ih264_inter_pred_luma_horz_qpel_vert_hpel_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd sub x0, x0, #2 //pu1_src-2 mov x9, x6 - mov x6, x7 + mov w6, w7 and x6, x6, #2 // dydx & 0x3 followed by dydx>>1 and dydx<<1 diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s index ab663d0..a9dfbd1 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s @@ -104,11 +104,11 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd -// x6 => dydx +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd +// w7 => dydx .text .p2align 2 @@ -122,7 +122,11 @@ ih264_inter_pred_luma_horz_qpel_vert_qpel_av8: push_v_regs stp x19, x20, [sp, #-16]! - mov x6, x7 + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 + mov w6, w7 and x7, x6, #3 add x7, x0, x7, lsr #1 //pu1_pred_vert = pu1_src + (x_offset>>1) diff --git a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s index 9d19a2d..014faca 100644 --- a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s @@ -94,11 +94,11 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ht -// x5 => wd -// x7 => dydx +// w2 => src_strd +// w3 => dst_strd +// w4 => ht +// w5 => wd +// w7 => dydx .text .p2align 2 @@ -112,6 +112,10 @@ ih264_inter_pred_luma_vert_qpel_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x2, w2 + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 and x7, x7, #12 //Finds y-offset diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s index 8f0f282..39c0256 100644 --- a/common/armv8/ih264_intra_pred_chroma_av8.s +++ b/common/armv8/ih264_intra_pred_chroma_av8.s @@ -100,9 +100,9 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability @@ -113,13 +113,14 @@ ih264_intra_pred_chroma_8x8_mode_dc_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 - mov x19, #5 - ands x6, x4, x19 + mov w19, #5 + ands w6, w4, w19 beq none_available - cmp x6, #1 + cmp w6, #1 beq left_only_available - cmp x6, #4 + cmp w6, #4 beq top_only_available all_available: @@ -251,9 +252,9 @@ end_func: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_chroma_8x8_mode_horz_av8 @@ -263,6 +264,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8: push_v_regs + sxtw x3, w3 ld1 {v0.8h}, [x0] dup v10.8h, v0.h[7] @@ -332,9 +334,9 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_chroma_8x8_mode_vert_av8 @@ -342,6 +344,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8: ih264_intra_pred_chroma_8x8_mode_vert_av8: push_v_regs + sxtw x3, w3 add x0, x0, #18 ld1 {v0.8b, v1.8b}, [x0] @@ -405,15 +408,16 @@ ih264_intra_pred_chroma_8x8_mode_vert_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_chroma_8x8_mode_plane_av8 ih264_intra_pred_chroma_8x8_mode_plane_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 {v0.2s}, [x0] add x10, x0, #10 @@ -457,18 +461,14 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8: rshrn v13.4h, v26.4s, #6 rshrn v14.4h, v28.4s, #6 ldrb w6, [x0], #1 - sxtw x6, w6 add x10, x0, #31 ldrb w8, [x0], #1 - sxtw x8, w8 ldrb w7, [x10], #1 - sxtw x7, w7 ldrb w9, [x10], #1 - sxtw x9, w9 - add x6, x6, x7 - add x8, x8, x9 - lsl x6, x6, #4 - lsl x8, x8, #4 + add w6, w6, w7 + add w8, w8, w9 + lsl w6, w6, #4 + lsl w8, w8, #4 dup v0.8h, w6 dup v2.8h, w8 dup v4.8h, v12.h[0] diff --git a/common/armv8/ih264_intra_pred_luma_16x16_av8.s b/common/armv8/ih264_intra_pred_luma_16x16_av8.s index c1847b5..fa19c12 100644 --- a/common/armv8/ih264_intra_pred_luma_16x16_av8.s +++ b/common/armv8/ih264_intra_pred_luma_16x16_av8.s @@ -98,9 +98,9 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_16x16_mode_vert_av8 @@ -108,6 +108,7 @@ ih264_intra_pred_luma_16x16_mode_vert_av8: push_v_regs + sxtw x3, w3 add x0, x0, #17 @@ -181,9 +182,9 @@ ih264_intra_pred_luma_16x16_mode_vert_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_16x16_mode_horz_av8 @@ -192,6 +193,7 @@ ih264_intra_pred_luma_16x16_mode_horz_av8: push_v_regs + sxtw x3, w3 ld1 {v0.16b}, [x0] @@ -283,9 +285,9 @@ ih264_intra_pred_luma_16x16_mode_horz_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_16x16_mode_dc_av8 @@ -295,18 +297,19 @@ ih264_intra_pred_luma_16x16_mode_dc_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 sub v0.16b, v0.16b, v0.16b sub v1.16b, v1.16b, v1.16b mov w10, #0 mov w11 , #3 - ands x6, x4, #0x01 + ands w6, w4, #0x01 beq top_available //LEFT NOT AVAILABLE ld1 {v0.16b}, [x0] add w10, w10, #8 add w11, w11, #1 top_available: - ands x6, x4, #0x04 + ands w6, w4, #0x04 beq none_available add x6, x0, #17 ld1 {v1.16b}, [x6] @@ -314,7 +317,7 @@ top_available: add w11, w11, #1 b summation none_available: - cmp x4, #0 + cmp w4, #0 bne summation mov w15, #128 dup v20.16b, w15 @@ -410,15 +413,16 @@ end_func: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_16x16_mode_plane_av8 ih264_intra_pred_luma_16x16_mode_plane_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 mov x2, x1 add x1, x0, #17 add x0, x0, #15 @@ -440,76 +444,58 @@ ih264_intra_pred_luma_16x16_mode_plane_av8: uxtl v18.8h, v7.8b add x7, x0, x4, lsl #3 sub x0, x7, x4, lsl #1 - sub x20, x4, #0x0 - neg x14, x20 + neg x14, x4 addp v0.8h, v0.8h, v1.8h ldrb w8, [x7], #-1 - sxtw x8, w8 ldrb w9, [x0], #1 - sxtw x9, w9 saddlp v0.2s, v0.4h - sub x12, x8, x9 + sub w12, w8, w9 ldrb w8, [x7], #-1 - sxtw x8, w8 saddlp v0.1d, v0.2s ldrb w9, [x0], #1 - sxtw x9, w9 - sub x8, x8, x9 + sub w8, w8, w9 shl v2.2s, v0.2s, #2 - add x12, x12, x8, lsl #1 + add w12, w12, w8, lsl #1 add v0.2s, v0.2s , v2.2s ldrb w8, [x7], #-1 - sxtw x8, w8 ldrb w9, [x0], #1 - sxtw x9, w9 srshr v0.2s, v0.2s, #6 // i_b = D0[0] - sub x8, x8, x9 + sub w8, w8, w9 ldrb w5, [x7], #-1 - sxtw x5, w5 - add x8, x8, x8, lsl #1 + add w8, w8, w8, lsl #1 dup v4.8h, v0.h[0] - add x12, x12, x8 + add w12, w12, w8 ldrb w9, [x0], #1 - sxtw x9, w9 mul v0.8h, v4.8h , v16.8h - sub x5, x5, x9 + sub w5, w5, w9 mul v2.8h, v4.8h , v18.8h - add x12, x12, x5, lsl #2 + add w12, w12, w5, lsl #2 ldrb w8, [x7], #-1 - sxtw x8, w8 ldrb w9, [x0], #1 - sxtw x9, w9 - sub x8, x8, x9 + sub w8, w8, w9 ldrb w5, [x7], #-1 - sxtw x5, w5 - add x8, x8, x8, lsl #2 + add w8, w8, w8, lsl #2 ldrb w6, [x0], #1 - sxtw x6, w6 - add x12, x12, x8 + add w12, w12, w8 ldrb w8, [x7], #-1 - sxtw x8, w8 ldrb w9, [x0], #1 - sxtw x9, w9 - sub x5, x5, x6 - sub x8, x8, x9 - add x5, x5, x5, lsl #1 - sub x20, x8, x8, lsl #3 - neg x8, x20 - add x12, x12, x5, lsl #1 + sub w5, w5, w6 + sub w8, w8, w9 + add w5, w5, w5, lsl #1 + sub w20, w8, w8, lsl #3 + neg w8, w20 + add w12, w12, w5, lsl #1 ldrb w5, [x7], #-1 - sxtw x5, w5 ldrb w6, [x10] //top_left - sxtw x6, w6 - add x12, x12, x8 - sub x9, x5, x6 + add w12, w12, w8 + sub w9, w5, w6 ldrb w6, [x1, #7] - sxtw x6, w6 - add x12, x12, x9, lsl #3 // i_c = x12 - add x8, x5, x6 - add x12, x12, x12, lsl #2 - lsl x8, x8, #4 // i_a = x8 - add x12, x12, #0x20 - lsr x12, x12, #6 + add w12, w12, w9, lsl #3 // i_c = w12 + add w8, w5, w6 + add w12, w12, w12, lsl #2 + lsl w8, w8, #4 // i_a = w8 + add w12, w12, #0x20 + lsr w12, w12, #6 shl v28.8h, v4.8h, #3 dup v6.8h, w12 dup v30.8h, w8 diff --git a/common/armv8/ih264_intra_pred_luma_4x4_av8.s b/common/armv8/ih264_intra_pred_luma_4x4_av8.s index 62e8cee..1f95131 100644 --- a/common/armv8/ih264_intra_pred_luma_4x4_av8.s +++ b/common/armv8/ih264_intra_pred_luma_4x4_av8.s @@ -102,15 +102,16 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_vert_av8 ih264_intra_pred_luma_4x4_mode_vert_av8: push_v_regs + sxtw x3, w3 add x0, x0, #5 @@ -171,9 +172,9 @@ ih264_intra_pred_luma_4x4_mode_vert_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability @@ -182,6 +183,7 @@ ih264_intra_pred_luma_4x4_mode_vert_av8: ih264_intra_pred_luma_4x4_mode_horz_av8: push_v_regs + sxtw x3, w3 ld1 {v1.s}[0], [x0] dup v0.8b, v1.b[3] @@ -246,9 +248,9 @@ ih264_intra_pred_luma_4x4_mode_horz_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability @@ -261,41 +263,34 @@ ih264_intra_pred_luma_4x4_mode_dc_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 - ands x5, x4, #0x01 + ands w5, w4, #0x01 beq top_available //LEFT NOT AVAILABLE add x10, x0, #3 mov x2, #-1 ldrb w5, [x10], #-1 - sxtw x5, w5 ldrb w6, [x10], #-1 - sxtw x6, w6 ldrb w7, [x10], #-1 - sxtw x7, w7 - add x5, x5, x6 + add w5, w5, w6 ldrb w8, [x10], #-1 - sxtw x8, w8 - add x5, x5, x7 - ands x11, x4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE - add x5, x5, x8 + add w5, w5, w7 + ands w11, w4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + add w5, w5, w8 beq left_available add x10, x0, #5 // BOTH LEFT AND TOP AVAILABLE ldrb w6, [x10], #1 - sxtw x6, w6 ldrb w7, [x10], #1 - sxtw x7, w7 - add x5, x5, x6 + add w5, w5, w6 ldrb w8, [x10], #1 - sxtw x8, w8 - add x5, x5, x7 + add w5, w5, w7 ldrb w9, [x10], #1 - sxtw x9, w9 - add x5, x5, x8 - add x5, x5, x9 - add x5, x5, #4 - lsr x5, x5, #3 + add w5, w5, w8 + add w5, w5, w9 + add w5, w5, #4 + lsr w5, w5, #3 dup v0.8b, w5 st1 {v0.s}[0], [x1], x3 st1 {v0.s}[0], [x1], x3 @@ -304,23 +299,19 @@ ih264_intra_pred_luma_4x4_mode_dc_av8: b end_func top_available: // ONLT TOP AVAILABLE - ands x11, x4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + ands w11, w4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE beq none_available add x10, x0, #5 ldrb w6, [x10], #1 - sxtw x6, w6 ldrb w7, [x10], #1 - sxtw x7, w7 ldrb w8, [x10], #1 - sxtw x8, w8 - add x5, x6, x7 + add w5, w6, w7 ldrb w9, [x10], #1 - sxtw x9, w9 - add x5, x5, x8 - add x5, x5, x9 - add x5, x5, #2 - lsr x5, x5, #2 + add w5, w5, w8 + add w5, w5, w9 + add w5, w5, #2 + lsr w5, w5, #2 dup v0.8b, w5 st1 {v0.s}[0], [x1], x3 st1 {v0.s}[0], [x1], x3 @@ -401,9 +392,9 @@ end_func: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_diag_dl_av8 @@ -413,6 +404,7 @@ ih264_intra_pred_luma_4x4_mode_diag_dl_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 add x0, x0, #5 sub x5, x3, #2 @@ -488,9 +480,9 @@ end_func_diag_dl: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_diag_dr_av8 @@ -499,6 +491,7 @@ ih264_intra_pred_luma_4x4_mode_diag_dr_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 {v0.8b}, [x0] @@ -571,9 +564,9 @@ end_func_diag_dr: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_vert_r_av8 @@ -582,6 +575,7 @@ ih264_intra_pred_luma_4x4_mode_vert_r_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 {v0.8b}, [x0] @@ -656,9 +650,9 @@ end_func_vert_r: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_horz_d_av8 @@ -667,6 +661,7 @@ ih264_intra_pred_luma_4x4_mode_horz_d_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 {v0.8b}, [x0] add x0, x0, #1 @@ -743,9 +738,9 @@ end_func_horz_d: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_vert_l_av8 @@ -754,6 +749,7 @@ ih264_intra_pred_luma_4x4_mode_vert_l_av8: push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 add x0, x0, #4 ld1 {v0.8b}, [x0] add x0, x0, #1 @@ -825,9 +821,9 @@ end_func_vert_l: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_4x4_mode_horz_u_av8 @@ -835,11 +831,11 @@ end_func_vert_l: ih264_intra_pred_luma_4x4_mode_horz_u_av8: push_v_regs + sxtw x3, w3 stp x19, x20, [sp, #-16]! mov x10, x0 ld1 {v0.8b}, [x0] ldrb w9, [x0], #1 - sxtw x9, w9 ext v1.8b, v0.8b , v0.8b , #1 ld1 {v0.b}[7], [x10] ext v2.8b, v1.8b , v1.8b , #1 diff --git a/common/armv8/ih264_intra_pred_luma_8x8_av8.s b/common/armv8/ih264_intra_pred_luma_8x8_av8.s index bf9a4c1..273aa81 100644 --- a/common/armv8/ih264_intra_pred_luma_8x8_av8.s +++ b/common/armv8/ih264_intra_pred_luma_8x8_av8.s @@ -102,9 +102,9 @@ //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_vert_av8 @@ -114,6 +114,7 @@ ih264_intra_pred_luma_8x8_mode_vert_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs //stp x19, x20,[sp,#-16]! + sxtw x3, w3 add x0, x0, #9 ld1 {v0.8b}, [x0] @@ -180,9 +181,9 @@ ih264_intra_pred_luma_8x8_mode_vert_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_horz_av8 @@ -194,38 +195,30 @@ ih264_intra_pred_luma_8x8_mode_horz_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 add x0, x0, #7 - mov x2 , #-1 ldrb w5, [x0], #-1 - sxtw x5, w5 ldrb w6, [x0], #-1 - sxtw x6, w6 dup v0.8b, w5 st1 {v0.8b}, [x1], x3 ldrb w7, [x0], #-1 - sxtw x7, w7 dup v1.8b, w6 st1 {v1.8b}, [x1], x3 dup v2.8b, w7 ldrb w8, [x0], #-1 - sxtw x8, w8 dup v3.8b, w8 st1 {v2.8b}, [x1], x3 ldrb w5, [x0], #-1 - sxtw x5, w5 st1 {v3.8b}, [x1], x3 dup v0.8b, w5 ldrb w6, [x0], #-1 - sxtw x6, w6 st1 {v0.8b}, [x1], x3 ldrb w7, [x0], #-1 - sxtw x7, w7 dup v1.8b, w6 dup v2.8b, w7 st1 {v1.8b}, [x1], x3 ldrb w8, [x0], #-1 - sxtw x8, w8 dup v3.8b, w8 st1 {v2.8b}, [x1], x3 st1 {v3.8b}, [x1], x3 @@ -285,9 +278,9 @@ ih264_intra_pred_luma_8x8_mode_horz_av8: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_dc_av8 @@ -298,37 +291,30 @@ ih264_intra_pred_luma_8x8_mode_dc_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs + sxtw x3, w3 stp x19, x20, [sp, #-16]! - ands x6, x4, #0x01 + ands w6, w4, #0x01 beq top_available //LEFT NOT AVAILABLE add x10, x0, #7 mov x2, #-1 ldrb w5, [x10], -1 - sxtw x5, w5 ldrb w6, [x10], -1 - sxtw x6, w6 ldrb w7, [x10], -1 - sxtw x7, w7 - add x5, x5, x6 + add w5, w5, w6 ldrb w8, [x10], -1 - sxtw x8, w8 - add x5, x5, x7 + add w5, w5, w7 ldrb w6, [x10], -1 - sxtw x6, w6 - add x5, x5, x8 + add w5, w5, w8 ldrb w7, [x10], -1 - sxtw x7, w7 - add x5, x5, x6 + add w5, w5, w6 ldrb w8, [x10], -1 - sxtw x8, w8 - add x5, x5, x7 - ands x11, x4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE - add x5, x5, x8 + add w5, w5, w7 + ands w11, w4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE + add w5, w5, w8 ldrb w6, [x10], -1 - sxtw x6, w6 - add x5, x5, x6 + add w5, w5, w6 beq left_available add x10, x0, #9 // BOTH LEFT AND TOP AVAILABLE @@ -351,7 +337,7 @@ ih264_intra_pred_luma_8x8_mode_dc_av8: b end_func top_available: // ONLT TOP AVAILABLE - ands x11, x4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE + ands w11, w4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE beq none_available add x10, x0, #9 @@ -452,9 +438,9 @@ end_func: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_diag_dl_av8 @@ -463,6 +449,7 @@ ih264_intra_pred_luma_8x8_mode_diag_dl_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 add x0, x0, #9 sub x5, x3, #4 @@ -554,9 +541,9 @@ end_func_diag_dl: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_diag_dr_av8 @@ -566,6 +553,7 @@ ih264_intra_pred_luma_8x8_mode_diag_dr_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 { v0.16b}, [x0] @@ -654,9 +642,9 @@ end_func_diag_dr: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_vert_r_av8 @@ -666,6 +654,7 @@ ih264_intra_pred_luma_8x8_mode_vert_r_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 { v0.16b}, [x0] mov v1.d[0], v0.d[1] @@ -780,9 +769,9 @@ end_func_vert_r: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_horz_d_av8 @@ -791,6 +780,7 @@ ih264_intra_pred_luma_8x8_mode_horz_d_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 { v0.16b}, [x0] mov v1.d[0], v0.d[1] @@ -910,9 +900,9 @@ end_func_horz_d: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_vert_l_av8 @@ -922,6 +912,7 @@ ih264_intra_pred_luma_8x8_mode_vert_l_av8: // STMFD sp!, {x4-x12, x14} //Restoring registers from stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 add x0, x0, #9 ld1 { v0.16b}, [x0] mov v1.d[0], v0.d[1] @@ -1018,9 +1009,9 @@ end_func_vert_l: //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst -// x2 => src_strd -// x3 => dst_strd -// x4 => ui_neighboravailability +// w2 => src_strd +// w3 => dst_strd +// w4 => ui_neighboravailability .global ih264_intra_pred_luma_8x8_mode_horz_u_av8 @@ -1029,6 +1020,7 @@ ih264_intra_pred_luma_8x8_mode_horz_u_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! + sxtw x3, w3 ld1 {v0.8b}, [x0] ld1 {v1.b}[7], [x0] diff --git a/common/armv8/ih264_iquant_itrans_recon_av8.s b/common/armv8/ih264_iquant_itrans_recon_av8.s index 4c83036..003ee74 100644 --- a/common/armv8/ih264_iquant_itrans_recon_av8.s +++ b/common/armv8/ih264_iquant_itrans_recon_av8.s @@ -103,11 +103,11 @@ //x0 => *pi2_src //x1 => *pu1_pred //x2 => *pu1_out -//x3 => pred_strd -//x4 => out_strd +//w3 => pred_strd +//w4 => out_strd //x5 => *pu2_iscal_mat //x6 => *pu2_weigh_mat -//x7 => u4_qp_div_6 +//w7 => u4_qp_div_6 // => pi4_tmp // => iq_start_idx // => pi2_dc_ld_addr @@ -119,6 +119,8 @@ ih264_iquant_itrans_recon_4x4_av8: push_v_regs + sxtw x3, w3 + sxtw x4, w4 dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 @@ -292,11 +294,11 @@ skip_loading_luma_dc_src: //x0 => *pi2_src //x1 => *pu1_pred //x2 => *pu1_out -//x3 => pred_strd -//x4 => out_strd +//w3 => pred_strd +//w4 => out_strd //x5 => *pu2_iscal_mat //x6 => *pu2_weigh_mat -//x7 => u4_qp_div_6 +//w7 => u4_qp_div_6 //sp => pi4_tmp //sp#8 => *pi2_dc_src @@ -315,6 +317,8 @@ ih264_iquant_itrans_recon_chroma_4x4_av8: //reduce sp by 64 push_v_regs + sxtw x3, w3 + sxtw x4, w4 dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 @@ -512,11 +516,11 @@ ih264_iquant_itrans_recon_chroma_4x4_av8: //x0 => *pi2_src //x1 => *pu1_pred //x2 => *pu1_out -//x3 => pred_strd -//x4 => out_strd +//w3 => pred_strd +//w4 => out_strd //x5 => *pu2_iscal_mat //x6 => *pu2_weigh_mat -//x7 => u4_qp_div_6 +//w7 => u4_qp_div_6 //NOT USED => pi4_tmp //NOT USED => iq_start_idx //NOT USED => pi2_dc_ld_addr @@ -525,6 +529,8 @@ ih264_iquant_itrans_recon_chroma_4x4_av8: ih264_iquant_itrans_recon_8x8_av8: push_v_regs + sxtw x3, w3 + sxtw x4, w4 ld1 {v8.8h -v11.8h}, [x5], #64 ld1 {v12.8h-v15.8h}, [x5] diff --git a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s index 8bb9c32..13061ec 100644 --- a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s +++ b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s @@ -104,11 +104,11 @@ //x0 => *pi2_src //x1 => *pu1_pred //x2 => *pu1_out -//x3 => pred_strd -//x4 => out_strd +//w3 => pred_strd +//w4 => out_strd //x5 => *pu2_iscal_mat //x6 => *pu2_weigh_mat -//x7 => u4_qp_div_6 +//w7 => u4_qp_div_6 // => pi4_tmp // => iq_start_idx // => pi2_dc_ld_addr @@ -119,6 +119,8 @@ .global ih264_iquant_itrans_recon_4x4_dc_av8 ih264_iquant_itrans_recon_4x4_dc_av8: + sxtw x3, w3 + sxtw x4, w4 ldr w8, [sp, #8] //Loads iq_start_idx subs w8, w8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set @@ -209,11 +211,11 @@ donot_use_pi2_src_luma_dc: // x0 : pi2_src // x1 : pu1_pred // x2 : pu1_out -// x3 : pred_strd -// x4 : out_strd +// w3 : pred_strd +// w4 : out_strd // x5 : pu2_iscal_mat // x6 : pu2_weigh_mat -// x7 : u4_qp_div_6 +// w7 : u4_qp_div_6 // : pi2_tmp // : pi2_dc_src // Neon registers d0-d7, d16-d30 are used @@ -223,6 +225,8 @@ donot_use_pi2_src_luma_dc: .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8 ih264_iquant_itrans_recon_chroma_4x4_dc_av8: + sxtw x3, w3 + sxtw x4, w4 ldr x0, [sp, #8] push_v_regs ld1 {v0.h}[0], [x0] @@ -327,11 +331,11 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_av8: //x0 => *pi2_src //x1 => *pu1_pred //x2 => *pu1_out -//x3 => pred_strd -//x4 => out_strd +//w3 => pred_strd +//w4 => out_strd //x5 => *pu2_iscal_mat //x6 => *pu2_weigh_mat -//x7 => u4_qp_div_6 +//w7 => u4_qp_div_6 //NOT USED => pi4_tmp //NOT USED => iq_start_idx //NOT USED => pi2_dc_ld_addr @@ -340,6 +344,8 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_av8: ih264_iquant_itrans_recon_8x8_dc_av8: push_v_regs + sxtw x3, w3 + sxtw x4, w4 ld1 {v1.h}[0], [x5] ld1 {v2.h}[0], [x6] diff --git a/common/armv8/ih264_mem_fns_neon_av8.s b/common/armv8/ih264_mem_fns_neon_av8.s index 4e9020d..802550d 100644 --- a/common/armv8/ih264_mem_fns_neon_av8.s +++ b/common/armv8/ih264_mem_fns_neon_av8.s @@ -70,11 +70,11 @@ //*/ //void ih264_memcpy_mul_8(UWORD8 *pu1_dst, // UWORD8 *pu1_src, -// UWORD8 num_bytes) +// UWORD32 num_bytes) //**************Variables Vs Registers************************* // x0 => *pu1_dst // x1 => *pu1_src -// x2 => num_bytes +// w2 => num_bytes @@ -89,7 +89,7 @@ loop_neon_memcpy_mul_8: ld1 {v0.8b}, [x1], #8 st1 {v0.8b}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bne loop_neon_memcpy_mul_8 ret @@ -99,38 +99,36 @@ loop_neon_memcpy_mul_8: //*/ //void ih264_memcpy(UWORD8 *pu1_dst, // UWORD8 *pu1_src, -// UWORD8 num_bytes) +// UWORD32 num_bytes) //**************Variables Vs Registers************************* // x0 => *pu1_dst // x1 => *pu1_src -// x2 => num_bytes +// w2 => num_bytes .global ih264_memcpy_av8 ih264_memcpy_av8: - subs x2, x2, #8 + subs w2, w2, #8 blt arm_memcpy loop_neon_memcpy: // Memcpy 8 bytes ld1 {v0.8b}, [x1], #8 st1 {v0.8b}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bge loop_neon_memcpy - cmn x2, #8 + cmn w2, #8 beq end_func1 arm_memcpy: - add x2, x2, #8 + add w2, w2, #8 loop_arm_memcpy: ldrb w3, [x1], #1 - sxtw x3, w3 strb w3, [x0], #1 - sxtw x3, w3 - subs x2, x2, #1 + subs w2, w2, #1 bne loop_arm_memcpy ret end_func1: @@ -139,7 +137,7 @@ end_func1: //void ih264_memset_mul_8(UWORD8 *pu1_dst, // UWORD8 value, -// UWORD8 num_bytes) +// UWORD32 num_bytes) //**************Variables Vs Registers************************* // x0 => *pu1_dst // x1 => value @@ -156,7 +154,7 @@ loop_memset_mul_8: // Memset 8 bytes st1 {v0.8b}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bne loop_memset_mul_8 ret @@ -164,36 +162,35 @@ loop_memset_mul_8: //void ih264_memset(UWORD8 *pu1_dst, // UWORD8 value, -// UWORD8 num_bytes) +// UWORD32 num_bytes) //**************Variables Vs Registers************************* // x0 => *pu1_dst -// x1 => value -// x2 => num_bytes +// w1 => value +// w2 => num_bytes .global ih264_memset_av8 ih264_memset_av8: - subs x2, x2, #8 + subs w2, w2, #8 blt arm_memset dup v0.8b, w1 loop_neon_memset: // Memcpy 8 bytes st1 {v0.8b}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bge loop_neon_memset - cmn x2, #8 + cmn w2, #8 beq end_func2 arm_memset: - add x2, x2, #8 + add w2, w2, #8 loop_arm_memset: strb w1, [x0], #1 - sxtw x1, w1 - subs x2, x2, #1 + subs w2, w2, #1 bne loop_arm_memset ret end_func2: @@ -205,11 +202,11 @@ end_func2: //void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst, // UWORD16 value, -// UWORD8 num_words) +// UWORD32 num_words) //**************Variables Vs Registers************************* // x0 => *pu2_dst -// x1 => value -// x2 => num_words +// w1 => value +// w2 => num_words .global ih264_memset_16bit_mul_8_av8 @@ -224,7 +221,7 @@ loop_memset_16bit_mul_8: st1 {v0.4h}, [x0], #8 st1 {v0.4h}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bne loop_memset_16bit_mul_8 ret @@ -233,18 +230,18 @@ loop_memset_16bit_mul_8: //void ih264_memset_16bit(UWORD16 *pu2_dst, // UWORD16 value, -// UWORD8 num_words) +// UWORD32 num_words) //**************Variables Vs Registers************************* // x0 => *pu2_dst -// x1 => value -// x2 => num_words +// w1 => value +// w2 => num_words .global ih264_memset_16bit_av8 ih264_memset_16bit_av8: - subs x2, x2, #8 + subs w2, w2, #8 blt arm_memset_16bit dup v0.4h, w1 loop_neon_memset_16bit: @@ -252,18 +249,17 @@ loop_neon_memset_16bit: st1 {v0.4h}, [x0], #8 st1 {v0.4h}, [x0], #8 - subs x2, x2, #8 + subs w2, w2, #8 bge loop_neon_memset_16bit - cmn x2, #8 + cmn w2, #8 beq end_func3 arm_memset_16bit: - add x2, x2, #8 + add w2, w2, #8 loop_arm_memset_16bit: strh w1, [x0], #2 - sxtw x1, w1 - subs x2, x2, #1 + subs w2, w2, #1 bne loop_arm_memset_16bit ret diff --git a/common/armv8/ih264_padding_neon_av8.s b/common/armv8/ih264_padding_neon_av8.s index 35d9c8a..e03fe2f 100644 --- a/common/armv8/ih264_padding_neon_av8.s +++ b/common/armv8/ih264_padding_neon_av8.s @@ -76,9 +76,9 @@ // WORD32 pad_size) //**************Variables Vs Registers************************* // x0 => *pu1_src -// x1 => src_strd -// x2 => wd -// x3 => pad_size +// w1 => src_strd +// w2 => wd +// w3 => pad_size .global ih264_pad_top_av8 @@ -86,25 +86,25 @@ ih264_pad_top_av8: // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 stp x19, x20, [sp, #-16]! sub x5, x0, x1 - sub x20, x1, #0 - neg x6, x20 + neg x6, x1 loop_neon_memcpy_mul_16: // Load 16 bytes ld1 {v0.8b, v1.8b}, [x0], #16 mov x4, x5 - mov x7, x3 + mov w7, w3 add x5, x5, #16 loop_neon_pad_top: st1 {v0.8b, v1.8b}, [x4], x6 - subs x7, x7, #1 + subs w7, w7, #1 bne loop_neon_pad_top - subs x2, x2, #16 + subs w2, w2, #16 bne loop_neon_memcpy_mul_16 // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP @@ -160,9 +160,9 @@ loop_neon_pad_top: // WORD32 pad_size) //**************Variables Vs Registers************************* // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -172,6 +172,8 @@ ih264_pad_left_luma_av8: // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! @@ -182,43 +184,35 @@ ih264_pad_left_luma_av8: loop_16: // /*hard coded for width=16 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 st1 {v2.16b}, [x4], x1 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 dup v4.16b, w10 dup v6.16b, w11 st1 {v4.16b}, [x4], x1 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], x1 // 16 bytes store ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], x1 // 16 bytes store dup v4.16b, w10 dup v6.16b, w11 - subs x2, x2, #8 + subs w2, w2, #8 st1 {v4.16b}, [x4], x1 // 16 bytes store st1 {v6.16b}, [x4], x1 // 16 bytes store bne loop_16 @@ -227,14 +221,11 @@ loop_16: // /*hard coded for width=16 ,height = loop_32: // /*hard coded for width=32 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 @@ -243,35 +234,30 @@ loop_32: // /*hard coded for width=32 ,height =8 st1 {v2.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], #16 // 16 bytes store dup v0.16b, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 st1 {v6.16b}, [x4], x6 // 16 bytes store ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.16b, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #8 + subs w2, w2, #8 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store bne loop_32 @@ -333,9 +319,9 @@ end_func: // WORD32 pad_size) //{ // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -345,6 +331,8 @@ ih264_pad_left_chroma_av8: // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! sub x4, x0, x3 @@ -354,27 +342,23 @@ ih264_pad_left_chroma_av8: loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/ ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store @@ -383,27 +367,23 @@ loop_32_l_c: // /*hard coded for width=32 ,height = ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store @@ -412,20 +392,16 @@ loop_32_l_c: // /*hard coded for width=32 ,height = ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store @@ -500,9 +476,9 @@ end_func_l_c: //} // // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -512,6 +488,8 @@ ih264_pad_right_luma_av8: // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! mov x4, x0 @@ -522,43 +500,35 @@ ih264_pad_right_luma_av8: loop_16_r: // /*hard coded for width=16 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 st1 {v2.16b}, [x4], x1 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 dup v4.16b, w10 dup v6.16b, w11 st1 {v4.16b}, [x4], x1 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], x1 // 16 bytes store ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], x1 // 16 bytes store dup v4.16b, w10 dup v6.16b, w11 - subs x2, x2, #8 + subs w2, w2, #8 st1 {v4.16b}, [x4], x1 // 16 bytes store st1 {v6.16b}, [x4], x1 // 16 bytes store bne loop_16_r @@ -567,14 +537,11 @@ loop_16_r: // /*hard coded for width=16 ,height =8,16*/ loop_32_r: // /*hard coded for width=32 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 @@ -583,35 +550,30 @@ loop_32_r: // /*hard coded for width=32 ,height = st1 {v2.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], #16 // 16 bytes store ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 st1 {v6.16b}, [x4], x6 // 16 bytes store ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.16b, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #8 + subs w2, w2, #8 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store bne loop_32_r @@ -672,9 +634,9 @@ end_func_r: // WORD32 ht, // WORD32 pad_size) // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -684,6 +646,8 @@ ih264_pad_right_chroma_av8: // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! mov x4, x0 @@ -692,24 +656,20 @@ ih264_pad_right_chroma_av8: loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store @@ -720,27 +680,23 @@ loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 dup v0.8h, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store @@ -748,20 +704,16 @@ loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ bne loop_32_r_c ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 dup v0.8h, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store diff --git a/common/armv8/ih264_resi_trans_quant_av8.s b/common/armv8/ih264_resi_trans_quant_av8.s index 316c220..d2ba3cf 100644 --- a/common/armv8/ih264_resi_trans_quant_av8.s +++ b/common/armv8/ih264_resi_trans_quant_av8.s @@ -45,18 +45,6 @@ //* function name : ih264_resi_trans_quant_4x4 //* description : this function does cf4 of h264 //* -//* arguments : x0 :pointer to src buffer -// x1 :pointer to pred buffer -// x2 :pointer to dst buffer -// x3 :source stride -// x4 :pred stride, -// x5 :dst stride, -// x6 :pointer to scaling matrix, -// x7 :pointer to threshold matrix, -// stack qbits, -// rounding factor, -// pointer to store nnz -// pointer to store non quantized dc value // values returned : none // // register usage : @@ -77,34 +65,24 @@ .global ih264_resi_trans_quant_4x4_av8 ih264_resi_trans_quant_4x4_av8: - //x0 :pointer to src buffer - //x1 :pointer to pred buffer - //x2 :pointer to dst buffer - //x3 :source stride - //x4 :pred stride - //x5 :dst stride, - //x6 :scale matirx, - //x7 :threshold matrix - // :qbits - // :round factor - // :nnz - // :pointer to store non quantized dc value push_v_regs //x0 :pointer to src buffer //x1 :pointer to pred buffer //x2 :pointer to dst buffer - //x3 :source stride - //x4 :pred stride - //x5 :scale matirx, + //w3 :source stride + //w4 :pred stride + //w5 :scale matirx, //x6 :threshold matrix - //x7 :qbits - //x8 :round factor + //w7 :qbits + //w8 :round factor //x9 :nnz //x10 :pointer to store non quantized dc value + sxtw x3, w3 + sxtw x4, w4 ldr w8, [sp, #64] //load round factor ldr x10, [sp, #80] //load addres for non quant val - neg x7, x7 //negate the qbit value for usiing lsl + neg w7, w7 //negate the qbit value for usiing lsl ldr x9, [sp, #72] //------------fucntion loading done----------------; @@ -259,18 +237,6 @@ ih264_resi_trans_quant_4x4_av8: //* description : this function does residue calculation, forward transform //* and quantization for 4x4 chroma block. //* -//* arguments : x0 :pointer to src buffer -// x1 :pointer to pred buffer -// x2 :pointer to dst buffer -// x3 :source stride -// x4 :pred stride, -// x5 :dst stride, -// x6 :pointer to scaling matrix, -// x7 :pointer to threshold matrix, -// stack qbits, -// rounding factor, -// pointer to store nnz -// pointer to store unquantized dc values // values returned : none // // register usage : @@ -290,33 +256,24 @@ ih264_resi_trans_quant_4x4_av8: .global ih264_resi_trans_quant_chroma_4x4_av8 ih264_resi_trans_quant_chroma_4x4_av8: - //x0 :pointer to src buffer - //x1 :pointer to pred buffer - //x2 :pointer to dst buffer - //x3 :source stride - //stack :pred stride - // :scale matirx, - // :threshold matrix - // :qbits - // :round factor - // :nnz - // :pu1_dc_alt_addr push_v_regs //x0 :pointer to src buffer //x1 :pointer to pred buffer //x2 :pointer to dst buffer - //x3 :source stride - //x4 :pred stride + //w3 :source stride + //w4 :pred stride //x5 :scale matirx, //x6 :threshold matrix - //x7 :qbits - //x8 :round factor + //w7 :qbits + //w8 :round factor //x9 :nnz //x10 :pointer to store non quantized dc value + sxtw x3, w3 + sxtw x4, w4 ldr w8, [sp, #64] //load round factor ldr x10, [sp, #80] //load addres for non quant val - neg x7, x7 //negate the qbit value for usiing lsl + neg w7, w7 //negate the qbit value for usiing lsl ldr x9, [sp, #72] //------------fucntion loading done----------------; @@ -485,10 +442,10 @@ ih264_resi_trans_quant_chroma_4x4_av8: //* arguments : x0 :pointer to src buffer // x1 :pointer to dst buffer // x2 :pu2_scale_matrix -// x2 :pu2_threshold_matrix -// x3 :u4_qbits -// x4 :u4_round_factor -// x5 :pu1_nnz +// x3 :pu2_threshold_matrix +// w4 :u4_qbits +// w5 :u4_round_factor +// x6 :pu1_nnz // values returned : none // // register usage : @@ -516,8 +473,8 @@ ih264_hadamard_quant_4x4_av8: //x1 :pointer to dst buffer //x2 :pu2_scale_matrix //x3 :pu2_threshold_matrix -//x4 :u4_qbits -//x5 :u4_round_factor +//w4 :u4_qbits +//w5 :u4_round_factor //x6 :pu1_nnz push_v_regs @@ -632,10 +589,10 @@ ih264_hadamard_quant_4x4_av8: //* arguments : x0 :pointer to src buffer // x1 :pointer to dst buffer // x2 :pu2_scale_matrix -// x2 :pu2_threshold_matrix -// x3 :u4_qbits -// x4 :u4_round_factor -// x5 :pu1_nnz +// x3 :pu2_threshold_matrix +// w4 :u4_qbits +// w5 :u4_round_factor +// x6 :pu1_nnz // values returned : none // // register usage : diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s index b039fba..475f690 100644 --- a/common/armv8/ih264_weighted_bi_pred_av8.s +++ b/common/armv8/ih264_weighted_bi_pred_av8.s @@ -103,28 +103,28 @@ // WORD32 src_strd1, // WORD32 src_strd2, // WORD32 dst_strd, -// UWORD16 log_WD, -// UWORD32 wt1, -// UWORD32 wt2, -// UWORD16 ofst1, -// UWORD16 ofst2, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 log_WD, +// WORD32 wt1, +// WORD32 wt2, +// WORD16 ofst1, +// WORD16 ofst2, +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src1 // x1 => puc_src2 // x2 => puc_dst -// x3 => src_strd1 -// [sp] => src_strd2 (x4) -// [sp+4] => dst_strd (x5) -// [sp+8] => log_WD (x6) -// [sp+12] => wt1 (x7) -// [sp+16] => wt2 (x8) -// [sp+20] => ofst1 (x9) -// [sp+24] => ofst2 (x10) -// [sp+28] => ht (x11) -// [sp+32] => wd (x12) +// w3 => src_strd1 +// w4 => src_strd2 +// w5 => dst_strd +// w6 => log_WD +// w7 => wt1 +// [sp] => wt2 (w8) +// [sp+8] => ofst1 (w9) +// [sp+16] => ofst2 (w10) +// [sp+24] => ht (w11) +// [sp+32] => wd (w12) // .text .p2align 2 @@ -138,21 +138,23 @@ ih264_weighted_bi_pred_luma_av8: // STMFD sp!, {x4-x12,x14} //stack stores the values of the arguments push_v_regs + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 stp x19, x20, [sp, #-16]! - ldr x8, [sp, #80] //Load wt2 in x8 - ldr x9, [sp, #88] //Load ofst1 in x9 - add x6, x6, #1 //x6 = log_WD + 1 - sub x20, x6, #0 //x13 = -(log_WD + 1) - neg x10, x20 + ldr w8, [sp, #80] //Load wt2 in w8 + ldr w9, [sp, #88] //Load ofst1 in w9 + add w6, w6, #1 //w6 = log_WD + 1 + neg w10, w6 //w10 = -(log_WD + 1) dup v0.8h, w10 //Q0 = -(log_WD + 1) (32-bit) - ldr x10, [sp, #96] //Load ofst2 in x10 - ldr x11, [sp, #104] //Load ht in x11 - ldr x12, [sp, #112] //Load wd in x12 - add x9, x9, #1 //x9 = ofst1 + 1 - add x9, x9, x10 //x9 = ofst1 + ofst2 + 1 + ldr w10, [sp, #96] //Load ofst2 in w10 + ldr w11, [sp, #104] //Load ht in w11 + ldr w12, [sp, #112] //Load wd in w12 + add w9, w9, #1 //w9 = ofst1 + 1 + add w9, w9, w10 //w9 = ofst1 + ofst2 + 1 mov v2.s[0], w7 mov v2.s[1], w8 //D2 = {wt1(32-bit), wt2(32-bit)} - asr x9, x9, #1 //x9 = ofst = (ofst1 + ofst2 + 1) >> 1 + asr w9, w9, #1 //w9 = ofst = (ofst1 + ofst2 + 1) >> 1 dup v3.8b, w9 //D3 = ofst (8-bit) cmp w12, #16 beq loop_16 //branch if wd is 16 @@ -383,28 +385,28 @@ end_loops: // WORD32 src_strd1, // WORD32 src_strd2, // WORD32 dst_strd, -// UWORD16 log_WD, -// UWORD32 wt1, -// UWORD32 wt2, -// UWORD16 ofst1, -// UWORD16 ofst2, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 log_WD, +// WORD32 wt1, +// WORD32 wt2, +// WORD32 ofst1, +// WORD32 ofst2, +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src1 // x1 => puc_src2 // x2 => puc_dst -// x3 => src_strd1 -// [sp] => src_strd2 (x4) -// [sp+4] => dst_strd (x5) -// [sp+8] => log_WD (x6) -// [sp+12] => wt1 (x7) -// [sp+16] => wt2 (x8) -// [sp+20] => ofst1 (x9) -// [sp+24] => ofst2 (x10) -// [sp+28] => ht (x11) -// [sp+32] => wd (x12) +// w3 => src_strd1 +// w4 => src_strd2 +// w5 => dst_strd +// w6 => log_WD +// w7 => wt1 +// [sp] => wt2 (w8) +// [sp+8] => ofst1 (w9) +// [sp+16] => ofst2 (w10) +// [sp+24] => ht (w11) +// [sp+32] => wd (w12) // @@ -417,24 +419,22 @@ ih264_weighted_bi_pred_chroma_av8: // STMFD sp!, {x4-x12,x14} //stack stores the values of the arguments push_v_regs + sxtw x3, w3 + sxtw x4, w4 + sxtw x5, w5 stp x19, x20, [sp, #-16]! - ldr x8, [sp, #80] //Load wt2 in x8 + ldr w8, [sp, #80] //Load wt2 in w8 dup v4.4s, w8 //Q2 = (wt2_u, wt2_v) (32-bit) dup v2.4s, w7 //Q1 = (wt1_u, wt1_v) (32-bit) - add x6, x6, #1 //x6 = log_WD + 1 - ldr w9, [sp, #88] //Load ofst1 in x9 - sxtw x9, w9 - ldr w10, [sp, #96] //Load ofst2 in x10 - sxtw x10, w10 - sub x20, x6, #0 //x12 = -(log_WD + 1) - neg x20, x20 + add w6, w6, #1 //w6 = log_WD + 1 + ldr w9, [sp, #88] //Load ofst1 in w9 + ldr w10, [sp, #96] //Load ofst2 in w10 + neg w20, w6 //w20 = -(log_WD + 1) dup v0.8h, w20 //Q0 = -(log_WD + 1) (16-bit) ldr w11, [sp, #104] //Load ht in x11 ldr w12, [sp, #112] //Load wd in x12 - sxtw x11, w11 - sxtw x12, w12 dup v20.8h, w9 //0ffset1 dup v21.8h, w10 //0ffset2 srhadd v6.8b, v20.8b, v21.8b diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s index 69ed3b0..f145217 100644 --- a/common/armv8/ih264_weighted_pred_av8.s +++ b/common/armv8/ih264_weighted_pred_av8.s @@ -89,22 +89,22 @@ // UWORD8 *puc_dst, // WORD32 src_strd, // WORD32 dst_strd, -// UWORD8 log_WD, -// UWORD32 wt, -// UWORD16 ofst, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 log_WD, +// WORD32 wt, +// WORD32 ofst, +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src // x1 => puc_dst -// x2 => src_strd -// x3 => dst_strd -// [sp] => log_WD (x4) -// [sp+4] => wt (x5) -// [sp+8] => ofst (x6) -// [sp+12] => ht (x7) -// [sp+16] => wd (x8) +// w2 => src_strd +// w3 => dst_strd +// w4 => log_WD +// w5 => wt +// w6 => ofst +// w7 => ht +// [sp] => wd (w8) // .text .p2align 2 @@ -118,13 +118,14 @@ ih264_weighted_pred_luma_av8: // STMFD sp!, {x4-x9,x14} //stack stores the values of the arguments push_v_regs + sxtw x2, w2 + sxtw x3, w3 stp x19, x20, [sp, #-16]! ldr w8, [sp, #80] //Load wd sxtw x8, w8 dup v2.4h, w5 //D2 = wt (16-bit) - sub x20, x4, #0 //x9 = -log_WD - neg x9, x20 + neg w9, w4 //w9 = -log_WD dup v3.8b, w6 //D3 = ofst (8-bit) cmp w8, #16 //check if wd is 16 dup v0.8h, w9 //Q0 = -log_WD (16-bit) @@ -318,22 +319,22 @@ end_loops: // UWORD8 *puc_dst, // WORD32 src_strd, // WORD32 dst_strd, -// UWORD8 log_WD, -// UWORD32 wt, -// UWORD16 ofst, -// UWORD8 ht, -// UWORD8 wd) +// WORD32 log_WD, +// WORD32 wt, +// WORD32 ofst, +// WORD32 ht, +// WORD32 wd) // //**************Variables Vs Registers***************************************** // x0 => puc_src // x1 => puc_dst -// x2 => src_strd -// x3 => dst_strd -// [sp] => log_WD (x4) -// [sp+4] => wt (x5) -// [sp+8] => ofst (x6) -// [sp+12] => ht (x7) -// [sp+16] => wd (x8) +// w2 => src_strd +// w3 => dst_strd +// w4 => log_WD +// w5 => wt +// w6 => ofst +// w7 => ht +// [sp] => wd (w8) // @@ -345,13 +346,14 @@ ih264_weighted_pred_chroma_av8: // STMFD sp!, {x4-x9,x14} //stack stores the values of the arguments push_v_regs + sxtw x2, w2 + sxtw x3, w3 stp x19, x20, [sp, #-16]! ldr w8, [sp, #80] //Load wd sxtw x8, w8 - sub x20, x4, #0 //x9 = -log_WD - neg x9, x20 + neg w9, w4 //w9 = -log_WD dup v2.4s, w5 //Q1 = {wt_u (16-bit), wt_v (16-bit)} diff --git a/decoder.arm64.mk b/decoder.arm64.mk index 2140b94..5ccf70f 100644 --- a/decoder.arm64.mk +++ b/decoder.arm64.mk @@ -6,7 +6,6 @@ libavcd_inc_dir_arm64 += $(LOCAL_PATH)/common/armv8 libavcd_srcs_c_arm64 += decoder/arm/ih264d_function_selector.c -ifeq ($(ARCH_ARM_HAVE_NEON),true) libavcd_srcs_c_arm64 += decoder/arm/ih264d_function_selector_av8.c libavcd_srcs_asm_arm64 += common/armv8/ih264_intra_pred_chroma_av8.s @@ -34,11 +33,6 @@ libavcd_srcs_asm_arm64 += common/armv8/ih264_ihadamard_scaling_av8.s libavcd_srcs_asm_arm64 += common/armv8/ih264_intra_pred_luma_8x8_av8.s libavcd_cflags_arm64 += -DDEFAULT_ARCH=D_ARCH_ARMV8_GENERIC -else -libavcd_cflags_arm64 += -DDISABLE_NEON -DDEFAULT_ARCH=D_ARCH_ARM_NONEON -endif - - LOCAL_SRC_FILES_arm64 += $(libavcd_srcs_c_arm64) $(libavcd_srcs_asm_arm64) diff --git a/decoder/ih264d_api.c b/decoder/ih264d_api.c index 2cde456..01deff0 100644 --- a/decoder/ih264d_api.c +++ b/decoder/ih264d_api.c @@ -1858,8 +1858,16 @@ WORD32 ih264d_video_decode(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op) } - if(ps_dec->u1_flushfrm && ps_dec->u1_init_dec_flag) + if(ps_dec->u1_flushfrm) { + if(ps_dec->u1_init_dec_flag == 0) + { + /*Come out of flush mode and return*/ + ps_dec->u1_flushfrm = 0; + return (IV_FAIL); + } + + ih264d_get_next_display_field(ps_dec, ps_dec->ps_out_buffer, &(ps_dec->s_disp_op)); @@ -2634,6 +2642,9 @@ WORD32 ih264d_set_flush_mode(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op ps_ctl_op->u4_error_code = 0; + /* Ignore dangling fields during flush */ + ps_dec->u1_top_bottom_decoded = 0; + return IV_SUCCESS; } @@ -3031,40 +3042,30 @@ WORD32 ih264d_set_params(iv_obj_t *dec_hdl, void *pv_api_ip, void *pv_api_op) } } - if((0 != ps_dec->u4_app_disp_width) - && (ps_ctl_ip->u4_disp_wd - != ps_dec->u4_app_disp_width)) + if(ps_ctl_ip->u4_disp_wd >= ps_dec->u2_pic_wd) { - ps_ctl_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM); - ps_ctl_op->u4_error_code |= ERROR_DISP_WIDTH_INVALID; - ret = IV_FAIL; + ps_dec->u4_app_disp_width = ps_ctl_ip->u4_disp_wd; + } + else if(0 == ps_dec->i4_header_decoded) + { + ps_dec->u4_app_disp_width = ps_ctl_ip->u4_disp_wd; + } + else if(ps_ctl_ip->u4_disp_wd == 0) + { + ps_dec->u4_app_disp_width = 0; } else { - if(ps_ctl_ip->u4_disp_wd >= ps_dec->u2_pic_wd) - { - ps_dec->u4_app_disp_width = ps_ctl_ip->u4_disp_wd; - } - else if(0 == ps_dec->i4_header_decoded) - { - ps_dec->u4_app_disp_width = ps_ctl_ip->u4_disp_wd; - } - else if(ps_ctl_ip->u4_disp_wd == 0) - { - ps_dec->u4_app_disp_width = 0; - } - else - { - /* - * Set the display width to zero. This will ensure that the wrong value we had stored (0xFFFFFFFF) - * does not propogate. - */ - ps_dec->u4_app_disp_width = 0; - ps_ctl_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM); - ps_ctl_op->u4_error_code |= ERROR_DISP_WIDTH_INVALID; - ret = IV_FAIL; - } + /* + * Set the display width to zero. This will ensure that the wrong value we had stored (0xFFFFFFFF) + * does not propogate. + */ + ps_dec->u4_app_disp_width = 0; + ps_ctl_op->u4_error_code |= (1 << IVD_UNSUPPORTEDPARAM); + ps_ctl_op->u4_error_code |= ERROR_DISP_WIDTH_INVALID; + ret = IV_FAIL; } + if(ps_ctl_ip->e_vid_dec_mode == IVD_DECODE_FRAME) ps_dec->i4_decode_header = 0; else if(ps_ctl_ip->e_vid_dec_mode == IVD_DECODE_HEADER) diff --git a/decoder/ih264d_dpb_mgr.c b/decoder/ih264d_dpb_mgr.c index e02cc90..453dcab 100644 --- a/decoder/ih264d_dpb_mgr.c +++ b/decoder/ih264d_dpb_mgr.c @@ -17,9 +17,10 @@ ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ +#ifdef __ANDROID__ #include "log/log.h" #include <cutils/log.h> - +#endif #include "ih264_typedefs.h" #include "ih264_macros.h" #include "ih264_platform_macros.h" @@ -888,8 +889,10 @@ WORD32 ih264d_read_mmco_commands(struct _DecStruct * ps_dec) { if (j >= MAX_REF_BUFS) { +#ifdef __ANDROID__ ALOGE("b/25818142"); android_errorWriteLog(0x534e4554, "25818142"); +#endif ps_dpb_cmds->u1_num_of_commands = 0; return -1; } diff --git a/decoder/ih264d_parse_headers.c b/decoder/ih264d_parse_headers.c index 2694114..d8c37a6 100644 --- a/decoder/ih264d_parse_headers.c +++ b/decoder/ih264d_parse_headers.c @@ -484,7 +484,7 @@ WORD32 ih264d_parse_sps(dec_struct_t *ps_dec, dec_bit_stream_t *ps_bitstrm) UWORD32 *pu4_bitstrm_buf = ps_bitstrm->pu4_buffer; UWORD32 *pu4_bitstrm_ofst = &ps_bitstrm->u4_ofst; UWORD8 u1_frm, uc_constraint_set0_flag, uc_constraint_set1_flag; - + WORD32 i4_cropped_ht, i4_cropped_wd; UWORD32 u4_temp; WORD32 pic_height_in_map_units_minus1 = 0; UWORD32 u2_pic_wd = 0; @@ -564,10 +564,11 @@ WORD32 ih264d_parse_sps(dec_struct_t *ps_dec, dec_bit_stream_t *ps_bitstrm) /*--------------------------------------------------------------------*/ ps_seq = ps_dec->pv_scratch_sps_pps; - *ps_seq = ps_dec->ps_sps[u1_seq_parameter_set_id]; + if(ps_dec->i4_header_decoded & 1) + { + *ps_seq = *ps_dec->ps_cur_sps; + } - if(NULL == ps_dec->ps_cur_sps) - ps_dec->ps_cur_sps = ps_seq; if((ps_dec->i4_header_decoded & 1) && (ps_seq->u1_profile_idc != u1_profile_idc)) { @@ -863,7 +864,6 @@ WORD32 ih264d_parse_sps(dec_struct_t *ps_dec, dec_bit_stream_t *ps_bitstrm) UWORD16 u2_btm_ofst = 0; UWORD8 u1_frm_mbs_flag; UWORD8 u1_vert_mult_factor; - WORD32 i4_cropped_ht, i4_cropped_wd; if(u1_frame_cropping_flag) { @@ -925,10 +925,6 @@ WORD32 ih264d_parse_sps(dec_struct_t *ps_dec, dec_bit_stream_t *ps_bitstrm) return IVD_STREAM_WIDTH_HEIGHT_NOT_SUPPORTED; } - ps_dec->u2_disp_height = i4_cropped_ht; - - ps_dec->u2_disp_width = i4_cropped_wd; - } /* Backup u4_num_reorder_frames if header is already decoded */ @@ -960,6 +956,19 @@ WORD32 ih264d_parse_sps(dec_struct_t *ps_dec, dec_bit_stream_t *ps_bitstrm) return IVD_RES_CHANGED; } + /* In case bitstream read has exceeded the filled size, then + return an error */ + if (ps_bitstrm->u4_ofst > ps_bitstrm->u4_max_ofst) + { + return ERROR_INV_SPS_PPS_T; + } + + /*--------------------------------------------------------------------*/ + /* All initializations to ps_dec are beyond this point */ + /*--------------------------------------------------------------------*/ + ps_dec->u2_disp_height = i4_cropped_ht; + ps_dec->u2_disp_width = i4_cropped_wd; + ps_dec->u2_pic_wd = u2_pic_wd; ps_dec->u2_pic_ht = u2_pic_ht; @@ -978,14 +987,9 @@ WORD32 ih264d_parse_sps(dec_struct_t *ps_dec, dec_bit_stream_t *ps_bitstrm) ps_dec->u2_crop_offset_y = u2_crop_offset_y; ps_dec->u2_crop_offset_uv = u2_crop_offset_uv; - /* In case bitstream read has exceeded the filled size, then - return an error */ - if(ps_bitstrm->u4_ofst > ps_bitstrm->u4_max_ofst) - { - return ERROR_INV_SPS_PPS_T; - } ps_seq->u1_is_valid = TRUE; ps_dec->ps_sps[u1_seq_parameter_set_id] = *ps_seq; + ps_dec->ps_cur_sps = &ps_dec->ps_sps[u1_seq_parameter_set_id]; return OK; } diff --git a/decoder/ih264d_parse_islice.c b/decoder/ih264d_parse_islice.c index 504b775..46a87d1 100644 --- a/decoder/ih264d_parse_islice.c +++ b/decoder/ih264d_parse_islice.c @@ -509,9 +509,7 @@ WORD32 ih264d_parse_imb_cabac(dec_struct_t * ps_dec, MEMSET_16BYTES(&ps_dec->pu1_left_mv_ctxt_inc[0][0], 0); *((UWORD32 *)ps_dec->pi1_left_ref_idx_ctxt_inc) = 0; MEMSET_16BYTES(p_curr_ctxt->u1_mv, 0); - pi1_buf = p_curr_ctxt->i1_ref_idx; - pi4_buf = (WORD32 *)pi1_buf; - *pi4_buf = 0; + memset(p_curr_ctxt->i1_ref_idx, 0, 4); } if(u1_mb_type == I_4x4_MB) diff --git a/decoder/ih264d_parse_mb_header.c b/decoder/ih264d_parse_mb_header.c index f30ad67..9a6a1f9 100644 --- a/decoder/ih264d_parse_mb_header.c +++ b/decoder/ih264d_parse_mb_header.c @@ -1172,7 +1172,6 @@ void ih264d_get_mvd_cabac(UWORD8 u1_sub_mb, /***************************************************************/ /* Store abs_mvd_values cabac contexts */ /***************************************************************/ -#ifndef ARM { UWORD8 u1_i; for(u1_i = 0; u1_i < u1_part_wd; u1_i++, pu1_top_mv_ctxt += 4) @@ -1187,46 +1186,6 @@ void ih264d_get_mvd_cabac(UWORD8 u1_sub_mb, pu1_lft_mv_ctxt[1] = u1_abs_mvd_y; } } -#else - /* Optimising the loop, with Little-Endian Assumption */ - { - UWORD16 *pu2_top_cxt = (UWORD16 *)pu1_top_mv_ctxt; - UWORD16 *pu2_lft_cxt = (UWORD16 *)pu1_lft_mv_ctxt; - UWORD16 u2_pack_mvd = (UWORD16)((u1_abs_mvd_y << 8) | u1_abs_mvd_x); - UWORD8 u1_wd = u1_part_wd, u1_ht = u1_part_ht; - - u1_wd--; - *pu2_top_cxt = u2_pack_mvd; - pu2_top_cxt += 2; - if(u1_wd) - { - u1_wd--; - *pu2_top_cxt = u2_pack_mvd; - pu2_top_cxt += 2; - } - if(u1_wd) - { - *pu2_top_cxt = u2_pack_mvd; - pu2_top_cxt += 2; - *pu2_top_cxt = u2_pack_mvd; - } - u1_ht--; - *pu2_lft_cxt = u2_pack_mvd; - pu2_lft_cxt += 2; - if(u1_ht) - { - u1_ht--; - *pu2_lft_cxt = u2_pack_mvd; - pu2_lft_cxt += 2; - } - if(u1_ht) - { - *pu2_lft_cxt = u2_pack_mvd; - pu2_lft_cxt += 2; - *pu2_lft_cxt = u2_pack_mvd; - } - } -#endif } /*****************************************************************************/ diff --git a/decoder/ih264d_structs.h b/decoder/ih264d_structs.h index c83c34e..6958a0c 100644 --- a/decoder/ih264d_structs.h +++ b/decoder/ih264d_structs.h @@ -1055,7 +1055,6 @@ typedef struct _DecStruct prev_seq_params_t s_prev_seq_params; UWORD8 u1_cur_mb_fld_dec_flag; /* current Mb fld or Frm */ - WORD8 pi1_left_pred_mode[8]; UWORD8 u1_topleft_mb_fld; UWORD8 u1_topleft_mbtype; UWORD8 u1_topleft_mb_fld_bot; @@ -1065,6 +1064,9 @@ typedef struct _DecStruct UWORD16 u2_top_left_mask; UWORD16 u2_top_right_mask; dec_err_status_t * ps_dec_err_status; + /* Ensure pi1_left_pred_mode is aligned to 4 byte boundary, + by declaring this after a pointer or an integer */ + WORD8 pi1_left_pred_mode[8]; UWORD8 u1_mb_idx_mv; UWORD16 u2_mv_2mb[2]; diff --git a/decoder/ih264d_utils.c b/decoder/ih264d_utils.c index 4f6deca..4437832 100644 --- a/decoder/ih264d_utils.c +++ b/decoder/ih264d_utils.c @@ -1893,6 +1893,10 @@ WORD16 ih264d_allocate_dynamic_bufs(dec_struct_t * ps_dec) RETURN_IF((NULL == pv_buf), IV_FAIL); ps_dec->p_ctxt_inc_mb_map = pv_buf; + /* 0th entry of CtxtIncMbMap will be always be containing default values + for CABAC context representing MB not available */ + ps_dec->p_ctxt_inc_mb_map += 1; + size = (sizeof(mv_pred_t) * ps_dec->u1_recon_mb_grp * 16); pv_buf = ps_dec->pf_aligned_alloc(pv_mem_ctxt, 128, size); @@ -2073,9 +2077,6 @@ WORD16 ih264d_allocate_dynamic_bufs(dec_struct_t * ps_dec) RETURN_IF((NULL == pv_buf), IV_FAIL); ps_dec->pu1_pic_buf_base = pv_buf; - /* 0th entry of CtxtIncMbMap will be always be containing default values - for CABAC context representing MB not available */ - ps_dec->p_ctxt_inc_mb_map += 1; /* Post allocation Increment Actions */ /***************************************************************************/ diff --git a/encoder.arm64.mk b/encoder.arm64.mk index f95a29f..73cce1b 100644 --- a/encoder.arm64.mk +++ b/encoder.arm64.mk @@ -7,7 +7,6 @@ libavce_inc_dir_arm64 += $(LOCAL_PATH)/common/armv8 libavce_srcs_c_arm64 += encoder/arm/ih264e_function_selector.c -ifeq ($(ARCH_ARM_HAVE_NEON),true) libavce_srcs_c_arm64 += encoder/arm/ih264e_function_selector_av8.c libavce_srcs_asm_arm64 += common/armv8/ih264_resi_trans_quant_av8.s @@ -35,12 +34,6 @@ libavce_srcs_asm_arm64 += encoder/armv8/ih264e_half_pel_av8.s #ME libavce_srcs_asm_arm64 += encoder/armv8/ime_distortion_metrics_av8.s -else -libavce_cflags_arm64 += -DDISABLE_NEON -endif - - - LOCAL_SRC_FILES_arm64 += $(libavce_srcs_c_arm64) $(libavce_srcs_asm_arm64) LOCAL_C_INCLUDES_arm64 += $(libavce_inc_dir_arm64) diff --git a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s index df06d41..c23a6ea 100644 --- a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s +++ b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s @@ -82,9 +82,9 @@ ih264e_evaluate_intra16x16_modes_av8: //x0 = pu1_src, //x1 = pu1_ngbr_pels_i16, //x2 = pu1_dst, -//x3 = src_strd, -//x4 = dst_strd, -//x5 = u4_n_avblty, +//w3 = src_strd, +//w4 = dst_strd, +//w5 = u4_n_avblty, //x6 = u4_intra_mode, //x7 = pu4_sadmin @@ -92,9 +92,11 @@ ih264e_evaluate_intra16x16_modes_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs + sxtw x3, w3 + sxtw x4, w4 stp x19, x20, [sp, #-16]! - ldr x16, [sp, #80] + ldr w16, [sp, #80] mov x17, x4 mov x14, x6 mov x15, x7 @@ -105,13 +107,13 @@ ih264e_evaluate_intra16x16_modes_av8: mov w10, #0 mov w11 , #3 - ands x6, x5, #0x01 + ands w6, w5, #0x01 beq top_available //LEFT NOT AVAILABLE ld1 {v0.16b}, [x1] add w10, w10, #8 add w11, w11, #1 top_available: - ands x6, x5, #0x04 + ands w6, w5, #0x04 beq none_available add x6, x1, #17 ld1 {v1.16b}, [x6] @@ -119,7 +121,7 @@ top_available: add w11, w11, #1 b summation none_available: - cmp x5, #0 + cmp w5, #0 bne summation mov w6, #128 dup v30.16b, w6 @@ -469,16 +471,16 @@ sad_comp: mov x11, #1 lsl x11, x11, #30 - mov x0, x16 + mov w0, w16 //-------------------------------------------- - ands x7, x0, #01 // vert mode valid???????????? + ands w7, w0, #01 // vert mode valid???????????? csel x8, x11, x8, eq - ands x6, x0, #02 // horz mode valid???????????? + ands w6, w0, #02 // horz mode valid???????????? csel x9, x11, x9, eq - ands x6, x0, #04 // dc mode valid???????????? + ands w6, w0, #04 // dc mode valid???????????? csel x10, x11, x10, eq diff --git a/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s index bb2526d..4014c4f 100644 --- a/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s +++ b/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s @@ -82,9 +82,9 @@ ih264e_evaluate_intra_chroma_modes_av8: //x0 = pu1_src, //x1 = pu1_ngbr_pels_i16, //x2 = pu1_dst, -//x3 = src_strd, -//x4 = dst_strd, -//x5 = u4_n_avblty, +//w3 = src_strd, +//w4 = dst_strd, +//w5 = u4_n_avblty, //x6 = u4_intra_mode, //x7 = pu4_sadmin @@ -92,20 +92,22 @@ ih264e_evaluate_intra_chroma_modes_av8: // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs + sxtw x3, w3 + sxtw x4, w4 stp x19, x20, [sp, #-16]! //----------------------- - ldr x16, [sp, #80] + ldr w16, [sp, #80] mov x17, x4 - mov x18, x5 + mov w18, w5 mov x14, x6 mov x15, x7 - mov x19, #5 - ands x6, x5, x19 + mov w19, #5 + ands w6, w5, w19 beq none_available - cmp x6, #1 + cmp w6, #1 beq left_only_available - cmp x6, #4 + cmp w6, #4 beq top_only_available all_available: @@ -368,20 +370,20 @@ sad_comp: mov x11, #1 //----------------------- - mov x0, x16 // u4_valid_intra_modes + mov w0, w16 // u4_valid_intra_modes //-------------------------------------------- lsl x11, x11, #30 - ands x7, x0, #04 // vert mode valid???????????? + ands w7, w0, #04 // vert mode valid???????????? csel x8, x11, x8, eq - ands x6, x0, #02 // horz mode valid???????????? + ands w6, w0, #02 // horz mode valid???????????? csel x9, x11, x9, eq - ands x6, x0, #01 // dc mode valid???????????? + ands w6, w0, #01 // dc mode valid???????????? csel x10, x11, x10, eq diff --git a/encoder/armv8/ih264e_half_pel_av8.s b/encoder/armv8/ih264e_half_pel_av8.s index 8f27104..cdac8da 100644 --- a/encoder/armv8/ih264e_half_pel_av8.s +++ b/encoder/armv8/ih264e_half_pel_av8.s @@ -86,6 +86,8 @@ ih264e_sixtapfilter_horz_av8: // STMFD sp!,{x14} push_v_regs + sxtw x2, w2 + sxtw x3, w3 stp x19, x20, [sp, #-16]! movi v0.8b, #5 @@ -263,6 +265,8 @@ filter_horz_loop: ih264e_sixtap_filter_2dvh_vert_av8: // STMFD sp!,{x10,x11,x12,x14} push_v_regs + sxtw x3, w3 + sxtw x4, w4 stp x19, x20, [sp, #-16]! ////x0 - pu1_ref diff --git a/encoder/armv8/ime_distortion_metrics_av8.s b/encoder/armv8/ime_distortion_metrics_av8.s index 47c3425..00d11c0 100644 --- a/encoder/armv8/ime_distortion_metrics_av8.s +++ b/encoder/armv8/ime_distortion_metrics_av8.s @@ -95,6 +95,8 @@ .global ime_compute_sad_16x16_fast_av8 ime_compute_sad_16x16_fast_av8: push_v_regs + sxtw x2, w2 + sxtw x3, w3 lsl x2, x2, #1 lsl x3, x3, #1 @@ -179,6 +181,8 @@ ime_compute_sad_16x8_av8: //chheck what stride incremtn to use //earlier code did not have this lsl push_v_regs + sxtw x2, w2 + sxtw x3, w3 mov x6, #2 movi v30.8h, #0 @@ -255,6 +259,8 @@ core_loop_ime_compute_sad_16x8_av8: ime_compute_sad_16x16_ea8_av8: push_v_regs + sxtw x2, w2 + sxtw x3, w3 movi v30.8h, #0 add x7, x0, x2 @@ -381,9 +387,12 @@ ime_calculate_sad2_prog_av8: // x0 = ref1 <UWORD8 *> // x1 = ref2 <UWORD8 *> // x2 = src <UWORD8 *> - // x3 = RefBufferWidth <UWORD32> - // stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> + // w3 = RefBufferWidth <UWORD32> + // w4 = CurBufferWidth <UWORD32> + // x5 = psad <UWORD32 *> push_v_regs + sxtw x3, w3 + sxtw x4, w4 mov x6, #8 movi v30.8h, #0 movi v31.8h, #0 @@ -459,16 +468,15 @@ ime_calculate_sad3_prog_av8: // x1 = ref2 <UWORD8 *> // x2 = ref3 <UWORD8 *> // x3 = src <UWORD8 *> - // stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *> + // w4 = RefBufferWidth <UWORD32> + // w5 = CurBufferWidth <UWORD32> + // x6 = psad <UWORD32 *> - // x0 = ref1 <UWORD8 *> - // x1 = ref2 <UWORD8 *> - // x2 = src <UWORD8 *> - // x3 = RefBufferWidth <UWORD32> - // stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> push_v_regs - mov x6, #16 + sxtw x4, w4 + sxtw x5, w5 + mov x7, #16 movi v29.8h, #0 movi v30.8h, #0 movi v31.8h, #0 @@ -499,15 +507,15 @@ core_loop_ime_calculate_sad3_prog_av8: uabal v31.8h, v6.8b, v7.8b uabal2 v31.8h, v6.16b, v7.16b - subs x6, x6, #1 - bne core_loop_ime_calculate_sad2_prog_av8 + subs x7, x7, #1 + bne core_loop_ime_calculate_sad3_prog_av8 addp v30.8h, v30.8h, v31.8h uaddlp v30.4s, v30.8h addp v30.2s, v30.2s, v30.2s shl v30.2s, v30.2s, #1 - st1 {v30.2s}, [x5] + st1 {v30.2s}, [x6] pop_v_regs ret @@ -544,6 +552,8 @@ core_loop_ime_calculate_sad3_prog_av8: .global ime_sub_pel_compute_sad_16x16_av8 ime_sub_pel_compute_sad_16x16_av8: push_v_regs + sxtw x4, w4 + sxtw x5, w5 sub x7, x1, #1 //x left sub x8, x2, x5 //y top sub x9, x3, #1 //xy left @@ -647,6 +657,8 @@ core_loop_ime_sub_pel_compute_sad_16x16_av8: .global ime_compute_sad_16x16_av8 ime_compute_sad_16x16_av8: push_v_regs + sxtw x2, w2 + sxtw x3, w3 mov x6, #4 movi v30.8h, #0 @@ -702,6 +714,8 @@ core_loop_ime_compute_sad_16x16_av8: .global ime_calculate_sad4_prog_av8 ime_calculate_sad4_prog_av8: push_v_regs + sxtw x2, w2 + sxtw x3, w3 sub x5, x0, #1 //left add x6, x0, #1 //right sub x7, x0, x2 //top @@ -777,13 +791,15 @@ core_loop_ime_calculate_sad4_prog_av8: ime_compute_satqd_16x16_lumainter_av8: //x0 :pointer to src buffer //x1 :pointer to est buffer - //x2 :Source stride - //x3 :Pred stride + //w2 :Source stride + //w3 :Pred stride //x4 :Threshold pointer //x5 :Distortion,ie SAD //x6 :is nonzero //x7 :loop counter push_v_regs + sxtw x2, w2 + sxtw x3, w3 stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! diff --git a/encoder/ih264e_api.c b/encoder/ih264e_api.c index e0c9f83..2ecfdf5 100644 --- a/encoder/ih264e_api.c +++ b/encoder/ih264e_api.c @@ -3823,7 +3823,7 @@ static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj, UWORD8 *pu1_buf = ps_mem_rec->pv_base; /* size of header data of 1 mb */ - size = 40; + size = sizeof(mb_hdr_t); /* size for 1 row of mbs */ size = size * max_mb_cols; diff --git a/encoder/ih264e_cabac_encode.c b/encoder/ih264e_cabac_encode.c index ecc30f5..e49ab58 100644 --- a/encoder/ih264e_cabac_encode.c +++ b/encoder/ih264e_cabac_encode.c @@ -339,7 +339,7 @@ static void ih264e_cabac_enc_4x4mb_modes(cabac_ctxt_t *ps_cabac_ctxt, for (i = 0; i < 16; i += 2) { /* sub blk idx 1 */ - byte = *pu1_intra_4x4_modes++; + byte = pu1_intra_4x4_modes[i >> 1]; if (byte & 0x1) { ih264e_cabac_encode_bin(ps_cabac_ctxt, @@ -1540,14 +1540,14 @@ static void ih264e_cabac_enc_mvds_b16x16(cabac_ctxt_t *ps_cabac_ctxt, u2_abs_mvd_y_b = (UWORD16) pu1_top_mv_ctxt[1]; u2_abs_mvd_x_a = (UWORD16) pu1_lft_mv_ctxt[0]; u2_abs_mvd_y_a = (UWORD16) pu1_lft_mv_ctxt[1]; - u2_mv = *(pi2_mv_ptr++); + u2_mv = pi2_mv_ptr[0]; ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_X, (UWORD16) (u2_abs_mvd_x_a + u2_abs_mvd_x_b), ps_cabac_ctxt); u1_abs_mvd_x = CLIP3(0, 127, ABS(u2_mv)); - u2_mv = *(pi2_mv_ptr++); + u2_mv = pi2_mv_ptr[1]; ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_Y, (UWORD16) (u2_abs_mvd_y_a + u2_abs_mvd_y_b), @@ -1555,6 +1555,7 @@ static void ih264e_cabac_enc_mvds_b16x16(cabac_ctxt_t *ps_cabac_ctxt, u1_abs_mvd_y = CLIP3(0, 127, ABS(u2_mv)); } + /***************************************************************/ /* Store abs_mvd_values cabac contexts */ /***************************************************************/ @@ -1571,14 +1572,14 @@ static void ih264e_cabac_enc_mvds_b16x16(cabac_ctxt_t *ps_cabac_ctxt, u2_abs_mvd_y_b = (UWORD16) pu1_top_mv_ctxt[3]; u2_abs_mvd_x_a = (UWORD16) pu1_lft_mv_ctxt[2]; u2_abs_mvd_y_a = (UWORD16) pu1_lft_mv_ctxt[3]; - u2_mv = *(pi2_mv_ptr++); + u2_mv = pi2_mv_ptr[2]; ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_X, (UWORD16) (u2_abs_mvd_x_a + u2_abs_mvd_x_b), ps_cabac_ctxt); u1_abs_mvd_x = CLIP3(0, 127, ABS(u2_mv)); - u2_mv = *(pi2_mv_ptr++); + u2_mv = pi2_mv_ptr[3]; ih264e_cabac_enc_ctx_mvd(u2_mv, MVD_Y, (UWORD16) (u2_abs_mvd_y_a + u2_abs_mvd_y_b), @@ -1624,11 +1625,11 @@ IH264E_ERROR_T ih264e_write_islice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) cabac_ctxt_t *ps_cabac_ctxt = ps_ent_ctxt->ps_cabac; /* packed header data */ UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data; + mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data; mb_info_ctxt_t *ps_curr_ctxt; WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode; WORD8 mb_qp_delta; UWORD32 u4_cbp_l, u4_cbp_c; - WORD32 byte_count = 0; WORD32 bitstream_start_offset, bitstream_end_offset; if ((ps_bitstream->u4_strm_buf_offset + MIN_STREAM_SIZE_MB) @@ -1638,12 +1639,10 @@ IH264E_ERROR_T ih264e_write_islice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) return (IH264E_BITSTREAM_BUFFER_OVERFLOW); } /* mb header info */ - mb_tpm = *pu1_byte++; - byte_count++; - cbp = *pu1_byte++; - byte_count++; - mb_qp_delta = *pu1_byte++; - byte_count++; + mb_tpm = ps_mb_hdr->u1_mb_type_mode; + cbp = ps_mb_hdr->u1_cbp; + mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta; + /* mb type */ mb_type = mb_tpm & 0xF; @@ -1671,9 +1670,10 @@ IH264E_ERROR_T ih264e_write_islice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) MB_TYPE_I_SLICE); if (mb_type == I4x4) - { /* Encode 4x4 MB modes */ - ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, pu1_byte); - byte_count += 8; + { + /* Encode 4x4 MB modes */ + mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data; + ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, ps_mb_hdr_i4x4->au1_sub_blk_modes); } /* Encode chroma mode */ ih264e_cabac_enc_chroma_predmode(chroma_intra_mode, ps_cabac_ctxt); @@ -1731,17 +1731,18 @@ IH264E_ERROR_T ih264e_write_islice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) memset(ps_curr_ctxt->u1_mv, 0, 16); memset(ps_cabac_ctxt->pu1_left_mv_ctxt_inc, 0, 16); ps_cabac_ctxt->ps_curr_ctxt_mb_info->u1_cbp = cbp; - ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count; + if (mb_type == I16x16) { ps_curr_ctxt->u1_mb_type = CAB_I16x16; - + pu1_byte += sizeof(mb_hdr_i16x16_t); } else { ps_curr_ctxt->u1_mb_type = CAB_I4x4; - + pu1_byte += sizeof(mb_hdr_i4x4_t); } + ps_ent_ctxt->pv_mb_header_data = pu1_byte; return IH264E_SUCCESS; } @@ -1778,8 +1779,8 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode; WORD8 mb_qp_delta; UWORD32 u4_cbp_l, u4_cbp_c; - WORD32 byte_count = 0; UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data; + mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data; if ((ps_bitstream->u4_strm_buf_offset + MIN_STREAM_SIZE_MB) >= ps_bitstream->u4_max_strm_size) @@ -1788,8 +1789,7 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) return (IH264E_BITSTREAM_BUFFER_OVERFLOW); } /* mb header info */ - mb_tpm = *pu1_byte++; - byte_count++; + mb_tpm = ps_mb_hdr->u1_mb_type_mode; /* mb type */ mb_type = mb_tpm & 0xF; @@ -1800,10 +1800,8 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) /* if Intra MB */ if (mb_type == I16x16 || mb_type == I4x4) { - cbp = *pu1_byte++; - byte_count++; - mb_qp_delta = *pu1_byte++; - byte_count++; + cbp = ps_mb_hdr->u1_cbp; + mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta; /* Starting bitstream offset for header in bits */ bitstream_start_offset = GET_NUM_BITS(ps_bitstream); @@ -1833,9 +1831,10 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) } if (mb_type == I4x4) - { /* Intra 4x4 modes */ - ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, pu1_byte); - byte_count += 8; + { + /* Intra 4x4 modes */ + mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data; + ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, ps_mb_hdr_i4x4->au1_sub_blk_modes); } chroma_intra_mode = (mb_tpm >> 6); @@ -1901,13 +1900,15 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) if (mb_type == I16x16) { ps_curr_ctxt->u1_mb_type = CAB_I16x16; + pu1_byte += sizeof(mb_hdr_i16x16_t); } else { ps_curr_ctxt->u1_mb_type = CAB_I4x4; + pu1_byte += sizeof(mb_hdr_i4x4_t); } - ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count; + ps_ent_ctxt->pv_mb_header_data = pu1_byte; return IH264E_SUCCESS; } @@ -1918,10 +1919,9 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) /* Encoding P16x16 */ if (mb_type != PSKIP) { - cbp = *pu1_byte++; - byte_count++; - mb_qp_delta = *pu1_byte++; - byte_count++; + mb_hdr_p16x16_t *ps_mb_hdr_p16x16 = (mb_hdr_p16x16_t *)ps_ent_ctxt->pv_mb_header_data; + cbp = ps_mb_hdr->u1_cbp; + mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta; /* Encoding mb_skip */ ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_P_SLICE); @@ -1937,8 +1937,8 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) } ps_curr_ctxt->u1_mb_type = CAB_P; { - WORD16 *pi2_mv_ptr = (WORD16 *) pu1_byte; - byte_count += 4; + WORD16 *pi2_mv_ptr = (WORD16 *) ps_mb_hdr_p16x16->ai2_mv; + ps_curr_ctxt->u1_mb_type = (ps_curr_ctxt->u1_mb_type | CAB_NON_BD16x16); /* Encoding motion vector for P16x16 */ @@ -1960,6 +1960,8 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) /* Starting bitstream offset for residue */ bitstream_start_offset = bitstream_end_offset; + pu1_byte += sizeof(mb_hdr_p16x16_t); + } else/* MB = PSKIP */ { @@ -1978,6 +1980,7 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) - bitstream_start_offset; /* Starting bitstream offset for residue */ + pu1_byte += sizeof(mb_hdr_pskip_t); } if (cbp > 0) @@ -2002,7 +2005,8 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) } ps_curr_ctxt->u1_intrapred_chroma_mode = 0; ps_curr_ctxt->u1_cbp = cbp; - ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count; + ps_ent_ctxt->pv_mb_header_data = pu1_byte; + return IH264E_SUCCESS; } } @@ -2066,8 +2070,8 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode; WORD8 mb_qp_delta; UWORD32 u4_cbp_l, u4_cbp_c; - WORD32 byte_count = 0; UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data; + mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data; if ((ps_bitstream->u4_strm_buf_offset + MIN_STREAM_SIZE_MB) >= ps_bitstream->u4_max_strm_size) @@ -2076,8 +2080,7 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) return (IH264E_BITSTREAM_BUFFER_OVERFLOW); } /* mb header info */ - mb_tpm = *pu1_byte++; - byte_count++; + mb_tpm = ps_mb_hdr->u1_mb_type_mode; /* mb type */ mb_type = mb_tpm & 0xF; @@ -2088,10 +2091,8 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) /* if Intra MB */ if (mb_type == I16x16 || mb_type == I4x4) { - cbp = *pu1_byte++; - byte_count++; - mb_qp_delta = *pu1_byte++; - byte_count++; + cbp = ps_mb_hdr->u1_cbp; + mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta; /* Starting bitstream offset for header in bits */ bitstream_start_offset = GET_NUM_BITS(ps_bitstream); @@ -2138,9 +2139,10 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) } if (mb_type == I4x4) - { /* Intra 4x4 modes */ - ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, pu1_byte); - byte_count += 8; + { + /* Intra 4x4 modes */ + mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data; + ih264e_cabac_enc_4x4mb_modes(ps_cabac_ctxt, ps_mb_hdr_i4x4->au1_sub_blk_modes); } chroma_intra_mode = (mb_tpm >> 6); @@ -2206,13 +2208,15 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) if (mb_type == I16x16) { ps_curr_ctxt->u1_mb_type = CAB_I16x16; + pu1_byte += sizeof(mb_hdr_i16x16_t); } else { ps_curr_ctxt->u1_mb_type = CAB_I4x4; + pu1_byte += sizeof(mb_hdr_i4x4_t); } - ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count; + ps_ent_ctxt->pv_mb_header_data = pu1_byte; return IH264E_SUCCESS; } @@ -2224,10 +2228,9 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) /* Encoding B_Direct_16x16 */ if (mb_type == BDIRECT) { - cbp = *pu1_byte++; - byte_count++; - mb_qp_delta = *pu1_byte++; - byte_count++; + cbp = ps_mb_hdr->u1_cbp; + mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta; + /* Encoding mb_skip */ ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_B_SLICE); @@ -2275,6 +2278,7 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) bitstream_start_offset = bitstream_end_offset; /* Starting bitstream offset for residue */ + pu1_byte += sizeof(mb_hdr_bdirect_t); } else if (mb_type == BSKIP)/* MB = BSKIP */ @@ -2293,17 +2297,18 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) - bitstream_start_offset; /* Starting bitstream offset for residue */ + pu1_byte += sizeof(mb_hdr_bskip_t); } else /* mbype is B_L0_16x16, B_L1_16x16 or B_Bi_16x16 */ { + mb_hdr_b16x16_t *ps_mb_hdr_b16x16 = (mb_hdr_b16x16_t *)ps_ent_ctxt->pv_mb_header_data; + WORD32 i4_mb_part_pred_mode = (mb_tpm >> 4); UWORD32 u4_mb_type = mb_type - B16x16 + B_L0_16x16 + i4_mb_part_pred_mode; - cbp = *pu1_byte++; - byte_count++; - mb_qp_delta = *pu1_byte++; - byte_count++; + cbp = ps_mb_hdr->u1_cbp; + mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta; /* Encoding mb_skip */ ih264e_cabac_enc_mb_skip(0, ps_cabac_ctxt, MB_SKIP_FLAG_B_SLICE); @@ -2338,11 +2343,9 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) ps_curr_ctxt->u1_mb_type = CAB_NON_BD16x16; { - WORD16 *pi2_mv_ptr = (WORD16 *) pu1_byte; - /* Get the pred modes */ - - byte_count += 4 * (1 + (i4_mb_part_pred_mode == PRED_BI)); + WORD16 *pi2_mv_ptr = (WORD16 *) ps_mb_hdr_b16x16->ai2_mv; + /* Get the pred modes */ ps_curr_ctxt->u1_mb_type = (ps_curr_ctxt->u1_mb_type | CAB_NON_BD16x16); /* Encoding motion vector for B16x16 */ @@ -2364,6 +2367,8 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) - bitstream_start_offset; /* Starting bitstream offset for residue */ bitstream_start_offset = bitstream_end_offset; + + pu1_byte += sizeof(mb_hdr_b16x16_t); } if (cbp > 0) @@ -2388,7 +2393,7 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cabac(entropy_ctxt_t *ps_ent_ctxt) } ps_curr_ctxt->u1_intrapred_chroma_mode = 0; ps_curr_ctxt->u1_cbp = cbp; - ps_ent_ctxt->pv_mb_header_data = ((WORD8 *)ps_ent_ctxt->pv_mb_header_data) + byte_count; + ps_ent_ctxt->pv_mb_header_data = pu1_byte; return IH264E_SUCCESS; } } diff --git a/encoder/ih264e_cabac_init.c b/encoder/ih264e_cabac_init.c index 347842c..7407dcc 100644 --- a/encoder/ih264e_cabac_init.c +++ b/encoder/ih264e_cabac_init.c @@ -160,17 +160,13 @@ void ih264e_init_cabac_table(entropy_ctxt_t *ps_ent_ctxt) /* 0th entry of mb_map_ctxt_inc will be always be containing default values */ /* for CABAC context representing MB not available */ mb_info_ctxt_t *ps_def_ctxt = ps_cabac_ctxt->ps_mb_map_ctxt_inc - 1; - UWORD32 *pu4_temp; - WORD8 i; ps_def_ctxt->u1_mb_type = CAB_SKIP; ps_def_ctxt->u1_cbp = 0x0f; ps_def_ctxt->u1_intrapred_chroma_mode = 0; - pu4_temp = (UWORD32 *)ps_def_ctxt->i1_ref_idx; - pu4_temp[0] = 0; - pu4_temp = (UWORD32 *)ps_def_ctxt->u1_mv; - for (i = 0; i < 4; i++, pu4_temp++) - (*pu4_temp) = 0; + + memset(ps_def_ctxt->i1_ref_idx, 0, sizeof(ps_def_ctxt->i1_ref_idx)); + memset(ps_def_ctxt->u1_mv, 0, sizeof(ps_def_ctxt->u1_mv)); ps_cabac_ctxt->ps_def_ctxt_mb_info = ps_def_ctxt; } } diff --git a/encoder/ih264e_cavlc.c b/encoder/ih264e_cavlc.c index 7491480..ed34a43 100644 --- a/encoder/ih264e_cavlc.c +++ b/encoder/ih264e_cavlc.c @@ -959,6 +959,7 @@ IH264E_ERROR_T ih264e_write_islice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* packed header data */ UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data; + mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data; /* mb header info */ /* @@ -986,9 +987,9 @@ IH264E_ERROR_T ih264e_write_islice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /********************************************************************/ /* mb header info */ - mb_tpm = *pu1_byte++; - cbp = *pu1_byte++; - mb_qp_delta = *pu1_byte++; + mb_tpm = ps_mb_hdr->u1_mb_type_mode; + cbp = ps_mb_hdr->u1_cbp; + mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta; /* mb type */ mb_type = mb_tpm & 0xF; @@ -1009,9 +1010,13 @@ IH264E_ERROR_T ih264e_write_islice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* intra_chroma_pred_mode */ PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + + pu1_byte += sizeof(mb_hdr_i16x16_t); } else if (mb_type == I4x4) { + mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data; + /* mb sub blk modes */ WORD32 intra_pred_mode_flag, rem_intra_mode; WORD32 byte; @@ -1024,7 +1029,7 @@ IH264E_ERROR_T ih264e_write_islice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) for (i = 0; i < 16; i += 2) { /* sub blk idx 1 */ - byte = *pu1_byte++; + byte = ps_mb_hdr_i4x4->au1_sub_blk_modes[i >> 1]; intra_pred_mode_flag = byte & 0x1; @@ -1056,11 +1061,14 @@ IH264E_ERROR_T ih264e_write_islice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* intra_chroma_pred_mode */ PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + + pu1_byte += sizeof(mb_hdr_i4x4_t); } else if (mb_type == I8x8) { /* transform 8x8 flag */ UWORD32 u4_transform_size_8x8_flag = ps_ent_ctxt->i1_transform_8x8_mode_flag; + mb_hdr_i8x8_t *ps_mb_hdr_i8x8 = (mb_hdr_i8x8_t *)ps_ent_ctxt->pv_mb_header_data; /* mb sub blk modes */ WORD32 intra_pred_mode_flag, rem_intra_mode; @@ -1080,7 +1088,7 @@ IH264E_ERROR_T ih264e_write_islice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) for (i = 0; i < 4; i++) { /* sub blk idx 1 */ - byte = *pu1_byte++; + byte = ps_mb_hdr_i8x8->au1_sub_blk_modes[i >> 1]; intra_pred_mode_flag = byte & 0x1; @@ -1112,6 +1120,8 @@ IH264E_ERROR_T ih264e_write_islice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* intra_chroma_pred_mode */ PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + + pu1_byte += sizeof(mb_hdr_i8x8_t); } else { @@ -1181,6 +1191,7 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* packed header data */ UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data; + mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data; /* mb header info */ /* @@ -1211,7 +1222,7 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /********************************************************************/ /* mb header info */ - mb_tpm = *pu1_byte++; + mb_tpm = ps_mb_hdr->u1_mb_type_mode; /* mb type */ mb_type = mb_tpm & 0xF; @@ -1227,6 +1238,7 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) (*ps_ent_ctxt->pi4_mb_skip_run)++; /* store the index of the next mb syntax layer */ + pu1_byte += sizeof(mb_hdr_pskip_t); ps_ent_ctxt->pv_mb_header_data = pu1_byte; /* set nnz to zero */ @@ -1248,8 +1260,8 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) } /* remaining mb header info */ - cbp = *pu1_byte++; - mb_qp_delta = *pu1_byte++; + cbp = ps_mb_hdr->u1_cbp; + mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta; /* mb skip run */ PUT_BITS_UEV(ps_bitstream, *ps_ent_ctxt->pi4_mb_skip_run, error_status, "mb skip run"); @@ -1278,9 +1290,12 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* intra_chroma_pred_mode */ PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + pu1_byte += sizeof(mb_hdr_i16x16_t); } else if (mb_type == I4x4) { + mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data; + /* mb sub blk modes */ WORD32 intra_pred_mode_flag, rem_intra_mode; WORD32 byte; @@ -1296,7 +1311,7 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) for (i = 0; i < 16; i += 2) { /* sub blk idx 1 */ - byte = *pu1_byte++; + byte = ps_mb_hdr_i4x4->au1_sub_blk_modes[i >> 1]; intra_pred_mode_flag = byte & 0x1; @@ -1328,9 +1343,13 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* intra_chroma_pred_mode */ PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + + pu1_byte += sizeof(mb_hdr_i4x4_t); } else if (mb_type == I8x8) { + mb_hdr_i8x8_t *ps_mb_hdr_i8x8 = (mb_hdr_i8x8_t *)ps_ent_ctxt->pv_mb_header_data; + /* transform 8x8 flag */ UWORD32 u4_transform_size_8x8_flag = ps_ent_ctxt->i1_transform_8x8_mode_flag; @@ -1355,7 +1374,7 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) for (i = 0; i < 4; i++) { /* sub blk idx 1 */ - byte = *pu1_byte++; + byte = ps_mb_hdr_i8x8->au1_sub_blk_modes[i >> 1]; intra_pred_mode_flag = byte & 0x1; @@ -1387,14 +1406,18 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* intra_chroma_pred_mode */ PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + + pu1_byte += sizeof(mb_hdr_i8x8_t); } else { + mb_hdr_p16x16_t *ps_mb_hdr_p16x16 = (mb_hdr_p16x16_t *)ps_ent_ctxt->pv_mb_header_data; + /* inter macro block partition cnt */ const UWORD8 au1_part_cnt[] = { 1, 2, 2, 4 }; /* mv ptr */ - WORD16 *pi2_mv_ptr = (WORD16 *)pu1_byte; + WORD16 *pi2_mv_ptr = (WORD16 *)ps_mb_hdr_p16x16->ai2_mv; /* number of partitions for the current mb */ UWORD32 u4_part_cnt = au1_part_cnt[mb_type - 3]; @@ -1410,7 +1433,8 @@ IH264E_ERROR_T ih264e_write_pslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) PUT_BITS_SEV(ps_bitstream, *pi2_mv_ptr++, error_status, "mv y"); } - pu1_byte = (UWORD8 *)pi2_mv_ptr; + pu1_byte += sizeof(mb_hdr_p16x16_t); + } /* coded_block_pattern */ @@ -1479,6 +1503,7 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* packed header data */ UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data; + mb_hdr_common_t *ps_mb_hdr = (mb_hdr_common_t *)ps_ent_ctxt->pv_mb_header_data; /* mb header info */ /* @@ -1508,7 +1533,7 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* BEGIN HEADER GENERATION */ /********************************************************************/ - mb_tpm = *pu1_byte++; + mb_tpm = ps_mb_hdr->u1_mb_type_mode; /* mb type */ mb_type = mb_tpm & 0xF; @@ -1524,6 +1549,7 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) (*ps_ent_ctxt->pi4_mb_skip_run)++; /* store the index of the next mb syntax layer */ + pu1_byte += sizeof(mb_hdr_bskip_t); ps_ent_ctxt->pv_mb_header_data = pu1_byte; /* set nnz to zero */ @@ -1547,8 +1573,8 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* remaining mb header info */ - cbp = *pu1_byte++; - mb_qp_delta = *pu1_byte++; + cbp = ps_mb_hdr->u1_cbp; + mb_qp_delta = ps_mb_hdr->u1_mb_qp_delta; /* mb skip run */ PUT_BITS_UEV(ps_bitstream, *ps_ent_ctxt->pi4_mb_skip_run, error_status, "mb skip run"); @@ -1577,9 +1603,13 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* intra_chroma_pred_mode */ PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + pu1_byte += sizeof(mb_hdr_i16x16_t); + } else if (mb_type == I4x4) { + mb_hdr_i4x4_t *ps_mb_hdr_i4x4 = (mb_hdr_i4x4_t *)ps_ent_ctxt->pv_mb_header_data; + /* mb sub blk modes */ WORD32 intra_pred_mode_flag, rem_intra_mode; WORD32 byte; @@ -1595,7 +1625,7 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) for (i = 0; i < 16; i += 2) { /* sub blk idx 1 */ - byte = *pu1_byte++; + byte = ps_mb_hdr_i4x4->au1_sub_blk_modes[i >> 1]; intra_pred_mode_flag = byte & 0x1; @@ -1627,9 +1657,13 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* intra_chroma_pred_mode */ PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + pu1_byte += sizeof(mb_hdr_i4x4_t); + } else if (mb_type == I8x8) { + mb_hdr_i8x8_t *ps_mb_hdr_i8x8 = (mb_hdr_i8x8_t *)ps_ent_ctxt->pv_mb_header_data; + /* transform 8x8 flag */ UWORD32 u4_transform_size_8x8_flag = ps_ent_ctxt->i1_transform_8x8_mode_flag; @@ -1654,7 +1688,7 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) for (i = 0; i < 4; i++) { /* sub blk idx 1 */ - byte = *pu1_byte++; + byte = ps_mb_hdr_i8x8->au1_sub_blk_modes[i >> 1]; intra_pred_mode_flag = byte & 0x1; @@ -1686,21 +1720,24 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) /* intra_chroma_pred_mode */ PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + pu1_byte += sizeof(mb_hdr_i8x8_t); + } else if(mb_type == BDIRECT) { is_inter = 1; /* write mb type */ PUT_BITS_UEV(ps_bitstream, B_DIRECT_16x16, error_status, "mb type"); + pu1_byte += sizeof(mb_hdr_bdirect_t); + } else /* if mb_type == B16x16 */ { + mb_hdr_b16x16_t *ps_mb_hdr_b16x16 = (mb_hdr_b16x16_t *)ps_ent_ctxt->pv_mb_header_data; + /* inter macro block partition cnt for 16x16 16x8 8x16 8x8 */ const UWORD8 au1_part_cnt[] = { 1, 2, 2, 4 }; - /* mv ptr */ - WORD16 *pi2_mvd_ptr = (WORD16 *)pu1_byte; - /* number of partitions for the current mb */ UWORD32 u4_part_cnt = au1_part_cnt[mb_type - B16x16]; @@ -1718,21 +1755,17 @@ IH264E_ERROR_T ih264e_write_bslice_mb_cavlc(entropy_ctxt_t *ps_ent_ctxt) { if (i4_mb_part_pred_mode != PRED_L1)/* || PRED_BI */ { - PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr, error_status, "mv l0 x"); - pi2_mvd_ptr++; - PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr, error_status, "mv l0 y"); - pi2_mvd_ptr++; + PUT_BITS_SEV(ps_bitstream, ps_mb_hdr_b16x16->ai2_mv[0][0], error_status, "mv l0 x"); + PUT_BITS_SEV(ps_bitstream, ps_mb_hdr_b16x16->ai2_mv[0][1], error_status, "mv l0 y"); } if (i4_mb_part_pred_mode != PRED_L0)/* || PRED_BI */ { - PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr, error_status, "mv l1 x"); - pi2_mvd_ptr++; - PUT_BITS_SEV(ps_bitstream, *pi2_mvd_ptr, error_status, "mv l1 y"); - pi2_mvd_ptr++; + PUT_BITS_SEV(ps_bitstream, ps_mb_hdr_b16x16->ai2_mv[1][0], error_status, "mv l1 x"); + PUT_BITS_SEV(ps_bitstream, ps_mb_hdr_b16x16->ai2_mv[1][1], error_status, "mv l1 y"); } } - pu1_byte = (UWORD8 *)pi2_mvd_ptr; + pu1_byte += sizeof(mb_hdr_b16x16_t); } /* coded_block_pattern */ diff --git a/encoder/ih264e_defs.h b/encoder/ih264e_defs.h index aee270e..c4e7885 100644 --- a/encoder/ih264e_defs.h +++ b/encoder/ih264e_defs.h @@ -66,7 +66,8 @@ * Maximum width supported by codec */ -#define MAX_WD 1920 +/* changed by haining@ to support Nexus 6P screen size, was previously 1920 */ +#define MAX_WD 2560 /** * Minimum height supported by codec @@ -77,7 +78,8 @@ * Maximum height supported by codec */ -#define MAX_HT 1920 +/* changed by haining@ to support Nexus 6P screen size, was previously 1920 */ +#define MAX_HT 2560 /*****************************************************************************/ /* Padding sizes */ diff --git a/encoder/ih264e_encode_header.c b/encoder/ih264e_encode_header.c index 04bdc14..3626a63 100644 --- a/encoder/ih264e_encode_header.c +++ b/encoder/ih264e_encode_header.c @@ -1129,7 +1129,14 @@ IH264E_ERROR_T ih264e_populate_sps(codec_t *ps_codec, sps_t *ps_sps) } /* direct_8x8_inference_flag */ - ps_sps->i1_direct_8x8_inference_flag = 0; + if (ps_sps->u1_level_idc < IH264_LEVEL_30) + { + ps_sps->i1_direct_8x8_inference_flag = 0; + } + else + { + ps_sps->i1_direct_8x8_inference_flag = 1; + } /* cropping params */ /*NOTE : Cropping values depend on the chroma format diff --git a/encoder/ih264e_process.c b/encoder/ih264e_process.c index 796c983..5fb0b88 100644 --- a/encoder/ih264e_process.c +++ b/encoder/ih264e_process.c @@ -652,18 +652,19 @@ IH264E_ERROR_T ih264e_pack_header_data(process_ctxt_t *ps_proc) { /* pointer to mb header storage space */ UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; + mb_hdr_i4x4_t *ps_mb_hdr = (mb_hdr_i4x4_t *)ps_proc->pv_mb_header_data; /* temp var */ WORD32 i4, byte; /* mb type plus mode */ - *pu1_ptr++ = (ps_proc->u1_c_i8_mode << 6) + u4_mb_type; + ps_mb_hdr->common.u1_mb_type_mode = (ps_proc->u1_c_i8_mode << 6) + u4_mb_type; /* cbp */ - *pu1_ptr++ = ps_proc->u4_cbp; + ps_mb_hdr->common.u1_cbp = ps_proc->u4_cbp; /* mb qp delta */ - *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; + ps_mb_hdr->common.u1_mb_qp_delta = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; /* sub mb modes */ for (i4 = 0; i4 < 16; i4 ++) @@ -710,63 +711,66 @@ IH264E_ERROR_T ih264e_pack_header_data(process_ctxt_t *ps_proc) } } - *pu1_ptr++ = byte; + ps_mb_hdr->au1_sub_blk_modes[i4 >> 1] = byte; } /* end of mb layer */ + pu1_ptr += sizeof(mb_hdr_i4x4_t); ps_proc->pv_mb_header_data = pu1_ptr; } else if (u4_mb_type == I16x16) { /* pointer to mb header storage space */ UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; + mb_hdr_i16x16_t *ps_mb_hdr = (mb_hdr_i16x16_t *)ps_proc->pv_mb_header_data; /* mb type plus mode */ - *pu1_ptr++ = (ps_proc->u1_c_i8_mode << 6) + (ps_proc->u1_l_i16_mode << 4) + u4_mb_type; + ps_mb_hdr->common.u1_mb_type_mode = (ps_proc->u1_c_i8_mode << 6) + (ps_proc->u1_l_i16_mode << 4) + u4_mb_type; /* cbp */ - *pu1_ptr++ = ps_proc->u4_cbp; + ps_mb_hdr->common.u1_cbp = ps_proc->u4_cbp; /* mb qp delta */ - *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; + ps_mb_hdr->common.u1_mb_qp_delta = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; /* end of mb layer */ + pu1_ptr += sizeof(mb_hdr_i16x16_t); ps_proc->pv_mb_header_data = pu1_ptr; } else if (u4_mb_type == P16x16) { /* pointer to mb header storage space */ UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; + mb_hdr_p16x16_t *ps_mb_hdr = (mb_hdr_p16x16_t *)ps_proc->pv_mb_header_data; - WORD16 *i2_mv_ptr; - - /* mb type plus mode */ - *pu1_ptr++ = u4_mb_type; + /* mb type */ + ps_mb_hdr->common.u1_mb_type_mode = u4_mb_type; /* cbp */ - *pu1_ptr++ = ps_proc->u4_cbp; + ps_mb_hdr->common.u1_cbp = ps_proc->u4_cbp; /* mb qp delta */ - *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; - - i2_mv_ptr = (WORD16 *)pu1_ptr; + ps_mb_hdr->common.u1_mb_qp_delta = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; - *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx - ps_proc->ps_pred_mv[0].s_mv.i2_mvx; + ps_mb_hdr->ai2_mv[0] = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx - ps_proc->ps_pred_mv[0].s_mv.i2_mvx; - *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy - ps_proc->ps_pred_mv[0].s_mv.i2_mvy; + ps_mb_hdr->ai2_mv[1] = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy - ps_proc->ps_pred_mv[0].s_mv.i2_mvy; /* end of mb layer */ - ps_proc->pv_mb_header_data = i2_mv_ptr; + pu1_ptr += sizeof(mb_hdr_p16x16_t); + ps_proc->pv_mb_header_data = pu1_ptr; } else if (u4_mb_type == PSKIP) { /* pointer to mb header storage space */ UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; + mb_hdr_pskip_t *ps_mb_hdr = (mb_hdr_pskip_t *)ps_proc->pv_mb_header_data; - /* mb type plus mode */ - *pu1_ptr++ = u4_mb_type; + /* mb type */ + ps_mb_hdr->common.u1_mb_type_mode = u4_mb_type; /* end of mb layer */ + pu1_ptr += sizeof(mb_hdr_pskip_t); ps_proc->pv_mb_header_data = pu1_ptr; } else if(u4_mb_type == B16x16) @@ -774,58 +778,59 @@ IH264E_ERROR_T ih264e_pack_header_data(process_ctxt_t *ps_proc) /* pointer to mb header storage space */ UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; - - WORD16 *i2_mv_ptr; + mb_hdr_b16x16_t *ps_mb_hdr = (mb_hdr_b16x16_t *)ps_proc->pv_mb_header_data; UWORD32 u4_pred_mode = ps_proc->ps_pu->b2_pred_mode; /* mb type plus mode */ - *pu1_ptr++ = (u4_pred_mode << 4) + u4_mb_type; + ps_mb_hdr->common.u1_mb_type_mode = (u4_pred_mode << 4) + u4_mb_type; /* cbp */ - *pu1_ptr++ = ps_proc->u4_cbp; + ps_mb_hdr->common.u1_cbp = ps_proc->u4_cbp; /* mb qp delta */ - *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; + ps_mb_hdr->common.u1_mb_qp_delta = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; /* l0 & l1 me data */ - i2_mv_ptr = (WORD16 *)pu1_ptr; - if (u4_pred_mode != PRED_L1) { - *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx + ps_mb_hdr->ai2_mv[0][0] = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvx - ps_proc->ps_pred_mv[0].s_mv.i2_mvx; - *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy + ps_mb_hdr->ai2_mv[0][1] = ps_proc->ps_pu->s_me_info[0].s_mv.i2_mvy - ps_proc->ps_pred_mv[0].s_mv.i2_mvy; } if (u4_pred_mode != PRED_L0) { - *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvx + ps_mb_hdr->ai2_mv[1][0] = ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvx - ps_proc->ps_pred_mv[1].s_mv.i2_mvx; - *i2_mv_ptr++ = ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvy + ps_mb_hdr->ai2_mv[1][1] = ps_proc->ps_pu->s_me_info[1].s_mv.i2_mvy - ps_proc->ps_pred_mv[1].s_mv.i2_mvy; } /* end of mb layer */ - ps_proc->pv_mb_header_data = i2_mv_ptr; + pu1_ptr += sizeof(mb_hdr_b16x16_t); + ps_proc->pv_mb_header_data = pu1_ptr; } else if(u4_mb_type == BDIRECT) { /* pointer to mb header storage space */ UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; + mb_hdr_bdirect_t *ps_mb_hdr = (mb_hdr_bdirect_t *)ps_proc->pv_mb_header_data; /* mb type plus mode */ - *pu1_ptr++ = u4_mb_type; + ps_mb_hdr->common.u1_mb_type_mode = u4_mb_type; /* cbp */ - *pu1_ptr++ = ps_proc->u4_cbp; + ps_mb_hdr->common.u1_cbp = ps_proc->u4_cbp; /* mb qp delta */ - *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; + ps_mb_hdr->common.u1_mb_qp_delta = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; + /* end of mb layer */ + pu1_ptr += sizeof(mb_hdr_bdirect_t); ps_proc->pv_mb_header_data = pu1_ptr; } @@ -835,11 +840,13 @@ IH264E_ERROR_T ih264e_pack_header_data(process_ctxt_t *ps_proc) /* pointer to mb header storage space */ UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; + mb_hdr_bskip_t *ps_mb_hdr = (mb_hdr_bskip_t *)ps_proc->pv_mb_header_data; /* mb type plus mode */ - *pu1_ptr++ = (u4_pred_mode << 4) + u4_mb_type; + ps_mb_hdr->common.u1_mb_type_mode = (u4_pred_mode << 4) + u4_mb_type; /* end of mb layer */ + pu1_ptr += sizeof(mb_hdr_bskip_t); ps_proc->pv_mb_header_data = pu1_ptr; } diff --git a/encoder/ih264e_structs.h b/encoder/ih264e_structs.h index 6cbce7c..125db84 100644 --- a/encoder/ih264e_structs.h +++ b/encoder/ih264e_structs.h @@ -1151,6 +1151,184 @@ typedef struct /** ****************************************************************************** +* @brief mb_hdr structures to access first few common elements of above +* structures +****************************************************************************** +*/ + +typedef struct +{ + /** + * mb type and mode + */ + UWORD8 u1_mb_type_mode; + + /** + * CBP + */ + UWORD8 u1_cbp; + + /** + * MB qp delta + */ + UWORD8 u1_mb_qp_delta; + + /** + * Element to align structure to 2 byte boundary + */ + UWORD8 u1_pad; +}mb_hdr_common_t; + +/** +****************************************************************************** +* @brief macro block info for I4x4 MB +****************************************************************************** +*/ +typedef struct +{ + /** + * Common MB header params + */ + mb_hdr_common_t common; + + /** + * Sub block modes, 2 modes per byte + */ + UWORD8 au1_sub_blk_modes[8]; +}mb_hdr_i4x4_t; + +/** +****************************************************************************** +* @brief macro block info for I8x8 MB +****************************************************************************** +*/ +typedef struct +{ + /** + * Common MB header params + */ + mb_hdr_common_t common; + + + /** + * Sub block modes, 2 modes per byte + */ + UWORD8 au1_sub_blk_modes[2]; +}mb_hdr_i8x8_t; + +/** +****************************************************************************** +* @brief macro block info for I16x16 MB +****************************************************************************** +*/ +typedef struct +{ + /** + * Common MB header params + */ + mb_hdr_common_t common; + +}mb_hdr_i16x16_t; + +/** +****************************************************************************** +* @brief macro block info for P16x16 MB +****************************************************************************** +*/ +typedef struct +{ + /** + * Common MB header params + */ + mb_hdr_common_t common; + + /** + * MV + */ + WORD16 ai2_mv[2]; +}mb_hdr_p16x16_t; + +/** +****************************************************************************** +* @brief macro block info for PSKIP MB +****************************************************************************** +*/ +typedef struct +{ + /** + * Common MB header params + */ + mb_hdr_common_t common; + +}mb_hdr_pskip_t; + +/** +****************************************************************************** +* @brief macro block info for B16x16 MB +****************************************************************************** +*/ +typedef struct +{ + /** + * Common MB header params + */ + mb_hdr_common_t common; + + + /** + * MV + */ + WORD16 ai2_mv[2][2]; +}mb_hdr_b16x16_t; + +/** +****************************************************************************** +* @brief macro block info for BDIRECT MB +****************************************************************************** +*/ +typedef struct +{ + /** + * Common MB header params + */ + mb_hdr_common_t common; + +}mb_hdr_bdirect_t; + +/** +****************************************************************************** +* @brief macro block info for PSKIP MB +****************************************************************************** +*/ +typedef struct +{ + /** + * Common MB header params + */ + mb_hdr_common_t common; + +}mb_hdr_bskip_t; + +/** +****************************************************************************** +* @brief Union of mb_hdr structures for size calculation +* and to access first few common elements +****************************************************************************** +*/ + +typedef union +{ + mb_hdr_i4x4_t mb_hdr_i4x4; + mb_hdr_i8x8_t mb_hdr_i8x8; + mb_hdr_i16x16_t mb_hdr_i16x16; + mb_hdr_p16x16_t mb_hdr_p16x16; + mb_hdr_pskip_t mb_hdr_pskip; + mb_hdr_b16x16_t mb_hdr_b16x16; + mb_hdr_bdirect_t mb_hdr_bdirect; + mb_hdr_bskip_t mb_hdr_bskip; +}mb_hdr_t; +/** +****************************************************************************** * @brief structure presenting the neighbor availability of a mb * or subblk or any other partition ****************************************************************************** diff --git a/encoder/irc_rate_control_api.c b/encoder/irc_rate_control_api.c index 95befce..4a64645 100644 --- a/encoder/irc_rate_control_api.c +++ b/encoder/irc_rate_control_api.c @@ -756,6 +756,16 @@ void irc_update_frame_level_info(rate_control_api_t *ps_rate_control_api, { u1_is_scd = 0; } + /* For frames that contain plane areas that differ from reference frames, encoder + * might generate more INTRA MBs because of lower SAD compared with INTER MBs. + * Such cases should not be treated as scene change. + * For such frames bits consumed will be lesser than the allocated bits. + */ + if(i4_total_frame_bits < ps_rate_control_api->i4_prev_frm_est_bits) + { + u1_is_scd = 0; + } + trace_printf((const WORD8*)"i4_total_frame_bits %d\n", i4_total_frame_bits); if(!i4_is_it_a_skip && !i4_is_pic_handling_done) diff --git a/test/decoder.mk b/test/decoder.mk index 1a49a92..0dda948 100644 --- a/test/decoder.mk +++ b/test/decoder.mk @@ -9,5 +9,5 @@ LOCAL_CFLAGS := -DPROFILE_ENABLE -DARM -DMD5_DISABLE -fPIC LOCAL_C_INCLUDES += $(LOCAL_PATH)/../decoder $(LOCAL_PATH)/../common $(LOCAL_PATH)/decoder/ LOCAL_SRC_FILES := decoder/main.c LOCAL_STATIC_LIBRARIES := libavcdec - +LOCAL_SHARED_LIBRARIES := liblog include $(BUILD_EXECUTABLE) diff --git a/test/decoder/dec.cfg b/test/decoder/dec.cfg new file mode 100644 index 0000000..f452ea1 --- /dev/null +++ b/test/decoder/dec.cfg @@ -0,0 +1,12 @@ +--input input.h264 +--save_output 0 +--num_frames -1 +--output out.yuv +--chroma_format YUV_420P +--share_display_buf 0 +--num_cores 3 +--loopback 0 +--display 0 +--fps 59.94 +--arch ARM_A9Q +--soc GENERIC diff --git a/test/encoder/enc.cfg b/test/encoder/enc.cfg new file mode 100644 index 0000000..ba62199 --- /dev/null +++ b/test/encoder/enc.cfg @@ -0,0 +1,47 @@ +--input input_qvga.yuv +--output output.264 +--recon recon.yuv +--chksum chksum.md5 +--chksum_enable 0 +--recon_enable 0 +--input_chroma_format YUV_420P +--recon_chroma_format YUV_420P +--qp_i 24 +--qp_p 27 +--qp_b 29 +--qp_i_min 4 +--qp_i_max 49 +--qp_p_min 4 +--qp_p_max 49 +--qp_b_min 4 +--qp_b_max 49 +--max_wd 1920 +--max_ht 1080 +--psnr 0 +--slice 0 +--slice_param 0 +--num_frames -1 +--search_range_x 16 +--search_range_y 16 +--width 320 +--height 240 +--src_framerate 30 +--tgt_framerate 30 +--num_cores 4 +--rc 2 +--bitrate 256000 +--vbv_delay 1000 +--disable_deblock_level 0 +--intra_4x4_enable 1 +--i_interval 1000 +--me_speed 100 +--hpel 1 +--fast_sad 0 +--speed NORMAL +--max_level 41 +--idr_interval 1000 +--entropy 0 +--bframes 0 +--adaptive_intra_refresh 0 +--air_refresh_period 30 + |