diff options
Diffstat (limited to 'common/armv8/ih264_padding_neon_av8.s')
-rw-r--r-- | common/armv8/ih264_padding_neon_av8.s | 120 |
1 files changed, 36 insertions, 84 deletions
diff --git a/common/armv8/ih264_padding_neon_av8.s b/common/armv8/ih264_padding_neon_av8.s index 35d9c8a..e03fe2f 100644 --- a/common/armv8/ih264_padding_neon_av8.s +++ b/common/armv8/ih264_padding_neon_av8.s @@ -76,9 +76,9 @@ // WORD32 pad_size) //**************Variables Vs Registers************************* // x0 => *pu1_src -// x1 => src_strd -// x2 => wd -// x3 => pad_size +// w1 => src_strd +// w2 => wd +// w3 => pad_size .global ih264_pad_top_av8 @@ -86,25 +86,25 @@ ih264_pad_top_av8: // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 stp x19, x20, [sp, #-16]! sub x5, x0, x1 - sub x20, x1, #0 - neg x6, x20 + neg x6, x1 loop_neon_memcpy_mul_16: // Load 16 bytes ld1 {v0.8b, v1.8b}, [x0], #16 mov x4, x5 - mov x7, x3 + mov w7, w3 add x5, x5, #16 loop_neon_pad_top: st1 {v0.8b, v1.8b}, [x4], x6 - subs x7, x7, #1 + subs w7, w7, #1 bne loop_neon_pad_top - subs x2, x2, #16 + subs w2, w2, #16 bne loop_neon_memcpy_mul_16 // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP @@ -160,9 +160,9 @@ loop_neon_pad_top: // WORD32 pad_size) //**************Variables Vs Registers************************* // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -172,6 +172,8 @@ ih264_pad_left_luma_av8: // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! @@ -182,43 +184,35 @@ ih264_pad_left_luma_av8: loop_16: // /*hard coded for width=16 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 st1 {v2.16b}, [x4], x1 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 dup v4.16b, w10 dup v6.16b, w11 st1 {v4.16b}, [x4], x1 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], x1 // 16 bytes store ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], x1 // 16 bytes store dup v4.16b, w10 dup v6.16b, w11 - subs x2, x2, #8 + subs w2, w2, #8 st1 {v4.16b}, [x4], x1 // 16 bytes store st1 {v6.16b}, [x4], x1 // 16 bytes store bne loop_16 @@ -227,14 +221,11 @@ loop_16: // /*hard coded for width=16 ,height = loop_32: // /*hard coded for width=32 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 @@ -243,35 +234,30 @@ loop_32: // /*hard coded for width=32 ,height =8 st1 {v2.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], #16 // 16 bytes store dup v0.16b, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 st1 {v6.16b}, [x4], x6 // 16 bytes store ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.16b, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #8 + subs w2, w2, #8 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store bne loop_32 @@ -333,9 +319,9 @@ end_func: // WORD32 pad_size) //{ // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -345,6 +331,8 @@ ih264_pad_left_chroma_av8: // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! sub x4, x0, x3 @@ -354,27 +342,23 @@ ih264_pad_left_chroma_av8: loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/ ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store @@ -383,27 +367,23 @@ loop_32_l_c: // /*hard coded for width=32 ,height = ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store @@ -412,20 +392,16 @@ loop_32_l_c: // /*hard coded for width=32 ,height = ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store @@ -500,9 +476,9 @@ end_func_l_c: //} // // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -512,6 +488,8 @@ ih264_pad_right_luma_av8: // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! mov x4, x0 @@ -522,43 +500,35 @@ ih264_pad_right_luma_av8: loop_16_r: // /*hard coded for width=16 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 st1 {v2.16b}, [x4], x1 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 dup v4.16b, w10 dup v6.16b, w11 st1 {v4.16b}, [x4], x1 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], x1 // 16 bytes store ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], x1 // 16 bytes store dup v2.16b, w9 ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], x1 // 16 bytes store dup v4.16b, w10 dup v6.16b, w11 - subs x2, x2, #8 + subs w2, w2, #8 st1 {v4.16b}, [x4], x1 // 16 bytes store st1 {v6.16b}, [x4], x1 // 16 bytes store bne loop_16_r @@ -567,14 +537,11 @@ loop_16_r: // /*hard coded for width=16 ,height =8,16*/ loop_32_r: // /*hard coded for width=32 ,height =8,16*/ ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 @@ -583,35 +550,30 @@ loop_32_r: // /*hard coded for width=32 ,height = st1 {v2.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store ldrb w8, [x0] add x0, x0, x1 - sxtw x8, w8 st1 {v6.16b}, [x4], #16 // 16 bytes store ldrb w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.16b, w8 st1 {v6.16b}, [x4], x6 // 16 bytes store ldrb w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.16b, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrb w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.16b, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.16b, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #8 + subs w2, w2, #8 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store bne loop_32_r @@ -672,9 +634,9 @@ end_func_r: // WORD32 ht, // WORD32 pad_size) // x0 => *pu1_src -// x1 => src_strd -// x2 => ht -// x3 => pad_size +// w1 => src_strd +// w2 => ht +// w3 => pad_size @@ -684,6 +646,8 @@ ih264_pad_right_chroma_av8: // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments push_v_regs + sxtw x1, w1 + sxtw x3, w3 stp x19, x20, [sp, #-16]! mov x4, x0 @@ -692,24 +656,20 @@ ih264_pad_right_chroma_av8: loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 dup v0.8h, w8 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store @@ -720,27 +680,23 @@ loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 dup v0.8h, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store st1 {v4.16b}, [x4], #16 // 16 bytes store dup v6.8h, w11 st1 {v4.16b}, [x4], x6 // 16 bytes store - subs x2, x2, #4 + subs w2, w2, #4 st1 {v6.16b}, [x4], #16 // 16 bytes store st1 {v6.16b}, [x4], x6 // 16 bytes store @@ -748,20 +704,16 @@ loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ bne loop_32_r_c ldrh w8, [x0] add x0, x0, x1 - sxtw x8, w8 dup v0.8h, w8 ldrh w9, [x0] add x0, x0, x1 - sxtw x9, w9 ldrh w10, [x0] add x0, x0, x1 - sxtw x10, w10 st1 {v0.16b}, [x4], #16 // 16 bytes store dup v2.8h, w9 st1 {v0.16b}, [x4], x6 // 16 bytes store ldrh w11, [x0] add x0, x0, x1 - sxtw x11, w11 st1 {v2.16b}, [x4], #16 // 16 bytes store dup v4.8h, w10 st1 {v2.16b}, [x4], x6 // 16 bytes store |