1 files changed, 36 insertions, 84 deletions
diff --git a/common/armv8/ih264_padding_neon_av8.s b/common/armv8/ih264_padding_neon_av8.s
index 35d9c8a..e03fe2f 100644
--- a/common/armv8/ih264_padding_neon_av8.s
+++ b/common/armv8/ih264_padding_neon_av8.s
@@ -76,9 +76,9 @@
 //                   WORD32 pad_size)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => wd
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => wd
+//    w3 => pad_size
 
     .global ih264_pad_top_av8
 
@@ -86,25 +86,25 @@ ih264_pad_top_av8:
 
     // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
     stp       x19, x20, [sp, #-16]!
 
     sub       x5, x0, x1
-    sub       x20, x1, #0
-    neg       x6, x20
+    neg       x6, x1
 
 loop_neon_memcpy_mul_16:
     // Load 16 bytes
     ld1       {v0.8b, v1.8b}, [x0], #16
     mov       x4, x5
-    mov       x7, x3
+    mov       w7, w3
     add       x5, x5, #16
 
 loop_neon_pad_top:
     st1       {v0.8b, v1.8b}, [x4], x6
-    subs      x7, x7, #1
+    subs      w7, w7, #1
     bne       loop_neon_pad_top
 
-    subs      x2, x2, #16
+    subs      w2, w2, #16
     bne       loop_neon_memcpy_mul_16
 
     // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
@@ -160,9 +160,9 @@ loop_neon_pad_top:
 //                        WORD32 pad_size)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -172,6 +172,8 @@ ih264_pad_left_luma_av8:
 
     // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
 
@@ -182,43 +184,35 @@ ih264_pad_left_luma_av8:
 loop_16:                                //  /*hard coded for width=16  ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     dup       v4.16b, w10
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     dup       v4.16b, w10
     dup       v6.16b, w11
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     bne       loop_16
@@ -227,14 +221,11 @@ loop_16:                                //  /*hard coded for width=16  ,height =
 loop_32:                                //  /*hard coded for width=32 ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6
@@ -243,35 +234,30 @@ loop_32:                                //  /*hard coded for width=32 ,height =8
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     dup       v0.16b, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.16b, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     bne       loop_32
@@ -333,9 +319,9 @@ end_func:
 //                            WORD32 pad_size)
 //{
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -345,6 +331,8 @@ ih264_pad_left_chroma_av8:
 
     // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     sub       x4, x0, x3
@@ -354,27 +342,23 @@ ih264_pad_left_chroma_av8:
 loop_32_l_c:                            //  /*hard coded for width=32  ,height =4,8,12*/
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
 
@@ -383,27 +367,23 @@ loop_32_l_c:                            //  /*hard coded for width=32  ,height =
 
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
 
@@ -412,20 +392,16 @@ loop_32_l_c:                            //  /*hard coded for width=32  ,height =
 
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
@@ -500,9 +476,9 @@ end_func_l_c:
 //}
 //
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -512,6 +488,8 @@ ih264_pad_right_luma_av8:
 
     // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     mov       x4, x0
@@ -522,43 +500,35 @@ ih264_pad_right_luma_av8:
 loop_16_r: //  /*hard coded for width=16  ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     dup       v4.16b, w10
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     dup       v4.16b, w10
     dup       v6.16b, w11
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     bne       loop_16_r
@@ -567,14 +537,11 @@ loop_16_r: //  /*hard coded for width=16  ,height =8,16*/
 loop_32_r:                              //  /*hard coded for width=32  ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6
@@ -583,35 +550,30 @@ loop_32_r:                              //  /*hard coded for width=32  ,height =
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.16b, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     bne       loop_32_r
@@ -672,9 +634,9 @@ end_func_r:
 //                        WORD32 ht,
 //                        WORD32 pad_size)
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -684,6 +646,8 @@ ih264_pad_right_chroma_av8:
 
     // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     mov       x4, x0
@@ -692,24 +656,20 @@ ih264_pad_right_chroma_av8:
 loop_32_r_c: //  /*hard coded for width=32 ,height =8,4*/
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
@@ -720,27 +680,23 @@ loop_32_r_c: //  /*hard coded for width=32 ,height =8,4*/
 
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     dup       v0.8h, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
 
@@ -748,20 +704,16 @@ loop_32_r_c: //  /*hard coded for width=32 ,height =8,4*/
     bne       loop_32_r_c
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     dup       v0.8h, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store