30 files changed, 569 insertions, 620 deletions
diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s
index 6681a7c..e2b8c99 100644
--- a/common/arm/ih264_inter_pred_chroma_a9q.s
+++ b/common/arm/ih264_inter_pred_chroma_a9q.s
@@ -91,8 +91,8 @@
 @                             UWORD8 *pu1_dst,
 @                             WORD32 src_strd,
 @                             WORD32 dst_strd,
-@                             UWORD8 u1_dx,
-@                             UWORD8 u1_dy,
+@                             WORD32 u1_dx,
+@                             WORD32 u1_dy,
 @                             WORD32 ht,
 @                             WORD32 wd)
 @**************Variables Vs Registers*****************************************
diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
index 0dd82f3..7597444 100644
--- a/common/arm/ih264_intra_pred_luma_16x16_a9q.s
+++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
@@ -413,7 +413,7 @@ scrlbl1:
 
     add           r7, r0, r4, lsl #3
     sub           r0, r7, r4, lsl #1
-    rsb           lr, r4, #0x0
+    neg           lr, r4
 
     vpadd.s16     d0, d0, d1
 
diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s
index 39ad9b3..b9595d7 100644
--- a/common/arm/ih264_mem_fns_neon.s
+++ b/common/arm/ih264_mem_fns_neon.s
@@ -68,7 +68,7 @@
 @*
 @void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
 @                    UWORD8 *pu1_src,
-@                   UWORD8 num_bytes)
+@                   UWORD32 num_bytes)
 @**************Variables Vs Registers*************************
 @   r0 => *pu1_dst
 @   r1 => *pu1_src
@@ -97,7 +97,7 @@ loop_neon_memcpy_mul_8:
 @*
 @void ih264_memcpy(UWORD8 *pu1_dst,
 @                  UWORD8 *pu1_src,
-@                  UWORD8 num_bytes)
+@                  UWORD32 num_bytes)
 @**************Variables Vs Registers*************************
 @   r0 => *pu1_dst
 @   r1 => *pu1_src
@@ -135,7 +135,7 @@ loop_memcpy:
 
 @void ih264_memset_mul_8(UWORD8 *pu1_dst,
 @                       UWORD8 value,
-@                       UWORD8 num_bytes)
+@                       UWORD32 num_bytes)
 @**************Variables Vs Registers*************************
 @   r0 => *pu1_dst
 @   r1 => value
@@ -202,7 +202,7 @@ loop_memset:
 
 @void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
 @                                   UWORD16 value,
-@                                   UWORD8 num_words)
+@                                   UWORD32 num_words)
 @**************Variables Vs Registers*************************
 @   r0 => *pu2_dst
 @   r1 => value
@@ -234,7 +234,7 @@ loop_memset_16bit_mul_8:
 
 @void ih264_memset_16bit(UWORD16 *pu2_dst,
 @                       UWORD16 value,
-@                       UWORD8 num_words)
+@                       UWORD32 num_words)
 @**************Variables Vs Registers*************************
 @   r0 => *pu2_dst
 @   r1 => value
diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s
index e7a1f91..819b0b3 100644
--- a/common/arm/ih264_padding_neon.s
+++ b/common/arm/ih264_padding_neon.s
@@ -88,7 +88,7 @@ ih264_pad_top_a9q:
     stmfd         sp!, {r4-r11, lr}     @stack stores the values of the arguments
 
     sub           r5, r0, r1
-    rsb           r6, r1, #0
+    neg           r6, r1
 
 loop_neon_memcpy_mul_16:
     @ Load 16 bytes
diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s
index 33859e6..304bd8a 100644
--- a/common/arm/ih264_weighted_bi_pred_a9q.s
+++ b/common/arm/ih264_weighted_bi_pred_a9q.s
@@ -144,7 +144,7 @@ ih264_weighted_bi_pred_luma_a9q:
     ldr           r4, [sp, #40]         @Load src_strd2 in r4
     ldr           r5, [sp, #44]         @Load dst_strd in r5
     sxtb          r9, r9                @sign-extend 8-bit ofst1 to 32-bit
-    rsb           r10, r6, #0           @r13 = -(log_wd + 1)
+    neg           r10, r6               @r10 = -(log_wd + 1)
     ldr           r11, [sp, #68]        @Load ht in r11
     ldr           r12, [sp, #72]        @Load wd in r12
     vdup.16       q0, r10               @Q0  = -(log_wd + 1) (32-bit)
@@ -456,7 +456,7 @@ ih264_weighted_bi_pred_chroma_a9q:
     ldr           r9, [sp, #60]         @Load ofst1 in r9
     ldr           r10, [sp, #64]        @Load ofst2 in r10
 
-    rsb           r12, r6, #0           @r12 = -(log_wd + 1)
+    neg           r12, r6               @r12 = -(log_wd + 1)
     ldr           r4, [sp, #40]         @Load src_strd2 in r4
     ldr           r5, [sp, #44]         @Load dst_strd in r5
     vdup.16       q0, r12               @Q0  = -(log_wd + 1) (16-bit)
diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s
index 81d26d4..80c2c6d 100644
--- a/common/arm/ih264_weighted_pred_a9q.s
+++ b/common/arm/ih264_weighted_pred_a9q.s
@@ -122,7 +122,7 @@ ih264_weighted_pred_luma_a9q:
     vpush         {d8-d15}
 
     vdup.16       d2, r5                @D2 = wt (16-bit)
-    rsb           r9, r4, #0            @r9 = -log_wd
+    neg           r9, r4                @r9 = -log_wd
     vdup.8        d3, r6                @D3 = ofst (8-bit)
     cmp           r8, #16               @check if wd is 16
     vdup.16       q0, r9                @Q0 = -log_wd (16-bit)
@@ -349,7 +349,7 @@ ih264_weighted_pred_chroma_a9q:
     ldr           r6, [sp, #36]         @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)}
     ldr           r8, [sp, #44]         @Load wd
 
-    rsb           r9, r4, #0            @r9 = -log_wd
+    neg           r9, r4                @r9 = -log_wd
     vdup.32       q1, r5                @Q1 = {wt_u (16-bit), wt_v (16-bit)}
     ldr           r7, [sp, #40]         @Load ht
     vpush         {d8-d15}
diff --git a/common/armv8/ih264_deblk_chroma_av8.s b/common/armv8/ih264_deblk_chroma_av8.s
index a4dbd23..b7f2d58 100644
--- a/common/armv8/ih264_deblk_chroma_av8.s
+++ b/common/armv8/ih264_deblk_chroma_av8.s
@@ -56,19 +56,19 @@
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
 //*  Alpha Value for the boundary in U
 //*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
 //*  Beta Value for the boundary in U
 //*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
 //*    Alpha Value for the boundary in V
 //*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
 //*    Beta Value for the boundary in V
 //*
 //* @returns
@@ -87,6 +87,7 @@ ih264_deblk_chroma_horz_bs4_av8:
     // STMFD sp!,{x4-x6,x14}            //
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x1, w1
     mov       x6, x5
     mov       x5, x4
     sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixel pointing to p1 of chroma
@@ -155,19 +156,19 @@ ih264_deblk_chroma_horz_bs4_av8:
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
 //*  Alpha Value for the boundary in U
 //*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
 //*  Beta Value for the boundary in U
 //*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
 //*    Alpha Value for the boundary in V
 //*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
 //*    Beta Value for the boundary in V
 //*
 //* @returns
@@ -186,12 +187,13 @@ ih264_deblk_chroma_vert_bs4_av8:
     // STMFD sp!,{x4,x5,x12,x14}
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x1, w1
 
     sub       x0, x0, #4                //point x0 to p1u of row0.
     mov       x12, x0                   //keep a back up of x0 for buffer write
 
-    add       x2, x2, x4, lsl #8        //x2 = (alpha_cr,alpha_cb)
-    add       x3, x3, x5, lsl #8        //x3 = (beta_cr,beta_cb)
+    add       w2, w2, w4, lsl #8        //w2 = (alpha_cr,alpha_cb)
+    add       w3, w3, w5, lsl #8        //w3 = (beta_cr,beta_cb)
 
     ld4       {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
     ld4       {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
@@ -292,28 +294,28 @@ ih264_deblk_chroma_vert_bs4_av8:
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
 //*  Alpha Value for the boundary in U
 //*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
 //*  Beta Value for the boundary in U
 //*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
 //*    Alpha Value for the boundary in V
 //*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
 //*    Beta Value for the boundary in V
 //*
-//* @param[in] sp(8) - u4_bs
+//* @param[in] w6 - u4_bs
 //*    Packed Boundary strength array
 //*
-//* @param[in] sp(12) - pu1_cliptab_cb
+//* @param[in] x7 - pu1_cliptab_cb
 //*    tc0_table for U
 //*
-//* @param[in] sp(16) - pu1_cliptab_cr
+//* @param[in] sp(0) - pu1_cliptab_cr
 //*    tc0_table for V
 //*
 //* @returns
@@ -332,14 +334,13 @@ ih264_deblk_chroma_horz_bslt4_av8:
     // STMFD sp!,{x4-x9,x14}        //
     push_v_regs
     stp       x19, x20, [sp, #-16]!
-    mov       x8, x7
-    mov       x7, x6
-    ldr       x9, [sp, #80]
+    sxtw      x1, w1
+    ldr       x8, [sp, #80]
     sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixelU pointing to p1 of chroma U
-    rev       w7, w7                    //
-    mov       v12.s[0], w7              //D12[0] = ui_Bs
-    ld1       {v16.s}[0], [x8]          //D16[0] contains cliptab_cb
-    ld1       {v17.s}[0], [x9]          //D17[0] contains cliptab_cr
+    rev       w6, w6                    //
+    mov       v12.s[0], w6              //D12[0] = ui_Bs
+    ld1       {v16.s}[0], [x7]          //D16[0] contains cliptab_cb
+    ld1       {v17.s}[0], [x8]          //D17[0] contains cliptab_cr
     ld2       {v6.8b, v7.8b}, [x0], x1  //Q3=p1
     tbl       v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U
     tbl       v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V
@@ -428,28 +429,28 @@ ih264_deblk_chroma_horz_bslt4_av8:
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
 //*  Alpha Value for the boundary in U
 //*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
 //*  Beta Value for the boundary in U
 //*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
 //*    Alpha Value for the boundary in V
 //*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
 //*    Beta Value for the boundary in V
 //*
-//* @param[in] sp(8) - u4_bs
+//* @param[in] w6 - u4_bs
 //*    Packed Boundary strength array
 //*
-//* @param[in] sp(12) - pu1_cliptab_cb
+//* @param[in] x7 - pu1_cliptab_cb
 //*    tc0_table for U
 //*
-//* @param[in] sp(16) - pu1_cliptab_cr
+//* @param[in] sp(0) - pu1_cliptab_cr
 //*    tc0_table for V
 //*
 //* @returns
@@ -468,11 +469,12 @@ ih264_deblk_chroma_vert_bslt4_av8:
     // STMFD sp!,{x4-x7,x10-x12,x14}
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x1, w1
     mov       x10, x7
-    ldr       x11, [sp, #80]            //x6 = u4_bs
+    ldr       x11, [sp, #80]            //x11 = u4_bs
     sub       x0, x0, #4                //point x0 to p1u of row0.
-    add       x2, x2, x4, lsl #8
-    add       x3, x3, x5, lsl #8
+    add       w2, w2, w4, lsl #8
+    add       w3, w3, w5, lsl #8
     mov       x12, x0                   //keep a back up of x0 for buffer write
     ld4       {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
     ld4       {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
diff --git a/common/armv8/ih264_deblk_luma_av8.s b/common/armv8/ih264_deblk_luma_av8.s
index 1b3950d..7705df2 100644
--- a/common/armv8/ih264_deblk_luma_av8.s
+++ b/common/armv8/ih264_deblk_luma_av8.s
@@ -60,19 +60,19 @@
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
 //*  Alpha Value for the boundary
 //*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
 //*  Beta Value for the boundary
 //*
-//* @param[in] sp(0) - u4_bs
+//* @param[in] w4 - u4_bs
 //*    Packed Boundary strength array
 //*
-//* @param[in] sp(4) - pu1_cliptab
+//* @param[in] x5 - pu1_cliptab
 //*    tc0_table
 //*
 //* @returns
@@ -90,6 +90,7 @@ ih264_deblk_luma_horz_bslt4_av8:
 
     // STMFD sp!,{x4-x7,x14}
     push_v_regs
+    sxtw      x1, w1
     stp       x19, x20, [sp, #-16]!
 
     //LDRD            x4,x5,[SP,#0x14]        //x4 = ui_Bs , x5 = *puc_ClpTab
@@ -214,13 +215,13 @@ ih264_deblk_luma_horz_bslt4_av8:
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
 //*  Alpha Value for the boundary
 //*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
 //*  Beta Value for the boundary
 //*
 //* @returns
@@ -240,6 +241,7 @@ ih264_deblk_luma_horz_bs4_av8:
     // STMFD sp!,{x12,x14}
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x1, w1
 
     // Init
     dup       v0.16b, w2                //duplicate alpha
@@ -401,19 +403,19 @@ ih264_deblk_luma_horz_bs4_av8:
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
 //*  Alpha Value for the boundary
 //*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
 //*  Beta Value for the boundary
 //*
-//* @param[in] sp(0) - u4_bs
+//* @param[in] w4 - u4_bs
 //*    Packed Boundary strength array
 //*
-//* @param[in] sp(4) - pu1_cliptab
+//* @param[in] x5 - pu1_cliptab
 //*    tc0_table
 //*
 //* @returns
@@ -432,6 +434,7 @@ ih264_deblk_luma_vert_bslt4_av8:
     // STMFD sp!,{x12,x14}
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x1, w1
 
     sub       x0, x0, #4                //pointer uc_edgePixel-4
     mov       x12, x4
@@ -743,13 +746,13 @@ ih264_deblk_luma_vert_bslt4_av8:
 //* @param[in] x0 - pu1_src
 //*  Pointer to the src sample q0
 //*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
 //*  Source stride
 //*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
 //*  Alpha Value for the boundary
 //*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
 //*  Beta Value for the boundary
 //*
 //* @returns
diff --git a/common/armv8/ih264_default_weighted_pred_av8.s b/common/armv8/ih264_default_weighted_pred_av8.s
index 6823015..d10047e 100644
--- a/common/armv8/ih264_default_weighted_pred_av8.s
+++ b/common/armv8/ih264_default_weighted_pred_av8.s
@@ -88,18 +88,18 @@
 //                                          WORD32 src_strd1,
 //                                          WORD32 src_strd2,
 //                                          WORD32 dst_strd,
-//                                          UWORD8 ht,
-//                                          UWORD8 wd)
+//                                          WORD32 ht,
+//                                          WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src1
 //    x1      => puc_src2
 //    x2      => puc_dst
-//    x3      => src_strd1
-//    [sp]    => src_strd2 (x4)
-//    [sp+4]  => dst_strd  (x5)
-//    [sp+8]  => ht        (x6)
-//    [sp+12] => wd        (x7)
+//    w3      => src_strd1
+//    w4      => src_strd2
+//    w5      => dst_strd
+//    w6      => ht
+//    w7      => wd
 //
 .text
 .p2align 2
@@ -113,6 +113,9 @@ ih264_default_weighted_pred_luma_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
     cmp       w7, #16
     beq       loop_16                   //branch if wd is 16
     cmp       w7, #8
@@ -263,18 +266,18 @@ end_loops:
 //                                            WORD32 src_strd1,
 //                                            WORD32 src_strd2,
 //                                            WORD32 dst_strd,
-//                                            UWORD8 ht,
-//                                            UWORD8 wd)
+//                                            WORD32 ht,
+//                                            WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src1
 //    x1      => puc_src2
 //    x2      => puc_dst
-//    x3      => src_strd1
-//    [sp]    => src_strd2 (x4)
-//    [sp+4]  => dst_strd  (x5)
-//    [sp+8]  => ht        (x6)
-//    [sp+12] => wd        (x7)
+//    w3      => src_strd1
+//    w4      => src_strd2
+//    w5      => dst_strd
+//    w6      => ht
+//    w7      => wd
 //
 
 
@@ -286,6 +289,9 @@ ih264_default_weighted_pred_chroma_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
     cmp       w7, #8
     beq       loop_8_uv                 //branch if wd is 8
     cmp       w7, #4
diff --git a/common/armv8/ih264_inter_pred_chroma_av8.s b/common/armv8/ih264_inter_pred_chroma_av8.s
index 714e271..f6aef40 100644
--- a/common/armv8/ih264_inter_pred_chroma_av8.s
+++ b/common/armv8/ih264_inter_pred_chroma_av8.s
@@ -91,19 +91,19 @@
 //                             UWORD8 *pu1_dst,
 //                             WORD32 src_strd,
 //                             WORD32 dst_strd,
-//                             UWORD8 u1_dx,
-//                             UWORD8 u1_dy,
+//                             WORD32 u1_dx,
+//                             WORD32 u1_dy,
 //                             WORD32 ht,
 //                             WORD32 wd)
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  u1_dx
-//   x5 =>  u1_dy
-//    x6 =>  height
-//    x7 => width
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  u1_dx
+//    w5 =>  u1_dy
+//    w6 =>  height
+//    w7 =>  width
 //
 .text
 .p2align 2
@@ -120,6 +120,12 @@ ih264_inter_pred_chroma_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
+    sxtw      x6, w6
+    sxtw      x7, w7
 
 
 
diff --git a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
index 6ad463a..e7c9f86 100644
--- a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
+++ b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
@@ -89,10 +89,10 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
 
 .text
 .p2align 2
@@ -111,6 +111,10 @@ ih264_inter_pred_luma_horz_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
     sub       x0, x0, #2                //pu1_src-2
     sub       x14, x4, #16
     movi      v0.8b, #5                 //filter coeff
diff --git a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
index 9564f99..711d73e 100644
--- a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
+++ b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
@@ -89,10 +89,10 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
 
 .text
 .p2align 2
@@ -108,6 +108,10 @@ ih264_inter_pred_luma_vert_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
 
diff --git a/common/armv8/ih264_inter_pred_luma_copy_av8.s b/common/armv8/ih264_inter_pred_luma_copy_av8.s
index 1a76c1c..007df30 100644
--- a/common/armv8/ih264_inter_pred_luma_copy_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_copy_av8.s
@@ -65,10 +65,10 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x7 =>  ht
-//    x12 => wd
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
 
 .text
 .p2align 2
@@ -82,6 +82,10 @@ ih264_inter_pred_luma_copy_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
     mov       x12, x5
     mov       x7, x4
@@ -228,14 +232,16 @@ end_inner_loop_wd_16:
 // Register Usage
 // x0 : pi2_src
 // x1 : pu1_out
-// x2 : src_strd
-// x3 : out_strd
+// w2 : src_strd
+// w3 : out_strd
 // Neon registers d0-d7, d16-d30 are used
 // No need for pushing  arm and neon registers
 
     .global ih264_interleave_copy_av8
 ih264_interleave_copy_av8:
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     ld1       {v2.8b}, [x0], x2         //load src plane 1 => d2 &pred palne 2 => d3
     ld1       {v3.8b}, [x0], x2
     mov       v2.d[1], v3.d[0]
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
index d2897b6..dd4383e 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
@@ -52,10 +52,10 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
 
 
 .text
@@ -71,6 +71,10 @@ ih264_inter_pred_luma_horz_hpel_vert_hpel_av8:
     //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
     sub       x0, x0, #2                //pu1_src-2
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
index 546c807..3563ac0 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
@@ -105,12 +105,12 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
-//    x7 =>  dydx
-//    x9 => *pu1_tmp
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
+//    x6 => *pu1_tmp
+//    w7 =>  dydx
 
 .text
 .p2align 2
@@ -126,6 +126,10 @@ ih264_inter_pred_luma_horz_hpel_vert_qpel_av8:
     // store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
 
 
@@ -134,7 +138,8 @@ ih264_inter_pred_luma_horz_hpel_vert_qpel_av8:
 
     mov       x9, x6
 
-    lsr       x7, x7, #3                // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
+                                        // by writing to w7 here, we clear the upper half of x7
+    lsr       w7, w7, #3                // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
 
     add       x7, x7, #2
     mov       x6, #48
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
index 39e3253..38268c7 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
@@ -94,11 +94,11 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
-//   x7 =>  dydx
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
+//    w7 =>  dydx
 
 .text
 .p2align 2
@@ -114,6 +114,10 @@ ih264_inter_pred_luma_horz_qpel_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
 
     and       x7, x7, #3                //Finds x-offset
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
index 3f3e297..6ccf11f 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
@@ -105,12 +105,12 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
-//    x6 =>  dydx
-//    x9 => *pu1_tmp
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
+//    x6 => *pu1_tmp
+//    w7 =>  dydx
 
 .text
 .p2align 2
@@ -125,11 +125,15 @@ ih264_inter_pred_luma_horz_qpel_vert_hpel_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
     sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
     sub       x0, x0, #2                //pu1_src-2
     mov       x9, x6
-    mov       x6, x7
+    mov       w6, w7
 
     and       x6, x6, #2                // dydx & 0x3 followed by dydx>>1 and dydx<<1
 
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
index ab663d0..a9dfbd1 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
@@ -104,11 +104,11 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
-//    x6 =>  dydx
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
+//    w7 =>  dydx
 
 .text
 .p2align 2
@@ -122,7 +122,11 @@ ih264_inter_pred_luma_horz_qpel_vert_qpel_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
-    mov       x6, x7
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
+    mov       w6, w7
     and       x7, x6, #3
     add       x7, x0, x7, lsr #1        //pu1_pred_vert = pu1_src + (x_offset>>1)
 
diff --git a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
index 9d19a2d..014faca 100644
--- a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
@@ -94,11 +94,11 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//    x4 =>  ht
-//    x5 =>  wd
-//   x7 =>  dydx
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ht
+//    w5 =>  wd
+//    w7 =>  dydx
 
 .text
 .p2align 2
@@ -112,6 +112,10 @@ ih264_inter_pred_luma_vert_qpel_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x2, w2
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
 
 
     and       x7, x7, #12               //Finds y-offset
diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s
index 8f0f282..39c0256 100644
--- a/common/armv8/ih264_intra_pred_chroma_av8.s
+++ b/common/armv8/ih264_intra_pred_chroma_av8.s
@@ -100,9 +100,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
 
@@ -113,13 +113,14 @@ ih264_intra_pred_chroma_8x8_mode_dc_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
-    mov       x19, #5
-    ands      x6, x4, x19
+    mov       w19, #5
+    ands      w6, w4, w19
     beq       none_available
-    cmp       x6, #1
+    cmp       w6, #1
     beq       left_only_available
-    cmp       x6, #4
+    cmp       w6, #4
     beq       top_only_available
 
 all_available:
@@ -251,9 +252,9 @@ end_func:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_chroma_8x8_mode_horz_av8
@@ -263,6 +264,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8:
 
 
     push_v_regs
+    sxtw      x3, w3
     ld1       {v0.8h}, [x0]
 
     dup       v10.8h, v0.h[7]
@@ -332,9 +334,9 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_chroma_8x8_mode_vert_av8
@@ -342,6 +344,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8:
 ih264_intra_pred_chroma_8x8_mode_vert_av8:
 
     push_v_regs
+    sxtw      x3, w3
 
     add       x0, x0, #18
     ld1       {v0.8b, v1.8b}, [x0]
@@ -405,15 +408,16 @@ ih264_intra_pred_chroma_8x8_mode_vert_av8:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_chroma_8x8_mode_plane_av8
 ih264_intra_pred_chroma_8x8_mode_plane_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     ld1       {v0.2s}, [x0]
     add       x10, x0, #10
@@ -457,18 +461,14 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8:
     rshrn     v13.4h, v26.4s, #6
     rshrn     v14.4h, v28.4s, #6
     ldrb      w6, [x0], #1
-    sxtw      x6, w6
     add       x10, x0, #31
     ldrb      w8, [x0], #1
-    sxtw      x8, w8
     ldrb      w7, [x10], #1
-    sxtw      x7, w7
     ldrb      w9, [x10], #1
-    sxtw      x9, w9
-    add       x6, x6, x7
-    add       x8, x8, x9
-    lsl       x6, x6, #4
-    lsl       x8, x8, #4
+    add       w6, w6, w7
+    add       w8, w8, w9
+    lsl       w6, w6, #4
+    lsl       w8, w8, #4
     dup       v0.8h, w6
     dup       v2.8h, w8
     dup       v4.8h, v12.h[0]
diff --git a/common/armv8/ih264_intra_pred_luma_16x16_av8.s b/common/armv8/ih264_intra_pred_luma_16x16_av8.s
index c1847b5..fa19c12 100644
--- a/common/armv8/ih264_intra_pred_luma_16x16_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_16x16_av8.s
@@ -98,9 +98,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_16x16_mode_vert_av8
@@ -108,6 +108,7 @@
 ih264_intra_pred_luma_16x16_mode_vert_av8:
 
     push_v_regs
+    sxtw      x3, w3
 
 
     add       x0, x0, #17
@@ -181,9 +182,9 @@ ih264_intra_pred_luma_16x16_mode_vert_av8:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_16x16_mode_horz_av8
 
@@ -192,6 +193,7 @@ ih264_intra_pred_luma_16x16_mode_horz_av8:
 
 
     push_v_regs
+    sxtw      x3, w3
 
     ld1       {v0.16b}, [x0]
 
@@ -283,9 +285,9 @@ ih264_intra_pred_luma_16x16_mode_horz_av8:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_16x16_mode_dc_av8
 
@@ -295,18 +297,19 @@ ih264_intra_pred_luma_16x16_mode_dc_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     sub       v0.16b, v0.16b, v0.16b
     sub       v1.16b, v1.16b, v1.16b
     mov       w10, #0
     mov       w11 , #3
-    ands      x6, x4, #0x01
+    ands      w6, w4, #0x01
     beq       top_available             //LEFT NOT AVAILABLE
     ld1       {v0.16b}, [x0]
     add       w10, w10, #8
     add       w11, w11, #1
 top_available:
-    ands      x6, x4, #0x04
+    ands      w6, w4, #0x04
     beq       none_available
     add       x6, x0, #17
     ld1       {v1.16b}, [x6]
@@ -314,7 +317,7 @@ top_available:
     add       w11, w11, #1
     b         summation
 none_available:
-    cmp       x4, #0
+    cmp       w4, #0
     bne       summation
     mov       w15, #128
     dup       v20.16b, w15
@@ -410,15 +413,16 @@ end_func:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_16x16_mode_plane_av8
 ih264_intra_pred_luma_16x16_mode_plane_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
     mov       x2, x1
     add       x1, x0, #17
     add       x0, x0, #15
@@ -440,76 +444,58 @@ ih264_intra_pred_luma_16x16_mode_plane_av8:
     uxtl      v18.8h, v7.8b
     add       x7, x0, x4, lsl #3
     sub       x0, x7, x4, lsl #1
-    sub       x20, x4, #0x0
-    neg       x14, x20
+    neg       x14, x4
     addp      v0.8h, v0.8h, v1.8h
     ldrb      w8, [x7], #-1
-    sxtw      x8, w8
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
     saddlp    v0.2s, v0.4h
-    sub       x12, x8, x9
+    sub       w12, w8, w9
     ldrb      w8, [x7], #-1
-    sxtw      x8, w8
     saddlp    v0.1d, v0.2s
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
-    sub       x8, x8, x9
+    sub       w8, w8, w9
     shl       v2.2s, v0.2s, #2
-    add       x12, x12, x8, lsl #1
+    add       w12, w12, w8, lsl #1
     add       v0.2s, v0.2s , v2.2s
     ldrb      w8, [x7], #-1
-    sxtw      x8, w8
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
     srshr     v0.2s, v0.2s, #6          // i_b = D0[0]
-    sub       x8, x8, x9
+    sub       w8, w8, w9
     ldrb      w5, [x7], #-1
-    sxtw      x5, w5
-    add       x8, x8, x8, lsl #1
+    add       w8, w8, w8, lsl #1
     dup       v4.8h, v0.h[0]
-    add       x12, x12, x8
+    add       w12, w12, w8
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
     mul       v0.8h, v4.8h , v16.8h
-    sub       x5, x5, x9
+    sub       w5, w5, w9
     mul       v2.8h, v4.8h , v18.8h
-    add       x12, x12, x5, lsl #2
+    add       w12, w12, w5, lsl #2
     ldrb      w8, [x7], #-1
-    sxtw      x8, w8
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
-    sub       x8, x8, x9
+    sub       w8, w8, w9
     ldrb      w5, [x7], #-1
-    sxtw      x5, w5
-    add       x8, x8, x8, lsl #2
+    add       w8, w8, w8, lsl #2
     ldrb      w6, [x0], #1
-    sxtw      x6, w6
-    add       x12, x12, x8
+    add       w12, w12, w8
     ldrb      w8, [x7], #-1
-    sxtw      x8, w8
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
-    sub       x5, x5, x6
-    sub       x8, x8, x9
-    add       x5, x5, x5, lsl #1
-    sub       x20, x8, x8, lsl #3
-    neg       x8, x20
-    add       x12, x12, x5, lsl #1
+    sub       w5, w5, w6
+    sub       w8, w8, w9
+    add       w5, w5, w5, lsl #1
+    sub       w20, w8, w8, lsl #3
+    neg       w8, w20
+    add       w12, w12, w5, lsl #1
     ldrb      w5, [x7], #-1
-    sxtw      x5, w5
     ldrb      w6, [x10]                 //top_left
-    sxtw      x6, w6
-    add       x12, x12, x8
-    sub       x9, x5, x6
+    add       w12, w12, w8
+    sub       w9, w5, w6
     ldrb      w6, [x1, #7]
-    sxtw      x6, w6
-    add       x12, x12, x9, lsl #3      // i_c = x12
-    add       x8, x5, x6
-    add       x12, x12, x12, lsl #2
-    lsl       x8, x8, #4                // i_a = x8
-    add       x12, x12, #0x20
-    lsr       x12, x12, #6
+    add       w12, w12, w9, lsl #3      // i_c = w12
+    add       w8, w5, w6
+    add       w12, w12, w12, lsl #2
+    lsl       w8, w8, #4                // i_a = w8
+    add       w12, w12, #0x20
+    lsr       w12, w12, #6
     shl       v28.8h, v4.8h, #3
     dup       v6.8h, w12
     dup       v30.8h, w8
diff --git a/common/armv8/ih264_intra_pred_luma_4x4_av8.s b/common/armv8/ih264_intra_pred_luma_4x4_av8.s
index 62e8cee..1f95131 100644
--- a/common/armv8/ih264_intra_pred_luma_4x4_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_4x4_av8.s
@@ -102,15 +102,16 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_4x4_mode_vert_av8
 
 ih264_intra_pred_luma_4x4_mode_vert_av8:
 
     push_v_regs
+    sxtw      x3, w3
 
     add       x0, x0, #5
 
@@ -171,9 +172,9 @@ ih264_intra_pred_luma_4x4_mode_vert_av8:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
 
@@ -182,6 +183,7 @@ ih264_intra_pred_luma_4x4_mode_vert_av8:
 ih264_intra_pred_luma_4x4_mode_horz_av8:
 
     push_v_regs
+    sxtw      x3, w3
 
     ld1       {v1.s}[0], [x0]
     dup       v0.8b, v1.b[3]
@@ -246,9 +248,9 @@ ih264_intra_pred_luma_4x4_mode_horz_av8:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
 
@@ -261,41 +263,34 @@ ih264_intra_pred_luma_4x4_mode_dc_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
-    ands      x5, x4, #0x01
+    ands      w5, w4, #0x01
     beq       top_available             //LEFT NOT AVAILABLE
 
     add       x10, x0, #3
     mov       x2, #-1
     ldrb      w5, [x10], #-1
-    sxtw      x5, w5
     ldrb      w6, [x10], #-1
-    sxtw      x6, w6
     ldrb      w7, [x10], #-1
-    sxtw      x7, w7
-    add       x5, x5, x6
+    add       w5, w5, w6
     ldrb      w8, [x10], #-1
-    sxtw      x8, w8
-    add       x5, x5, x7
-    ands      x11, x4, #0x04            // CHECKING IF TOP_AVAILABLE  ELSE BRANCHING TO ONLY LEFT AVAILABLE
-    add       x5, x5, x8
+    add       w5, w5, w7
+    ands      w11, w4, #0x04            // CHECKING IF TOP_AVAILABLE  ELSE BRANCHING TO ONLY LEFT AVAILABLE
+    add       w5, w5, w8
     beq       left_available
     add       x10, x0, #5
     //    BOTH LEFT AND TOP AVAILABLE
     ldrb      w6, [x10], #1
-    sxtw      x6, w6
     ldrb      w7, [x10], #1
-    sxtw      x7, w7
-    add       x5, x5, x6
+    add       w5, w5, w6
     ldrb      w8, [x10], #1
-    sxtw      x8, w8
-    add       x5, x5, x7
+    add       w5, w5, w7
     ldrb      w9, [x10], #1
-    sxtw      x9, w9
-    add       x5, x5, x8
-    add       x5, x5, x9
-    add       x5, x5, #4
-    lsr       x5, x5, #3
+    add       w5, w5, w8
+    add       w5, w5, w9
+    add       w5, w5, #4
+    lsr       w5, w5, #3
     dup       v0.8b, w5
     st1       {v0.s}[0], [x1], x3
     st1       {v0.s}[0], [x1], x3
@@ -304,23 +299,19 @@ ih264_intra_pred_luma_4x4_mode_dc_av8:
     b         end_func
 
 top_available: // ONLT TOP AVAILABLE
-    ands      x11, x4, #0x04            // CHECKING TOP AVAILABILTY  OR ELSE BRANCH TO NONE AVAILABLE
+    ands      w11, w4, #0x04            // CHECKING TOP AVAILABILTY  OR ELSE BRANCH TO NONE AVAILABLE
     beq       none_available
 
     add       x10, x0, #5
     ldrb      w6, [x10], #1
-    sxtw      x6, w6
     ldrb      w7, [x10], #1
-    sxtw      x7, w7
     ldrb      w8, [x10], #1
-    sxtw      x8, w8
-    add       x5, x6, x7
+    add       w5, w6, w7
     ldrb      w9, [x10], #1
-    sxtw      x9, w9
-    add       x5, x5, x8
-    add       x5, x5, x9
-    add       x5, x5, #2
-    lsr       x5, x5, #2
+    add       w5, w5, w8
+    add       w5, w5, w9
+    add       w5, w5, #2
+    lsr       w5, w5, #2
     dup       v0.8b, w5
     st1       {v0.s}[0], [x1], x3
     st1       {v0.s}[0], [x1], x3
@@ -401,9 +392,9 @@ end_func:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_diag_dl_av8
@@ -413,6 +404,7 @@ ih264_intra_pred_luma_4x4_mode_diag_dl_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     add       x0, x0, #5
     sub       x5, x3, #2
@@ -488,9 +480,9 @@ end_func_diag_dl:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_diag_dr_av8
@@ -499,6 +491,7 @@ ih264_intra_pred_luma_4x4_mode_diag_dr_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
 
     ld1       {v0.8b}, [x0]
@@ -571,9 +564,9 @@ end_func_diag_dr:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_vert_r_av8
@@ -582,6 +575,7 @@ ih264_intra_pred_luma_4x4_mode_vert_r_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
 
     ld1       {v0.8b}, [x0]
@@ -656,9 +650,9 @@ end_func_vert_r:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_horz_d_av8
@@ -667,6 +661,7 @@ ih264_intra_pred_luma_4x4_mode_horz_d_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     ld1       {v0.8b}, [x0]
     add       x0, x0, #1
@@ -743,9 +738,9 @@ end_func_horz_d:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_vert_l_av8
@@ -754,6 +749,7 @@ ih264_intra_pred_luma_4x4_mode_vert_l_av8:
 
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
     add       x0, x0, #4
     ld1       {v0.8b}, [x0]
     add       x0, x0, #1
@@ -825,9 +821,9 @@ end_func_vert_l:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_4x4_mode_horz_u_av8
@@ -835,11 +831,11 @@ end_func_vert_l:
 ih264_intra_pred_luma_4x4_mode_horz_u_av8:
 
     push_v_regs
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
     mov       x10, x0
     ld1       {v0.8b}, [x0]
     ldrb      w9, [x0], #1
-    sxtw      x9, w9
     ext       v1.8b, v0.8b , v0.8b , #1
     ld1       {v0.b}[7], [x10]
     ext       v2.8b, v1.8b , v1.8b , #1
diff --git a/common/armv8/ih264_intra_pred_luma_8x8_av8.s b/common/armv8/ih264_intra_pred_luma_8x8_av8.s
index bf9a4c1..273aa81 100644
--- a/common/armv8/ih264_intra_pred_luma_8x8_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_8x8_av8.s
@@ -102,9 +102,9 @@
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_vert_av8
@@ -114,6 +114,7 @@ ih264_intra_pred_luma_8x8_mode_vert_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     //stp x19, x20,[sp,#-16]!
+    sxtw      x3, w3
 
     add       x0, x0, #9
     ld1       {v0.8b}, [x0]
@@ -180,9 +181,9 @@ ih264_intra_pred_luma_8x8_mode_vert_av8:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_horz_av8
@@ -194,38 +195,30 @@ ih264_intra_pred_luma_8x8_mode_horz_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
     add       x0, x0, #7
-    mov       x2 , #-1
 
     ldrb      w5, [x0], #-1
-    sxtw      x5, w5
     ldrb      w6, [x0], #-1
-    sxtw      x6, w6
     dup       v0.8b, w5
     st1       {v0.8b}, [x1], x3
     ldrb      w7, [x0], #-1
-    sxtw      x7, w7
     dup       v1.8b, w6
     st1       {v1.8b}, [x1], x3
     dup       v2.8b, w7
     ldrb      w8, [x0], #-1
-    sxtw      x8, w8
     dup       v3.8b, w8
     st1       {v2.8b}, [x1], x3
     ldrb      w5, [x0], #-1
-    sxtw      x5, w5
     st1       {v3.8b}, [x1], x3
     dup       v0.8b, w5
     ldrb      w6, [x0], #-1
-    sxtw      x6, w6
     st1       {v0.8b}, [x1], x3
     ldrb      w7, [x0], #-1
-    sxtw      x7, w7
     dup       v1.8b, w6
     dup       v2.8b, w7
     st1       {v1.8b}, [x1], x3
     ldrb      w8, [x0], #-1
-    sxtw      x8, w8
     dup       v3.8b, w8
     st1       {v2.8b}, [x1], x3
     st1       {v3.8b}, [x1], x3
@@ -285,9 +278,9 @@ ih264_intra_pred_luma_8x8_mode_horz_av8:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_dc_av8
@@ -298,37 +291,30 @@ ih264_intra_pred_luma_8x8_mode_dc_av8:
 
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
-    ands      x6, x4, #0x01
+    ands      w6, w4, #0x01
     beq       top_available             //LEFT NOT AVAILABLE
 
     add       x10, x0, #7
     mov       x2, #-1
     ldrb      w5, [x10], -1
-    sxtw      x5, w5
     ldrb      w6, [x10], -1
-    sxtw      x6, w6
     ldrb      w7, [x10], -1
-    sxtw      x7, w7
-    add       x5, x5, x6
+    add       w5, w5, w6
     ldrb      w8, [x10], -1
-    sxtw      x8, w8
-    add       x5, x5, x7
+    add       w5, w5, w7
     ldrb      w6, [x10], -1
-    sxtw      x6, w6
-    add       x5, x5, x8
+    add       w5, w5, w8
     ldrb      w7, [x10], -1
-    sxtw      x7, w7
-    add       x5, x5, x6
+    add       w5, w5, w6
     ldrb      w8, [x10], -1
-    sxtw      x8, w8
-    add       x5, x5, x7
-    ands      x11, x4, #0x04            // CHECKING IF TOP_AVAILABLE  ELSE BRANCHING TO ONLY LEFT AVAILABLE
-    add       x5, x5, x8
+    add       w5, w5, w7
+    ands      w11, w4, #0x04            // CHECKING IF TOP_AVAILABLE  ELSE BRANCHING TO ONLY LEFT AVAILABLE
+    add       w5, w5, w8
     ldrb      w6, [x10], -1
-    sxtw      x6, w6
-    add       x5, x5, x6
+    add       w5, w5, w6
     beq       left_available
     add       x10, x0, #9
     //    BOTH LEFT AND TOP AVAILABLE
@@ -351,7 +337,7 @@ ih264_intra_pred_luma_8x8_mode_dc_av8:
     b         end_func
 
 top_available: // ONLT TOP AVAILABLE
-    ands      x11, x4, #0x04            // CHECKING TOP AVAILABILTY  OR ELSE BRANCH TO NONE AVAILABLE
+    ands      w11, w4, #0x04            // CHECKING TOP AVAILABILTY  OR ELSE BRANCH TO NONE AVAILABLE
     beq       none_available
 
     add       x10, x0, #9
@@ -452,9 +438,9 @@ end_func:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_8x8_mode_diag_dl_av8
 
@@ -463,6 +449,7 @@ ih264_intra_pred_luma_8x8_mode_diag_dl_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     add       x0, x0, #9
     sub       x5, x3, #4
@@ -554,9 +541,9 @@ end_func_diag_dl:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_diag_dr_av8
@@ -566,6 +553,7 @@ ih264_intra_pred_luma_8x8_mode_diag_dr_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
 
     ld1       { v0.16b}, [x0]
@@ -654,9 +642,9 @@ end_func_diag_dr:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_vert_r_av8
@@ -666,6 +654,7 @@ ih264_intra_pred_luma_8x8_mode_vert_r_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     ld1       { v0.16b}, [x0]
     mov       v1.d[0], v0.d[1]
@@ -780,9 +769,9 @@ end_func_vert_r:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_8x8_mode_horz_d_av8
 
@@ -791,6 +780,7 @@ ih264_intra_pred_luma_8x8_mode_horz_d_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     ld1       { v0.16b}, [x0]
     mov       v1.d[0], v0.d[1]
@@ -910,9 +900,9 @@ end_func_horz_d:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
 
     .global ih264_intra_pred_luma_8x8_mode_vert_l_av8
@@ -922,6 +912,7 @@ ih264_intra_pred_luma_8x8_mode_vert_l_av8:
     // STMFD sp!, {x4-x12, x14}         //Restoring registers from stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
     add       x0, x0, #9
     ld1       { v0.16b}, [x0]
     mov       v1.d[0], v0.d[1]
@@ -1018,9 +1009,9 @@ end_func_vert_l:
 //**************Variables Vs Registers*****************************************
 //    x0 => *pu1_src
 //    x1 => *pu1_dst
-//    x2 =>  src_strd
-//    x3 =>  dst_strd
-//   x4 =>  ui_neighboravailability
+//    w2 =>  src_strd
+//    w3 =>  dst_strd
+//    w4 =>  ui_neighboravailability
 
     .global ih264_intra_pred_luma_8x8_mode_horz_u_av8
 
@@ -1029,6 +1020,7 @@ ih264_intra_pred_luma_8x8_mode_horz_u_av8:
     // STMFD sp!, {x4-x12, x14}          //store register values to stack
     push_v_regs
     stp       x19, x20, [sp, #-16]!
+    sxtw      x3, w3
 
     ld1       {v0.8b}, [x0]
     ld1       {v1.b}[7], [x0]
diff --git a/common/armv8/ih264_iquant_itrans_recon_av8.s b/common/armv8/ih264_iquant_itrans_recon_av8.s
index 4c83036..003ee74 100644
--- a/common/armv8/ih264_iquant_itrans_recon_av8.s
+++ b/common/armv8/ih264_iquant_itrans_recon_av8.s
@@ -103,11 +103,11 @@
 //x0 => *pi2_src
 //x1 => *pu1_pred
 //x2 => *pu1_out
-//x3 =>  pred_strd
-//x4 =>  out_strd
+//w3 =>  pred_strd
+//w4 =>  out_strd
 //x5 => *pu2_iscal_mat
 //x6 => *pu2_weigh_mat
-//x7 =>  u4_qp_div_6
+//w7 =>  u4_qp_div_6
 //   =>  pi4_tmp
 //   =>  iq_start_idx
 //   =>  pi2_dc_ld_addr
@@ -119,6 +119,8 @@
 ih264_iquant_itrans_recon_4x4_av8:
 
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
 
     dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
 
@@ -292,11 +294,11 @@ skip_loading_luma_dc_src:
 //x0 => *pi2_src
 //x1 => *pu1_pred
 //x2 => *pu1_out
-//x3 =>  pred_strd
-//x4 =>  out_strd
+//w3 =>  pred_strd
+//w4 =>  out_strd
 //x5 => *pu2_iscal_mat
 //x6 => *pu2_weigh_mat
-//x7 =>  u4_qp_div_6
+//w7 =>  u4_qp_div_6
 //sp =>  pi4_tmp
 //sp#8 => *pi2_dc_src
 
@@ -315,6 +317,8 @@ ih264_iquant_itrans_recon_chroma_4x4_av8:
 
     //reduce sp by 64
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
 
     dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
 
@@ -512,11 +516,11 @@ ih264_iquant_itrans_recon_chroma_4x4_av8:
 //x0       => *pi2_src
 //x1       => *pu1_pred
 //x2       => *pu1_out
-//x3       =>  pred_strd
-//x4       =>  out_strd
+//w3       =>  pred_strd
+//w4       =>  out_strd
 //x5       =>  *pu2_iscal_mat
 //x6       =>  *pu2_weigh_mat
-//x7       =>  u4_qp_div_6
+//w7       =>  u4_qp_div_6
 //NOT USED =>  pi4_tmp
 //NOT USED =>  iq_start_idx
 //NOT USED =>  pi2_dc_ld_addr
@@ -525,6 +529,8 @@ ih264_iquant_itrans_recon_chroma_4x4_av8:
 ih264_iquant_itrans_recon_8x8_av8:
 
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
 
     ld1       {v8.8h -v11.8h}, [x5], #64
     ld1       {v12.8h-v15.8h}, [x5]
diff --git a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
index 8bb9c32..13061ec 100644
--- a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
+++ b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
@@ -104,11 +104,11 @@
 //x0 => *pi2_src
 //x1 => *pu1_pred
 //x2 => *pu1_out
-//x3 =>  pred_strd
-//x4 =>  out_strd
+//w3 =>  pred_strd
+//w4 =>  out_strd
 //x5 => *pu2_iscal_mat
 //x6 => *pu2_weigh_mat
-//x7 =>  u4_qp_div_6
+//w7 =>  u4_qp_div_6
 //   =>  pi4_tmp
 //   =>  iq_start_idx
 //   =>  pi2_dc_ld_addr
@@ -119,6 +119,8 @@
     .global ih264_iquant_itrans_recon_4x4_dc_av8
 ih264_iquant_itrans_recon_4x4_dc_av8:
 
+    sxtw      x3, w3
+    sxtw      x4, w4
     ldr       w8, [sp, #8]              //Loads iq_start_idx
     subs      w8, w8, #1                // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
 
@@ -209,11 +211,11 @@ donot_use_pi2_src_luma_dc:
 // x0 : pi2_src
 // x1 : pu1_pred
 // x2 : pu1_out
-// x3 : pred_strd
-// x4 : out_strd
+// w3 : pred_strd
+// w4 : out_strd
 // x5 : pu2_iscal_mat
 // x6 : pu2_weigh_mat
-// x7 : u4_qp_div_6
+// w7 : u4_qp_div_6
 //    : pi2_tmp
 //    : pi2_dc_src
 // Neon registers d0-d7, d16-d30 are used
@@ -223,6 +225,8 @@ donot_use_pi2_src_luma_dc:
     .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8
 ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
 
+    sxtw      x3, w3
+    sxtw      x4, w4
     ldr       x0, [sp, #8]
     push_v_regs
     ld1       {v0.h}[0], [x0]
@@ -327,11 +331,11 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
 //x0       => *pi2_src
 //x1       => *pu1_pred
 //x2       => *pu1_out
-//x3       =>  pred_strd
-//x4       =>  out_strd
+//w3       =>  pred_strd
+//w4       =>  out_strd
 //x5       =>  *pu2_iscal_mat
 //x6       =>  *pu2_weigh_mat
-//x7       =>  u4_qp_div_6
+//w7       =>  u4_qp_div_6
 //NOT USED =>  pi4_tmp
 //NOT USED =>  iq_start_idx
 //NOT USED =>  pi2_dc_ld_addr
@@ -340,6 +344,8 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
 ih264_iquant_itrans_recon_8x8_dc_av8:
 
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
 
     ld1       {v1.h}[0], [x5]
     ld1       {v2.h}[0], [x6]
diff --git a/common/armv8/ih264_mem_fns_neon_av8.s b/common/armv8/ih264_mem_fns_neon_av8.s
index 4e9020d..802550d 100644
--- a/common/armv8/ih264_mem_fns_neon_av8.s
+++ b/common/armv8/ih264_mem_fns_neon_av8.s
@@ -70,11 +70,11 @@
 //*/
 //void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
 //                      UWORD8 *pu1_src,
-//                      UWORD8 num_bytes)
+//                      UWORD32 num_bytes)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_dst
 //    x1 => *pu1_src
-//    x2 => num_bytes
+//    w2 => num_bytes
 
 
 
@@ -89,7 +89,7 @@ loop_neon_memcpy_mul_8:
     ld1       {v0.8b}, [x1], #8
     st1       {v0.8b}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bne       loop_neon_memcpy_mul_8
     ret
 
@@ -99,38 +99,36 @@ loop_neon_memcpy_mul_8:
 //*/
 //void ih264_memcpy(UWORD8 *pu1_dst,
 //                  UWORD8 *pu1_src,
-//                  UWORD8 num_bytes)
+//                  UWORD32 num_bytes)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_dst
 //    x1 => *pu1_src
-//    x2 => num_bytes
+//    w2 => num_bytes
 
 
 
     .global ih264_memcpy_av8
 
 ih264_memcpy_av8:
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     blt       arm_memcpy
 loop_neon_memcpy:
     // Memcpy 8 bytes
     ld1       {v0.8b}, [x1], #8
     st1       {v0.8b}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bge       loop_neon_memcpy
-    cmn       x2, #8
+    cmn       w2, #8
     beq       end_func1
 
 arm_memcpy:
-    add       x2, x2, #8
+    add       w2, w2, #8
 
 loop_arm_memcpy:
     ldrb      w3, [x1], #1
-    sxtw      x3, w3
     strb      w3, [x0], #1
-    sxtw      x3, w3
-    subs      x2, x2, #1
+    subs      w2, w2, #1
     bne       loop_arm_memcpy
     ret
 end_func1:
@@ -139,7 +137,7 @@ end_func1:
 
 //void ih264_memset_mul_8(UWORD8 *pu1_dst,
 //                       UWORD8 value,
-//                       UWORD8 num_bytes)
+//                       UWORD32 num_bytes)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_dst
 //    x1 => value
@@ -156,7 +154,7 @@ loop_memset_mul_8:
     // Memset 8 bytes
     st1       {v0.8b}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bne       loop_memset_mul_8
 
     ret
@@ -164,36 +162,35 @@ loop_memset_mul_8:
 
 //void ih264_memset(UWORD8 *pu1_dst,
 //                       UWORD8 value,
-//                       UWORD8 num_bytes)
+//                       UWORD32 num_bytes)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_dst
-//    x1 => value
-//    x2 => num_bytes
+//    w1 => value
+//    w2 => num_bytes
 
 
 
     .global ih264_memset_av8
 
 ih264_memset_av8:
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     blt       arm_memset
     dup       v0.8b, w1
 loop_neon_memset:
     // Memcpy 8 bytes
     st1       {v0.8b}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bge       loop_neon_memset
-    cmn       x2, #8
+    cmn       w2, #8
     beq       end_func2
 
 arm_memset:
-    add       x2, x2, #8
+    add       w2, w2, #8
 
 loop_arm_memset:
     strb      w1, [x0], #1
-    sxtw      x1, w1
-    subs      x2, x2, #1
+    subs      w2, w2, #1
     bne       loop_arm_memset
     ret
 end_func2:
@@ -205,11 +202,11 @@ end_func2:
 
 //void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
 //                                      UWORD16 value,
-//                                      UWORD8 num_words)
+//                                      UWORD32 num_words)
 //**************Variables Vs Registers*************************
 //    x0 => *pu2_dst
-//    x1 => value
-//    x2 => num_words
+//    w1 => value
+//    w2 => num_words
 
 
     .global ih264_memset_16bit_mul_8_av8
@@ -224,7 +221,7 @@ loop_memset_16bit_mul_8:
     st1       {v0.4h}, [x0], #8
     st1       {v0.4h}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bne       loop_memset_16bit_mul_8
 
     ret
@@ -233,18 +230,18 @@ loop_memset_16bit_mul_8:
 
 //void ih264_memset_16bit(UWORD16 *pu2_dst,
 //                       UWORD16 value,
-//                       UWORD8 num_words)
+//                       UWORD32 num_words)
 //**************Variables Vs Registers*************************
 //    x0 => *pu2_dst
-//    x1 => value
-//    x2 => num_words
+//    w1 => value
+//    w2 => num_words
 
 
 
     .global ih264_memset_16bit_av8
 
 ih264_memset_16bit_av8:
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     blt       arm_memset_16bit
     dup       v0.4h, w1
 loop_neon_memset_16bit:
@@ -252,18 +249,17 @@ loop_neon_memset_16bit:
     st1       {v0.4h}, [x0], #8
     st1       {v0.4h}, [x0], #8
 
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     bge       loop_neon_memset_16bit
-    cmn       x2, #8
+    cmn       w2, #8
     beq       end_func3
 
 arm_memset_16bit:
-    add       x2, x2, #8
+    add       w2, w2, #8
 
 loop_arm_memset_16bit:
     strh      w1, [x0], #2
-    sxtw      x1, w1
-    subs      x2, x2, #1
+    subs      w2, w2, #1
     bne       loop_arm_memset_16bit
     ret
 
diff --git a/common/armv8/ih264_padding_neon_av8.s b/common/armv8/ih264_padding_neon_av8.s
index 35d9c8a..e03fe2f 100644
--- a/common/armv8/ih264_padding_neon_av8.s
+++ b/common/armv8/ih264_padding_neon_av8.s
@@ -76,9 +76,9 @@
 //                   WORD32 pad_size)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => wd
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => wd
+//    w3 => pad_size
 
     .global ih264_pad_top_av8
 
@@ -86,25 +86,25 @@ ih264_pad_top_av8:
 
     // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
     stp       x19, x20, [sp, #-16]!
 
     sub       x5, x0, x1
-    sub       x20, x1, #0
-    neg       x6, x20
+    neg       x6, x1
 
 loop_neon_memcpy_mul_16:
     // Load 16 bytes
     ld1       {v0.8b, v1.8b}, [x0], #16
     mov       x4, x5
-    mov       x7, x3
+    mov       w7, w3
     add       x5, x5, #16
 
 loop_neon_pad_top:
     st1       {v0.8b, v1.8b}, [x4], x6
-    subs      x7, x7, #1
+    subs      w7, w7, #1
     bne       loop_neon_pad_top
 
-    subs      x2, x2, #16
+    subs      w2, w2, #16
     bne       loop_neon_memcpy_mul_16
 
     // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
@@ -160,9 +160,9 @@ loop_neon_pad_top:
 //                        WORD32 pad_size)
 //**************Variables Vs Registers*************************
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -172,6 +172,8 @@ ih264_pad_left_luma_av8:
 
     // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
 
@@ -182,43 +184,35 @@ ih264_pad_left_luma_av8:
 loop_16:                                //  /*hard coded for width=16  ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     dup       v4.16b, w10
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     dup       v4.16b, w10
     dup       v6.16b, w11
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     bne       loop_16
@@ -227,14 +221,11 @@ loop_16:                                //  /*hard coded for width=16  ,height =
 loop_32:                                //  /*hard coded for width=32 ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6
@@ -243,35 +234,30 @@ loop_32:                                //  /*hard coded for width=32 ,height =8
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     dup       v0.16b, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.16b, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     bne       loop_32
@@ -333,9 +319,9 @@ end_func:
 //                            WORD32 pad_size)
 //{
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -345,6 +331,8 @@ ih264_pad_left_chroma_av8:
 
     // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     sub       x4, x0, x3
@@ -354,27 +342,23 @@ ih264_pad_left_chroma_av8:
 loop_32_l_c:                            //  /*hard coded for width=32  ,height =4,8,12*/
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
 
@@ -383,27 +367,23 @@ loop_32_l_c:                            //  /*hard coded for width=32  ,height =
 
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
 
@@ -412,20 +392,16 @@ loop_32_l_c:                            //  /*hard coded for width=32  ,height =
 
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
@@ -500,9 +476,9 @@ end_func_l_c:
 //}
 //
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -512,6 +488,8 @@ ih264_pad_right_luma_av8:
 
     // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     mov       x4, x0
@@ -522,43 +500,35 @@ ih264_pad_right_luma_av8:
 loop_16_r: //  /*hard coded for width=16  ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     dup       v4.16b, w10
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], x1        // 16 bytes store
     dup       v2.16b, w9
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], x1        // 16 bytes store
     dup       v4.16b, w10
     dup       v6.16b, w11
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v4.16b}, [x4], x1        // 16 bytes store
     st1       {v6.16b}, [x4], x1        // 16 bytes store
     bne       loop_16_r
@@ -567,14 +537,11 @@ loop_16_r: //  /*hard coded for width=16  ,height =8,16*/
 loop_32_r:                              //  /*hard coded for width=32  ,height =8,16*/
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6
@@ -583,35 +550,30 @@ loop_32_r:                              //  /*hard coded for width=32  ,height =
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
     ldrb      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     ldrb      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.16b, w8
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     ldrb      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.16b, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrb      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.16b, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.16b, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #8
+    subs      w2, w2, #8
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
     bne       loop_32_r
@@ -672,9 +634,9 @@ end_func_r:
 //                        WORD32 ht,
 //                        WORD32 pad_size)
 //    x0 => *pu1_src
-//    x1 => src_strd
-//    x2 => ht
-//    x3 => pad_size
+//    w1 => src_strd
+//    w2 => ht
+//    w3 => pad_size
 
 
 
@@ -684,6 +646,8 @@ ih264_pad_right_chroma_av8:
 
     // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x1, w1
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     mov       x4, x0
@@ -692,24 +656,20 @@ ih264_pad_right_chroma_av8:
 loop_32_r_c: //  /*hard coded for width=32 ,height =8,4*/
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     dup       v0.8h, w8
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
@@ -720,27 +680,23 @@ loop_32_r_c: //  /*hard coded for width=32 ,height =8,4*/
 
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     dup       v0.8h, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
     st1       {v4.16b}, [x4], #16       // 16 bytes store
     dup       v6.8h, w11
     st1       {v4.16b}, [x4], x6        // 16 bytes store
-    subs      x2, x2, #4
+    subs      w2, w2, #4
     st1       {v6.16b}, [x4], #16       // 16 bytes store
     st1       {v6.16b}, [x4], x6        // 16 bytes store
 
@@ -748,20 +704,16 @@ loop_32_r_c: //  /*hard coded for width=32 ,height =8,4*/
     bne       loop_32_r_c
     ldrh      w8, [x0]
     add       x0, x0, x1
-    sxtw      x8, w8
     dup       v0.8h, w8
     ldrh      w9, [x0]
     add       x0, x0, x1
-    sxtw      x9, w9
     ldrh      w10, [x0]
     add       x0, x0, x1
-    sxtw      x10, w10
     st1       {v0.16b}, [x4], #16       // 16 bytes store
     dup       v2.8h, w9
     st1       {v0.16b}, [x4], x6        // 16 bytes store
     ldrh      w11, [x0]
     add       x0, x0, x1
-    sxtw      x11, w11
     st1       {v2.16b}, [x4], #16       // 16 bytes store
     dup       v4.8h, w10
     st1       {v2.16b}, [x4], x6        // 16 bytes store
diff --git a/common/armv8/ih264_resi_trans_quant_av8.s b/common/armv8/ih264_resi_trans_quant_av8.s
index 316c220..d2ba3cf 100644
--- a/common/armv8/ih264_resi_trans_quant_av8.s
+++ b/common/armv8/ih264_resi_trans_quant_av8.s
@@ -45,18 +45,6 @@
 //* function name     : ih264_resi_trans_quant_4x4
 //* description       : this function does cf4 of h264
 //*
-//* arguments         :   x0 :pointer to src buffer
-//                        x1 :pointer to pred buffer
-//                        x2 :pointer to dst buffer
-//                        x3 :source stride
-//                        x4 :pred stride,
-//                        x5 :dst stride,
-//                        x6 :pointer to scaling matrix,
-//                        x7 :pointer to threshold matrix,
-//                        stack   qbits,
-//                                rounding factor,
-//                                pointer to store nnz
-//                                pointer to store non quantized dc value
 // values returned   : none
 //
 // register usage    :
@@ -77,34 +65,24 @@
     .global ih264_resi_trans_quant_4x4_av8
 ih264_resi_trans_quant_4x4_av8:
 
-    //x0     :pointer to src buffer
-    //x1     :pointer to pred buffer
-    //x2     :pointer to dst buffer
-    //x3     :source stride
-    //x4     :pred stride
-    //x5     :dst stride,
-    //x6     :scale matirx,
-    //x7     :threshold matrix
-    //       :qbits
-    //       :round factor
-    //       :nnz
-    //       :pointer to store non quantized dc value
     push_v_regs
     //x0     :pointer to src buffer
     //x1     :pointer to pred buffer
     //x2     :pointer to dst buffer
-    //x3     :source stride
-    //x4     :pred stride
-    //x5     :scale matirx,
+    //w3     :source stride
+    //w4     :pred stride
+    //w5     :scale matirx,
     //x6     :threshold matrix
-    //x7     :qbits
-    //x8        :round factor
+    //w7     :qbits
+    //w8        :round factor
     //x9        :nnz
     //x10       :pointer to store non quantized dc value
 
+    sxtw      x3, w3
+    sxtw      x4, w4
     ldr       w8, [sp, #64]             //load round factor
     ldr       x10, [sp, #80]            //load addres for non quant val
-    neg       x7, x7                    //negate the qbit value for usiing lsl
+    neg       w7, w7                    //negate the qbit value for usiing lsl
     ldr       x9, [sp, #72]
 
     //------------fucntion loading done----------------;
@@ -259,18 +237,6 @@ ih264_resi_trans_quant_4x4_av8:
 //* description       : this function does residue calculation, forward transform
 //*                        and quantization for 4x4 chroma block.
 //*
-//* arguments         :   x0 :pointer to src buffer
-//                        x1 :pointer to pred buffer
-//                        x2 :pointer to dst buffer
-//                        x3 :source stride
-//                        x4 :pred stride,
-//                        x5 :dst stride,
-//                        x6 :pointer to scaling matrix,
-//                        x7 :pointer to threshold matrix,
-//                        stack     qbits,
-//                                  rounding factor,
-//                                  pointer to store nnz
-//                                  pointer to store unquantized dc values
 // values returned   : none
 //
 // register usage    :
@@ -290,33 +256,24 @@ ih264_resi_trans_quant_4x4_av8:
     .global ih264_resi_trans_quant_chroma_4x4_av8
 ih264_resi_trans_quant_chroma_4x4_av8:
 
-    //x0     :pointer to src buffer
-    //x1     :pointer to pred buffer
-    //x2     :pointer to dst buffer
-    //x3     :source stride
-    //stack     :pred stride
-    //          :scale matirx,
-    //          :threshold matrix
-    //          :qbits
-    //          :round factor
-    //          :nnz
-    //          :pu1_dc_alt_addr
     push_v_regs
     //x0     :pointer to src buffer
     //x1     :pointer to pred buffer
     //x2     :pointer to dst buffer
-    //x3     :source stride
-    //x4     :pred stride
+    //w3     :source stride
+    //w4     :pred stride
     //x5     :scale matirx,
     //x6     :threshold matrix
-    //x7     :qbits
-    //x8        :round factor
+    //w7     :qbits
+    //w8        :round factor
     //x9        :nnz
     //x10       :pointer to store non quantized dc value
 
+    sxtw      x3, w3
+    sxtw      x4, w4
     ldr       w8, [sp, #64]             //load round factor
     ldr       x10, [sp, #80]            //load addres for non quant val
-    neg       x7, x7                    //negate the qbit value for usiing lsl
+    neg       w7, w7                    //negate the qbit value for usiing lsl
     ldr       x9, [sp, #72]
     //------------fucntion loading done----------------;
 
@@ -485,10 +442,10 @@ ih264_resi_trans_quant_chroma_4x4_av8:
 //* arguments         :  x0 :pointer to src buffer
 //                       x1 :pointer to dst buffer
 //                       x2 :pu2_scale_matrix
-//                       x2 :pu2_threshold_matrix
-//                       x3 :u4_qbits
-//                       x4 :u4_round_factor
-//                       x5 :pu1_nnz
+//                       x3 :pu2_threshold_matrix
+//                       w4 :u4_qbits
+//                       w5 :u4_round_factor
+//                       x6 :pu1_nnz
 // values returned   : none
 //
 // register usage    :
@@ -516,8 +473,8 @@ ih264_hadamard_quant_4x4_av8:
 //x1 :pointer to dst buffer
 //x2 :pu2_scale_matrix
 //x3 :pu2_threshold_matrix
-//x4 :u4_qbits
-//x5 :u4_round_factor
+//w4 :u4_qbits
+//w5 :u4_round_factor
 //x6 :pu1_nnz
 
     push_v_regs
@@ -632,10 +589,10 @@ ih264_hadamard_quant_4x4_av8:
 //* arguments         :  x0 :pointer to src buffer
 //                       x1 :pointer to dst buffer
 //                       x2 :pu2_scale_matrix
-//                       x2 :pu2_threshold_matrix
-//                       x3 :u4_qbits
-//                       x4 :u4_round_factor
-//                       x5 :pu1_nnz
+//                       x3 :pu2_threshold_matrix
+//                       w4 :u4_qbits
+//                       w5 :u4_round_factor
+//                       x6 :pu1_nnz
 // values returned   : none
 //
 // register usage    :
diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s
index b039fba..475f690 100644
--- a/common/armv8/ih264_weighted_bi_pred_av8.s
+++ b/common/armv8/ih264_weighted_bi_pred_av8.s
@@ -103,28 +103,28 @@
 //                                     WORD32 src_strd1,
 //                                     WORD32 src_strd2,
 //                                     WORD32 dst_strd,
-//                                     UWORD16 log_WD,
-//                                     UWORD32 wt1,
-//                                     UWORD32 wt2,
-//                                     UWORD16 ofst1,
-//                                     UWORD16 ofst2,
-//                                     UWORD8 ht,
-//                                     UWORD8 wd)
+//                                     WORD32 log_WD,
+//                                     WORD32 wt1,
+//                                     WORD32 wt2,
+//                                     WORD16 ofst1,
+//                                     WORD16 ofst2,
+//                                     WORD32 ht,
+//                                     WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src1
 //    x1      => puc_src2
 //    x2      => puc_dst
-//    x3      => src_strd1
-//    [sp]    => src_strd2 (x4)
-//    [sp+4]  => dst_strd  (x5)
-//    [sp+8]  => log_WD    (x6)
-//    [sp+12] => wt1       (x7)
-//   [sp+16] => wt2       (x8)
-//   [sp+20] => ofst1     (x9)
-//   [sp+24] => ofst2     (x10)
-//    [sp+28] => ht        (x11)
-//    [sp+32] => wd        (x12)
+//    w3      => src_strd1
+//    w4      => src_strd2
+//    w5      => dst_strd
+//    w6      => log_WD
+//    w7      => wt1
+//    [sp]    => wt2       (w8)
+//    [sp+8]  => ofst1     (w9)
+//    [sp+16] => ofst2     (w10)
+//    [sp+24] => ht        (w11)
+//    [sp+32] => wd        (w12)
 //
 .text
 .p2align 2
@@ -138,21 +138,23 @@ ih264_weighted_bi_pred_luma_av8:
 
     // STMFD sp!, {x4-x12,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
     stp       x19, x20, [sp, #-16]!
-    ldr       x8, [sp, #80]             //Load wt2 in x8
-    ldr       x9, [sp, #88]             //Load ofst1 in x9
-    add       x6, x6, #1                //x6  = log_WD + 1
-    sub       x20, x6, #0               //x13 = -(log_WD + 1)
-    neg       x10, x20
+    ldr       w8, [sp, #80]             //Load wt2 in w8
+    ldr       w9, [sp, #88]             //Load ofst1 in w9
+    add       w6, w6, #1                //w6  = log_WD + 1
+    neg       w10, w6                   //w10 = -(log_WD + 1)
     dup       v0.8h, w10                //Q0  = -(log_WD + 1) (32-bit)
-    ldr       x10, [sp, #96]            //Load ofst2 in x10
-    ldr       x11, [sp, #104]           //Load ht in x11
-    ldr       x12, [sp, #112]           //Load wd in x12
-    add       x9, x9, #1                //x9 = ofst1 + 1
-    add       x9, x9, x10               //x9 = ofst1 + ofst2 + 1
+    ldr       w10, [sp, #96]            //Load ofst2 in w10
+    ldr       w11, [sp, #104]           //Load ht in w11
+    ldr       w12, [sp, #112]           //Load wd in w12
+    add       w9, w9, #1                //w9 = ofst1 + 1
+    add       w9, w9, w10               //w9 = ofst1 + ofst2 + 1
     mov       v2.s[0], w7
     mov       v2.s[1], w8               //D2 = {wt1(32-bit), wt2(32-bit)}
-    asr       x9, x9, #1                //x9 = ofst = (ofst1 + ofst2 + 1) >> 1
+    asr       w9, w9, #1                //w9 = ofst = (ofst1 + ofst2 + 1) >> 1
     dup       v3.8b, w9                 //D3 = ofst (8-bit)
     cmp       w12, #16
     beq       loop_16                   //branch if wd is 16
@@ -383,28 +385,28 @@ end_loops:
 //                                       WORD32 src_strd1,
 //                                       WORD32 src_strd2,
 //                                       WORD32 dst_strd,
-//                                       UWORD16 log_WD,
-//                                       UWORD32 wt1,
-//                                       UWORD32 wt2,
-//                                       UWORD16 ofst1,
-//                                       UWORD16 ofst2,
-//                                       UWORD8 ht,
-//                                       UWORD8 wd)
+//                                       WORD32 log_WD,
+//                                       WORD32 wt1,
+//                                       WORD32 wt2,
+//                                       WORD32 ofst1,
+//                                       WORD32 ofst2,
+//                                       WORD32 ht,
+//                                       WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src1
 //    x1      => puc_src2
 //    x2      => puc_dst
-//    x3      => src_strd1
-//    [sp]    => src_strd2 (x4)
-//    [sp+4]  => dst_strd  (x5)
-//    [sp+8]  => log_WD    (x6)
-//    [sp+12] => wt1       (x7)
-//   [sp+16] => wt2       (x8)
-//   [sp+20] => ofst1     (x9)
-//   [sp+24] => ofst2     (x10)
-//    [sp+28] => ht        (x11)
-//    [sp+32] => wd        (x12)
+//    w3      => src_strd1
+//    w4      => src_strd2
+//    w5      => dst_strd
+//    w6      => log_WD
+//    w7      => wt1
+//    [sp]    => wt2       (w8)
+//    [sp+8]  => ofst1     (w9)
+//    [sp+16] => ofst2     (w10)
+//    [sp+24] => ht        (w11)
+//    [sp+32] => wd        (w12)
 //
 
 
@@ -417,24 +419,22 @@ ih264_weighted_bi_pred_chroma_av8:
 
     // STMFD sp!, {x4-x12,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
     stp       x19, x20, [sp, #-16]!
 
 
-    ldr       x8, [sp, #80]             //Load wt2 in x8
+    ldr       w8, [sp, #80]             //Load wt2 in w8
     dup       v4.4s, w8                 //Q2 = (wt2_u, wt2_v) (32-bit)
     dup       v2.4s, w7                 //Q1 = (wt1_u, wt1_v) (32-bit)
-    add       x6, x6, #1                //x6  = log_WD + 1
-    ldr       w9, [sp, #88]             //Load ofst1 in x9
-    sxtw      x9, w9
-    ldr       w10, [sp, #96]            //Load ofst2 in x10
-    sxtw      x10, w10
-    sub       x20, x6, #0               //x12 = -(log_WD + 1)
-    neg       x20, x20
+    add       w6, w6, #1                //w6  = log_WD + 1
+    ldr       w9, [sp, #88]             //Load ofst1 in w9
+    ldr       w10, [sp, #96]            //Load ofst2 in w10
+    neg       w20, w6                   //w20 = -(log_WD + 1)
     dup       v0.8h, w20                //Q0  = -(log_WD + 1) (16-bit)
     ldr       w11, [sp, #104]           //Load ht in x11
     ldr       w12, [sp, #112]           //Load wd in x12
-    sxtw      x11, w11
-    sxtw      x12, w12
     dup       v20.8h, w9                //0ffset1
     dup       v21.8h, w10               //0ffset2
     srhadd    v6.8b, v20.8b, v21.8b
diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s
index 69ed3b0..f145217 100644
--- a/common/armv8/ih264_weighted_pred_av8.s
+++ b/common/armv8/ih264_weighted_pred_av8.s
@@ -89,22 +89,22 @@
 //                                  UWORD8 *puc_dst,
 //                                  WORD32 src_strd,
 //                                  WORD32 dst_strd,
-//                                  UWORD8 log_WD,
-//                                  UWORD32 wt,
-//                                  UWORD16 ofst,
-//                                  UWORD8 ht,
-//                                  UWORD8 wd)
+//                                  WORD32 log_WD,
+//                                  WORD32 wt,
+//                                  WORD32 ofst,
+//                                  WORD32 ht,
+//                                  WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src
 //    x1      => puc_dst
-//    x2      => src_strd
-//    x3      => dst_strd
-//    [sp]    => log_WD (x4)
-//    [sp+4]  => wt     (x5)
-//   [sp+8]  => ofst   (x6)
-//    [sp+12] => ht     (x7)
-//    [sp+16] => wd     (x8)
+//    w2      => src_strd
+//    w3      => dst_strd
+//    w4      => log_WD
+//    w5      => wt
+//    w6      => ofst
+//    w7      => ht
+//    [sp]    => wd     (w8)
 //
 .text
 .p2align 2
@@ -118,13 +118,14 @@ ih264_weighted_pred_luma_av8:
 
     // STMFD sp!, {x4-x9,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
     ldr       w8, [sp, #80]             //Load wd
     sxtw      x8, w8
 
     dup       v2.4h, w5                 //D2 = wt (16-bit)
-    sub       x20, x4, #0               //x9 = -log_WD
-    neg       x9, x20
+    neg       w9, w4                    //w9 = -log_WD
     dup       v3.8b, w6                 //D3 = ofst (8-bit)
     cmp       w8, #16                   //check if wd is 16
     dup       v0.8h, w9                 //Q0 = -log_WD (16-bit)
@@ -318,22 +319,22 @@ end_loops:
 //                                    UWORD8 *puc_dst,
 //                                    WORD32 src_strd,
 //                                    WORD32 dst_strd,
-//                                    UWORD8 log_WD,
-//                                    UWORD32 wt,
-//                                    UWORD16 ofst,
-//                                    UWORD8 ht,
-//                                    UWORD8 wd)
+//                                    WORD32 log_WD,
+//                                    WORD32 wt,
+//                                    WORD32 ofst,
+//                                    WORD32 ht,
+//                                    WORD32 wd)
 //
 //**************Variables Vs Registers*****************************************
 //    x0      => puc_src
 //    x1      => puc_dst
-//    x2      => src_strd
-//    x3      => dst_strd
-//    [sp]    => log_WD (x4)
-//    [sp+4]  => wt     (x5)
-//   [sp+8]  => ofst   (x6)
-//    [sp+12] => ht     (x7)
-//    [sp+16] => wd     (x8)
+//    w2      => src_strd
+//    w3      => dst_strd
+//    w4      => log_WD
+//    w5      => wt
+//    w6      => ofst
+//    w7      => ht
+//    [sp]    => wd     (w8)
 //
 
 
@@ -345,13 +346,14 @@ ih264_weighted_pred_chroma_av8:
 
     // STMFD sp!, {x4-x9,x14}                //stack stores the values of the arguments
     push_v_regs
+    sxtw      x2, w2
+    sxtw      x3, w3
     stp       x19, x20, [sp, #-16]!
 
     ldr       w8, [sp, #80]             //Load wd
     sxtw      x8, w8
 
-    sub       x20, x4, #0               //x9 = -log_WD
-    neg       x9, x20
+    neg       w9, w4                    //w9 = -log_WD
     dup       v2.4s, w5                 //Q1 = {wt_u (16-bit), wt_v (16-bit)}