aboutsummaryrefslogtreecommitdiff
path: root/common
diff options
context:
space:
mode:
Diffstat (limited to 'common')
-rw-r--r--common/arm/ih264_inter_pred_chroma_a9q.s4
-rw-r--r--common/arm/ih264_intra_pred_luma_16x16_a9q.s2
-rw-r--r--common/arm/ih264_mem_fns_neon.s10
-rw-r--r--common/arm/ih264_padding_neon.s2
-rw-r--r--common/arm/ih264_weighted_bi_pred_a9q.s4
-rw-r--r--common/arm/ih264_weighted_pred_a9q.s4
-rw-r--r--common/armv8/ih264_deblk_chroma_av8.s78
-rw-r--r--common/armv8/ih264_deblk_luma_av8.s35
-rw-r--r--common/armv8/ih264_default_weighted_pred_av8.s34
-rw-r--r--common/armv8/ih264_inter_pred_chroma_av8.s22
-rw-r--r--common/armv8/ih264_inter_pred_filters_luma_horz_av8.s12
-rw-r--r--common/armv8/ih264_inter_pred_filters_luma_vert_av8.s12
-rw-r--r--common/armv8/ih264_inter_pred_luma_copy_av8.s18
-rw-r--r--common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s12
-rw-r--r--common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s19
-rw-r--r--common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s14
-rw-r--r--common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s18
-rw-r--r--common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s16
-rw-r--r--common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s14
-rw-r--r--common/armv8/ih264_intra_pred_chroma_av8.s48
-rw-r--r--common/armv8/ih264_intra_pred_luma_16x16_av8.s104
-rw-r--r--common/armv8/ih264_intra_pred_luma_4x4_av8.s110
-rw-r--r--common/armv8/ih264_intra_pred_luma_8x8_av8.s100
-rw-r--r--common/armv8/ih264_iquant_itrans_recon_av8.s24
-rw-r--r--common/armv8/ih264_iquant_itrans_recon_dc_av8.s24
-rw-r--r--common/armv8/ih264_mem_fns_neon_av8.s68
-rw-r--r--common/armv8/ih264_padding_neon_av8.s120
-rw-r--r--common/armv8/ih264_resi_trans_quant_av8.s93
-rw-r--r--common/armv8/ih264_weighted_bi_pred_av8.s110
-rw-r--r--common/armv8/ih264_weighted_pred_av8.s58
30 files changed, 569 insertions, 620 deletions
diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s
index 6681a7c..e2b8c99 100644
--- a/common/arm/ih264_inter_pred_chroma_a9q.s
+++ b/common/arm/ih264_inter_pred_chroma_a9q.s
@@ -91,8 +91,8 @@
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@ WORD32 dst_strd,
-@ UWORD8 u1_dx,
-@ UWORD8 u1_dy,
+@ WORD32 u1_dx,
+@ WORD32 u1_dy,
@ WORD32 ht,
@ WORD32 wd)
@**************Variables Vs Registers*****************************************
diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
index 0dd82f3..7597444 100644
--- a/common/arm/ih264_intra_pred_luma_16x16_a9q.s
+++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
@@ -413,7 +413,7 @@ scrlbl1:
add r7, r0, r4, lsl #3
sub r0, r7, r4, lsl #1
- rsb lr, r4, #0x0
+ neg lr, r4
vpadd.s16 d0, d0, d1
diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s
index 39ad9b3..b9595d7 100644
--- a/common/arm/ih264_mem_fns_neon.s
+++ b/common/arm/ih264_mem_fns_neon.s
@@ -68,7 +68,7 @@
@*
@void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
@ UWORD8 *pu1_src,
-@ UWORD8 num_bytes)
+@ UWORD32 num_bytes)
@**************Variables Vs Registers*************************
@ r0 => *pu1_dst
@ r1 => *pu1_src
@@ -97,7 +97,7 @@ loop_neon_memcpy_mul_8:
@*
@void ih264_memcpy(UWORD8 *pu1_dst,
@ UWORD8 *pu1_src,
-@ UWORD8 num_bytes)
+@ UWORD32 num_bytes)
@**************Variables Vs Registers*************************
@ r0 => *pu1_dst
@ r1 => *pu1_src
@@ -135,7 +135,7 @@ loop_memcpy:
@void ih264_memset_mul_8(UWORD8 *pu1_dst,
@ UWORD8 value,
-@ UWORD8 num_bytes)
+@ UWORD32 num_bytes)
@**************Variables Vs Registers*************************
@ r0 => *pu1_dst
@ r1 => value
@@ -202,7 +202,7 @@ loop_memset:
@void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
@ UWORD16 value,
-@ UWORD8 num_words)
+@ UWORD32 num_words)
@**************Variables Vs Registers*************************
@ r0 => *pu2_dst
@ r1 => value
@@ -234,7 +234,7 @@ loop_memset_16bit_mul_8:
@void ih264_memset_16bit(UWORD16 *pu2_dst,
@ UWORD16 value,
-@ UWORD8 num_words)
+@ UWORD32 num_words)
@**************Variables Vs Registers*************************
@ r0 => *pu2_dst
@ r1 => value
diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s
index e7a1f91..819b0b3 100644
--- a/common/arm/ih264_padding_neon.s
+++ b/common/arm/ih264_padding_neon.s
@@ -88,7 +88,7 @@ ih264_pad_top_a9q:
stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
sub r5, r0, r1
- rsb r6, r1, #0
+ neg r6, r1
loop_neon_memcpy_mul_16:
@ Load 16 bytes
diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s
index 33859e6..304bd8a 100644
--- a/common/arm/ih264_weighted_bi_pred_a9q.s
+++ b/common/arm/ih264_weighted_bi_pred_a9q.s
@@ -144,7 +144,7 @@ ih264_weighted_bi_pred_luma_a9q:
ldr r4, [sp, #40] @Load src_strd2 in r4
ldr r5, [sp, #44] @Load dst_strd in r5
sxtb r9, r9 @sign-extend 8-bit ofst1 to 32-bit
- rsb r10, r6, #0 @r13 = -(log_wd + 1)
+ neg r10, r6 @r10 = -(log_wd + 1)
ldr r11, [sp, #68] @Load ht in r11
ldr r12, [sp, #72] @Load wd in r12
vdup.16 q0, r10 @Q0 = -(log_wd + 1) (32-bit)
@@ -456,7 +456,7 @@ ih264_weighted_bi_pred_chroma_a9q:
ldr r9, [sp, #60] @Load ofst1 in r9
ldr r10, [sp, #64] @Load ofst2 in r10
- rsb r12, r6, #0 @r12 = -(log_wd + 1)
+ neg r12, r6 @r12 = -(log_wd + 1)
ldr r4, [sp, #40] @Load src_strd2 in r4
ldr r5, [sp, #44] @Load dst_strd in r5
vdup.16 q0, r12 @Q0 = -(log_wd + 1) (16-bit)
diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s
index 81d26d4..80c2c6d 100644
--- a/common/arm/ih264_weighted_pred_a9q.s
+++ b/common/arm/ih264_weighted_pred_a9q.s
@@ -122,7 +122,7 @@ ih264_weighted_pred_luma_a9q:
vpush {d8-d15}
vdup.16 d2, r5 @D2 = wt (16-bit)
- rsb r9, r4, #0 @r9 = -log_wd
+ neg r9, r4 @r9 = -log_wd
vdup.8 d3, r6 @D3 = ofst (8-bit)
cmp r8, #16 @check if wd is 16
vdup.16 q0, r9 @Q0 = -log_wd (16-bit)
@@ -349,7 +349,7 @@ ih264_weighted_pred_chroma_a9q:
ldr r6, [sp, #36] @Load ofst = {ofst_u (8-bit), ofst_v (8-bit)}
ldr r8, [sp, #44] @Load wd
- rsb r9, r4, #0 @r9 = -log_wd
+ neg r9, r4 @r9 = -log_wd
vdup.32 q1, r5 @Q1 = {wt_u (16-bit), wt_v (16-bit)}
ldr r7, [sp, #40] @Load ht
vpush {d8-d15}
diff --git a/common/armv8/ih264_deblk_chroma_av8.s b/common/armv8/ih264_deblk_chroma_av8.s
index a4dbd23..b7f2d58 100644
--- a/common/armv8/ih264_deblk_chroma_av8.s
+++ b/common/armv8/ih264_deblk_chroma_av8.s
@@ -56,19 +56,19 @@
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
//* Source stride
//*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
//* Alpha Value for the boundary in U
//*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
//* Beta Value for the boundary in U
//*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
//* Alpha Value for the boundary in V
//*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
//* Beta Value for the boundary in V
//*
//* @returns
@@ -87,6 +87,7 @@ ih264_deblk_chroma_horz_bs4_av8:
// STMFD sp!,{x4-x6,x14} //
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x1, w1
mov x6, x5
mov x5, x4
sub x0, x0, x1, lsl #1 //x0 = uc_edgePixel pointing to p1 of chroma
@@ -155,19 +156,19 @@ ih264_deblk_chroma_horz_bs4_av8:
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
//* Source stride
//*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
//* Alpha Value for the boundary in U
//*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
//* Beta Value for the boundary in U
//*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
//* Alpha Value for the boundary in V
//*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
//* Beta Value for the boundary in V
//*
//* @returns
@@ -186,12 +187,13 @@ ih264_deblk_chroma_vert_bs4_av8:
// STMFD sp!,{x4,x5,x12,x14}
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x1, w1
sub x0, x0, #4 //point x0 to p1u of row0.
mov x12, x0 //keep a back up of x0 for buffer write
- add x2, x2, x4, lsl #8 //x2 = (alpha_cr,alpha_cb)
- add x3, x3, x5, lsl #8 //x3 = (beta_cr,beta_cb)
+ add w2, w2, w4, lsl #8 //w2 = (alpha_cr,alpha_cb)
+ add w3, w3, w5, lsl #8 //w3 = (beta_cr,beta_cb)
ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
@@ -292,28 +294,28 @@ ih264_deblk_chroma_vert_bs4_av8:
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
//* Source stride
//*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
//* Alpha Value for the boundary in U
//*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
//* Beta Value for the boundary in U
//*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
//* Alpha Value for the boundary in V
//*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
//* Beta Value for the boundary in V
//*
-//* @param[in] sp(8) - u4_bs
+//* @param[in] w6 - u4_bs
//* Packed Boundary strength array
//*
-//* @param[in] sp(12) - pu1_cliptab_cb
+//* @param[in] x7 - pu1_cliptab_cb
//* tc0_table for U
//*
-//* @param[in] sp(16) - pu1_cliptab_cr
+//* @param[in] sp(0) - pu1_cliptab_cr
//* tc0_table for V
//*
//* @returns
@@ -332,14 +334,13 @@ ih264_deblk_chroma_horz_bslt4_av8:
// STMFD sp!,{x4-x9,x14} //
push_v_regs
stp x19, x20, [sp, #-16]!
- mov x8, x7
- mov x7, x6
- ldr x9, [sp, #80]
+ sxtw x1, w1
+ ldr x8, [sp, #80]
sub x0, x0, x1, lsl #1 //x0 = uc_edgePixelU pointing to p1 of chroma U
- rev w7, w7 //
- mov v12.s[0], w7 //D12[0] = ui_Bs
- ld1 {v16.s}[0], [x8] //D16[0] contains cliptab_cb
- ld1 {v17.s}[0], [x9] //D17[0] contains cliptab_cr
+ rev w6, w6 //
+ mov v12.s[0], w6 //D12[0] = ui_Bs
+ ld1 {v16.s}[0], [x7] //D16[0] contains cliptab_cb
+ ld1 {v17.s}[0], [x8] //D17[0] contains cliptab_cr
ld2 {v6.8b, v7.8b}, [x0], x1 //Q3=p1
tbl v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U
tbl v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V
@@ -428,28 +429,28 @@ ih264_deblk_chroma_horz_bslt4_av8:
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
//* Source stride
//*
-//* @param[in] x2 - alpha_cb
+//* @param[in] w2 - alpha_cb
//* Alpha Value for the boundary in U
//*
-//* @param[in] x3 - beta_cb
+//* @param[in] w3 - beta_cb
//* Beta Value for the boundary in U
//*
-//* @param[in] sp(0) - alpha_cr
+//* @param[in] w4 - alpha_cr
//* Alpha Value for the boundary in V
//*
-//* @param[in] sp(4) - beta_cr
+//* @param[in] w5 - beta_cr
//* Beta Value for the boundary in V
//*
-//* @param[in] sp(8) - u4_bs
+//* @param[in] w6 - u4_bs
//* Packed Boundary strength array
//*
-//* @param[in] sp(12) - pu1_cliptab_cb
+//* @param[in] x7 - pu1_cliptab_cb
//* tc0_table for U
//*
-//* @param[in] sp(16) - pu1_cliptab_cr
+//* @param[in] sp(0) - pu1_cliptab_cr
//* tc0_table for V
//*
//* @returns
@@ -468,11 +469,12 @@ ih264_deblk_chroma_vert_bslt4_av8:
// STMFD sp!,{x4-x7,x10-x12,x14}
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x1, w1
mov x10, x7
- ldr x11, [sp, #80] //x6 = u4_bs
+ ldr x11, [sp, #80] //x11 = u4_bs
sub x0, x0, #4 //point x0 to p1u of row0.
- add x2, x2, x4, lsl #8
- add x3, x3, x5, lsl #8
+ add w2, w2, w4, lsl #8
+ add w3, w3, w5, lsl #8
mov x12, x0 //keep a back up of x0 for buffer write
ld4 {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
ld4 {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
diff --git a/common/armv8/ih264_deblk_luma_av8.s b/common/armv8/ih264_deblk_luma_av8.s
index 1b3950d..7705df2 100644
--- a/common/armv8/ih264_deblk_luma_av8.s
+++ b/common/armv8/ih264_deblk_luma_av8.s
@@ -60,19 +60,19 @@
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
//* Source stride
//*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
//* Alpha Value for the boundary
//*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
//* Beta Value for the boundary
//*
-//* @param[in] sp(0) - u4_bs
+//* @param[in] w4 - u4_bs
//* Packed Boundary strength array
//*
-//* @param[in] sp(4) - pu1_cliptab
+//* @param[in] x5 - pu1_cliptab
//* tc0_table
//*
//* @returns
@@ -90,6 +90,7 @@ ih264_deblk_luma_horz_bslt4_av8:
// STMFD sp!,{x4-x7,x14}
push_v_regs
+ sxtw x1, w1
stp x19, x20, [sp, #-16]!
//LDRD x4,x5,[SP,#0x14] //x4 = ui_Bs , x5 = *puc_ClpTab
@@ -214,13 +215,13 @@ ih264_deblk_luma_horz_bslt4_av8:
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
//* Source stride
//*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
//* Alpha Value for the boundary
//*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
//* Beta Value for the boundary
//*
//* @returns
@@ -240,6 +241,7 @@ ih264_deblk_luma_horz_bs4_av8:
// STMFD sp!,{x12,x14}
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x1, w1
// Init
dup v0.16b, w2 //duplicate alpha
@@ -401,19 +403,19 @@ ih264_deblk_luma_horz_bs4_av8:
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
//* Source stride
//*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
//* Alpha Value for the boundary
//*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
//* Beta Value for the boundary
//*
-//* @param[in] sp(0) - u4_bs
+//* @param[in] w4 - u4_bs
//* Packed Boundary strength array
//*
-//* @param[in] sp(4) - pu1_cliptab
+//* @param[in] x5 - pu1_cliptab
//* tc0_table
//*
//* @returns
@@ -432,6 +434,7 @@ ih264_deblk_luma_vert_bslt4_av8:
// STMFD sp!,{x12,x14}
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x1, w1
sub x0, x0, #4 //pointer uc_edgePixel-4
mov x12, x4
@@ -743,13 +746,13 @@ ih264_deblk_luma_vert_bslt4_av8:
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
-//* @param[in] x1 - src_strd
+//* @param[in] w1 - src_strd
//* Source stride
//*
-//* @param[in] x2 - alpha
+//* @param[in] w2 - alpha
//* Alpha Value for the boundary
//*
-//* @param[in] x3 - beta
+//* @param[in] w3 - beta
//* Beta Value for the boundary
//*
//* @returns
diff --git a/common/armv8/ih264_default_weighted_pred_av8.s b/common/armv8/ih264_default_weighted_pred_av8.s
index 6823015..d10047e 100644
--- a/common/armv8/ih264_default_weighted_pred_av8.s
+++ b/common/armv8/ih264_default_weighted_pred_av8.s
@@ -88,18 +88,18 @@
// WORD32 src_strd1,
// WORD32 src_strd2,
// WORD32 dst_strd,
-// UWORD8 ht,
-// UWORD8 wd)
+// WORD32 ht,
+// WORD32 wd)
//
//**************Variables Vs Registers*****************************************
// x0 => puc_src1
// x1 => puc_src2
// x2 => puc_dst
-// x3 => src_strd1
-// [sp] => src_strd2 (x4)
-// [sp+4] => dst_strd (x5)
-// [sp+8] => ht (x6)
-// [sp+12] => wd (x7)
+// w3 => src_strd1
+// w4 => src_strd2
+// w5 => dst_strd
+// w6 => ht
+// w7 => wd
//
.text
.p2align 2
@@ -113,6 +113,9 @@ ih264_default_weighted_pred_luma_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
cmp w7, #16
beq loop_16 //branch if wd is 16
cmp w7, #8
@@ -263,18 +266,18 @@ end_loops:
// WORD32 src_strd1,
// WORD32 src_strd2,
// WORD32 dst_strd,
-// UWORD8 ht,
-// UWORD8 wd)
+// WORD32 ht,
+// WORD32 wd)
//
//**************Variables Vs Registers*****************************************
// x0 => puc_src1
// x1 => puc_src2
// x2 => puc_dst
-// x3 => src_strd1
-// [sp] => src_strd2 (x4)
-// [sp+4] => dst_strd (x5)
-// [sp+8] => ht (x6)
-// [sp+12] => wd (x7)
+// w3 => src_strd1
+// w4 => src_strd2
+// w5 => dst_strd
+// w6 => ht
+// w7 => wd
//
@@ -286,6 +289,9 @@ ih264_default_weighted_pred_chroma_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
cmp w7, #8
beq loop_8_uv //branch if wd is 8
cmp w7, #4
diff --git a/common/armv8/ih264_inter_pred_chroma_av8.s b/common/armv8/ih264_inter_pred_chroma_av8.s
index 714e271..f6aef40 100644
--- a/common/armv8/ih264_inter_pred_chroma_av8.s
+++ b/common/armv8/ih264_inter_pred_chroma_av8.s
@@ -91,19 +91,19 @@
// UWORD8 *pu1_dst,
// WORD32 src_strd,
// WORD32 dst_strd,
-// UWORD8 u1_dx,
-// UWORD8 u1_dy,
+// WORD32 u1_dx,
+// WORD32 u1_dy,
// WORD32 ht,
// WORD32 wd)
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => u1_dx
-// x5 => u1_dy
-// x6 => height
-// x7 => width
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => u1_dx
+// w5 => u1_dy
+// w6 => height
+// w7 => width
//
.text
.p2align 2
@@ -120,6 +120,12 @@ ih264_inter_pred_chroma_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x2, w2
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
+ sxtw x6, w6
+ sxtw x7, w7
diff --git a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
index 6ad463a..e7c9f86 100644
--- a/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
+++ b/common/armv8/ih264_inter_pred_filters_luma_horz_av8.s
@@ -89,10 +89,10 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ht
-// x5 => wd
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ht
+// w5 => wd
.text
.p2align 2
@@ -111,6 +111,10 @@ ih264_inter_pred_luma_horz_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x2, w2
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
sub x0, x0, #2 //pu1_src-2
sub x14, x4, #16
movi v0.8b, #5 //filter coeff
diff --git a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
index 9564f99..711d73e 100644
--- a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
+++ b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
@@ -89,10 +89,10 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ht
-// x5 => wd
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ht
+// w5 => wd
.text
.p2align 2
@@ -108,6 +108,10 @@ ih264_inter_pred_luma_vert_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x2, w2
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd
diff --git a/common/armv8/ih264_inter_pred_luma_copy_av8.s b/common/armv8/ih264_inter_pred_luma_copy_av8.s
index 1a76c1c..007df30 100644
--- a/common/armv8/ih264_inter_pred_luma_copy_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_copy_av8.s
@@ -65,10 +65,10 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x7 => ht
-// x12 => wd
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ht
+// w5 => wd
.text
.p2align 2
@@ -82,6 +82,10 @@ ih264_inter_pred_luma_copy_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x2, w2
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
mov x12, x5
mov x7, x4
@@ -228,14 +232,16 @@ end_inner_loop_wd_16:
// Register Usage
// x0 : pi2_src
// x1 : pu1_out
-// x2 : src_strd
-// x3 : out_strd
+// w2 : src_strd
+// w3 : out_strd
// Neon registers d0-d7, d16-d30 are used
// No need for pushing arm and neon registers
.global ih264_interleave_copy_av8
ih264_interleave_copy_av8:
push_v_regs
+ sxtw x2, w2
+ sxtw x3, w3
ld1 {v2.8b}, [x0], x2 //load src plane 1 => d2 &pred palne 2 => d3
ld1 {v3.8b}, [x0], x2
mov v2.d[1], v3.d[0]
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
index d2897b6..dd4383e 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
@@ -52,10 +52,10 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ht
-// x5 => wd
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ht
+// w5 => wd
.text
@@ -71,6 +71,10 @@ ih264_inter_pred_luma_horz_hpel_vert_hpel_av8:
//store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x2, w2
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd
sub x0, x0, #2 //pu1_src-2
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
index 546c807..3563ac0 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
@@ -105,12 +105,12 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ht
-// x5 => wd
-// x7 => dydx
-// x9 => *pu1_tmp
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ht
+// w5 => wd
+// x6 => *pu1_tmp
+// w7 => dydx
.text
.p2align 2
@@ -126,6 +126,10 @@ ih264_inter_pred_luma_horz_hpel_vert_qpel_av8:
// store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x2, w2
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
@@ -134,7 +138,8 @@ ih264_inter_pred_luma_horz_hpel_vert_qpel_av8:
mov x9, x6
- lsr x7, x7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
+ // by writing to w7 here, we clear the upper half of x7
+ lsr w7, w7, #3 // dydx >> 2 followed by dydx & 0x3 and dydx>>1 to obtain the deciding bit
add x7, x7, #2
mov x6, #48
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
index 39e3253..38268c7 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_av8.s
@@ -94,11 +94,11 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ht
-// x5 => wd
-// x7 => dydx
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ht
+// w5 => wd
+// w7 => dydx
.text
.p2align 2
@@ -114,6 +114,10 @@ ih264_inter_pred_luma_horz_qpel_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x2, w2
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
and x7, x7, #3 //Finds x-offset
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
index 3f3e297..6ccf11f 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s
@@ -105,12 +105,12 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ht
-// x5 => wd
-// x6 => dydx
-// x9 => *pu1_tmp
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ht
+// w5 => wd
+// x6 => *pu1_tmp
+// w7 => dydx
.text
.p2align 2
@@ -125,11 +125,15 @@ ih264_inter_pred_luma_horz_qpel_vert_hpel_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x2, w2
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd
sub x0, x0, #2 //pu1_src-2
mov x9, x6
- mov x6, x7
+ mov w6, w7
and x6, x6, #2 // dydx & 0x3 followed by dydx>>1 and dydx<<1
diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
index ab663d0..a9dfbd1 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_qpel_av8.s
@@ -104,11 +104,11 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ht
-// x5 => wd
-// x6 => dydx
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ht
+// w5 => wd
+// w7 => dydx
.text
.p2align 2
@@ -122,7 +122,11 @@ ih264_inter_pred_luma_horz_qpel_vert_qpel_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
- mov x6, x7
+ sxtw x2, w2
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
+ mov w6, w7
and x7, x6, #3
add x7, x0, x7, lsr #1 //pu1_pred_vert = pu1_src + (x_offset>>1)
diff --git a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
index 9d19a2d..014faca 100644
--- a/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_vert_qpel_av8.s
@@ -94,11 +94,11 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ht
-// x5 => wd
-// x7 => dydx
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ht
+// w5 => wd
+// w7 => dydx
.text
.p2align 2
@@ -112,6 +112,10 @@ ih264_inter_pred_luma_vert_qpel_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x2, w2
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
and x7, x7, #12 //Finds y-offset
diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s
index 8f0f282..39c0256 100644
--- a/common/armv8/ih264_intra_pred_chroma_av8.s
+++ b/common/armv8/ih264_intra_pred_chroma_av8.s
@@ -100,9 +100,9 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
@@ -113,13 +113,14 @@ ih264_intra_pred_chroma_8x8_mode_dc_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
- mov x19, #5
- ands x6, x4, x19
+ mov w19, #5
+ ands w6, w4, w19
beq none_available
- cmp x6, #1
+ cmp w6, #1
beq left_only_available
- cmp x6, #4
+ cmp w6, #4
beq top_only_available
all_available:
@@ -251,9 +252,9 @@ end_func:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_chroma_8x8_mode_horz_av8
@@ -263,6 +264,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8:
push_v_regs
+ sxtw x3, w3
ld1 {v0.8h}, [x0]
dup v10.8h, v0.h[7]
@@ -332,9 +334,9 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_chroma_8x8_mode_vert_av8
@@ -342,6 +344,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8:
ih264_intra_pred_chroma_8x8_mode_vert_av8:
push_v_regs
+ sxtw x3, w3
add x0, x0, #18
ld1 {v0.8b, v1.8b}, [x0]
@@ -405,15 +408,16 @@ ih264_intra_pred_chroma_8x8_mode_vert_av8:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_chroma_8x8_mode_plane_av8
ih264_intra_pred_chroma_8x8_mode_plane_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
ld1 {v0.2s}, [x0]
add x10, x0, #10
@@ -457,18 +461,14 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8:
rshrn v13.4h, v26.4s, #6
rshrn v14.4h, v28.4s, #6
ldrb w6, [x0], #1
- sxtw x6, w6
add x10, x0, #31
ldrb w8, [x0], #1
- sxtw x8, w8
ldrb w7, [x10], #1
- sxtw x7, w7
ldrb w9, [x10], #1
- sxtw x9, w9
- add x6, x6, x7
- add x8, x8, x9
- lsl x6, x6, #4
- lsl x8, x8, #4
+ add w6, w6, w7
+ add w8, w8, w9
+ lsl w6, w6, #4
+ lsl w8, w8, #4
dup v0.8h, w6
dup v2.8h, w8
dup v4.8h, v12.h[0]
diff --git a/common/armv8/ih264_intra_pred_luma_16x16_av8.s b/common/armv8/ih264_intra_pred_luma_16x16_av8.s
index c1847b5..fa19c12 100644
--- a/common/armv8/ih264_intra_pred_luma_16x16_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_16x16_av8.s
@@ -98,9 +98,9 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_16x16_mode_vert_av8
@@ -108,6 +108,7 @@
ih264_intra_pred_luma_16x16_mode_vert_av8:
push_v_regs
+ sxtw x3, w3
add x0, x0, #17
@@ -181,9 +182,9 @@ ih264_intra_pred_luma_16x16_mode_vert_av8:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_16x16_mode_horz_av8
@@ -192,6 +193,7 @@ ih264_intra_pred_luma_16x16_mode_horz_av8:
push_v_regs
+ sxtw x3, w3
ld1 {v0.16b}, [x0]
@@ -283,9 +285,9 @@ ih264_intra_pred_luma_16x16_mode_horz_av8:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_16x16_mode_dc_av8
@@ -295,18 +297,19 @@ ih264_intra_pred_luma_16x16_mode_dc_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
sub v0.16b, v0.16b, v0.16b
sub v1.16b, v1.16b, v1.16b
mov w10, #0
mov w11 , #3
- ands x6, x4, #0x01
+ ands w6, w4, #0x01
beq top_available //LEFT NOT AVAILABLE
ld1 {v0.16b}, [x0]
add w10, w10, #8
add w11, w11, #1
top_available:
- ands x6, x4, #0x04
+ ands w6, w4, #0x04
beq none_available
add x6, x0, #17
ld1 {v1.16b}, [x6]
@@ -314,7 +317,7 @@ top_available:
add w11, w11, #1
b summation
none_available:
- cmp x4, #0
+ cmp w4, #0
bne summation
mov w15, #128
dup v20.16b, w15
@@ -410,15 +413,16 @@ end_func:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_16x16_mode_plane_av8
ih264_intra_pred_luma_16x16_mode_plane_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
mov x2, x1
add x1, x0, #17
add x0, x0, #15
@@ -440,76 +444,58 @@ ih264_intra_pred_luma_16x16_mode_plane_av8:
uxtl v18.8h, v7.8b
add x7, x0, x4, lsl #3
sub x0, x7, x4, lsl #1
- sub x20, x4, #0x0
- neg x14, x20
+ neg x14, x4
addp v0.8h, v0.8h, v1.8h
ldrb w8, [x7], #-1
- sxtw x8, w8
ldrb w9, [x0], #1
- sxtw x9, w9
saddlp v0.2s, v0.4h
- sub x12, x8, x9
+ sub w12, w8, w9
ldrb w8, [x7], #-1
- sxtw x8, w8
saddlp v0.1d, v0.2s
ldrb w9, [x0], #1
- sxtw x9, w9
- sub x8, x8, x9
+ sub w8, w8, w9
shl v2.2s, v0.2s, #2
- add x12, x12, x8, lsl #1
+ add w12, w12, w8, lsl #1
add v0.2s, v0.2s , v2.2s
ldrb w8, [x7], #-1
- sxtw x8, w8
ldrb w9, [x0], #1
- sxtw x9, w9
srshr v0.2s, v0.2s, #6 // i_b = D0[0]
- sub x8, x8, x9
+ sub w8, w8, w9
ldrb w5, [x7], #-1
- sxtw x5, w5
- add x8, x8, x8, lsl #1
+ add w8, w8, w8, lsl #1
dup v4.8h, v0.h[0]
- add x12, x12, x8
+ add w12, w12, w8
ldrb w9, [x0], #1
- sxtw x9, w9
mul v0.8h, v4.8h , v16.8h
- sub x5, x5, x9
+ sub w5, w5, w9
mul v2.8h, v4.8h , v18.8h
- add x12, x12, x5, lsl #2
+ add w12, w12, w5, lsl #2
ldrb w8, [x7], #-1
- sxtw x8, w8
ldrb w9, [x0], #1
- sxtw x9, w9
- sub x8, x8, x9
+ sub w8, w8, w9
ldrb w5, [x7], #-1
- sxtw x5, w5
- add x8, x8, x8, lsl #2
+ add w8, w8, w8, lsl #2
ldrb w6, [x0], #1
- sxtw x6, w6
- add x12, x12, x8
+ add w12, w12, w8
ldrb w8, [x7], #-1
- sxtw x8, w8
ldrb w9, [x0], #1
- sxtw x9, w9
- sub x5, x5, x6
- sub x8, x8, x9
- add x5, x5, x5, lsl #1
- sub x20, x8, x8, lsl #3
- neg x8, x20
- add x12, x12, x5, lsl #1
+ sub w5, w5, w6
+ sub w8, w8, w9
+ add w5, w5, w5, lsl #1
+ sub w20, w8, w8, lsl #3
+ neg w8, w20
+ add w12, w12, w5, lsl #1
ldrb w5, [x7], #-1
- sxtw x5, w5
ldrb w6, [x10] //top_left
- sxtw x6, w6
- add x12, x12, x8
- sub x9, x5, x6
+ add w12, w12, w8
+ sub w9, w5, w6
ldrb w6, [x1, #7]
- sxtw x6, w6
- add x12, x12, x9, lsl #3 // i_c = x12
- add x8, x5, x6
- add x12, x12, x12, lsl #2
- lsl x8, x8, #4 // i_a = x8
- add x12, x12, #0x20
- lsr x12, x12, #6
+ add w12, w12, w9, lsl #3 // i_c = w12
+ add w8, w5, w6
+ add w12, w12, w12, lsl #2
+ lsl w8, w8, #4 // i_a = w8
+ add w12, w12, #0x20
+ lsr w12, w12, #6
shl v28.8h, v4.8h, #3
dup v6.8h, w12
dup v30.8h, w8
diff --git a/common/armv8/ih264_intra_pred_luma_4x4_av8.s b/common/armv8/ih264_intra_pred_luma_4x4_av8.s
index 62e8cee..1f95131 100644
--- a/common/armv8/ih264_intra_pred_luma_4x4_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_4x4_av8.s
@@ -102,15 +102,16 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_4x4_mode_vert_av8
ih264_intra_pred_luma_4x4_mode_vert_av8:
push_v_regs
+ sxtw x3, w3
add x0, x0, #5
@@ -171,9 +172,9 @@ ih264_intra_pred_luma_4x4_mode_vert_av8:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
@@ -182,6 +183,7 @@ ih264_intra_pred_luma_4x4_mode_vert_av8:
ih264_intra_pred_luma_4x4_mode_horz_av8:
push_v_regs
+ sxtw x3, w3
ld1 {v1.s}[0], [x0]
dup v0.8b, v1.b[3]
@@ -246,9 +248,9 @@ ih264_intra_pred_luma_4x4_mode_horz_av8:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
@@ -261,41 +263,34 @@ ih264_intra_pred_luma_4x4_mode_dc_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
- ands x5, x4, #0x01
+ ands w5, w4, #0x01
beq top_available //LEFT NOT AVAILABLE
add x10, x0, #3
mov x2, #-1
ldrb w5, [x10], #-1
- sxtw x5, w5
ldrb w6, [x10], #-1
- sxtw x6, w6
ldrb w7, [x10], #-1
- sxtw x7, w7
- add x5, x5, x6
+ add w5, w5, w6
ldrb w8, [x10], #-1
- sxtw x8, w8
- add x5, x5, x7
- ands x11, x4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
- add x5, x5, x8
+ add w5, w5, w7
+ ands w11, w4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ add w5, w5, w8
beq left_available
add x10, x0, #5
// BOTH LEFT AND TOP AVAILABLE
ldrb w6, [x10], #1
- sxtw x6, w6
ldrb w7, [x10], #1
- sxtw x7, w7
- add x5, x5, x6
+ add w5, w5, w6
ldrb w8, [x10], #1
- sxtw x8, w8
- add x5, x5, x7
+ add w5, w5, w7
ldrb w9, [x10], #1
- sxtw x9, w9
- add x5, x5, x8
- add x5, x5, x9
- add x5, x5, #4
- lsr x5, x5, #3
+ add w5, w5, w8
+ add w5, w5, w9
+ add w5, w5, #4
+ lsr w5, w5, #3
dup v0.8b, w5
st1 {v0.s}[0], [x1], x3
st1 {v0.s}[0], [x1], x3
@@ -304,23 +299,19 @@ ih264_intra_pred_luma_4x4_mode_dc_av8:
b end_func
top_available: // ONLT TOP AVAILABLE
- ands x11, x4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ ands w11, w4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
beq none_available
add x10, x0, #5
ldrb w6, [x10], #1
- sxtw x6, w6
ldrb w7, [x10], #1
- sxtw x7, w7
ldrb w8, [x10], #1
- sxtw x8, w8
- add x5, x6, x7
+ add w5, w6, w7
ldrb w9, [x10], #1
- sxtw x9, w9
- add x5, x5, x8
- add x5, x5, x9
- add x5, x5, #2
- lsr x5, x5, #2
+ add w5, w5, w8
+ add w5, w5, w9
+ add w5, w5, #2
+ lsr w5, w5, #2
dup v0.8b, w5
st1 {v0.s}[0], [x1], x3
st1 {v0.s}[0], [x1], x3
@@ -401,9 +392,9 @@ end_func:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_4x4_mode_diag_dl_av8
@@ -413,6 +404,7 @@ ih264_intra_pred_luma_4x4_mode_diag_dl_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
add x0, x0, #5
sub x5, x3, #2
@@ -488,9 +480,9 @@ end_func_diag_dl:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_4x4_mode_diag_dr_av8
@@ -499,6 +491,7 @@ ih264_intra_pred_luma_4x4_mode_diag_dr_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
ld1 {v0.8b}, [x0]
@@ -571,9 +564,9 @@ end_func_diag_dr:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_4x4_mode_vert_r_av8
@@ -582,6 +575,7 @@ ih264_intra_pred_luma_4x4_mode_vert_r_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
ld1 {v0.8b}, [x0]
@@ -656,9 +650,9 @@ end_func_vert_r:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_4x4_mode_horz_d_av8
@@ -667,6 +661,7 @@ ih264_intra_pred_luma_4x4_mode_horz_d_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
ld1 {v0.8b}, [x0]
add x0, x0, #1
@@ -743,9 +738,9 @@ end_func_horz_d:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_4x4_mode_vert_l_av8
@@ -754,6 +749,7 @@ ih264_intra_pred_luma_4x4_mode_vert_l_av8:
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
add x0, x0, #4
ld1 {v0.8b}, [x0]
add x0, x0, #1
@@ -825,9 +821,9 @@ end_func_vert_l:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_4x4_mode_horz_u_av8
@@ -835,11 +831,11 @@ end_func_vert_l:
ih264_intra_pred_luma_4x4_mode_horz_u_av8:
push_v_regs
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
mov x10, x0
ld1 {v0.8b}, [x0]
ldrb w9, [x0], #1
- sxtw x9, w9
ext v1.8b, v0.8b , v0.8b , #1
ld1 {v0.b}[7], [x10]
ext v2.8b, v1.8b , v1.8b , #1
diff --git a/common/armv8/ih264_intra_pred_luma_8x8_av8.s b/common/armv8/ih264_intra_pred_luma_8x8_av8.s
index bf9a4c1..273aa81 100644
--- a/common/armv8/ih264_intra_pred_luma_8x8_av8.s
+++ b/common/armv8/ih264_intra_pred_luma_8x8_av8.s
@@ -102,9 +102,9 @@
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_8x8_mode_vert_av8
@@ -114,6 +114,7 @@ ih264_intra_pred_luma_8x8_mode_vert_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
//stp x19, x20,[sp,#-16]!
+ sxtw x3, w3
add x0, x0, #9
ld1 {v0.8b}, [x0]
@@ -180,9 +181,9 @@ ih264_intra_pred_luma_8x8_mode_vert_av8:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_8x8_mode_horz_av8
@@ -194,38 +195,30 @@ ih264_intra_pred_luma_8x8_mode_horz_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
add x0, x0, #7
- mov x2 , #-1
ldrb w5, [x0], #-1
- sxtw x5, w5
ldrb w6, [x0], #-1
- sxtw x6, w6
dup v0.8b, w5
st1 {v0.8b}, [x1], x3
ldrb w7, [x0], #-1
- sxtw x7, w7
dup v1.8b, w6
st1 {v1.8b}, [x1], x3
dup v2.8b, w7
ldrb w8, [x0], #-1
- sxtw x8, w8
dup v3.8b, w8
st1 {v2.8b}, [x1], x3
ldrb w5, [x0], #-1
- sxtw x5, w5
st1 {v3.8b}, [x1], x3
dup v0.8b, w5
ldrb w6, [x0], #-1
- sxtw x6, w6
st1 {v0.8b}, [x1], x3
ldrb w7, [x0], #-1
- sxtw x7, w7
dup v1.8b, w6
dup v2.8b, w7
st1 {v1.8b}, [x1], x3
ldrb w8, [x0], #-1
- sxtw x8, w8
dup v3.8b, w8
st1 {v2.8b}, [x1], x3
st1 {v3.8b}, [x1], x3
@@ -285,9 +278,9 @@ ih264_intra_pred_luma_8x8_mode_horz_av8:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_8x8_mode_dc_av8
@@ -298,37 +291,30 @@ ih264_intra_pred_luma_8x8_mode_dc_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
- ands x6, x4, #0x01
+ ands w6, w4, #0x01
beq top_available //LEFT NOT AVAILABLE
add x10, x0, #7
mov x2, #-1
ldrb w5, [x10], -1
- sxtw x5, w5
ldrb w6, [x10], -1
- sxtw x6, w6
ldrb w7, [x10], -1
- sxtw x7, w7
- add x5, x5, x6
+ add w5, w5, w6
ldrb w8, [x10], -1
- sxtw x8, w8
- add x5, x5, x7
+ add w5, w5, w7
ldrb w6, [x10], -1
- sxtw x6, w6
- add x5, x5, x8
+ add w5, w5, w8
ldrb w7, [x10], -1
- sxtw x7, w7
- add x5, x5, x6
+ add w5, w5, w6
ldrb w8, [x10], -1
- sxtw x8, w8
- add x5, x5, x7
- ands x11, x4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
- add x5, x5, x8
+ add w5, w5, w7
+ ands w11, w4, #0x04 // CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
+ add w5, w5, w8
ldrb w6, [x10], -1
- sxtw x6, w6
- add x5, x5, x6
+ add w5, w5, w6
beq left_available
add x10, x0, #9
// BOTH LEFT AND TOP AVAILABLE
@@ -351,7 +337,7 @@ ih264_intra_pred_luma_8x8_mode_dc_av8:
b end_func
top_available: // ONLT TOP AVAILABLE
- ands x11, x4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
+ ands w11, w4, #0x04 // CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
beq none_available
add x10, x0, #9
@@ -452,9 +438,9 @@ end_func:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_8x8_mode_diag_dl_av8
@@ -463,6 +449,7 @@ ih264_intra_pred_luma_8x8_mode_diag_dl_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
add x0, x0, #9
sub x5, x3, #4
@@ -554,9 +541,9 @@ end_func_diag_dl:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_8x8_mode_diag_dr_av8
@@ -566,6 +553,7 @@ ih264_intra_pred_luma_8x8_mode_diag_dr_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
ld1 { v0.16b}, [x0]
@@ -654,9 +642,9 @@ end_func_diag_dr:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_8x8_mode_vert_r_av8
@@ -666,6 +654,7 @@ ih264_intra_pred_luma_8x8_mode_vert_r_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
ld1 { v0.16b}, [x0]
mov v1.d[0], v0.d[1]
@@ -780,9 +769,9 @@ end_func_vert_r:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_8x8_mode_horz_d_av8
@@ -791,6 +780,7 @@ ih264_intra_pred_luma_8x8_mode_horz_d_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
ld1 { v0.16b}, [x0]
mov v1.d[0], v0.d[1]
@@ -910,9 +900,9 @@ end_func_horz_d:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_8x8_mode_vert_l_av8
@@ -922,6 +912,7 @@ ih264_intra_pred_luma_8x8_mode_vert_l_av8:
// STMFD sp!, {x4-x12, x14} //Restoring registers from stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
add x0, x0, #9
ld1 { v0.16b}, [x0]
mov v1.d[0], v0.d[1]
@@ -1018,9 +1009,9 @@ end_func_vert_l:
//**************Variables Vs Registers*****************************************
// x0 => *pu1_src
// x1 => *pu1_dst
-// x2 => src_strd
-// x3 => dst_strd
-// x4 => ui_neighboravailability
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => ui_neighboravailability
.global ih264_intra_pred_luma_8x8_mode_horz_u_av8
@@ -1029,6 +1020,7 @@ ih264_intra_pred_luma_8x8_mode_horz_u_av8:
// STMFD sp!, {x4-x12, x14} //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
+ sxtw x3, w3
ld1 {v0.8b}, [x0]
ld1 {v1.b}[7], [x0]
diff --git a/common/armv8/ih264_iquant_itrans_recon_av8.s b/common/armv8/ih264_iquant_itrans_recon_av8.s
index 4c83036..003ee74 100644
--- a/common/armv8/ih264_iquant_itrans_recon_av8.s
+++ b/common/armv8/ih264_iquant_itrans_recon_av8.s
@@ -103,11 +103,11 @@
//x0 => *pi2_src
//x1 => *pu1_pred
//x2 => *pu1_out
-//x3 => pred_strd
-//x4 => out_strd
+//w3 => pred_strd
+//w4 => out_strd
//x5 => *pu2_iscal_mat
//x6 => *pu2_weigh_mat
-//x7 => u4_qp_div_6
+//w7 => u4_qp_div_6
// => pi4_tmp
// => iq_start_idx
// => pi2_dc_ld_addr
@@ -119,6 +119,8 @@
ih264_iquant_itrans_recon_4x4_av8:
push_v_regs
+ sxtw x3, w3
+ sxtw x4, w4
dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15
@@ -292,11 +294,11 @@ skip_loading_luma_dc_src:
//x0 => *pi2_src
//x1 => *pu1_pred
//x2 => *pu1_out
-//x3 => pred_strd
-//x4 => out_strd
+//w3 => pred_strd
+//w4 => out_strd
//x5 => *pu2_iscal_mat
//x6 => *pu2_weigh_mat
-//x7 => u4_qp_div_6
+//w7 => u4_qp_div_6
//sp => pi4_tmp
//sp#8 => *pi2_dc_src
@@ -315,6 +317,8 @@ ih264_iquant_itrans_recon_chroma_4x4_av8:
//reduce sp by 64
push_v_regs
+ sxtw x3, w3
+ sxtw x4, w4
dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15
@@ -512,11 +516,11 @@ ih264_iquant_itrans_recon_chroma_4x4_av8:
//x0 => *pi2_src
//x1 => *pu1_pred
//x2 => *pu1_out
-//x3 => pred_strd
-//x4 => out_strd
+//w3 => pred_strd
+//w4 => out_strd
//x5 => *pu2_iscal_mat
//x6 => *pu2_weigh_mat
-//x7 => u4_qp_div_6
+//w7 => u4_qp_div_6
//NOT USED => pi4_tmp
//NOT USED => iq_start_idx
//NOT USED => pi2_dc_ld_addr
@@ -525,6 +529,8 @@ ih264_iquant_itrans_recon_chroma_4x4_av8:
ih264_iquant_itrans_recon_8x8_av8:
push_v_regs
+ sxtw x3, w3
+ sxtw x4, w4
ld1 {v8.8h -v11.8h}, [x5], #64
ld1 {v12.8h-v15.8h}, [x5]
diff --git a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
index 8bb9c32..13061ec 100644
--- a/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
+++ b/common/armv8/ih264_iquant_itrans_recon_dc_av8.s
@@ -104,11 +104,11 @@
//x0 => *pi2_src
//x1 => *pu1_pred
//x2 => *pu1_out
-//x3 => pred_strd
-//x4 => out_strd
+//w3 => pred_strd
+//w4 => out_strd
//x5 => *pu2_iscal_mat
//x6 => *pu2_weigh_mat
-//x7 => u4_qp_div_6
+//w7 => u4_qp_div_6
// => pi4_tmp
// => iq_start_idx
// => pi2_dc_ld_addr
@@ -119,6 +119,8 @@
.global ih264_iquant_itrans_recon_4x4_dc_av8
ih264_iquant_itrans_recon_4x4_dc_av8:
+ sxtw x3, w3
+ sxtw x4, w4
ldr w8, [sp, #8] //Loads iq_start_idx
subs w8, w8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
@@ -209,11 +211,11 @@ donot_use_pi2_src_luma_dc:
// x0 : pi2_src
// x1 : pu1_pred
// x2 : pu1_out
-// x3 : pred_strd
-// x4 : out_strd
+// w3 : pred_strd
+// w4 : out_strd
// x5 : pu2_iscal_mat
// x6 : pu2_weigh_mat
-// x7 : u4_qp_div_6
+// w7 : u4_qp_div_6
// : pi2_tmp
// : pi2_dc_src
// Neon registers d0-d7, d16-d30 are used
@@ -223,6 +225,8 @@ donot_use_pi2_src_luma_dc:
.global ih264_iquant_itrans_recon_chroma_4x4_dc_av8
ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
+ sxtw x3, w3
+ sxtw x4, w4
ldr x0, [sp, #8]
push_v_regs
ld1 {v0.h}[0], [x0]
@@ -327,11 +331,11 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
//x0 => *pi2_src
//x1 => *pu1_pred
//x2 => *pu1_out
-//x3 => pred_strd
-//x4 => out_strd
+//w3 => pred_strd
+//w4 => out_strd
//x5 => *pu2_iscal_mat
//x6 => *pu2_weigh_mat
-//x7 => u4_qp_div_6
+//w7 => u4_qp_div_6
//NOT USED => pi4_tmp
//NOT USED => iq_start_idx
//NOT USED => pi2_dc_ld_addr
@@ -340,6 +344,8 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
ih264_iquant_itrans_recon_8x8_dc_av8:
push_v_regs
+ sxtw x3, w3
+ sxtw x4, w4
ld1 {v1.h}[0], [x5]
ld1 {v2.h}[0], [x6]
diff --git a/common/armv8/ih264_mem_fns_neon_av8.s b/common/armv8/ih264_mem_fns_neon_av8.s
index 4e9020d..802550d 100644
--- a/common/armv8/ih264_mem_fns_neon_av8.s
+++ b/common/armv8/ih264_mem_fns_neon_av8.s
@@ -70,11 +70,11 @@
//*/
//void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
// UWORD8 *pu1_src,
-// UWORD8 num_bytes)
+// UWORD32 num_bytes)
//**************Variables Vs Registers*************************
// x0 => *pu1_dst
// x1 => *pu1_src
-// x2 => num_bytes
+// w2 => num_bytes
@@ -89,7 +89,7 @@ loop_neon_memcpy_mul_8:
ld1 {v0.8b}, [x1], #8
st1 {v0.8b}, [x0], #8
- subs x2, x2, #8
+ subs w2, w2, #8
bne loop_neon_memcpy_mul_8
ret
@@ -99,38 +99,36 @@ loop_neon_memcpy_mul_8:
//*/
//void ih264_memcpy(UWORD8 *pu1_dst,
// UWORD8 *pu1_src,
-// UWORD8 num_bytes)
+// UWORD32 num_bytes)
//**************Variables Vs Registers*************************
// x0 => *pu1_dst
// x1 => *pu1_src
-// x2 => num_bytes
+// w2 => num_bytes
.global ih264_memcpy_av8
ih264_memcpy_av8:
- subs x2, x2, #8
+ subs w2, w2, #8
blt arm_memcpy
loop_neon_memcpy:
// Memcpy 8 bytes
ld1 {v0.8b}, [x1], #8
st1 {v0.8b}, [x0], #8
- subs x2, x2, #8
+ subs w2, w2, #8
bge loop_neon_memcpy
- cmn x2, #8
+ cmn w2, #8
beq end_func1
arm_memcpy:
- add x2, x2, #8
+ add w2, w2, #8
loop_arm_memcpy:
ldrb w3, [x1], #1
- sxtw x3, w3
strb w3, [x0], #1
- sxtw x3, w3
- subs x2, x2, #1
+ subs w2, w2, #1
bne loop_arm_memcpy
ret
end_func1:
@@ -139,7 +137,7 @@ end_func1:
//void ih264_memset_mul_8(UWORD8 *pu1_dst,
// UWORD8 value,
-// UWORD8 num_bytes)
+// UWORD32 num_bytes)
//**************Variables Vs Registers*************************
// x0 => *pu1_dst
// x1 => value
@@ -156,7 +154,7 @@ loop_memset_mul_8:
// Memset 8 bytes
st1 {v0.8b}, [x0], #8
- subs x2, x2, #8
+ subs w2, w2, #8
bne loop_memset_mul_8
ret
@@ -164,36 +162,35 @@ loop_memset_mul_8:
//void ih264_memset(UWORD8 *pu1_dst,
// UWORD8 value,
-// UWORD8 num_bytes)
+// UWORD32 num_bytes)
//**************Variables Vs Registers*************************
// x0 => *pu1_dst
-// x1 => value
-// x2 => num_bytes
+// w1 => value
+// w2 => num_bytes
.global ih264_memset_av8
ih264_memset_av8:
- subs x2, x2, #8
+ subs w2, w2, #8
blt arm_memset
dup v0.8b, w1
loop_neon_memset:
// Memcpy 8 bytes
st1 {v0.8b}, [x0], #8
- subs x2, x2, #8
+ subs w2, w2, #8
bge loop_neon_memset
- cmn x2, #8
+ cmn w2, #8
beq end_func2
arm_memset:
- add x2, x2, #8
+ add w2, w2, #8
loop_arm_memset:
strb w1, [x0], #1
- sxtw x1, w1
- subs x2, x2, #1
+ subs w2, w2, #1
bne loop_arm_memset
ret
end_func2:
@@ -205,11 +202,11 @@ end_func2:
//void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
// UWORD16 value,
-// UWORD8 num_words)
+// UWORD32 num_words)
//**************Variables Vs Registers*************************
// x0 => *pu2_dst
-// x1 => value
-// x2 => num_words
+// w1 => value
+// w2 => num_words
.global ih264_memset_16bit_mul_8_av8
@@ -224,7 +221,7 @@ loop_memset_16bit_mul_8:
st1 {v0.4h}, [x0], #8
st1 {v0.4h}, [x0], #8
- subs x2, x2, #8
+ subs w2, w2, #8
bne loop_memset_16bit_mul_8
ret
@@ -233,18 +230,18 @@ loop_memset_16bit_mul_8:
//void ih264_memset_16bit(UWORD16 *pu2_dst,
// UWORD16 value,
-// UWORD8 num_words)
+// UWORD32 num_words)
//**************Variables Vs Registers*************************
// x0 => *pu2_dst
-// x1 => value
-// x2 => num_words
+// w1 => value
+// w2 => num_words
.global ih264_memset_16bit_av8
ih264_memset_16bit_av8:
- subs x2, x2, #8
+ subs w2, w2, #8
blt arm_memset_16bit
dup v0.4h, w1
loop_neon_memset_16bit:
@@ -252,18 +249,17 @@ loop_neon_memset_16bit:
st1 {v0.4h}, [x0], #8
st1 {v0.4h}, [x0], #8
- subs x2, x2, #8
+ subs w2, w2, #8
bge loop_neon_memset_16bit
- cmn x2, #8
+ cmn w2, #8
beq end_func3
arm_memset_16bit:
- add x2, x2, #8
+ add w2, w2, #8
loop_arm_memset_16bit:
strh w1, [x0], #2
- sxtw x1, w1
- subs x2, x2, #1
+ subs w2, w2, #1
bne loop_arm_memset_16bit
ret
diff --git a/common/armv8/ih264_padding_neon_av8.s b/common/armv8/ih264_padding_neon_av8.s
index 35d9c8a..e03fe2f 100644
--- a/common/armv8/ih264_padding_neon_av8.s
+++ b/common/armv8/ih264_padding_neon_av8.s
@@ -76,9 +76,9 @@
// WORD32 pad_size)
//**************Variables Vs Registers*************************
// x0 => *pu1_src
-// x1 => src_strd
-// x2 => wd
-// x3 => pad_size
+// w1 => src_strd
+// w2 => wd
+// w3 => pad_size
.global ih264_pad_top_av8
@@ -86,25 +86,25 @@ ih264_pad_top_av8:
// STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x1, w1
stp x19, x20, [sp, #-16]!
sub x5, x0, x1
- sub x20, x1, #0
- neg x6, x20
+ neg x6, x1
loop_neon_memcpy_mul_16:
// Load 16 bytes
ld1 {v0.8b, v1.8b}, [x0], #16
mov x4, x5
- mov x7, x3
+ mov w7, w3
add x5, x5, #16
loop_neon_pad_top:
st1 {v0.8b, v1.8b}, [x4], x6
- subs x7, x7, #1
+ subs w7, w7, #1
bne loop_neon_pad_top
- subs x2, x2, #16
+ subs w2, w2, #16
bne loop_neon_memcpy_mul_16
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
@@ -160,9 +160,9 @@ loop_neon_pad_top:
// WORD32 pad_size)
//**************Variables Vs Registers*************************
// x0 => *pu1_src
-// x1 => src_strd
-// x2 => ht
-// x3 => pad_size
+// w1 => src_strd
+// w2 => ht
+// w3 => pad_size
@@ -172,6 +172,8 @@ ih264_pad_left_luma_av8:
// STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x1, w1
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
@@ -182,43 +184,35 @@ ih264_pad_left_luma_av8:
loop_16: // /*hard coded for width=16 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
st1 {v2.16b}, [x4], x1 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
dup v4.16b, w10
dup v6.16b, w11
st1 {v4.16b}, [x4], x1 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
st1 {v6.16b}, [x4], x1 // 16 bytes store
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], x1 // 16 bytes store
dup v4.16b, w10
dup v6.16b, w11
- subs x2, x2, #8
+ subs w2, w2, #8
st1 {v4.16b}, [x4], x1 // 16 bytes store
st1 {v6.16b}, [x4], x1 // 16 bytes store
bne loop_16
@@ -227,14 +221,11 @@ loop_16: // /*hard coded for width=16 ,height =
loop_32: // /*hard coded for width=32 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6
@@ -243,35 +234,30 @@ loop_32: // /*hard coded for width=32 ,height =8
st1 {v2.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
st1 {v6.16b}, [x4], #16 // 16 bytes store
dup v0.16b, w8
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
st1 {v6.16b}, [x4], x6 // 16 bytes store
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.16b, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #8
+ subs w2, w2, #8
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
bne loop_32
@@ -333,9 +319,9 @@ end_func:
// WORD32 pad_size)
//{
// x0 => *pu1_src
-// x1 => src_strd
-// x2 => ht
-// x3 => pad_size
+// w1 => src_strd
+// w2 => ht
+// w3 => pad_size
@@ -345,6 +331,8 @@ ih264_pad_left_chroma_av8:
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x1, w1
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
sub x4, x0, x3
@@ -354,27 +342,23 @@ ih264_pad_left_chroma_av8:
loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
st1 {v4.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #4
+ subs w2, w2, #4
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
@@ -383,27 +367,23 @@ loop_32_l_c: // /*hard coded for width=32 ,height =
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
st1 {v4.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #4
+ subs w2, w2, #4
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
@@ -412,20 +392,16 @@ loop_32_l_c: // /*hard coded for width=32 ,height =
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
@@ -500,9 +476,9 @@ end_func_l_c:
//}
//
// x0 => *pu1_src
-// x1 => src_strd
-// x2 => ht
-// x3 => pad_size
+// w1 => src_strd
+// w2 => ht
+// w3 => pad_size
@@ -512,6 +488,8 @@ ih264_pad_right_luma_av8:
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x1, w1
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
mov x4, x0
@@ -522,43 +500,35 @@ ih264_pad_right_luma_av8:
loop_16_r: // /*hard coded for width=16 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
st1 {v2.16b}, [x4], x1 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
dup v4.16b, w10
dup v6.16b, w11
st1 {v4.16b}, [x4], x1 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
st1 {v6.16b}, [x4], x1 // 16 bytes store
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], x1 // 16 bytes store
dup v4.16b, w10
dup v6.16b, w11
- subs x2, x2, #8
+ subs w2, w2, #8
st1 {v4.16b}, [x4], x1 // 16 bytes store
st1 {v6.16b}, [x4], x1 // 16 bytes store
bne loop_16_r
@@ -567,14 +537,11 @@ loop_16_r: // /*hard coded for width=16 ,height =8,16*/
loop_32_r: // /*hard coded for width=32 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6
@@ -583,35 +550,30 @@ loop_32_r: // /*hard coded for width=32 ,height =
st1 {v2.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
st1 {v6.16b}, [x4], #16 // 16 bytes store
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
st1 {v6.16b}, [x4], x6 // 16 bytes store
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.16b, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #8
+ subs w2, w2, #8
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
bne loop_32_r
@@ -672,9 +634,9 @@ end_func_r:
// WORD32 ht,
// WORD32 pad_size)
// x0 => *pu1_src
-// x1 => src_strd
-// x2 => ht
-// x3 => pad_size
+// w1 => src_strd
+// w2 => ht
+// w3 => pad_size
@@ -684,6 +646,8 @@ ih264_pad_right_chroma_av8:
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x1, w1
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
mov x4, x0
@@ -692,24 +656,20 @@ ih264_pad_right_chroma_av8:
loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #4
+ subs w2, w2, #4
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
@@ -720,27 +680,23 @@ loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
dup v0.8h, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #4
+ subs w2, w2, #4
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
@@ -748,20 +704,16 @@ loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/
bne loop_32_r_c
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
dup v0.8h, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
diff --git a/common/armv8/ih264_resi_trans_quant_av8.s b/common/armv8/ih264_resi_trans_quant_av8.s
index 316c220..d2ba3cf 100644
--- a/common/armv8/ih264_resi_trans_quant_av8.s
+++ b/common/armv8/ih264_resi_trans_quant_av8.s
@@ -45,18 +45,6 @@
//* function name : ih264_resi_trans_quant_4x4
//* description : this function does cf4 of h264
//*
-//* arguments : x0 :pointer to src buffer
-// x1 :pointer to pred buffer
-// x2 :pointer to dst buffer
-// x3 :source stride
-// x4 :pred stride,
-// x5 :dst stride,
-// x6 :pointer to scaling matrix,
-// x7 :pointer to threshold matrix,
-// stack qbits,
-// rounding factor,
-// pointer to store nnz
-// pointer to store non quantized dc value
// values returned : none
//
// register usage :
@@ -77,34 +65,24 @@
.global ih264_resi_trans_quant_4x4_av8
ih264_resi_trans_quant_4x4_av8:
- //x0 :pointer to src buffer
- //x1 :pointer to pred buffer
- //x2 :pointer to dst buffer
- //x3 :source stride
- //x4 :pred stride
- //x5 :dst stride,
- //x6 :scale matirx,
- //x7 :threshold matrix
- // :qbits
- // :round factor
- // :nnz
- // :pointer to store non quantized dc value
push_v_regs
//x0 :pointer to src buffer
//x1 :pointer to pred buffer
//x2 :pointer to dst buffer
- //x3 :source stride
- //x4 :pred stride
- //x5 :scale matirx,
+ //w3 :source stride
+ //w4 :pred stride
+ //w5 :scale matirx,
//x6 :threshold matrix
- //x7 :qbits
- //x8 :round factor
+ //w7 :qbits
+ //w8 :round factor
//x9 :nnz
//x10 :pointer to store non quantized dc value
+ sxtw x3, w3
+ sxtw x4, w4
ldr w8, [sp, #64] //load round factor
ldr x10, [sp, #80] //load addres for non quant val
- neg x7, x7 //negate the qbit value for usiing lsl
+ neg w7, w7 //negate the qbit value for usiing lsl
ldr x9, [sp, #72]
//------------fucntion loading done----------------;
@@ -259,18 +237,6 @@ ih264_resi_trans_quant_4x4_av8:
//* description : this function does residue calculation, forward transform
//* and quantization for 4x4 chroma block.
//*
-//* arguments : x0 :pointer to src buffer
-// x1 :pointer to pred buffer
-// x2 :pointer to dst buffer
-// x3 :source stride
-// x4 :pred stride,
-// x5 :dst stride,
-// x6 :pointer to scaling matrix,
-// x7 :pointer to threshold matrix,
-// stack qbits,
-// rounding factor,
-// pointer to store nnz
-// pointer to store unquantized dc values
// values returned : none
//
// register usage :
@@ -290,33 +256,24 @@ ih264_resi_trans_quant_4x4_av8:
.global ih264_resi_trans_quant_chroma_4x4_av8
ih264_resi_trans_quant_chroma_4x4_av8:
- //x0 :pointer to src buffer
- //x1 :pointer to pred buffer
- //x2 :pointer to dst buffer
- //x3 :source stride
- //stack :pred stride
- // :scale matirx,
- // :threshold matrix
- // :qbits
- // :round factor
- // :nnz
- // :pu1_dc_alt_addr
push_v_regs
//x0 :pointer to src buffer
//x1 :pointer to pred buffer
//x2 :pointer to dst buffer
- //x3 :source stride
- //x4 :pred stride
+ //w3 :source stride
+ //w4 :pred stride
//x5 :scale matirx,
//x6 :threshold matrix
- //x7 :qbits
- //x8 :round factor
+ //w7 :qbits
+ //w8 :round factor
//x9 :nnz
//x10 :pointer to store non quantized dc value
+ sxtw x3, w3
+ sxtw x4, w4
ldr w8, [sp, #64] //load round factor
ldr x10, [sp, #80] //load addres for non quant val
- neg x7, x7 //negate the qbit value for usiing lsl
+ neg w7, w7 //negate the qbit value for usiing lsl
ldr x9, [sp, #72]
//------------fucntion loading done----------------;
@@ -485,10 +442,10 @@ ih264_resi_trans_quant_chroma_4x4_av8:
//* arguments : x0 :pointer to src buffer
// x1 :pointer to dst buffer
// x2 :pu2_scale_matrix
-// x2 :pu2_threshold_matrix
-// x3 :u4_qbits
-// x4 :u4_round_factor
-// x5 :pu1_nnz
+// x3 :pu2_threshold_matrix
+// w4 :u4_qbits
+// w5 :u4_round_factor
+// x6 :pu1_nnz
// values returned : none
//
// register usage :
@@ -516,8 +473,8 @@ ih264_hadamard_quant_4x4_av8:
//x1 :pointer to dst buffer
//x2 :pu2_scale_matrix
//x3 :pu2_threshold_matrix
-//x4 :u4_qbits
-//x5 :u4_round_factor
+//w4 :u4_qbits
+//w5 :u4_round_factor
//x6 :pu1_nnz
push_v_regs
@@ -632,10 +589,10 @@ ih264_hadamard_quant_4x4_av8:
//* arguments : x0 :pointer to src buffer
// x1 :pointer to dst buffer
// x2 :pu2_scale_matrix
-// x2 :pu2_threshold_matrix
-// x3 :u4_qbits
-// x4 :u4_round_factor
-// x5 :pu1_nnz
+// x3 :pu2_threshold_matrix
+// w4 :u4_qbits
+// w5 :u4_round_factor
+// x6 :pu1_nnz
// values returned : none
//
// register usage :
diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s
index b039fba..475f690 100644
--- a/common/armv8/ih264_weighted_bi_pred_av8.s
+++ b/common/armv8/ih264_weighted_bi_pred_av8.s
@@ -103,28 +103,28 @@
// WORD32 src_strd1,
// WORD32 src_strd2,
// WORD32 dst_strd,
-// UWORD16 log_WD,
-// UWORD32 wt1,
-// UWORD32 wt2,
-// UWORD16 ofst1,
-// UWORD16 ofst2,
-// UWORD8 ht,
-// UWORD8 wd)
+// WORD32 log_WD,
+// WORD32 wt1,
+// WORD32 wt2,
+// WORD16 ofst1,
+// WORD16 ofst2,
+// WORD32 ht,
+// WORD32 wd)
//
//**************Variables Vs Registers*****************************************
// x0 => puc_src1
// x1 => puc_src2
// x2 => puc_dst
-// x3 => src_strd1
-// [sp] => src_strd2 (x4)
-// [sp+4] => dst_strd (x5)
-// [sp+8] => log_WD (x6)
-// [sp+12] => wt1 (x7)
-// [sp+16] => wt2 (x8)
-// [sp+20] => ofst1 (x9)
-// [sp+24] => ofst2 (x10)
-// [sp+28] => ht (x11)
-// [sp+32] => wd (x12)
+// w3 => src_strd1
+// w4 => src_strd2
+// w5 => dst_strd
+// w6 => log_WD
+// w7 => wt1
+// [sp] => wt2 (w8)
+// [sp+8] => ofst1 (w9)
+// [sp+16] => ofst2 (w10)
+// [sp+24] => ht (w11)
+// [sp+32] => wd (w12)
//
.text
.p2align 2
@@ -138,21 +138,23 @@ ih264_weighted_bi_pred_luma_av8:
// STMFD sp!, {x4-x12,x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
stp x19, x20, [sp, #-16]!
- ldr x8, [sp, #80] //Load wt2 in x8
- ldr x9, [sp, #88] //Load ofst1 in x9
- add x6, x6, #1 //x6 = log_WD + 1
- sub x20, x6, #0 //x13 = -(log_WD + 1)
- neg x10, x20
+ ldr w8, [sp, #80] //Load wt2 in w8
+ ldr w9, [sp, #88] //Load ofst1 in w9
+ add w6, w6, #1 //w6 = log_WD + 1
+ neg w10, w6 //w10 = -(log_WD + 1)
dup v0.8h, w10 //Q0 = -(log_WD + 1) (32-bit)
- ldr x10, [sp, #96] //Load ofst2 in x10
- ldr x11, [sp, #104] //Load ht in x11
- ldr x12, [sp, #112] //Load wd in x12
- add x9, x9, #1 //x9 = ofst1 + 1
- add x9, x9, x10 //x9 = ofst1 + ofst2 + 1
+ ldr w10, [sp, #96] //Load ofst2 in w10
+ ldr w11, [sp, #104] //Load ht in w11
+ ldr w12, [sp, #112] //Load wd in w12
+ add w9, w9, #1 //w9 = ofst1 + 1
+ add w9, w9, w10 //w9 = ofst1 + ofst2 + 1
mov v2.s[0], w7
mov v2.s[1], w8 //D2 = {wt1(32-bit), wt2(32-bit)}
- asr x9, x9, #1 //x9 = ofst = (ofst1 + ofst2 + 1) >> 1
+ asr w9, w9, #1 //w9 = ofst = (ofst1 + ofst2 + 1) >> 1
dup v3.8b, w9 //D3 = ofst (8-bit)
cmp w12, #16
beq loop_16 //branch if wd is 16
@@ -383,28 +385,28 @@ end_loops:
// WORD32 src_strd1,
// WORD32 src_strd2,
// WORD32 dst_strd,
-// UWORD16 log_WD,
-// UWORD32 wt1,
-// UWORD32 wt2,
-// UWORD16 ofst1,
-// UWORD16 ofst2,
-// UWORD8 ht,
-// UWORD8 wd)
+// WORD32 log_WD,
+// WORD32 wt1,
+// WORD32 wt2,
+// WORD32 ofst1,
+// WORD32 ofst2,
+// WORD32 ht,
+// WORD32 wd)
//
//**************Variables Vs Registers*****************************************
// x0 => puc_src1
// x1 => puc_src2
// x2 => puc_dst
-// x3 => src_strd1
-// [sp] => src_strd2 (x4)
-// [sp+4] => dst_strd (x5)
-// [sp+8] => log_WD (x6)
-// [sp+12] => wt1 (x7)
-// [sp+16] => wt2 (x8)
-// [sp+20] => ofst1 (x9)
-// [sp+24] => ofst2 (x10)
-// [sp+28] => ht (x11)
-// [sp+32] => wd (x12)
+// w3 => src_strd1
+// w4 => src_strd2
+// w5 => dst_strd
+// w6 => log_WD
+// w7 => wt1
+// [sp] => wt2 (w8)
+// [sp+8] => ofst1 (w9)
+// [sp+16] => ofst2 (w10)
+// [sp+24] => ht (w11)
+// [sp+32] => wd (w12)
//
@@ -417,24 +419,22 @@ ih264_weighted_bi_pred_chroma_av8:
// STMFD sp!, {x4-x12,x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x3, w3
+ sxtw x4, w4
+ sxtw x5, w5
stp x19, x20, [sp, #-16]!
- ldr x8, [sp, #80] //Load wt2 in x8
+ ldr w8, [sp, #80] //Load wt2 in w8
dup v4.4s, w8 //Q2 = (wt2_u, wt2_v) (32-bit)
dup v2.4s, w7 //Q1 = (wt1_u, wt1_v) (32-bit)
- add x6, x6, #1 //x6 = log_WD + 1
- ldr w9, [sp, #88] //Load ofst1 in x9
- sxtw x9, w9
- ldr w10, [sp, #96] //Load ofst2 in x10
- sxtw x10, w10
- sub x20, x6, #0 //x12 = -(log_WD + 1)
- neg x20, x20
+ add w6, w6, #1 //w6 = log_WD + 1
+ ldr w9, [sp, #88] //Load ofst1 in w9
+ ldr w10, [sp, #96] //Load ofst2 in w10
+ neg w20, w6 //w20 = -(log_WD + 1)
dup v0.8h, w20 //Q0 = -(log_WD + 1) (16-bit)
ldr w11, [sp, #104] //Load ht in x11
ldr w12, [sp, #112] //Load wd in x12
- sxtw x11, w11
- sxtw x12, w12
dup v20.8h, w9 //0ffset1
dup v21.8h, w10 //0ffset2
srhadd v6.8b, v20.8b, v21.8b
diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s
index 69ed3b0..f145217 100644
--- a/common/armv8/ih264_weighted_pred_av8.s
+++ b/common/armv8/ih264_weighted_pred_av8.s
@@ -89,22 +89,22 @@
// UWORD8 *puc_dst,
// WORD32 src_strd,
// WORD32 dst_strd,
-// UWORD8 log_WD,
-// UWORD32 wt,
-// UWORD16 ofst,
-// UWORD8 ht,
-// UWORD8 wd)
+// WORD32 log_WD,
+// WORD32 wt,
+// WORD32 ofst,
+// WORD32 ht,
+// WORD32 wd)
//
//**************Variables Vs Registers*****************************************
// x0 => puc_src
// x1 => puc_dst
-// x2 => src_strd
-// x3 => dst_strd
-// [sp] => log_WD (x4)
-// [sp+4] => wt (x5)
-// [sp+8] => ofst (x6)
-// [sp+12] => ht (x7)
-// [sp+16] => wd (x8)
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => log_WD
+// w5 => wt
+// w6 => ofst
+// w7 => ht
+// [sp] => wd (w8)
//
.text
.p2align 2
@@ -118,13 +118,14 @@ ih264_weighted_pred_luma_av8:
// STMFD sp!, {x4-x9,x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x2, w2
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
ldr w8, [sp, #80] //Load wd
sxtw x8, w8
dup v2.4h, w5 //D2 = wt (16-bit)
- sub x20, x4, #0 //x9 = -log_WD
- neg x9, x20
+ neg w9, w4 //w9 = -log_WD
dup v3.8b, w6 //D3 = ofst (8-bit)
cmp w8, #16 //check if wd is 16
dup v0.8h, w9 //Q0 = -log_WD (16-bit)
@@ -318,22 +319,22 @@ end_loops:
// UWORD8 *puc_dst,
// WORD32 src_strd,
// WORD32 dst_strd,
-// UWORD8 log_WD,
-// UWORD32 wt,
-// UWORD16 ofst,
-// UWORD8 ht,
-// UWORD8 wd)
+// WORD32 log_WD,
+// WORD32 wt,
+// WORD32 ofst,
+// WORD32 ht,
+// WORD32 wd)
//
//**************Variables Vs Registers*****************************************
// x0 => puc_src
// x1 => puc_dst
-// x2 => src_strd
-// x3 => dst_strd
-// [sp] => log_WD (x4)
-// [sp+4] => wt (x5)
-// [sp+8] => ofst (x6)
-// [sp+12] => ht (x7)
-// [sp+16] => wd (x8)
+// w2 => src_strd
+// w3 => dst_strd
+// w4 => log_WD
+// w5 => wt
+// w6 => ofst
+// w7 => ht
+// [sp] => wd (w8)
//
@@ -345,13 +346,14 @@ ih264_weighted_pred_chroma_av8:
// STMFD sp!, {x4-x9,x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x2, w2
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
ldr w8, [sp, #80] //Load wd
sxtw x8, w8
- sub x20, x4, #0 //x9 = -log_WD
- neg x9, x20
+ neg w9, w4 //w9 = -log_WD
dup v2.4s, w5 //Q1 = {wt_u (16-bit), wt_v (16-bit)}