aboutsummaryrefslogtreecommitdiff
path: root/common/armv8/ih264_padding_neon_av8.s
diff options
context:
space:
mode:
Diffstat (limited to 'common/armv8/ih264_padding_neon_av8.s')
-rw-r--r--common/armv8/ih264_padding_neon_av8.s120
1 files changed, 36 insertions, 84 deletions
diff --git a/common/armv8/ih264_padding_neon_av8.s b/common/armv8/ih264_padding_neon_av8.s
index 35d9c8a..e03fe2f 100644
--- a/common/armv8/ih264_padding_neon_av8.s
+++ b/common/armv8/ih264_padding_neon_av8.s
@@ -76,9 +76,9 @@
// WORD32 pad_size)
//**************Variables Vs Registers*************************
// x0 => *pu1_src
-// x1 => src_strd
-// x2 => wd
-// x3 => pad_size
+// w1 => src_strd
+// w2 => wd
+// w3 => pad_size
.global ih264_pad_top_av8
@@ -86,25 +86,25 @@ ih264_pad_top_av8:
// STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x1, w1
stp x19, x20, [sp, #-16]!
sub x5, x0, x1
- sub x20, x1, #0
- neg x6, x20
+ neg x6, x1
loop_neon_memcpy_mul_16:
// Load 16 bytes
ld1 {v0.8b, v1.8b}, [x0], #16
mov x4, x5
- mov x7, x3
+ mov w7, w3
add x5, x5, #16
loop_neon_pad_top:
st1 {v0.8b, v1.8b}, [x4], x6
- subs x7, x7, #1
+ subs w7, w7, #1
bne loop_neon_pad_top
- subs x2, x2, #16
+ subs w2, w2, #16
bne loop_neon_memcpy_mul_16
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
@@ -160,9 +160,9 @@ loop_neon_pad_top:
// WORD32 pad_size)
//**************Variables Vs Registers*************************
// x0 => *pu1_src
-// x1 => src_strd
-// x2 => ht
-// x3 => pad_size
+// w1 => src_strd
+// w2 => ht
+// w3 => pad_size
@@ -172,6 +172,8 @@ ih264_pad_left_luma_av8:
// STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x1, w1
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
@@ -182,43 +184,35 @@ ih264_pad_left_luma_av8:
loop_16: // /*hard coded for width=16 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
st1 {v2.16b}, [x4], x1 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
dup v4.16b, w10
dup v6.16b, w11
st1 {v4.16b}, [x4], x1 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
st1 {v6.16b}, [x4], x1 // 16 bytes store
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], x1 // 16 bytes store
dup v4.16b, w10
dup v6.16b, w11
- subs x2, x2, #8
+ subs w2, w2, #8
st1 {v4.16b}, [x4], x1 // 16 bytes store
st1 {v6.16b}, [x4], x1 // 16 bytes store
bne loop_16
@@ -227,14 +221,11 @@ loop_16: // /*hard coded for width=16 ,height =
loop_32: // /*hard coded for width=32 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6
@@ -243,35 +234,30 @@ loop_32: // /*hard coded for width=32 ,height =8
st1 {v2.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
st1 {v6.16b}, [x4], #16 // 16 bytes store
dup v0.16b, w8
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
st1 {v6.16b}, [x4], x6 // 16 bytes store
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.16b, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #8
+ subs w2, w2, #8
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
bne loop_32
@@ -333,9 +319,9 @@ end_func:
// WORD32 pad_size)
//{
// x0 => *pu1_src
-// x1 => src_strd
-// x2 => ht
-// x3 => pad_size
+// w1 => src_strd
+// w2 => ht
+// w3 => pad_size
@@ -345,6 +331,8 @@ ih264_pad_left_chroma_av8:
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x1, w1
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
sub x4, x0, x3
@@ -354,27 +342,23 @@ ih264_pad_left_chroma_av8:
loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
st1 {v4.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #4
+ subs w2, w2, #4
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
@@ -383,27 +367,23 @@ loop_32_l_c: // /*hard coded for width=32 ,height =
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
st1 {v4.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #4
+ subs w2, w2, #4
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
@@ -412,20 +392,16 @@ loop_32_l_c: // /*hard coded for width=32 ,height =
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
@@ -500,9 +476,9 @@ end_func_l_c:
//}
//
// x0 => *pu1_src
-// x1 => src_strd
-// x2 => ht
-// x3 => pad_size
+// w1 => src_strd
+// w2 => ht
+// w3 => pad_size
@@ -512,6 +488,8 @@ ih264_pad_right_luma_av8:
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x1, w1
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
mov x4, x0
@@ -522,43 +500,35 @@ ih264_pad_right_luma_av8:
loop_16_r: // /*hard coded for width=16 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
st1 {v2.16b}, [x4], x1 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
dup v4.16b, w10
dup v6.16b, w11
st1 {v4.16b}, [x4], x1 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
st1 {v6.16b}, [x4], x1 // 16 bytes store
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], x1 // 16 bytes store
dup v4.16b, w10
dup v6.16b, w11
- subs x2, x2, #8
+ subs w2, w2, #8
st1 {v4.16b}, [x4], x1 // 16 bytes store
st1 {v6.16b}, [x4], x1 // 16 bytes store
bne loop_16_r
@@ -567,14 +537,11 @@ loop_16_r: // /*hard coded for width=16 ,height =8,16*/
loop_32_r: // /*hard coded for width=32 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6
@@ -583,35 +550,30 @@ loop_32_r: // /*hard coded for width=32 ,height =
st1 {v2.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
- sxtw x8, w8
st1 {v6.16b}, [x4], #16 // 16 bytes store
ldrb w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.16b, w8
st1 {v6.16b}, [x4], x6 // 16 bytes store
ldrb w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.16b, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #8
+ subs w2, w2, #8
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
bne loop_32_r
@@ -672,9 +634,9 @@ end_func_r:
// WORD32 ht,
// WORD32 pad_size)
// x0 => *pu1_src
-// x1 => src_strd
-// x2 => ht
-// x3 => pad_size
+// w1 => src_strd
+// w2 => ht
+// w3 => pad_size
@@ -684,6 +646,8 @@ ih264_pad_right_chroma_av8:
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
push_v_regs
+ sxtw x1, w1
+ sxtw x3, w3
stp x19, x20, [sp, #-16]!
mov x4, x0
@@ -692,24 +656,20 @@ ih264_pad_right_chroma_av8:
loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #4
+ subs w2, w2, #4
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
@@ -720,27 +680,23 @@ loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
dup v0.8h, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
- subs x2, x2, #4
+ subs w2, w2, #4
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
@@ -748,20 +704,16 @@ loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/
bne loop_32_r_c
ldrh w8, [x0]
add x0, x0, x1
- sxtw x8, w8
dup v0.8h, w8
ldrh w9, [x0]
add x0, x0, x1
- sxtw x9, w9
ldrh w10, [x0]
add x0, x0, x1
- sxtw x10, w10
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrh w11, [x0]
add x0, x0, x1
- sxtw x11, w11
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store