aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArpad Panyik <Arpad.Panyik@arm.com>2024-04-26 17:51:35 +0200
committerArpad Panyik <Arpad.Panyik@arm.com>2024-05-08 23:28:52 +0200
commitb2eca1aca7b055ec6255ebb286edab080a377526 (patch)
tree07a97759fb882e4d3eb8340ef1e81178659f8181
parentd1bdf4f1ff4bae70834d9e5391bb68b75c1c9111 (diff)
downloadlibdav1d-b2eca1aca7b055ec6255ebb286edab080a377526.tar.gz
AArch64: Optimize vertical i8mm subpel filters
Replace the accumulator initializations of the vertical subpel filters with register fills by zeros (which are usually zero latency operations in this feature class), this implies the usage of rounding shifts at the end in the prep cases. Out-of-order CPU cores can benefit from this change. The width=16 case uses a simpler register duplication scheme that relies on MOV instructions for the subsequent shuffles. This approach uses a different register to load the data into for better instruction scheduling and data dependency chain. Relative performance of micro benchmarks (lower is better): Cortex-X3: mct_8tap_sharp_w16_v_8bpc_i8mm: 0.910x mct_8tap_sharp_w8_v_8bpc_i8mm: 0.986x mc_8tap_sharp_w16_v_8bpc_i8mm: 0.864x mc_8tap_sharp_w8_v_8bpc_i8mm: 0.882x mc_8tap_sharp_w4_v_8bpc_i8mm: 0.933x mc_8tap_sharp_w2_v_8bpc_i8mm: 0.926x Cortex-A715: mct_8tap_sharp_w16_v_8bpc_i8mm: 0.855x mct_8tap_sharp_w8_v_8bpc_i8mm: 0.784x mct_8tap_sharp_w4_v_8bpc_i8mm: 1.069x mc_8tap_sharp_w16_v_8bpc_i8mm: 0.850x mc_8tap_sharp_w8_v_8bpc_i8mm: 0.779x mc_8tap_sharp_w4_v_8bpc_i8mm: 0.971x mc_8tap_sharp_w2_v_8bpc_i8mm: 0.975x Cortex-A510: mct_8tap_sharp_w16_v_8bpc_i8mm: 1.001x mct_8tap_sharp_w8_v_8bpc_i8mm: 0.979x mct_8tap_sharp_w4_v_8bpc_i8mm: 0.998x mc_8tap_sharp_w16_v_8bpc_i8mm: 0.998x mc_8tap_sharp_w8_v_8bpc_i8mm: 1.004x mc_8tap_sharp_w4_v_8bpc_i8mm: 1.003x mc_8tap_sharp_w2_v_8bpc_i8mm: 0.996x
-rw-r--r--src/arm/64/mc_dotprod.S114
1 files changed, 83 insertions, 31 deletions
diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index 19431ab..b61ee26 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -113,13 +113,7 @@ L(\type\()_8tap_v_\isa):
madd \my, \my, w11, w10
ldr q6, L(v_tbl_neon_dotprod)
sub \src, \src, \s_strd
-.ifc \isa, neon_i8mm
- .ifc \type, prep
- movi v4.4s, #2 // rounding
- .else
- movi v4.4s, #0
- .endif
-.else // neon_dotprod
+.ifc \isa, neon_dotprod
.ifc \type, prep
mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding
dup v4.4s, w8
@@ -202,17 +196,21 @@ L(\type\()_8tap_v_\isa):
.endif
.align LOOP_ALIGN
16:
+.ifc \isa, neon_i8mm
+ ld1 {v18.16b}, [\lsrc], \s_strd
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movi v2.4s, #0
+ movi v3.4s, #0
+ mov v21.16b, v18.16b
+ mov v24.16b, v18.16b
+ mov v27.16b, v18.16b
+.else // neon_dotprod
ld1 {v27.16b}, [\lsrc], \s_strd
-
mov v0.16b, v4.16b
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
-.ifc \isa, neon_i8mm
- mov v18.16b, v27.16b
- mov v21.16b, v27.16b
- mov v24.16b, v27.16b
-.else // neon_dotprod
sub v18.16b, v27.16b, v5.16b
sub v21.16b, v27.16b, v5.16b
sub v24.16b, v27.16b, v5.16b
@@ -242,8 +240,13 @@ L(\type\()_8tap_v_\isa):
uzp1 v0.8h, v0.8h, v1.8h
uzp1 v2.8h, v2.8h, v3.8h
.ifc \type, prep
+ .ifc \isa, neon_i8mm
+ srshr v0.8h, v0.8h, #2
+ srshr v1.8h, v2.8h, #2
+ .else
sshr v0.8h, v0.8h, #2
sshr v1.8h, v2.8h, #2
+ .endif
st1 {v0.8h, v1.8h}, [\ldst], \d_strd
.else // put
sqrshrun v0.8b, v0.8h, #6
@@ -252,11 +255,17 @@ L(\type\()_8tap_v_\isa):
.endif
b.gt 16b
+.ifc \isa, neon_i8mm
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movi v2.4s, #0
+ movi v3.4s, #0
+.else // neon_dotprod
mov v0.16b, v4.16b
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
-
+.endif
\dot v0.4s, v16.16b, v7.4b[0]
\dot v1.4s, v19.16b, v7.4b[0]
\dot v2.4s, v22.16b, v7.4b[0]
@@ -271,8 +280,13 @@ L(\type\()_8tap_v_\isa):
uzp1 v0.8h, v0.8h, v1.8h
uzp1 v2.8h, v2.8h, v3.8h
.ifc \type, prep
+ .ifc \isa, neon_i8mm
+ srshr v0.8h, v0.8h, #2
+ srshr v1.8h, v2.8h, #2
+ .else
sshr v0.8h, v0.8h, #2
sshr v1.8h, v2.8h, #2
+ .endif
stp q0, q1, [\ldst]
add \dst, \dst, #32
.else // put
@@ -322,18 +336,24 @@ L(\type\()_8tap_v_\isa):
.endif
.align LOOP_ALIGN
8:
+.ifc \isa, neon_i8mm
+ ldr d18, [\src]
+ movi v0.4s, #0
+ movi v1.4s, #0
+ ldr d24, [\src, \s_strd]
+ add \src, \src, \s_strd, lsl #1
+ movi v2.4s, #0
+ movi v3.4s, #0
+ mov v21.8b, v18.8b
+ mov v27.8b, v24.8b
+.else // neon_dotprod
ldr d21, [\src]
ldr d27, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
-
mov v0.16b, v4.16b
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
-.ifc \isa, neon_i8mm
- mov v18.16b, v21.16b
- mov v24.16b, v27.16b
-.else // neon_dotprod
sub v18.16b, v21.16b, v5.16b
sub v21.16b, v21.16b, v5.16b
sub v24.16b, v27.16b, v5.16b
@@ -363,8 +383,13 @@ L(\type\()_8tap_v_\isa):
uzp1 v0.8h, v0.8h, v1.8h
uzp1 v2.8h, v2.8h, v3.8h
.ifc \type, prep
+ .ifc \isa, neon_i8mm
+ srshr v0.8h, v0.8h, #2
+ srshr v1.8h, v2.8h, #2
+ .else
sshr v0.8h, v0.8h, #2
sshr v1.8h, v2.8h, #2
+ .endif
stp q0, q1, [\dst], #32
.else // put
sqrshrun v0.8b, v0.8h, #6
@@ -379,15 +404,19 @@ L(\type\()_8tap_v_\isa):
.align JUMP_ALIGN
82:
.endif
+.ifc \isa, neon_i8mm
+ ldr d18, [\src]
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movi v2.4s, #0
+ movi v3.4s, #0
+ mov v21.8b, v18.8b
+.else // neon_dotprod
ldr d21, [\src]
-
mov v0.16b, v4.16b
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
-.ifc \isa, neon_i8mm
- mov v18.16b, v21.16b
-.else
sub v18.16b, v21.16b, v5.16b
sub v21.16b, v21.16b, v5.16b
.endif
@@ -409,8 +438,13 @@ L(\type\()_8tap_v_\isa):
uzp1 v0.8h, v0.8h, v1.8h
uzp1 v2.8h, v2.8h, v3.8h
.ifc \type, prep
+ .ifc \isa, neon_i8mm
+ srshr v0.8h, v0.8h, #2
+ srshr v1.8h, v2.8h, #2
+ .else
sshr v0.8h, v0.8h, #2
sshr v1.8h, v2.8h, #2
+ .endif
stp q0, q1, [\dst]
.else // put
sqrshrun v0.8b, v0.8h, #6
@@ -460,10 +494,12 @@ L(\type\()_8tap_v_\isa):
ldr s18, [\src]
ldr s21, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
-
+.ifc \isa, neon_i8mm
+ movi v0.4s, #0
+ movi v1.4s, #0
+.else // neon_dotprod
mov v0.16b, v4.16b
mov v1.16b, v4.16b
-.ifc \isa, neon_dotprod
sub v18.16b, v18.16b, v5.16b
sub v21.16b, v21.16b, v5.16b
.endif
@@ -480,8 +516,13 @@ L(\type\()_8tap_v_\isa):
\dot v1.4s, v20.16b, v7.4b[1]
.ifc \type, prep
subs \h, \h, #2
+ .ifc \isa, neon_i8mm
+ rshrn v0.4h, v0.4s, #2
+ rshrn2 v0.8h, v1.4s, #2
+ .else
shrn v0.4h, v0.4s, #2
shrn2 v0.8h, v1.4s, #2
+ .endif
str q0, [\dst], #16
.else
uzp1 v0.8h, v0.8h, v1.8h
@@ -500,10 +541,12 @@ L(\type\()_8tap_v_\isa):
42:
.endif
ldr s18, [\src]
-
+.ifc \isa, neon_i8mm
+ movi v0.4s, #0
+ movi v1.4s, #0
+.else // neon_dotprod
mov v0.16b, v4.16b
mov v1.16b, v4.16b
-.ifc \isa, neon_dotprod
sub v18.16b, v18.16b, v5.16b
.endif
tbl v19.16b, {v16.16b, v17.16b}, v6.16b
@@ -515,8 +558,13 @@ L(\type\()_8tap_v_\isa):
\dot v1.4s, v19.16b, v7.4b[0]
\dot v1.4s, v20.16b, v7.4b[1]
.ifc \type, prep
+ .ifc \isa, neon_i8mm
+ rshrn v0.4h, v0.4s, #2
+ rshrn2 v0.8h, v1.4s, #2
+ .else
shrn v0.4h, v0.4s, #2
shrn2 v0.8h, v1.4s, #2
+ .endif
str q0, [\dst]
.else
uzp1 v0.8h, v0.8h, v1.8h
@@ -564,10 +612,12 @@ L(\type\()_8tap_v_\isa):
ldr h18, [\src]
ldr h21, [\src, \s_strd]
add \src, \src, \s_strd, lsl #1
-
+ .ifc \isa, neon_i8mm
+ movi v0.4s, #0
+ movi v1.4s, #0
+ .else // put
mov v0.16b, v4.16b
mov v1.16b, v4.16b
- .ifc \isa, neon_dotprod
sub v18.8b, v18.8b, v5.8b
sub v21.8b, v21.8b, v5.8b
.endif
@@ -597,10 +647,12 @@ L(\type\()_8tap_v_\isa):
.align JUMP_ALIGN
22:
ldr h18, [\src]
-
+ .ifc \isa, neon_i8mm
+ movi v0.4s, #0
+ movi v1.4s, #0
+ .else // put
mov v0.16b, v4.16b
mov v1.16b, v4.16b
- .ifc \isa, neon_dotprod
sub v18.8b, v18.8b, v5.8b
.endif
tbl v19.16b, {v16.16b, v17.16b}, v6.16b