aboutsummaryrefslogtreecommitdiff
path: root/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S')
-rw-r--r--src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S45
1 files changed, 24 insertions, 21 deletions
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
index 559a71a35..2fdf4f153 100644
--- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
@@ -122,7 +122,7 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
# Is there at least 4 floats (16 bytes)?
SUBS x0, x2, 16 // k = kc - 16
- B.LO 2f
+ B.LO 5f
# Main loop - 4 floats of A (16 bytes)
# 48 FMA + 6 ld128 A + 4 LDP B
@@ -191,12 +191,11 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
FMLA v31.4s, v19.4s, v5.s[3]
B.HS 1b
-2:
- # Is there a remainder?- 2 floats of A (8 bytes)
- TBNZ x0, 3, 4f
- # Is there a remainder?- 1 floats of A (4 bytes)
- TBNZ x0, 2, 5f
-3:
+ # Is there a remainder?- 2 floats of A (8 bytes) or less
+ TST x0, 15
+ B.NE 5f
+
+4:
# Clamp
FMIN v20.4s, v20.4s, v6.4s
SUBS x1, x1, 8
@@ -225,7 +224,7 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
FMAX v31.4s, v31.4s, v7.4s
# Store full 6 x 8
- B.LO 6f
+ B.LO 7f
ST1 {v30.16b, v31.16b}, [x7], x14
SUB x3, x3, x2 // a0 -= kc
@@ -241,10 +240,12 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
SUB x4, x4, x2 // a5 -= kc
B.HI 0b
-
RET
-4:
+5:
+ # Is there a remainder?- 2 floats of A (8 bytes)
+ TBZ x0, 3, 6f
+
# Remainder- 2 floats of A (8 bytes)
LDR d0, [x3], 8
LDP q16, q17, [x5], 32
@@ -279,10 +280,12 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
FMLA v27.4s, v19.4s, v3.s[1]
FMLA v29.4s, v19.4s, v4.s[1]
FMLA v31.4s, v19.4s, v5.s[1]
- TBZ x0, 2, 3b
-5:
- # Remainder- 1 floats of A (4 bytes)
+ # Is there a remainder?- 1 floats of A (4 bytes)
+ TBZ x0, 2, 4b
+
+ # Remainder- 1 float of A (4 bytes)
+6:
LDR s0, [x3], 4
LDP q16, q17, [x5], 32
LDR s1, [x9], 4
@@ -302,11 +305,11 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
FMLA v27.4s, v17.4s, v3.s[0]
FMLA v29.4s, v17.4s, v4.s[0]
FMLA v31.4s, v17.4s, v5.s[0]
- B 3b
+ B 4b
# Store odd width
-6:
- TBZ x1, 2, 7f
+7:
+ TBZ x1, 2, 8f
STR q30, [x7], 16
MOV v30.16b, v31.16b
STR q28, [x13], 16
@@ -320,8 +323,8 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
STR q20, [x6], 16
MOV v20.16b, v21.16b
-7:
- TBZ x1, 1, 8f
+8:
+ TBZ x1, 1, 9f
STR d30, [x7], 8
DUP d30, v30.d[1]
STR d28, [x13], 8
@@ -335,15 +338,15 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
STR d20, [x6], 8
DUP d20, v20.d[1]
-8:
- TBZ x1, 0, 9f
+9:
+ TBZ x1, 0, 10f
STR s30, [x7]
STR s28, [x13]
STR s26, [x18]
STR s24, [x17]
STR s22, [x16]
STR s20, [x6]
-9:
+10:
RET
END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128