aboutsummaryrefslogtreecommitdiff
path: root/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S')
-rw-r--r--src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S89
1 files changed, 48 insertions, 41 deletions
diff --git a/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
index c2f496745..aa7e7a7e6 100644
--- a/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
+++ b/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
@@ -75,7 +75,7 @@ BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128
# Is there at least 4 floats (16 bytes)?
SUBS x0, x2, 16 // k = kc - 16
- B.LO 2f
+ B.LO 5f
# Main loop - 4 floats of A (16 bytes)
1:
@@ -122,10 +122,50 @@ BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128
FMLA v31.4s, v27.4s, v3.s[3]
B.HS 1b
+ TST x0, 15
+ B.NE 5f
+
+4:
+ # Clamp
+ FMIN v16.4s, v16.4s, v4.4s
+ SUBS x1, x1, 8
+ FMIN v17.4s, v17.4s, v4.4s
+ FMIN v18.4s, v18.4s, v4.4s
+ FMIN v19.4s, v19.4s, v4.4s
+ FMIN v28.4s, v28.4s, v4.4s
+ FMIN v29.4s, v29.4s, v4.4s
+ FMIN v30.4s, v30.4s, v4.4s
+ FMIN v31.4s, v31.4s, v4.4s
+ FMAX v16.4s, v16.4s, v5.4s
+ FMAX v17.4s, v17.4s, v5.4s
+ FMAX v18.4s, v18.4s, v5.4s
+ FMAX v19.4s, v19.4s, v5.4s
+ FMAX v28.4s, v28.4s, v5.4s
+ FMAX v29.4s, v29.4s, v5.4s
+ FMAX v30.4s, v30.4s, v5.4s
+ FMAX v31.4s, v31.4s, v5.4s
+
+ # Store full 4 x 8
+ B.LO 7f
+
+ ST1 {v16.16b, v17.16b}, [x6], x14
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v18.16b, v19.16b}, [x9], x14
+ SUB x11, x11, x2 // a1 -= kc
+ ST1 {v28.16b, v29.16b}, [x10], x14
+ SUB x12, x12, x2 // a2 -= kc
+ ST1 {v30.16b, v31.16b}, [x7], x14
+ SUB x4, x4, x2 // a3 -= kc
+
+ B.HI 0b
+ RET
+
# Remainder- 2 floats of A (8 bytes)
-2:
- TBZ x0, 3, 3f
+5:
+ # Is there a remainder?- 2 floats of A (8 bytes)
+ TBZ x0, 3, 6f
+ # Remainder- 2 floats of A (8 bytes)
LDR d0, [x3], 8
LDP q20, q21, [x5], 32
LDR d1, [x11], 8
@@ -149,10 +189,11 @@ BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128
FMLA v30.4s, v22.4s, v3.s[1]
FMLA v31.4s, v23.4s, v3.s[1]
- # Remainder- 1 float of A (4 bytes)
-3:
- TBZ x0, 2, 6f
+ # Is there a remainder?- 1 floats of A (4 bytes)
+ TBZ x0, 2, 4b
+ # Remainder- 1 float of A (4 bytes)
+6:
LDR s0, [x3], 4
LDP q20, q21, [x5], 32
LDR s1, [x11], 4
@@ -166,42 +207,8 @@ BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128
FMLA v29.4s, v21.4s, v2.s[0]
FMLA v30.4s, v20.4s, v3.s[0]
FMLA v31.4s, v21.4s, v3.s[0]
+ B 4b
-6:
- # Clamp
- FMIN v16.4s, v16.4s, v4.4s
- SUBS x1, x1, 8
- FMIN v17.4s, v17.4s, v4.4s
- FMIN v18.4s, v18.4s, v4.4s
- FMIN v19.4s, v19.4s, v4.4s
- FMIN v28.4s, v28.4s, v4.4s
- FMIN v29.4s, v29.4s, v4.4s
- FMIN v30.4s, v30.4s, v4.4s
- FMIN v31.4s, v31.4s, v4.4s
- FMAX v16.4s, v16.4s, v5.4s
- FMAX v17.4s, v17.4s, v5.4s
- FMAX v18.4s, v18.4s, v5.4s
- FMAX v19.4s, v19.4s, v5.4s
- FMAX v28.4s, v28.4s, v5.4s
- FMAX v29.4s, v29.4s, v5.4s
- FMAX v30.4s, v30.4s, v5.4s
- FMAX v31.4s, v31.4s, v5.4s
-
- # Store full 4 x 8
- B.LO 7f
-
- ST1 {v16.16b, v17.16b}, [x6], x14
- SUB x3, x3, x2 // a0 -= kc
- ST1 {v18.16b, v19.16b}, [x9], x14
- SUB x11, x11, x2 // a1 -= kc
- ST1 {v28.16b, v29.16b}, [x10], x14
- SUB x12, x12, x2 // a2 -= kc
- ST1 {v30.16b, v31.16b}, [x7], x14
- SUB x4, x4, x2 // a3 -= kc
-
- B.HI 0b
-
- RET
# Store odd width
7: