diff options
Diffstat (limited to 'src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S')
-rw-r--r-- | src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S | 89 |
1 files changed, 48 insertions, 41 deletions
diff --git a/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S index c2f496745..aa7e7a7e6 100644 --- a/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S +++ b/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S @@ -75,7 +75,7 @@ BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128 # Is there at least 4 floats (16 bytes)? SUBS x0, x2, 16 // k = kc - 16 - B.LO 2f + B.LO 5f # Main loop - 4 floats of A (16 bytes) 1: @@ -122,10 +122,50 @@ BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128 FMLA v31.4s, v27.4s, v3.s[3] B.HS 1b + TST x0, 15 + B.NE 5f + +4: + # Clamp + FMIN v16.4s, v16.4s, v4.4s + SUBS x1, x1, 8 + FMIN v17.4s, v17.4s, v4.4s + FMIN v18.4s, v18.4s, v4.4s + FMIN v19.4s, v19.4s, v4.4s + FMIN v28.4s, v28.4s, v4.4s + FMIN v29.4s, v29.4s, v4.4s + FMIN v30.4s, v30.4s, v4.4s + FMIN v31.4s, v31.4s, v4.4s + FMAX v16.4s, v16.4s, v5.4s + FMAX v17.4s, v17.4s, v5.4s + FMAX v18.4s, v18.4s, v5.4s + FMAX v19.4s, v19.4s, v5.4s + FMAX v28.4s, v28.4s, v5.4s + FMAX v29.4s, v29.4s, v5.4s + FMAX v30.4s, v30.4s, v5.4s + FMAX v31.4s, v31.4s, v5.4s + + # Store full 4 x 8 + B.LO 7f + + ST1 {v16.16b, v17.16b}, [x6], x14 + SUB x3, x3, x2 // a0 -= kc + ST1 {v18.16b, v19.16b}, [x9], x14 + SUB x11, x11, x2 // a1 -= kc + ST1 {v28.16b, v29.16b}, [x10], x14 + SUB x12, x12, x2 // a2 -= kc + ST1 {v30.16b, v31.16b}, [x7], x14 + SUB x4, x4, x2 // a3 -= kc + + B.HI 0b + RET + # Remainder- 2 floats of A (8 bytes) -2: - TBZ x0, 3, 3f +5: + # Is there a remainder?- 2 floats of A (8 bytes) + TBZ x0, 3, 6f + # Remainder- 2 floats of A (8 bytes) LDR d0, [x3], 8 LDP q20, q21, [x5], 32 LDR d1, [x11], 8 @@ -149,10 +189,11 @@ BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128 FMLA v30.4s, v22.4s, v3.s[1] FMLA v31.4s, v23.4s, v3.s[1] - # Remainder- 1 float of A (4 bytes) -3: - TBZ x0, 2, 6f + # Is there a remainder?- 1 floats of A (4 bytes) + TBZ x0, 2, 4b + # Remainder- 1 float of A (4 bytes) +6: LDR s0, [x3], 4 LDP q20, q21, [x5], 32 LDR s1, [x11], 4 @@ -166,42 +207,8 @@ BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128 FMLA v29.4s, v21.4s, v2.s[0] FMLA v30.4s, v20.4s, v3.s[0] FMLA v31.4s, v21.4s, v3.s[0] + B 4b -6: - # Clamp - FMIN v16.4s, v16.4s, v4.4s - SUBS x1, x1, 8 - FMIN v17.4s, v17.4s, v4.4s - FMIN v18.4s, v18.4s, v4.4s - FMIN v19.4s, v19.4s, v4.4s - FMIN v28.4s, v28.4s, v4.4s - FMIN v29.4s, v29.4s, v4.4s - FMIN v30.4s, v30.4s, v4.4s - FMIN v31.4s, v31.4s, v4.4s - FMAX v16.4s, v16.4s, v5.4s - FMAX v17.4s, v17.4s, v5.4s - FMAX v18.4s, v18.4s, v5.4s - FMAX v19.4s, v19.4s, v5.4s - FMAX v28.4s, v28.4s, v5.4s - FMAX v29.4s, v29.4s, v5.4s - FMAX v30.4s, v30.4s, v5.4s - FMAX v31.4s, v31.4s, v5.4s - - # Store full 4 x 8 - B.LO 7f - - ST1 {v16.16b, v17.16b}, [x6], x14 - SUB x3, x3, x2 // a0 -= kc - ST1 {v18.16b, v19.16b}, [x9], x14 - SUB x11, x11, x2 // a1 -= kc - ST1 {v28.16b, v29.16b}, [x10], x14 - SUB x12, x12, x2 // a2 -= kc - ST1 {v30.16b, v31.16b}, [x7], x14 - SUB x4, x4, x2 // a3 -= kc - - B.HI 0b - - RET # Store odd width 7: |