diff options
Diffstat (limited to 'src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S')
-rw-r--r-- | src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S | 45 |
1 files changed, 24 insertions, 21 deletions
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S index 559a71a35..2fdf4f153 100644 --- a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S +++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S @@ -122,7 +122,7 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 # Is there at least 4 floats (16 bytes)? SUBS x0, x2, 16 // k = kc - 16 - B.LO 2f + B.LO 5f # Main loop - 4 floats of A (16 bytes) # 48 FMA + 6 ld128 A + 4 LDP B @@ -191,12 +191,11 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 FMLA v31.4s, v19.4s, v5.s[3] B.HS 1b -2: - # Is there a remainder?- 2 floats of A (8 bytes) - TBNZ x0, 3, 4f - # Is there a remainder?- 1 floats of A (4 bytes) - TBNZ x0, 2, 5f -3: + # Is there a remainder?- 2 floats of A (8 bytes) or less + TST x0, 15 + B.NE 5f + +4: # Clamp FMIN v20.4s, v20.4s, v6.4s SUBS x1, x1, 8 @@ -225,7 +224,7 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 FMAX v31.4s, v31.4s, v7.4s # Store full 6 x 8 - B.LO 6f + B.LO 7f ST1 {v30.16b, v31.16b}, [x7], x14 SUB x3, x3, x2 // a0 -= kc @@ -241,10 +240,12 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 SUB x4, x4, x2 // a5 -= kc B.HI 0b - RET -4: +5: + # Is there a remainder?- 2 floats of A (8 bytes) + TBZ x0, 3, 6f + # Remainder- 2 floats of A (8 bytes) LDR d0, [x3], 8 LDP q16, q17, [x5], 32 @@ -279,10 +280,12 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 FMLA v27.4s, v19.4s, v3.s[1] FMLA v29.4s, v19.4s, v4.s[1] FMLA v31.4s, v19.4s, v5.s[1] - TBZ x0, 2, 3b -5: - # Remainder- 1 floats of A (4 bytes) + # Is there a remainder?- 1 floats of A (4 bytes) + TBZ x0, 2, 4b + + # Remainder- 1 float of A (4 bytes) +6: LDR s0, [x3], 4 LDP q16, q17, [x5], 32 LDR s1, [x9], 4 @@ -302,11 +305,11 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 FMLA v27.4s, v17.4s, v3.s[0] FMLA v29.4s, v17.4s, v4.s[0] FMLA v31.4s, v17.4s, v5.s[0] - B 3b + B 4b # Store odd width -6: - TBZ x1, 2, 7f +7: + TBZ x1, 2, 8f STR q30, [x7], 16 MOV v30.16b, v31.16b STR q28, [x13], 16 @@ -320,8 +323,8 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 STR q20, [x6], 16 MOV v20.16b, v21.16b -7: - TBZ x1, 1, 8f +8: + TBZ x1, 1, 9f STR d30, [x7], 8 DUP d30, v30.d[1] STR d28, [x13], 8 @@ -335,15 +338,15 @@ BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 STR d20, [x6], 8 DUP d20, v20.d[1] -8: - TBZ x1, 0, 9f +9: + TBZ x1, 0, 10f STR s30, [x7] STR s28, [x13] STR s26, [x18] STR s24, [x17] STR s22, [x16] STR s20, [x6] -9: +10: RET END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 |