diff options
author | Frank Barchard <fbarchard@google.com> | 2021-03-18 13:13:09 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2021-03-18 13:13:59 -0700 |
commit | b0da47a9fab5216f120c5abc3aa44a61b2dab932 (patch) | |
tree | 5ab397fe555b769220e3cf171d2912df997e132c | |
parent | f5f9cec5757985807d897d39bb2bcc934f4481de (diff) | |
download | XNNPACK-b0da47a9fab5216f120c5abc3aa44a61b2dab932.tar.gz |
QS8 C8 neon microkernel load B at end of loop and PADAP at top of loop.
Saves 1 cycle on Cortex A72 and A73
PiperOrigin-RevId: 363729112
-rw-r--r-- | src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S | 70 |
1 files changed, 36 insertions, 34 deletions
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S index f02204520..40106786c 100644 --- a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S +++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S @@ -61,26 +61,27 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal LDP x10, x9, [sp, 64] // cn_stride, params MOV v31.4s, v30.4s # Is there at least 16 bytes for epilogue? - B.LO 3f + B.LO 4f # Prologue LDP d0, d6, [x3], 16 // Read A0 - - SUBS x0, x0, 16 // k = kc - 16 - # Is there at least 16 bytes for mainloop? - B.LO 1f - - # Main loop - 16 bytes of A - .p2align 3 -11: LDP d4, d5, [x5] LDP d1, d7, [x4], 16 LDP d8, d9, [x5, 64] SMULL v2.8h, v4.8b, v0.8b SMULL v3.8h, v4.8b, v1.8b + SUBS x0, x0, 16 // k = kc - 16 SMULL v10.8h, v5.8b, v0.8b SMULL v11.8h, v5.8b, v1.8b LDP d4, d5, [x5, 16] + + # Is there at least 16 bytes for mainloop? + B.LO 2f + + # Main loop - 16 bytes of A + + .p2align 3 +1: SMLAL v2.8h, v8.8b, v6.8b SMLAL v3.8h, v8.8b, v7.8b SMLAL v10.8h, v9.8b, v6.8b @@ -125,32 +126,33 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal SADALP v26.4s, v10.8h SMULL v15.8h, v5.8b, v1.8b SADALP v27.4s, v11.8h + LDP d4, d5, [x5, 128] // Read B0 SMLAL v12.8h, v8.8b, v6.8b SMLAL v13.8h, v8.8b, v7.8b + ADD x5, x5, 128 SMLAL v14.8h, v9.8b, v6.8b + LDP d0, d6, [x3], 16 // Read A0 SMLAL v15.8h, v9.8b, v7.8b - ADD x5, x5, 128 +# start of next loop + LDP d1, d7, [x4], 16 + LDP d8, d9, [x5, 64] + SMULL v2.8h, v4.8b, v0.8b SADALP v28.4s, v12.8h + SMULL v3.8h, v4.8b, v1.8b SADALP v29.4s, v13.8h - LDP d0, d6, [x3], 16 // Read A0 + SMULL v10.8h, v5.8b, v0.8b SADALP v30.4s, v14.8h + SMULL v11.8h, v5.8b, v1.8b SUBS x0, x0, 16 SADALP v31.4s, v15.8h - B.HS 11b + LDP d4, d5, [x5, 16] + B.HS 1b # Epilogue loop - 16 bytes of A - # Same as main loop except no read a0 + # Same as main loop except no read a0, b0 .p2align 3 -1: - LDP d4, d5, [x5] - LDP d1, d7, [x4], 16 - LDP d8, d9, [x5, 64] - SMULL v2.8h, v4.8b, v0.8b - SMULL v3.8h, v4.8b, v1.8b - SMULL v10.8h, v5.8b, v0.8b - SMULL v11.8h, v5.8b, v1.8b - LDP d4, d5, [x5, 16] +2: SMLAL v2.8h, v8.8b, v6.8b SMLAL v3.8h, v8.8b, v7.8b SMLAL v10.8h, v9.8b, v6.8b @@ -207,10 +209,10 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal SADALP v31.4s, v15.8h # Is there a remainder?- 8 bytes of A - TBNZ x0, 3, 3f + TBNZ x0, 3, 4f .p2align 3 -2: +3: # Add columns ADDP v16.4s, v16.4s, v18.4s ADDP v20.4s, v20.4s, v22.4s @@ -259,7 +261,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal LD1R {v2.16b}, [x9] SMAX v0.16b, v0.16b, v1.16b SMIN v0.16b, v0.16b, v2.16b - B.LO 4f + B.LO 5f # Store full 2 x 8 ST1 {v0.8b}, [x6], x10 @@ -277,7 +279,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal # Remainder - 8 bytes of A .p2align 3 -3: +4: LDR d0, [x3], 8 LDP d4, d5, [x5] LDR d1, [x4], 8 @@ -317,26 +319,26 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal SADALP v29.4s, v13.8h SADALP v30.4s, v14.8h SADALP v31.4s, v15.8h - B 2b + B 3b # Store odd width .p2align 3 -4: - TBZ x1, 2, 5f +5: + TBZ x1, 2, 6f STR s0, [x6], 4 ST1 {v0.s}[2], [x7], 4 EXT v0.16b, v0.16b, v0.16b, 4 -5: - TBZ x1, 1, 6f +6: + TBZ x1, 1, 7f ST1 {v0.h}[0], [x6], 2 ST1 {v0.h}[4], [x7], 2 EXT v0.16b, v0.16b, v0.16b, 2 -6: - TBZ x1, 0, 7f +7: + TBZ x1, 0, 8f ST1 {v0.b}[0], [x6] ST1 {v0.b}[8], [x7] -7: +8: # Restore d8-d15 from stack LDP d14, d15, [sp, 48] LDP d12, d13, [sp, 32] |