diff options
author | Frank Barchard <fbarchard@google.com> | 2021-03-15 16:44:29 -0700 |
---|---|---|
committer | XNNPACK Team <xnnpack-github-robot@google.com> | 2021-03-15 16:45:17 -0700 |
commit | 8e58994cb05074afc9a4f2e428d490c178d3c7be (patch) | |
tree | 2098cf6160a28b6df20fda48bebaa7f55322233b | |
parent | bbf51825bc2b96f691121611e8d1d262f76b8010 (diff) | |
download | XNNPACK-8e58994cb05074afc9a4f2e428d490c178d3c7be.tar.gz |
2x8c8__aarch64_neon_mlal_padal GEMM microkernel load A0 last
Move A0 load to end of loop. Saves 1 cycle on Pixel 3.
PiperOrigin-RevId: 363060159
-rw-r--r-- | src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S | 82 |
1 files changed, 78 insertions, 4 deletions
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S index 8e6bccd70..f02204520 100644 --- a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S +++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S @@ -60,13 +60,19 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal MOV v29.4s, v28.4s LDP x10, x9, [sp, 64] // cn_stride, params MOV v31.4s, v30.4s - # Is there at least 16 bytes? + # Is there at least 16 bytes for epilogue? B.LO 3f + # Prologue + LDP d0, d6, [x3], 16 // Read A0 + + SUBS x0, x0, 16 // k = kc - 16 + # Is there at least 16 bytes for mainloop? + B.LO 1f + # Main loop - 16 bytes of A .p2align 3 -1: - LDP d0, d6, [x3], 16 +11: LDP d4, d5, [x5] LDP d1, d7, [x4], 16 LDP d8, d9, [x5, 64] @@ -127,10 +133,78 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal SADALP v28.4s, v12.8h SADALP v29.4s, v13.8h + LDP d0, d6, [x3], 16 // Read A0 + SADALP v30.4s, v14.8h SUBS x0, x0, 16 + SADALP v31.4s, v15.8h + B.HS 11b + + # Epilogue loop - 16 bytes of A + # Same as main loop except no read a0 + .p2align 3 +1: + LDP d4, d5, [x5] + LDP d1, d7, [x4], 16 + LDP d8, d9, [x5, 64] + SMULL v2.8h, v4.8b, v0.8b + SMULL v3.8h, v4.8b, v1.8b + SMULL v10.8h, v5.8b, v0.8b + SMULL v11.8h, v5.8b, v1.8b + LDP d4, d5, [x5, 16] + SMLAL v2.8h, v8.8b, v6.8b + SMLAL v3.8h, v8.8b, v7.8b + SMLAL v10.8h, v9.8b, v6.8b + SMLAL v11.8h, v9.8b, v7.8b + + LDP d8, d9, [x5, 80] + SMULL v12.8h, v4.8b, v0.8b + SADALP v16.4s, v2.8h + SMULL v13.8h, v4.8b, v1.8b + SADALP v17.4s, v3.8h + SMULL v14.8h, v5.8b, v0.8b + SADALP v18.4s, v10.8h + SMULL v15.8h, v5.8b, v1.8b + SADALP v19.4s, v11.8h + LDP d4, d5, [x5, 32] + SMLAL v12.8h, v8.8b, v6.8b + SMLAL v13.8h, v8.8b, v7.8b + SMLAL v14.8h, v9.8b, v6.8b + SMLAL v15.8h, v9.8b, v7.8b + + LDP d8, d9, [x5, 96] + SMULL v2.8h, v4.8b, v0.8b + SADALP v20.4s, v12.8h + SMULL v3.8h, v4.8b, v1.8b + SADALP v21.4s, v13.8h + SMULL v10.8h, v5.8b, v0.8b + SADALP v22.4s, v14.8h + SMULL v11.8h, v5.8b, v1.8b + SADALP v23.4s, v15.8h + LDP d4, d5, [x5, 48] + SMLAL v2.8h, v8.8b, v6.8b + SMLAL v3.8h, v8.8b, v7.8b + SMLAL v10.8h, v9.8b, v6.8b + SMLAL v11.8h, v9.8b, v7.8b + + LDP d8, d9, [x5, 112] + SMULL v12.8h, v4.8b, v0.8b + SADALP v24.4s, v2.8h + SMULL v13.8h, v4.8b, v1.8b + SADALP v25.4s, v3.8h + SMULL v14.8h, v5.8b, v0.8b + SADALP v26.4s, v10.8h + SMULL v15.8h, v5.8b, v1.8b + SADALP v27.4s, v11.8h + SMLAL v12.8h, v8.8b, v6.8b + SMLAL v13.8h, v8.8b, v7.8b + SMLAL v14.8h, v9.8b, v6.8b + SMLAL v15.8h, v9.8b, v7.8b + ADD x5, x5, 128 + + SADALP v28.4s, v12.8h + SADALP v29.4s, v13.8h SADALP v30.4s, v14.8h SADALP v31.4s, v15.8h - B.HS 1b # Is there a remainder?- 8 bytes of A TBNZ x0, 3, 3f |