aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2021-03-15 16:44:29 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2021-03-15 16:45:17 -0700
commit8e58994cb05074afc9a4f2e428d490c178d3c7be (patch)
tree2098cf6160a28b6df20fda48bebaa7f55322233b
parentbbf51825bc2b96f691121611e8d1d262f76b8010 (diff)
downloadXNNPACK-8e58994cb05074afc9a4f2e428d490c178d3c7be.tar.gz
2x8c8__aarch64_neon_mlal_padal GEMM microkernel load A0 last
Move A0 load to end of loop. Saves 1 cycle on Pixel 3. PiperOrigin-RevId: 363060159
-rw-r--r--src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S82
1 files changed, 78 insertions, 4 deletions
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S
index 8e6bccd70..f02204520 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S
@@ -60,13 +60,19 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
MOV v29.4s, v28.4s
LDP x10, x9, [sp, 64] // cn_stride, params
MOV v31.4s, v30.4s
- # Is there at least 16 bytes?
+ # Is there at least 16 bytes for epilogue?
B.LO 3f
+ # Prologue
+ LDP d0, d6, [x3], 16 // Read A0
+
+ SUBS x0, x0, 16 // k = kc - 16
+ # Is there at least 16 bytes for mainloop?
+ B.LO 1f
+
# Main loop - 16 bytes of A
.p2align 3
-1:
- LDP d0, d6, [x3], 16
+11:
LDP d4, d5, [x5]
LDP d1, d7, [x4], 16
LDP d8, d9, [x5, 64]
@@ -127,10 +133,78 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
SADALP v28.4s, v12.8h
SADALP v29.4s, v13.8h
+ LDP d0, d6, [x3], 16 // Read A0
+ SADALP v30.4s, v14.8h
SUBS x0, x0, 16
+ SADALP v31.4s, v15.8h
+ B.HS 11b
+
+ # Epilogue loop - 16 bytes of A
+ # Same as main loop except no read a0
+ .p2align 3
+1:
+ LDP d4, d5, [x5]
+ LDP d1, d7, [x4], 16
+ LDP d8, d9, [x5, 64]
+ SMULL v2.8h, v4.8b, v0.8b
+ SMULL v3.8h, v4.8b, v1.8b
+ SMULL v10.8h, v5.8b, v0.8b
+ SMULL v11.8h, v5.8b, v1.8b
+ LDP d4, d5, [x5, 16]
+ SMLAL v2.8h, v8.8b, v6.8b
+ SMLAL v3.8h, v8.8b, v7.8b
+ SMLAL v10.8h, v9.8b, v6.8b
+ SMLAL v11.8h, v9.8b, v7.8b
+
+ LDP d8, d9, [x5, 80]
+ SMULL v12.8h, v4.8b, v0.8b
+ SADALP v16.4s, v2.8h
+ SMULL v13.8h, v4.8b, v1.8b
+ SADALP v17.4s, v3.8h
+ SMULL v14.8h, v5.8b, v0.8b
+ SADALP v18.4s, v10.8h
+ SMULL v15.8h, v5.8b, v1.8b
+ SADALP v19.4s, v11.8h
+ LDP d4, d5, [x5, 32]
+ SMLAL v12.8h, v8.8b, v6.8b
+ SMLAL v13.8h, v8.8b, v7.8b
+ SMLAL v14.8h, v9.8b, v6.8b
+ SMLAL v15.8h, v9.8b, v7.8b
+
+ LDP d8, d9, [x5, 96]
+ SMULL v2.8h, v4.8b, v0.8b
+ SADALP v20.4s, v12.8h
+ SMULL v3.8h, v4.8b, v1.8b
+ SADALP v21.4s, v13.8h
+ SMULL v10.8h, v5.8b, v0.8b
+ SADALP v22.4s, v14.8h
+ SMULL v11.8h, v5.8b, v1.8b
+ SADALP v23.4s, v15.8h
+ LDP d4, d5, [x5, 48]
+ SMLAL v2.8h, v8.8b, v6.8b
+ SMLAL v3.8h, v8.8b, v7.8b
+ SMLAL v10.8h, v9.8b, v6.8b
+ SMLAL v11.8h, v9.8b, v7.8b
+
+ LDP d8, d9, [x5, 112]
+ SMULL v12.8h, v4.8b, v0.8b
+ SADALP v24.4s, v2.8h
+ SMULL v13.8h, v4.8b, v1.8b
+ SADALP v25.4s, v3.8h
+ SMULL v14.8h, v5.8b, v0.8b
+ SADALP v26.4s, v10.8h
+ SMULL v15.8h, v5.8b, v1.8b
+ SADALP v27.4s, v11.8h
+ SMLAL v12.8h, v8.8b, v6.8b
+ SMLAL v13.8h, v8.8b, v7.8b
+ SMLAL v14.8h, v9.8b, v6.8b
+ SMLAL v15.8h, v9.8b, v7.8b
+ ADD x5, x5, 128
+
+ SADALP v28.4s, v12.8h
+ SADALP v29.4s, v13.8h
SADALP v30.4s, v14.8h
SADALP v31.4s, v15.8h
- B.HS 1b
# Is there a remainder?- 8 bytes of A
TBNZ x0, 3, 3f