aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2021-03-18 13:13:09 -0700
committerXNNPACK Team <xnnpack-github-robot@google.com>2021-03-18 13:13:59 -0700
commitb0da47a9fab5216f120c5abc3aa44a61b2dab932 (patch)
tree5ab397fe555b769220e3cf171d2912df997e132c
parentf5f9cec5757985807d897d39bb2bcc934f4481de (diff)
downloadXNNPACK-b0da47a9fab5216f120c5abc3aa44a61b2dab932.tar.gz
QS8 C8 neon microkernel load B at end of loop and PADAP at top of loop.
Saves 1 cycle on Cortex A72 and A73 PiperOrigin-RevId: 363729112
-rw-r--r--src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S70
1 files changed, 36 insertions, 34 deletions
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S
index f02204520..40106786c 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S
@@ -61,26 +61,27 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
LDP x10, x9, [sp, 64] // cn_stride, params
MOV v31.4s, v30.4s
# Is there at least 16 bytes for epilogue?
- B.LO 3f
+ B.LO 4f
# Prologue
LDP d0, d6, [x3], 16 // Read A0
-
- SUBS x0, x0, 16 // k = kc - 16
- # Is there at least 16 bytes for mainloop?
- B.LO 1f
-
- # Main loop - 16 bytes of A
- .p2align 3
-11:
LDP d4, d5, [x5]
LDP d1, d7, [x4], 16
LDP d8, d9, [x5, 64]
SMULL v2.8h, v4.8b, v0.8b
SMULL v3.8h, v4.8b, v1.8b
+ SUBS x0, x0, 16 // k = kc - 16
SMULL v10.8h, v5.8b, v0.8b
SMULL v11.8h, v5.8b, v1.8b
LDP d4, d5, [x5, 16]
+
+ # Is there at least 16 bytes for mainloop?
+ B.LO 2f
+
+ # Main loop - 16 bytes of A
+
+ .p2align 3
+1:
SMLAL v2.8h, v8.8b, v6.8b
SMLAL v3.8h, v8.8b, v7.8b
SMLAL v10.8h, v9.8b, v6.8b
@@ -125,32 +126,33 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
SADALP v26.4s, v10.8h
SMULL v15.8h, v5.8b, v1.8b
SADALP v27.4s, v11.8h
+ LDP d4, d5, [x5, 128] // Read B0
SMLAL v12.8h, v8.8b, v6.8b
SMLAL v13.8h, v8.8b, v7.8b
+ ADD x5, x5, 128
SMLAL v14.8h, v9.8b, v6.8b
+ LDP d0, d6, [x3], 16 // Read A0
SMLAL v15.8h, v9.8b, v7.8b
- ADD x5, x5, 128
+# start of next loop
+ LDP d1, d7, [x4], 16
+ LDP d8, d9, [x5, 64]
+ SMULL v2.8h, v4.8b, v0.8b
SADALP v28.4s, v12.8h
+ SMULL v3.8h, v4.8b, v1.8b
SADALP v29.4s, v13.8h
- LDP d0, d6, [x3], 16 // Read A0
+ SMULL v10.8h, v5.8b, v0.8b
SADALP v30.4s, v14.8h
+ SMULL v11.8h, v5.8b, v1.8b
SUBS x0, x0, 16
SADALP v31.4s, v15.8h
- B.HS 11b
+ LDP d4, d5, [x5, 16]
+ B.HS 1b
# Epilogue loop - 16 bytes of A
- # Same as main loop except no read a0
+ # Same as main loop except no read a0, b0
.p2align 3
-1:
- LDP d4, d5, [x5]
- LDP d1, d7, [x4], 16
- LDP d8, d9, [x5, 64]
- SMULL v2.8h, v4.8b, v0.8b
- SMULL v3.8h, v4.8b, v1.8b
- SMULL v10.8h, v5.8b, v0.8b
- SMULL v11.8h, v5.8b, v1.8b
- LDP d4, d5, [x5, 16]
+2:
SMLAL v2.8h, v8.8b, v6.8b
SMLAL v3.8h, v8.8b, v7.8b
SMLAL v10.8h, v9.8b, v6.8b
@@ -207,10 +209,10 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
SADALP v31.4s, v15.8h
# Is there a remainder?- 8 bytes of A
- TBNZ x0, 3, 3f
+ TBNZ x0, 3, 4f
.p2align 3
-2:
+3:
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
@@ -259,7 +261,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
LD1R {v2.16b}, [x9]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
- B.LO 4f
+ B.LO 5f
# Store full 2 x 8
ST1 {v0.8b}, [x6], x10
@@ -277,7 +279,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
# Remainder - 8 bytes of A
.p2align 3
-3:
+4:
LDR d0, [x3], 8
LDP d4, d5, [x5]
LDR d1, [x4], 8
@@ -317,26 +319,26 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal
SADALP v29.4s, v13.8h
SADALP v30.4s, v14.8h
SADALP v31.4s, v15.8h
- B 2b
+ B 3b
# Store odd width
.p2align 3
-4:
- TBZ x1, 2, 5f
+5:
+ TBZ x1, 2, 6f
STR s0, [x6], 4
ST1 {v0.s}[2], [x7], 4
EXT v0.16b, v0.16b, v0.16b, 4
-5:
- TBZ x1, 1, 6f
+6:
+ TBZ x1, 1, 7f
ST1 {v0.h}[0], [x6], 2
ST1 {v0.h}[4], [x7], 2
EXT v0.16b, v0.16b, v0.16b, 2
-6:
- TBZ x1, 0, 7f
+7:
+ TBZ x1, 0, 8f
ST1 {v0.b}[0], [x6]
ST1 {v0.b}[8], [x7]
-7:
+8:
# Restore d8-d15 from stack
LDP d14, d15, [sp, 48]
LDP d12, d13, [sp, 32]