aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrank Barchard <fbarchard@google.com>2022-02-03 18:52:35 -0800
committerXNNPACK Team <xnnpack-github-robot@google.com>2022-02-03 18:54:01 -0800
commitcde8bdfe9085e99106ee2f031ab8b8a490d416c5 (patch)
tree60a0738782a9f86d85d693fdde4c123723bbd3d9
parent16c09129fc7b1352a6570bfa5ec384afe1ec65c5 (diff)
downloadXNNPACK-cde8bdfe9085e99106ee2f031ab8b8a490d416c5.tar.gz
Q8 GEMM for Cortex A7 reduce prefetch to weights
- Remove prefetch for inputs. Improves performance on Cortex A7. - Remove r2/lf push. Improves performance on Cortex A7 and A35. - Adjust K near end of epilogue 2 instructions earlier. - Comments updated in A53 and A7 microkernels PiperOrigin-RevId: 426289306
-rw-r--r--src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S6
-rw-r--r--src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S21
-rw-r--r--src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S6
-rw-r--r--src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S25
-rw-r--r--src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S6
-rw-r--r--src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S6
-rw-r--r--src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a53.S.in6
-rw-r--r--src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a7.S.in34
-rw-r--r--src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S6
-rw-r--r--src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S21
-rw-r--r--src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S6
-rw-r--r--src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S25
-rw-r--r--src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S6
-rw-r--r--src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S20
-rw-r--r--src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S6
-rw-r--r--src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S24
16 files changed, 91 insertions, 133 deletions
diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S
index 6ef709d92..f77dd7262 100644
--- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S
+++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S
@@ -112,7 +112,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -331,16 +331,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# QC8 FP32 quantization
VLD1.8 {q0-q1}, [r9]!
diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S
index a79cd7fee..1062ee71a 100644
--- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S
+++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S
@@ -53,8 +53,8 @@
BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7
# Push 96 bytes
- PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40
- SUB sp, sp, 8 // +8
+ PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32
+ SUB sp, sp, 16 // +16
VPUSH {d8-d13} // +48 = 96
LDR r7, [sp, 96] // a_stride
@@ -113,7 +113,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -320,16 +320,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# QC8 FP32 quantization
VLD1.8 {q0-q1}, [r9]!
@@ -413,8 +411,9 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort
BHI 0b
VPOP {d8-d13}
- ADD sp, sp, 12 // skip pad of 8 + r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ ADD sp, sp, 16 // skip pad of 8 + d14
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
# Remainder- 1 to 7 bytes of A
.p2align 3
@@ -548,9 +547,9 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort
7:
VPOP {d8-d13}
- ADD sp, sp, 12 // skip pad of 8 + r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
+ ADD sp, sp, 16 // skip pad of 8 + d14
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7
diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S
index 746fea17f..5ee434690 100644
--- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S
+++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S
@@ -122,7 +122,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -346,16 +346,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# QC8 FP32 quantization
VLD1.8 {q0-q1}, [r9]!
diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S
index d444d5a28..a357465ad 100644
--- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S
+++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S
@@ -53,8 +53,8 @@
BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7
# Push 96 bytes
- PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40
- SUB sp, sp, 8 // +8
+ PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32
+ SUB sp, sp, 16 // +16
VPUSH {d8-d13} // +48 = 96
LDR r7, [sp, 96] // a_stride
@@ -123,7 +123,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -131,15 +131,11 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm
1:
// Extend - 5 cycles
VMOVL.S8 q0, d0
- PLD [r3, 128]
VMOVL.S8 q4, d8
PLD [r9, 448]
VMOVL.S8 q1, d2
- PLD [r12, 128]
VMOVL.S8 q2, d4
- PLD [r0, 128]
VMOVL.S8 q3, d6
- PLD [r10, 128]
// BLOCK 0 - 10 cycles
VLD1.8 {d10}, [r9]! // B1
@@ -335,16 +331,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# QC8 FP32 quantization
VLD1.8 {q0-q1}, [r9]!
@@ -428,8 +422,9 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm
BHI 0b
VPOP {d8-d13}
- ADD sp, sp, 12 // skip pad of 8 + r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ ADD sp, sp, 16 // skip pad of 8 + d14
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
# Remainder- 1 to 7 bytes of A
.p2align 3
@@ -563,9 +558,9 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm
7:
VPOP {d8-d13}
- ADD sp, sp, 12 // skip pad of 8 + r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
+ ADD sp, sp, 16 // skip pad of 8 + d14
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7
diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S
index 4d4cef3bb..fda94fcb4 100644
--- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S
+++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S
@@ -110,7 +110,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_co
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -329,16 +329,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_co
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# QC8 FP32 quantization
VLD1.8 {q0-q1}, [r9]!
diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S
index 94dc2f504..b53c1e122 100644
--- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S
+++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S
@@ -120,7 +120,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_pr
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -344,16 +344,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_pr
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# QC8 FP32 quantization
VLD1.8 {q0-q1}, [r9]!
diff --git a/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a53.S.in b/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a53.S.in
index a487bd77a..6d2aceae1 100644
--- a/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a53.S.in
+++ b/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a53.S.in
@@ -177,7 +177,7 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -454,16 +454,14 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
$if REQUANTIZATION == "RNDNU":
# RNDNU quantization
VDUP.32 q0, d12[0] // right_pre_shift
diff --git a/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a7.S.in b/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a7.S.in
index d56f87b25..7047ae494 100644
--- a/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a7.S.in
+++ b/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a7.S.in
@@ -89,11 +89,12 @@ $else:
BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a7
# Push 96 bytes
- PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40
+ PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32
$if DATATYPE == "QU8":
+ SUB sp, sp, 8 // +8
VPUSH {d8-d14} // +56 = 96
$else:
- SUB sp, sp, 8 // +8
+ SUB sp, sp, 16 // +16
VPUSH {d8-d13} // +48 = 96
LDR r7, [sp, 96] // a_stride
@@ -178,7 +179,7 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -186,8 +187,6 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke
1:
// Extend - 5 cycles
${XXTL} q0, d0
- $if PREFETCH:
- PLD [r3, 128]
$if DATATYPE == "QU8":
VSUBL.U8 q4, d8, d14
$else:
@@ -195,14 +194,8 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke
$if PREFETCH:
PLD [r9, 448]
${XXTL} q1, d2
- $if PREFETCH:
- PLD [r12, 128]
${XXTL} q2, d4
- $if PREFETCH:
- PLD [r0, 128]
${XXTL} q3, d6
- $if PREFETCH:
- PLD [r10, 128]
// BLOCK 0 - 10 cycles
VLD1.8 {d10}, [r9]! // B1
@@ -443,16 +436,14 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
$if REQUANTIZATION == "RNDNU":
# RNDNU quantization
VDUP.32 q0, d12[0] // right_pre_shift
@@ -607,11 +598,12 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke
$if DATATYPE == "QU8":
VPOP {d8-d14}
- ADD sp, sp, 4 // skip r2
+ ADD sp, sp, 8 // skip pad of 8
$else:
VPOP {d8-d13}
- ADD sp, sp, 12 // skip pad of 8 + r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ ADD sp, sp, 16 // skip pad of 8 + d14
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
# Remainder- 1 to 7 bytes of A
.p2align 3
@@ -767,12 +759,12 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke
7:
$if DATATYPE == "QU8":
VPOP {d8-d14}
- ADD sp, sp, 4 // skip r2
+ ADD sp, sp, 8 // skip pad of 8
$else:
VPOP {d8-d13}
- ADD sp, sp, 12 // skip pad of 8 + r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
+ ADD sp, sp, 16 // skip pad of 8 + d14
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
END_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a7
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S
index e11b7d479..47e7fac24 100644
--- a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S
@@ -113,7 +113,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -332,16 +332,14 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# RNDNU quantization
VDUP.32 q0, d12[0] // right_pre_shift
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S
index 3da9e5ccf..10a2fc23e 100644
--- a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S
@@ -55,8 +55,8 @@
BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7
# Push 96 bytes
- PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40
- SUB sp, sp, 8 // +8
+ PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32
+ SUB sp, sp, 16 // +16
VPUSH {d8-d13} // +48 = 96
LDR r7, [sp, 96] // a_stride
@@ -114,7 +114,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -321,16 +321,14 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# RNDNU quantization
VDUP.32 q0, d12[0] // right_pre_shift
@@ -409,8 +407,9 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
BHI 0b
VPOP {d8-d13}
- ADD sp, sp, 12 // skip pad of 8 + r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ ADD sp, sp, 16 // skip pad of 8 + d14
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
# Remainder- 1 to 7 bytes of A
.p2align 3
@@ -544,9 +543,9 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
7:
VPOP {d8-d13}
- ADD sp, sp, 12 // skip pad of 8 + r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
+ ADD sp, sp, 16 // skip pad of 8 + d14
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
index 96f647dd0..7d0d514a4 100644
--- a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
@@ -123,7 +123,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -347,16 +347,14 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# RNDNU quantization
VDUP.32 q0, d12[0] // right_pre_shift
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S
index 30af9cc2c..bf26435eb 100644
--- a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S
@@ -55,8 +55,8 @@
BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7
# Push 96 bytes
- PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40
- SUB sp, sp, 8 // +8
+ PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32
+ SUB sp, sp, 16 // +16
VPUSH {d8-d13} // +48 = 96
LDR r7, [sp, 96] // a_stride
@@ -124,7 +124,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -132,15 +132,11 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
1:
// Extend - 5 cycles
VMOVL.S8 q0, d0
- PLD [r3, 128]
VMOVL.S8 q4, d8
PLD [r9, 448]
VMOVL.S8 q1, d2
- PLD [r12, 128]
VMOVL.S8 q2, d4
- PLD [r0, 128]
VMOVL.S8 q3, d6
- PLD [r10, 128]
// BLOCK 0 - 10 cycles
VLD1.8 {d10}, [r9]! // B1
@@ -336,16 +332,14 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# RNDNU quantization
VDUP.32 q0, d12[0] // right_pre_shift
@@ -424,8 +418,9 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
BHI 0b
VPOP {d8-d13}
- ADD sp, sp, 12 // skip pad of 8 + r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ ADD sp, sp, 16 // skip pad of 8 + d14
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
# Remainder- 1 to 7 bytes of A
.p2align 3
@@ -559,9 +554,9 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
7:
VPOP {d8-d13}
- ADD sp, sp, 12 // skip pad of 8 + r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
+ ADD sp, sp, 16 // skip pad of 8 + d14
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7
diff --git a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S
index f610bb364..b0350b6cd 100644
--- a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S
+++ b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S
@@ -114,7 +114,7 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -333,16 +333,14 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# RNDNU quantization
VDUP.32 q0, d12[0] // right_pre_shift
diff --git a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S
index 06d930e2b..8b5798c40 100644
--- a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S
+++ b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S
@@ -56,7 +56,8 @@
BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7
# Push 96 bytes
- PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40
+ PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32
+ SUB sp, sp, 8 // +8
VPUSH {d8-d14} // +56 = 96
LDR r7, [sp, 96] // a_stride
@@ -115,7 +116,7 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -322,16 +323,14 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# RNDNU quantization
VDUP.32 q0, d12[0] // right_pre_shift
@@ -410,8 +409,9 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
BHI 0b
VPOP {d8-d14}
- ADD sp, sp, 4 // skip r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ ADD sp, sp, 8 // skip pad of 8
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
# Remainder- 1 to 7 bytes of A
.p2align 3
@@ -545,9 +545,9 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor
7:
VPOP {d8-d14}
- ADD sp, sp, 4 // skip r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
+ ADD sp, sp, 8 // skip pad of 8
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7
diff --git a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
index d1b628554..2a8794dd1 100644
--- a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
+++ b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
@@ -124,7 +124,7 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -348,16 +348,14 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# RNDNU quantization
VDUP.32 q0, d12[0] // right_pre_shift
diff --git a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S
index cc5f43461..b7231c21a 100644
--- a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S
+++ b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S
@@ -56,7 +56,8 @@
BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7
# Push 96 bytes
- PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40
+ PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32
+ SUB sp, sp, 8 // +8
VPUSH {d8-d14} // +56 = 96
LDR r7, [sp, 96] // a_stride
@@ -125,7 +126,7 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
// Main loop - 8 bytes
// 64 bytes for weights.
- // 5 vmovl = 4 A and 1 B = 5 cycles
+ // 5 VMOVL = 4 A and 1 B = 5 cycles
// 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles
// 1 blocks with VLD B, VMLA = 9 cycles
// total = 84 cycles
@@ -133,15 +134,11 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
1:
// Extend - 5 cycles
VMOVL.U8 q0, d0
- PLD [r3, 128]
VSUBL.U8 q4, d8, d14
PLD [r9, 448]
VMOVL.U8 q1, d2
- PLD [r12, 128]
VMOVL.U8 q2, d4
- PLD [r0, 128]
VMOVL.U8 q3, d6
- PLD [r10, 128]
// BLOCK 0 - 10 cycles
VLD1.8 {d10}, [r9]! // B1
@@ -337,16 +334,14 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
VMLAL.S16 q11, d11, d3[3]
VMLAL.S16 q12, d10, d5[3]
VMLAL.S16 q13, d11, d5[3]
+ ADDS r5, r5, 8
VMLAL.S16 q14, d10, d7[3]
VMLAL.S16 q15, d11, d7[3]
-
# Is there a remainder?- 1-7 bytes of A
- ADDS r5, r5, 8
BNE 3f
2:
-
# RNDNU quantization
VDUP.32 q0, d12[0] // right_pre_shift
@@ -425,8 +420,9 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
BHI 0b
VPOP {d8-d14}
- ADD sp, sp, 4 // skip r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ ADD sp, sp, 8 // skip pad of 8
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
# Remainder- 1 to 7 bytes of A
.p2align 3
@@ -560,9 +556,9 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf
7:
VPOP {d8-d14}
- ADD sp, sp, 4 // skip r2
- POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
+ ADD sp, sp, 8 // skip pad of 8
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7