From cde8bdfe9085e99106ee2f031ab8b8a490d416c5 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 3 Feb 2022 18:52:35 -0800 Subject: Q8 GEMM for Cortex A7 reduce prefetch to weights - Remove prefetch for inputs. Improves performance on Cortex A7. - Remove r2/lf push. Improves performance on Cortex A7 and A35. - Adjust K near end of epilogue 2 instructions earlier. - Comments updated in A53 and A7 microkernels PiperOrigin-RevId: 426289306 --- ...minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S | 6 ++-- ...-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S | 21 +++++++------ ...x-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S | 6 ++-- ...ax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S | 25 +++++++--------- ...nmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S | 6 ++-- ...fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S | 6 ++-- .../4x8-aarch32-neon-mlal-lane-cortex-a53.S.in | 6 ++-- .../4x8-aarch32-neon-mlal-lane-cortex-a7.S.in | 34 +++++++++------------- ...inmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S | 6 ++-- ...minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S | 21 +++++++------ ...-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S | 6 ++-- ...x-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S | 25 +++++++--------- ...inmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S | 6 ++-- ...minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S | 20 ++++++------- ...-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S | 6 ++-- ...x-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S | 24 +++++++-------- 16 files changed, 91 insertions(+), 133 deletions(-) diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S index 6ef709d92..f77dd7262 100644 --- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S +++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a53.S @@ -112,7 +112,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -331,16 +331,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # QC8 FP32 quantization VLD1.8 {q0-q1}, [r9]! diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S index a79cd7fee..1062ee71a 100644 --- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S +++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-cortex-a7.S @@ -53,8 +53,8 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7 # Push 96 bytes - PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40 - SUB sp, sp, 8 // +8 + PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 + SUB sp, sp, 16 // +16 VPUSH {d8-d13} // +48 = 96 LDR r7, [sp, 96] // a_stride @@ -113,7 +113,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -320,16 +320,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # QC8 FP32 quantization VLD1.8 {q0-q1}, [r9]! @@ -413,8 +411,9 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort BHI 0b VPOP {d8-d13} - ADD sp, sp, 12 // skip pad of 8 + r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + ADD sp, sp, 16 // skip pad of 8 + d14 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr # Remainder- 1 to 7 bytes of A .p2align 3 @@ -548,9 +547,9 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cort 7: VPOP {d8-d13} - ADD sp, sp, 12 // skip pad of 8 + r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - + ADD sp, sp, 16 // skip pad of 8 + d14 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7 diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S index 746fea17f..5ee434690 100644 --- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S +++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a53.S @@ -122,7 +122,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -346,16 +346,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # QC8 FP32 quantization VLD1.8 {q0-q1}, [r9]! diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S index d444d5a28..a357465ad 100644 --- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S +++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-cortex-a7.S @@ -53,8 +53,8 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7 # Push 96 bytes - PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40 - SUB sp, sp, 8 // +8 + PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 + SUB sp, sp, 16 // +16 VPUSH {d8-d13} // +48 = 96 LDR r7, [sp, 96] // a_stride @@ -123,7 +123,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -131,15 +131,11 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm 1: // Extend - 5 cycles VMOVL.S8 q0, d0 - PLD [r3, 128] VMOVL.S8 q4, d8 PLD [r9, 448] VMOVL.S8 q1, d2 - PLD [r12, 128] VMOVL.S8 q2, d4 - PLD [r0, 128] VMOVL.S8 q3, d6 - PLD [r10, 128] // BLOCK 0 - 10 cycles VLD1.8 {d10}, [r9]! // B1 @@ -335,16 +331,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # QC8 FP32 quantization VLD1.8 {q0-q1}, [r9]! @@ -428,8 +422,9 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm BHI 0b VPOP {d8-d13} - ADD sp, sp, 12 // skip pad of 8 + r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + ADD sp, sp, 16 // skip pad of 8 + d14 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr # Remainder- 1 to 7 bytes of A .p2align 3 @@ -563,9 +558,9 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm 7: VPOP {d8-d13} - ADD sp, sp, 12 // skip pad of 8 + r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - + ADD sp, sp, 16 // skip pad of 8 + d14 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7 diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S index 4d4cef3bb..fda94fcb4 100644 --- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S +++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S @@ -110,7 +110,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_co // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -329,16 +329,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_co VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # QC8 FP32 quantization VLD1.8 {q0-q1}, [r9]! diff --git a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S index 94dc2f504..b53c1e122 100644 --- a/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S +++ b/src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S @@ -120,7 +120,7 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_pr // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -344,16 +344,14 @@ BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_pr VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # QC8 FP32 quantization VLD1.8 {q0-q1}, [r9]! diff --git a/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a53.S.in b/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a53.S.in index a487bd77a..6d2aceae1 100644 --- a/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a53.S.in +++ b/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a53.S.in @@ -177,7 +177,7 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -454,16 +454,14 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - $if REQUANTIZATION == "RNDNU": # RNDNU quantization VDUP.32 q0, d12[0] // right_pre_shift diff --git a/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a7.S.in b/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a7.S.in index d56f87b25..7047ae494 100644 --- a/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a7.S.in +++ b/src/qs8-gemm/4x8-aarch32-neon-mlal-lane-cortex-a7.S.in @@ -89,11 +89,12 @@ $else: BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a7 # Push 96 bytes - PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40 + PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 $if DATATYPE == "QU8": + SUB sp, sp, 8 // +8 VPUSH {d8-d14} // +56 = 96 $else: - SUB sp, sp, 8 // +8 + SUB sp, sp, 16 // +16 VPUSH {d8-d13} // +48 = 96 LDR r7, [sp, 96] // a_stride @@ -178,7 +179,7 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -186,8 +187,6 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke 1: // Extend - 5 cycles ${XXTL} q0, d0 - $if PREFETCH: - PLD [r3, 128] $if DATATYPE == "QU8": VSUBL.U8 q4, d8, d14 $else: @@ -195,14 +194,8 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke $if PREFETCH: PLD [r9, 448] ${XXTL} q1, d2 - $if PREFETCH: - PLD [r12, 128] ${XXTL} q2, d4 - $if PREFETCH: - PLD [r0, 128] ${XXTL} q3, d6 - $if PREFETCH: - PLD [r10, 128] // BLOCK 0 - 10 cycles VLD1.8 {d10}, [r9]! // B1 @@ -443,16 +436,14 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - $if REQUANTIZATION == "RNDNU": # RNDNU quantization VDUP.32 q0, d12[0] // right_pre_shift @@ -607,11 +598,12 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke $if DATATYPE == "QU8": VPOP {d8-d14} - ADD sp, sp, 4 // skip r2 + ADD sp, sp, 8 // skip pad of 8 $else: VPOP {d8-d13} - ADD sp, sp, 12 // skip pad of 8 + r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + ADD sp, sp, 16 // skip pad of 8 + d14 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr # Remainder- 1 to 7 bytes of A .p2align 3 @@ -767,12 +759,12 @@ BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_uke 7: $if DATATYPE == "QU8": VPOP {d8-d14} - ADD sp, sp, 4 // skip r2 + ADD sp, sp, 8 // skip pad of 8 $else: VPOP {d8-d13} - ADD sp, sp, 12 // skip pad of 8 + r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - + ADD sp, sp, 16 // skip pad of 8 + d14 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr END_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_a7 diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S index e11b7d479..47e7fac24 100644 --- a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S +++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S @@ -113,7 +113,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -332,16 +332,14 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # RNDNU quantization VDUP.32 q0, d12[0] // right_pre_shift diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S index 3da9e5ccf..10a2fc23e 100644 --- a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S +++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S @@ -55,8 +55,8 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7 # Push 96 bytes - PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40 - SUB sp, sp, 8 // +8 + PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 + SUB sp, sp, 16 // +16 VPUSH {d8-d13} // +48 = 96 LDR r7, [sp, 96] // a_stride @@ -114,7 +114,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -321,16 +321,14 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # RNDNU quantization VDUP.32 q0, d12[0] // right_pre_shift @@ -409,8 +407,9 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor BHI 0b VPOP {d8-d13} - ADD sp, sp, 12 // skip pad of 8 + r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + ADD sp, sp, 16 // skip pad of 8 + d14 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr # Remainder- 1 to 7 bytes of A .p2align 3 @@ -544,9 +543,9 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor 7: VPOP {d8-d13} - ADD sp, sp, 12 // skip pad of 8 + r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - + ADD sp, sp, 16 // skip pad of 8 + d14 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7 diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S index 96f647dd0..7d0d514a4 100644 --- a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S +++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S @@ -123,7 +123,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -347,16 +347,14 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # RNDNU quantization VDUP.32 q0, d12[0] // right_pre_shift diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S index 30af9cc2c..bf26435eb 100644 --- a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S +++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S @@ -55,8 +55,8 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7 # Push 96 bytes - PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40 - SUB sp, sp, 8 // +8 + PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 + SUB sp, sp, 16 // +16 VPUSH {d8-d13} // +48 = 96 LDR r7, [sp, 96] // a_stride @@ -124,7 +124,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -132,15 +132,11 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf 1: // Extend - 5 cycles VMOVL.S8 q0, d0 - PLD [r3, 128] VMOVL.S8 q4, d8 PLD [r9, 448] VMOVL.S8 q1, d2 - PLD [r12, 128] VMOVL.S8 q2, d4 - PLD [r0, 128] VMOVL.S8 q3, d6 - PLD [r10, 128] // BLOCK 0 - 10 cycles VLD1.8 {d10}, [r9]! // B1 @@ -336,16 +332,14 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # RNDNU quantization VDUP.32 q0, d12[0] // right_pre_shift @@ -424,8 +418,9 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf BHI 0b VPOP {d8-d13} - ADD sp, sp, 12 // skip pad of 8 + r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + ADD sp, sp, 16 // skip pad of 8 + d14 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr # Remainder- 1 to 7 bytes of A .p2align 3 @@ -559,9 +554,9 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf 7: VPOP {d8-d13} - ADD sp, sp, 12 // skip pad of 8 + r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - + ADD sp, sp, 16 // skip pad of 8 + d14 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7 diff --git a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S index f610bb364..b0350b6cd 100644 --- a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S +++ b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S @@ -114,7 +114,7 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -333,16 +333,14 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # RNDNU quantization VDUP.32 q0, d12[0] // right_pre_shift diff --git a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S index 06d930e2b..8b5798c40 100644 --- a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S +++ b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a7.S @@ -56,7 +56,8 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7 # Push 96 bytes - PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40 + PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 + SUB sp, sp, 8 // +8 VPUSH {d8-d14} // +56 = 96 LDR r7, [sp, 96] // a_stride @@ -115,7 +116,7 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -322,16 +323,14 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # RNDNU quantization VDUP.32 q0, d12[0] // right_pre_shift @@ -410,8 +409,9 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor BHI 0b VPOP {d8-d14} - ADD sp, sp, 4 // skip r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + ADD sp, sp, 8 // skip pad of 8 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr # Remainder- 1 to 7 bytes of A .p2align 3 @@ -545,9 +545,9 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cor 7: VPOP {d8-d14} - ADD sp, sp, 4 // skip r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - + ADD sp, sp, 8 // skip pad of 8 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7 diff --git a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S index d1b628554..2a8794dd1 100644 --- a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S +++ b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S @@ -124,7 +124,7 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -348,16 +348,14 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # RNDNU quantization VDUP.32 q0, d12[0] // right_pre_shift diff --git a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S index cc5f43461..b7231c21a 100644 --- a/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S +++ b/src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a7.S @@ -56,7 +56,8 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7 # Push 96 bytes - PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11, lr} // 40 + PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 + SUB sp, sp, 8 // +8 VPUSH {d8-d14} // +56 = 96 LDR r7, [sp, 96] // a_stride @@ -125,7 +126,7 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf // Main loop - 8 bytes // 64 bytes for weights. - // 5 vmovl = 4 A and 1 B = 5 cycles + // 5 VMOVL = 4 A and 1 B = 5 cycles // 7 blocks with VLD B, VMOVL, 8 VMLA = 10 cycles // 1 blocks with VLD B, VMLA = 9 cycles // total = 84 cycles @@ -133,15 +134,11 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf 1: // Extend - 5 cycles VMOVL.U8 q0, d0 - PLD [r3, 128] VSUBL.U8 q4, d8, d14 PLD [r9, 448] VMOVL.U8 q1, d2 - PLD [r12, 128] VMOVL.U8 q2, d4 - PLD [r0, 128] VMOVL.U8 q3, d6 - PLD [r10, 128] // BLOCK 0 - 10 cycles VLD1.8 {d10}, [r9]! // B1 @@ -337,16 +334,14 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf VMLAL.S16 q11, d11, d3[3] VMLAL.S16 q12, d10, d5[3] VMLAL.S16 q13, d11, d5[3] + ADDS r5, r5, 8 VMLAL.S16 q14, d10, d7[3] VMLAL.S16 q15, d11, d7[3] - # Is there a remainder?- 1-7 bytes of A - ADDS r5, r5, 8 BNE 3f 2: - # RNDNU quantization VDUP.32 q0, d12[0] // right_pre_shift @@ -425,8 +420,9 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf BHI 0b VPOP {d8-d14} - ADD sp, sp, 4 // skip r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + ADD sp, sp, 8 // skip pad of 8 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr # Remainder- 1 to 7 bytes of A .p2align 3 @@ -560,9 +556,9 @@ BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prf 7: VPOP {d8-d14} - ADD sp, sp, 4 // skip r2 - POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} - + ADD sp, sp, 8 // skip pad of 8 + POP {r4, r5, r6, r7, r8, r9, r10, r11} + BX lr END_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7 -- cgit v1.2.3