string: Use L(name) for labels

Use L(name) for all assembler labels.
author: Wilco Dijkstra <wdijkstr@arm.com> 2020-01-02 13:36:34 +0000
committer: Szabolcs Nagy <szabolcs.nagy@arm.com> 2020-01-02 13:36:34 +0000
commit: 833e86096b1c38218670459a4c11bf9c790a96d0 (patch)
tree: 88e48be64786d4edf2d75416d6cb8f0c8002d29e
parent: 31b560bc3b82ae45044e6455493ce6783aa94d98 (diff)
download: arm-optimized-routines-833e86096b1c38218670459a4c11bf9c790a96d0.tar.gz
10 files changed, 180 insertions, 180 deletions
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 6ffade1..10be49e 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -48,7 +48,7 @@
 
 ENTRY (__memchr_aarch64)
 	/* Do not dereference srcin if no bytes to compare.  */
-	cbz	cntin, .Lzero_length
+	cbz	cntin, L(zero_length)
 	/*
 	 * Magic constant 0x40100401 allows us to identify which lane matches
 	 * the requested byte.
@@ -61,7 +61,7 @@ ENTRY (__memchr_aarch64)
 	dup	vrepmask.4s, wtmp2
 	ands	soff, srcin, #31
 	and	cntrem, cntin, #31
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/*
 	 * Input string is not 32-byte aligned. We calculate the syndrome
@@ -84,25 +84,25 @@ ENTRY (__memchr_aarch64)
 	lsr	synd, synd, tmp
 	lsl	synd, synd, tmp
 	/* The first block can also be the last */
-	b.ls	.Lmasklast
+	b.ls	L(masklast)
 	/* Have we found something already? */
-	cbnz	synd, .Ltail
+	cbnz	synd, L(tail)
 
-.Lloop:
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	subs	cntin, cntin, #32
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	/* If we're out of data we finish regardless of the result */
-	b.ls	.Lend
+	b.ls	L(end)
 	/* Use a fast check for the termination condition */
 	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
 	addp	vend.2d, vend.2d, vend.2d
 	mov	synd, vend.d[0]
 	/* We're not out of data, loop if we haven't found the character */
-	cbz	synd, .Lloop
+	cbz	synd, L(loop)
 
-.Lend:
+L(end):
 	/* Termination condition found, let's calculate the syndrome value */
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
@@ -110,9 +110,9 @@ ENTRY (__memchr_aarch64)
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
 	mov	synd, vend.d[0]
 	/* Only do the clear for the last possible block */
-	b.hi	.Ltail
+	b.hi	L(tail)
 
-.Lmasklast:
+L(masklast):
 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
 	add	tmp, cntrem, soff
 	and	tmp, tmp, #31
@@ -121,7 +121,7 @@ ENTRY (__memchr_aarch64)
 	lsl	synd, synd, tmp
 	lsr	synd, synd, tmp
 
-.Ltail:
+L(tail):
 	/* Count the trailing zeros using bit reversing */
 	rbit	synd, synd
 	/* Compensate the last post-increment */
@@ -136,7 +136,7 @@ ENTRY (__memchr_aarch64)
 	csel	result, xzr, result, eq
 	ret
 
-.Lzero_length:
+L(zero_length):
 	mov	result, #0
 	ret
 
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 66a1fdd..00d9be3 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -61,7 +61,7 @@ ENTRY (__strchr_aarch64)
 	dup	vrepmask_c.4s, wtmp2
 	ands	tmp1, srcin, #31
 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
@@ -87,9 +87,9 @@ ENTRY (__strchr_aarch64)
 
 	mov	tmp3, vend1.d[0]
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
-	cbnz	tmp1, .Ltail
+	cbnz	tmp1, L(tail)
 
-.Lloop:
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
@@ -101,7 +101,7 @@ ENTRY (__strchr_aarch64)
 	orr	vend1.16b, vend1.16b, vend2.16b
 	addp	vend1.2d, vend1.2d, vend1.2d
 	mov	tmp1, vend1.d[0]
-	cbz	tmp1, .Lloop
+	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
@@ -115,7 +115,7 @@ ENTRY (__strchr_aarch64)
 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
 
 	mov	tmp1, vend1.d[0]
-.Ltail:
+L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
 	/* Re-bias source.  */
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 697dbf4..81264ea 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -55,7 +55,7 @@ ENTRY (__strchrnul_aarch64)
 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
 	dup	vrepmask.4s, wtmp2
 	ands	tmp1, srcin, #31
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
@@ -79,9 +79,9 @@ ENTRY (__strchrnul_aarch64)
 
 	mov	tmp3, vend1.d[0]
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
-	cbnz	tmp1, .Ltail
+	cbnz	tmp1, L(tail)
 
-.Lloop:
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
@@ -93,7 +93,7 @@ ENTRY (__strchrnul_aarch64)
 	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
 	addp	vend1.2d, vend1.2d, vend1.2d
 	mov	tmp1, vend1.d[0]
-	cbz	tmp1, .Lloop
+	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
@@ -103,7 +103,7 @@ ENTRY (__strchrnul_aarch64)
 	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
 
 	mov	tmp1, vend1.d[0]
-.Ltail:
+L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
 	/* Re-bias source.  */
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 766e71b..4edffcf 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -99,9 +99,9 @@ ENTRY (STRCPY)
 	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
 	   aligned string will never fail the page align check, so will
 	   always take the fast path.  */
-	b.gt	.Lpage_cross
+	b.gt	L(page_cross)
 
-.Lpage_cross_ok:
+L(page_cross_ok):
 	ldp	data1, data2, [srcin]
 #ifdef __AARCH64EB__
 	/* Because we expect the end to be found within 16 characters
@@ -113,7 +113,7 @@ ENTRY (STRCPY)
 	sub	tmp1, tmp2, zeroones
 	orr	tmp2, tmp2, #REP8_7f
 	bics	has_nul1, tmp1, tmp2
-	b.ne	.Lfp_le8
+	b.ne	L(fp_le8)
 	rev	tmp4, data2
 	sub	tmp3, tmp4, zeroones
 	orr	tmp4, tmp4, #REP8_7f
@@ -121,17 +121,17 @@ ENTRY (STRCPY)
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	bics	has_nul1, tmp1, tmp2
-	b.ne	.Lfp_le8
+	b.ne	L(fp_le8)
 	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, #REP8_7f
 #endif
 	bics	has_nul2, tmp3, tmp4
-	b.eq	.Lbulk_entry
+	b.eq	L(bulk_entry)
 
 	/* The string is short (<=16 bytes).  We don't know exactly how
 	   short though, yet.  Work out the exact length so that we can
 	   quickly select the optimal copy strategy.  */
-.Lfp_gt8:
+L(fp_gt8):
 	rev	has_nul2, has_nul2
 	clz	pos, has_nul2
 	mov	tmp2, #56
@@ -149,12 +149,12 @@ ENTRY (STRCPY)
 #endif
 	ret
 
-.Lfp_le8:
+L(fp_le8):
 	rev	has_nul1, has_nul1
 	clz	pos, has_nul1
 	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
 	subs	tmp2, pos, #24			/* Pos in bits. */
-	b.lt	.Lfp_lt4
+	b.lt	L(fp_lt4)
 #ifdef __AARCH64EB__
 	mov	tmp2, #56
 	sub	pos, tmp2, pos
@@ -170,15 +170,15 @@ ENTRY (STRCPY)
 	mov	dstin, dst
 #endif
 	ret
-.Lfp_lt4:
-	cbz	pos, .Lfp_lt2
+L(fp_lt4):
+	cbz	pos, L(fp_lt2)
 	/* 2->3 bytes to copy.  */
 #ifdef __AARCH64EB__
 	lsr	data1, data1, #48
 #endif
 	strh	data1w, [dstin]
 	/* Fall-through, one byte (max) to go.  */
-.Lfp_lt2:
+L(fp_lt2):
 	/* Null-terminated string.  Last character must be zero!  */
 	strb	wzr, [dst]
 #ifdef BUILD_STPCPY
@@ -189,20 +189,20 @@ ENTRY (STRCPY)
 	.p2align 6
 	/* Aligning here ensures that the entry code and main loop all lies
 	   within one 64-byte cache line.  */
-.Lbulk_entry:
+L(bulk_entry):
 	sub	to_align, to_align, #16
 	stp	data1, data2, [dstin]
 	sub	src, srcin, to_align
 	sub	dst, dstin, to_align
-	b	.Lentry_no_page_cross
+	b	L(entry_no_page_cross)
 
 	/* The inner loop deals with two Dwords at a time.  This has a
 	   slightly higher start-up cost, but we should win quite quickly,
 	   especially on cores with a high number of issue slots per
 	   cycle, as we get much better parallelism out of the operations.  */
-.Lmain_loop:
+L(main_loop):
 	stp	data1, data2, [dst], #16
-.Lentry_no_page_cross:
+L(entry_no_page_cross):
 	ldp	data1, data2, [src], #16
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
@@ -211,7 +211,7 @@ ENTRY (STRCPY)
 	bic	has_nul1, tmp1, tmp2
 	bics	has_nul2, tmp3, tmp4
 	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lmain_loop
+	b.eq	L(main_loop)
 
 	/* Since we know we are copying at least 16 bytes, the fastest way
 	   to deal with the tail is to determine the location of the
@@ -244,7 +244,7 @@ ENTRY (STRCPY)
 #endif
 	ret
 
-.Lpage_cross:
+L(page_cross):
 	bic	src, srcin, #15
 	/* Start by loading two words at [srcin & ~15], then forcing the
 	   bytes that precede srcin to 0xff.  This means they never look
@@ -270,7 +270,7 @@ ENTRY (STRCPY)
 	bic	has_nul1, tmp1, tmp2
 	bics	has_nul2, tmp3, tmp4
 	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lpage_cross_ok
+	b.eq	L(page_cross_ok)
 	/* We now need to make data1 and data2 look like they've been
 	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
 	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
@@ -301,8 +301,8 @@ ENTRY (STRCPY)
 	orr	tmp4, data2, #REP8_7f
 #endif
 	bic	has_nul1, tmp1, tmp2
-	cbnz	has_nul1, .Lfp_le8
+	cbnz	has_nul1, L(fp_le8)
 	bic	has_nul2, tmp3, tmp4
-	b	.Lfp_gt8
+	b	L(fp_gt8)
 
 END (STRCPY)
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 8a7d753..2293f73 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -114,7 +114,7 @@ L(main_loop_entry):
 	sub	src, src, 16
 L(main_loop):
 	ldp	data1, data2, [src, 32]!
-.Lpage_cross_entry:
+L(page_cross_entry):
 	sub	tmp1, data1, zeroones
 	sub	tmp3, data2, zeroones
 	orr	tmp2, tmp1, tmp3
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index e29fb7d..fbd08ee 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -46,13 +46,13 @@
 	nop	/* Pad so that the loop below fits a cache line.  */
 	.endr
 ENTRY_ALIGN (__strncmp_aarch64, 0)
-	cbz	limit, .Lret0
+	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
 	and	count, src1, #7
-	b.ne	.Lmisaligned8
-	cbnz	count, .Lmutual_align
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
@@ -61,10 +61,10 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
 	/* Start of performance-critical section  -- one 64B cache line.  */
-.Lloop_aligned:
+L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
 	subs	limit_wd, limit_wd, #1
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
@@ -72,15 +72,15 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
-	b.eq	.Lloop_aligned
+	b.eq	L(loop_aligned)
 	/* End of performance-critical section  -- one 64B cache line.  */
 
 	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
+	tbz	limit_wd, #63, L(not_limit)
 
 	/* Limit % 8 == 0 => all bytes significant.  */
 	ands	limit, limit, #7
-	b.eq	.Lnot_limit
+	b.eq	L(not_limit)
 
 	lsl	limit, limit, #3	/* Bits -> bytes.  */
 	mov	mask, #~0
@@ -95,7 +95,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	/* Make sure that the NUL byte is marked in the syndrome.  */
 	orr	has_nul, has_nul, mask
 
-.Lnot_limit:
+L(not_limit):
 	orr	syndrome, diff, has_nul
 
 #ifndef	__AARCH64EB__
@@ -148,7 +148,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	ret
 #endif
 
-.Lmutual_align:
+L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
 	   the bytes that precede the start point.
@@ -176,56 +176,56 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
-	b	.Lstart_realigned
+	b	L(start_realigned)
 
 	.p2align 6
 	/* Don't bother with dwords for up to 16 bytes.  */
-.Lmisaligned8:
+L(misaligned8):
 	cmp	limit, #16
-	b.hs	.Ltry_misaligned_words
+	b.hs	L(try_misaligned_words)
 
-.Lbyte_loop:
+L(byte_loop):
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
 	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	.Lbyte_loop
-.Ldone:
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	ret
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
-.Ltry_misaligned_words:
+L(try_misaligned_words):
 	lsr	limit_wd, limit, #3
-	cbz	count, .Ldo_misaligned
+	cbz	count, L(do_misaligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
 	lsr	limit_wd, limit, #3
 
-.Lpage_end_loop:
+L(page_end_loop):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.ne	.Ldone
+	b.ne	L(done)
 	subs	count, count, #1
-	b.hi	.Lpage_end_loop
+	b.hi	L(page_end_loop)
 
-.Ldo_misaligned:
+L(do_misaligned):
 	/* Prepare ourselves for the next page crossing.  Unlike the aligned
 	   loop, we fetch 1 less dword because we risk crossing bounds on
 	   SRC2.  */
 	mov	count, #8
 	subs	limit_wd, limit_wd, #1
-	b.lo	.Ldone_loop
-.Lloop_misaligned:
+	b.lo	L(done_loop)
+L(loop_misaligned):
 	and	tmp2, src2, #0xff8
 	eor	tmp2, tmp2, #0xff8
-	cbz	tmp2, .Lpage_end_loop
+	cbz	tmp2, L(page_end_loop)
 
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
@@ -234,14 +234,14 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	diff, #0, #0, eq
-	b.ne	.Lnot_limit
+	b.ne	L(not_limit)
 	subs	limit_wd, limit_wd, #1
-	b.pl	.Lloop_misaligned
+	b.pl	L(loop_misaligned)
 
-.Ldone_loop:
+L(done_loop):
 	/* We found a difference or a NULL before the limit was reached.  */
 	and	limit, limit, #7
-	cbz	limit, .Lnot_limit
+	cbz	limit, L(not_limit)
 	/* Read the last word.  */
 	sub	src1, src1, 8
 	sub	src2, src2, 8
@@ -252,9 +252,9 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	diff, #0, #0, eq
-	b.ne	.Lnot_limit
+	b.ne	L(not_limit)
 
-.Lret0:
+L(ret0):
 	mov	result, #0
 	ret
 
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index bf72686..df66b60 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -38,22 +38,22 @@
 
 	.text
 	.p2align	6
-.Lstart:
+L(start):
 	/* Pre-pad to ensure critical loop begins an icache line.  */
 	.rep 7
 	nop
 	.endr
 	/* Put this code here to avoid wasting more space with pre-padding.  */
-.Lhit_limit:
+L(hit_limit):
 	mov	len, limit
 	ret
 
 ENTRY_ALIGN (__strnlen_aarch64, 0)
-	cbz	limit, .Lhit_limit
+	cbz	limit, L(hit_limit)
 	mov	zeroones, #REP8_01
 	bic	src, srcin, #15
 	ands	tmp1, srcin, #15
-	b.ne	.Lmisaligned
+	b.ne	L(misaligned)
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
@@ -67,9 +67,9 @@ ENTRY_ALIGN (__strnlen_aarch64, 0)
 	   cycle, as we get much better parallelism out of the operations.  */
 
 	/* Start of critial section -- keep to one 64Byte cache line.  */
-.Lloop:
+L(loop):
 	ldp	data1, data2, [src], #16
-.Lrealigned:
+L(realigned):
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	sub	tmp3, data2, zeroones
@@ -79,24 +79,24 @@ ENTRY_ALIGN (__strnlen_aarch64, 0)
 	subs	limit_wd, limit_wd, #1
 	orr	tmp1, has_nul1, has_nul2
 	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
-	b.eq	.Lloop
+	b.eq	L(loop)
 	/* End of critical section -- keep to one 64Byte cache line.  */
 
 	orr	tmp1, has_nul1, has_nul2
-	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
+	cbz	tmp1, L(hit_limit)	/* No null in final Qword.  */
 
 	/* We know there's a null in the final Qword.  The easiest thing
 	   to do now is work out the length of the string and return
 	   MIN (len, limit).  */
 
 	sub	len, src, srcin
-	cbz	has_nul1, .Lnul_in_data2
+	cbz	has_nul1, L(nul_in_data2)
 #ifdef __AARCH64EB__
 	mov	data2, data1
 #endif
 	sub	len, len, #8
 	mov	has_nul2, has_nul1
-.Lnul_in_data2:
+L(nul_in_data2):
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
 	   string is 0x01) means we cannot use has_nul directly.  The
@@ -115,7 +115,7 @@ ENTRY_ALIGN (__strnlen_aarch64, 0)
 	csel	len, len, limit, ls		/* Return the lower value.  */
 	ret
 
-.Lmisaligned:
+L(misaligned):
 	/* Deal with a partial first word.
 	   We're doing two things in parallel here;
 	   1) Calculate the number of words (but avoiding overflow if
@@ -150,6 +150,6 @@ ENTRY_ALIGN (__strnlen_aarch64, 0)
 
 	csinv	data1, data1, xzr, le
 	csel	data2, data2, data2a, le
-	b	.Lrealigned
+	b	L(realigned)
 
 END (__strnlen_aarch64)
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index e8f5843..aab78a2 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -124,11 +124,11 @@ ENTRY (__memcpy_arm)
 
 	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
 	cmp	count, #64
-	bge	.Lcpy_not_short
+	bge	L(cpy_not_short)
 	/* Deal with small copies quickly by dropping straight into the
 	   exit block.  */
 
-.Ltail63unaligned:
+L(tail63unaligned):
 #ifdef USE_NEON
 	and	tmp1, count, #0x38
 	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
@@ -207,13 +207,13 @@ ENTRY (__memcpy_arm)
 	strbne	src, [dst]
 	bx	lr
 
-.Lcpy_not_short:
+L(cpy_not_short):
 	/* At least 64 bytes to copy, but don't know the alignment yet.  */
 	str	tmp2, [sp, #-FRAME_SIZE]!
 	and	tmp2, src, #7
 	and	tmp1, dst, #7
 	cmp	tmp1, tmp2
-	bne	.Lcpy_notaligned
+	bne	L(cpy_notaligned)
 
 #ifdef USE_VFP
 	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
@@ -239,12 +239,12 @@ ENTRY (__memcpy_arm)
 
 1:
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
-	blt	.Ltail63aligned
+	blt	L(tail63aligned)
 
 	cmp	tmp2, #512
-	bge	.Lcpy_body_long
+	bge	L(cpy_body_long)
 
-.Lcpy_body_medium:			/* Count in tmp2.  */
+L(cpy_body_medium):			/* Count in tmp2.  */
 #ifdef USE_VFP
 1:
 	vldr	d0, [src, #0]
@@ -268,9 +268,9 @@ ENTRY (__memcpy_arm)
 	add	dst, dst, #64
 	bge	1b
 	tst	tmp2, #0x3f
-	beq	.Ldone
+	beq	L(done)
 
-.Ltail63aligned:			/* Count in tmp2.  */
+L(tail63aligned):			/* Count in tmp2.  */
 	and	tmp1, tmp2, #0x38
 	add	dst, dst, tmp1
 	add	src, src, tmp1
@@ -321,7 +321,7 @@ ENTRY (__memcpy_arm)
 	add	src, src, #8
 	add	dst, dst, #8
 
-.Ltail63aligned:			/* Count in tmp2.  */
+L(tail63aligned):			/* Count in tmp2.  */
 	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
 	   we know that the src and dest are 64-bit aligned so we can use
 	   LDRD/STRD to improve efficiency.  */
@@ -358,11 +358,11 @@ ENTRY (__memcpy_arm)
 	strhcs	tmp1, [dst], #2
 	strbne	tmp2, [dst]
 
-.Ldone:
+L(done):
 	ldr	tmp2, [sp], #FRAME_SIZE
 	bx	lr
 
-.Lcpy_body_long:			/* Count in tmp2.  */
+L(cpy_body_long):			/* Count in tmp2.  */
 
 	/* Long copy.  We know that there's at least (prefetch_lines * 64)
 	   bytes to go.  */
@@ -419,7 +419,7 @@ ENTRY (__memcpy_arm)
 	vstr	d2, [dst, #64 + 56]
 	add	dst, dst, #128
 	add	tmp2, tmp2, #prefetch_lines * 64
-	b	.Lcpy_body_medium
+	b	L(cpy_body_medium)
 #else
 	/* Long copy.  Use an SMS style loop to maximize the I/O
 	   bandwidth of the core.  We don't have enough spare registers
@@ -473,12 +473,12 @@ ENTRY (__memcpy_arm)
 	ldrd	D_l, D_h, [sp, #24]
 	add	dst, dst, #72
 	tst	tmp2, #0x3f
-	bne	.Ltail63aligned
+	bne	L(tail63aligned)
 	ldr	tmp2, [sp], #FRAME_SIZE
 	bx	lr
 #endif
 
-.Lcpy_notaligned:
+L(cpy_notaligned):
 	pld	[src]
 	pld	[src, #64]
 	/* There's at least 64 bytes to copy, but there is no mutual
@@ -500,7 +500,7 @@ ENTRY (__memcpy_arm)
 	pld	[src, #(3 * 64)]
 	subs	count, count, #64
 	ldrmi	tmp2, [sp], #FRAME_SIZE
-	bmi	.Ltail63unaligned
+	bmi	L(tail63unaligned)
 	pld	[src, #(4 * 64)]
 
 #ifdef USE_NEON
@@ -581,7 +581,7 @@ ENTRY (__memcpy_arm)
 	ands	count, tmp2, #0x3f
 #endif
 	ldr	tmp2, [sp], #FRAME_SIZE
-	bne	.Ltail63unaligned
+	bne	L(tail63unaligned)
 	bx	lr
 
 END (__memcpy_arm)
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index b46bc6d..295db8b 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -125,9 +125,9 @@
 
 	.text
 	.p2align	5
-.Lstrcmp_start_addr:
+L(strcmp_start_addr):
 #if STRCMP_NO_PRECHECK == 0
-.Lfastpath_exit:
+L(fastpath_exit):
 	sub	r0, r2, r3
 	bx	lr
 	nop
@@ -139,7 +139,7 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	cmp	r2, #1
 	it	cs
 	cmpcs	r2, r3
-	bne	.Lfastpath_exit
+	bne	L(fastpath_exit)
 #endif
 	strd	r4, r5, [sp, #-16]!
 	.cfi_def_cfa_offset 16
@@ -151,12 +151,12 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	.cfi_offset 7, -4
 	mvn	const_m1, #0
 	lsl	r2, tmp1, #29
-	cbz	r2, .Lloop_aligned8
+	cbz	r2, L(loop_aligned8)
 
-.Lnot_aligned:
+L(not_aligned):
 	eor	tmp1, src1, src2
 	tst	tmp1, #7
-	bne	.Lmisaligned8
+	bne	L(misaligned8)
 
 	/* Deal with mutual misalignment by aligning downwards and then
 	   masking off the unwanted loaded data to prevent a difference.  */
@@ -173,29 +173,29 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	S2HI	tmp1, const_m1, tmp2
 	orn	data1a, data1a, tmp1
 	orn	data2a, data2a, tmp1
-	beq	.Lstart_realigned8
+	beq	L(start_realigned8)
 	orn	data1b, data1b, tmp1
 	mov	data1a, const_m1
 	orn	data2b, data2b, tmp1
 	mov	data2a, const_m1
-	b	.Lstart_realigned8
+	b	L(start_realigned8)
 
 	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
 	   pass.  */
 	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
 	.p2align 2	/* Always word aligned.  */
-.Lloop_aligned8:
+L(loop_aligned8):
 	ldrd	data1a, data1b, [src1], #16
 	ldrd	data2a, data2b, [src2], #16
-.Lstart_realigned8:
+L(start_realigned8):
 	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
 	eor	syndrome_a, data1a, data2a
 	sel	syndrome_a, syndrome_a, const_m1
-	cbnz	syndrome_a, .Ldiff_in_a
+	cbnz	syndrome_a, L(diff_in_a)
 	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
 	eor	syndrome_b, data1b, data2b
 	sel	syndrome_b, syndrome_b, const_m1
-	cbnz	syndrome_b, .Ldiff_in_b
+	cbnz	syndrome_b, L(diff_in_b)
 
 	ldrd	data1a, data1b, [src1, #-8]
 	ldrd	data2a, data2b, [src2, #-8]
@@ -207,47 +207,47 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	sel	syndrome_b, syndrome_b, const_m1
 	/* Can't use CBZ for backwards branch.  */
 	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
-	beq	.Lloop_aligned8
+	beq	L(loop_aligned8)
 
-.Ldiff_found:
-	cbnz	syndrome_a, .Ldiff_in_a
+L(diff_found):
+	cbnz	syndrome_a, L(diff_in_a)
 
-.Ldiff_in_b:
+L(diff_in_b):
 	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
 
-.Ldiff_in_a:
+L(diff_in_a):
 	.cfi_restore_state
 	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
 
 	.cfi_restore_state
-.Lmisaligned8:
+L(misaligned8):
 	tst	tmp1, #3
-	bne	.Lmisaligned4
+	bne	L(misaligned4)
 	ands	tmp1, src1, #3
-	bne	.Lmutual_align4
+	bne	L(mutual_align4)
 
 	/* Unrolled by a factor of 2, to reduce the number of post-increment
 	   operations.  */
-.Lloop_aligned4:
+L(loop_aligned4):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned4:
+L(start_realigned4):
 	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
 	eor	syndrome, data1, data2
 	sel	syndrome, syndrome, const_m1
-	cbnz	syndrome, .Laligned4_done
+	cbnz	syndrome, L(aligned4_done)
 	ldr	data1, [src1, #-4]
 	ldr	data2, [src2, #-4]
 	uadd8	syndrome, data1, const_m1
 	eor	syndrome, data1, data2
 	sel	syndrome, syndrome, const_m1
 	cmp	syndrome, #0
-	beq	.Lloop_aligned4
+	beq	L(loop_aligned4)
 
-.Laligned4_done:
+L(aligned4_done):
 	strcmp_epilogue_aligned syndrome, data1, data2, 0
 
-.Lmutual_align4:
+L(mutual_align4):
 	.cfi_restore_state
 	/* Deal with mutual misalignment by aligning downwards and then
 	   masking off the unwanted loaded data to prevent a difference.  */
@@ -262,57 +262,57 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	S2HI	tmp1, const_m1, tmp1
 	orn	data1, data1, tmp1
 	orn	data2, data2, tmp1
-	b	.Lstart_realigned4
+	b	L(start_realigned4)
 
-.Lmisaligned4:
+L(misaligned4):
 	ands	tmp1, src1, #3
-	beq	.Lsrc1_aligned
+	beq	L(src1_aligned)
 	sub	src2, src2, tmp1
 	bic	src1, src1, #3
 	lsls	tmp1, tmp1, #31
 	ldr	data1, [src1], #4
-	beq	.Laligned_m2
-	bcs	.Laligned_m1
+	beq	L(aligned_m2)
+	bcs	L(aligned_m1)
 
 #if STRCMP_NO_PRECHECK == 1
 	ldrb	data2, [src2, #1]
 	uxtb	tmp1, data1, ror #BYTE1_OFFSET
 	subs	tmp1, tmp1, data2
-	bne	.Lmisaligned_exit
-	cbz	data2, .Lmisaligned_exit
+	bne	L(misaligned_exit)
+	cbz	data2, L(misaligned_exit)
 
-.Laligned_m2:
+L(aligned_m2):
 	ldrb	data2, [src2, #2]
 	uxtb	tmp1, data1, ror #BYTE2_OFFSET
 	subs	tmp1, tmp1, data2
-	bne	.Lmisaligned_exit
-	cbz	data2, .Lmisaligned_exit
+	bne	L(misaligned_exit)
+	cbz	data2, L(misaligned_exit)
 
-.Laligned_m1:
+L(aligned_m1):
 	ldrb	data2, [src2, #3]
 	uxtb	tmp1, data1, ror #BYTE3_OFFSET
 	subs	tmp1, tmp1, data2
-	bne	.Lmisaligned_exit
+	bne	L(misaligned_exit)
 	add	src2, src2, #4
-	cbnz	data2, .Lsrc1_aligned
+	cbnz	data2, L(src1_aligned)
 #else  /* STRCMP_NO_PRECHECK */
 	/* If we've done the pre-check, then we don't need to check the
 	   first byte again here.  */
 	ldrb	data2, [src2, #2]
 	uxtb	tmp1, data1, ror #BYTE2_OFFSET
 	subs	tmp1, tmp1, data2
-	bne	.Lmisaligned_exit
-	cbz	data2, .Lmisaligned_exit
+	bne	L(misaligned_exit)
+	cbz	data2, L(misaligned_exit)
 
-.Laligned_m2:
+L(aligned_m2):
 	ldrb	data2, [src2, #3]
 	uxtb	tmp1, data1, ror #BYTE3_OFFSET
 	subs	tmp1, tmp1, data2
-	bne	.Lmisaligned_exit
-	cbnz	data2, .Laligned_m1
+	bne	L(misaligned_exit)
+	cbnz	data2, L(aligned_m1)
 #endif
 
-.Lmisaligned_exit:
+L(misaligned_exit):
 	.cfi_remember_state
 	mov	result, tmp1
 	ldr	r4, [sp], #16
@@ -320,10 +320,10 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	bx	lr
 
 #if STRCMP_NO_PRECHECK == 0
-.Laligned_m1:
+L(aligned_m1):
 	add	src2, src2, #4
 #endif
-.Lsrc1_aligned:
+L(src1_aligned):
 	.cfi_restore_state
 	/* src1 is word aligned, but src2 has no common alignment
 	   with it.  */
@@ -332,11 +332,11 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 
 	bic	src2, src2, #3
 	ldr	data2, [src2], #4
-	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
-	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
+	bhi	L(overlap1)		/* C=1, Z=0 => src2[1:0] = 0b11.  */
+	bcs	L(overlap2)		/* C=1, Z=1 => src2[1:0] = 0b10.  */
 
 	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
-.Loverlap3:
+L(overlap3):
 	bic	tmp1, data1, #MSB
 	uadd8	syndrome, data1, const_m1
 	eors	syndrome, tmp1, data2, S2LO #8
@@ -348,14 +348,14 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	cmp	tmp1, data2, S2HI #24
 	bne	6f
 	ldr	data1, [src1], #4
-	b	.Loverlap3
+	b	L(overlap3)
 4:
 	S2LO	data2, data2, #8
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 
 5:
 	bics	syndrome, syndrome, #MSB
-	bne	.Lstrcmp_done_equal
+	bne	L(strcmp_done_equal)
 
 	/* We can only get here if the MSB of data1 contains 0, so
 	   fast-path the exit.  */
@@ -374,10 +374,10 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	.cfi_restore_state
 	S2LO	data1, data1, #24
 	and	data2, data2, #LSB
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 
 	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
-.Loverlap2:
+L(overlap2):
 	and	tmp1, data1, const_m1, S2LO #16
 	uadd8	syndrome, data1, const_m1
 	eors	syndrome, tmp1, data2, S2LO #16
@@ -389,28 +389,28 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	cmp	tmp1, data2, S2HI #16
 	bne	6f
 	ldr	data1, [src1], #4
-	b	.Loverlap2
+	b	L(overlap2)
 4:
 	S2LO	data2, data2, #16
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 5:
 	ands	syndrome, syndrome, const_m1, S2LO #16
-	bne	.Lstrcmp_done_equal
+	bne	L(strcmp_done_equal)
 
 	ldrh	data2, [src2]
 	S2LO	data1, data1, #16
 #ifdef __ARM_BIG_ENDIAN
 	lsl	data2, data2, #16
 #endif
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 
 6:
 	S2LO	data1, data1, #16
 	and	data2, data2, const_m1, S2LO #16
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 
 	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
-.Loverlap1:
+L(overlap1):
 	and	tmp1, data1, #LSB
 	uadd8	syndrome, data1, const_m1
 	eors	syndrome, tmp1, data2, S2LO #24
@@ -422,20 +422,20 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	cmp	tmp1, data2, S2HI #8
 	bne	6f
 	ldr	data1, [src1], #4
-	b	.Loverlap1
+	b	L(overlap1)
 4:
 	S2LO	data2, data2, #24
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 5:
 	tst	syndrome, #LSB
-	bne	.Lstrcmp_done_equal
+	bne	L(strcmp_done_equal)
 	ldr	data2, [src2]
 6:
 	S2LO	data1, data1, #8
 	bic	data2, data2, #MSB
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 
-.Lstrcmp_done_equal:
+L(strcmp_done_equal):
 	mov	result, #0
 	.cfi_remember_state
 	ldrd	r4, r5, [sp], #16
@@ -446,7 +446,7 @@ ENTRY_ALIGN (__strcmp_arm, 0)
 	.cfi_restore 7
 	bx	lr
 
-.Lstrcmp_tail:
+L(strcmp_tail):
 	.cfi_restore_state
 #ifndef __ARM_BIG_ENDIAN
 	rev	data1, data1
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 7245440..76e6930 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -45,20 +45,20 @@ ENTRY (__strlen_armv6t2)
 	mvn	const_m1, #0
 	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
 	pld	[src, #32]
-	bne.w	.Lmisaligned8
+	bne.w	L(misaligned8)
 	mov	const_0, #0
 	mov	result, #-8
-.Lloop_aligned:
+L(loop_aligned):
 	/* Bytes 0-7.  */
 	ldrd	data1a, data1b, [src]
 	pld	[src, #64]
 	add	result, result, #8
-.Lstart_realigned:
+L(start_realigned):
 	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
-	cbnz	data1b, .Lnull_found
+	cbnz	data1b, L(null_found)
 
 	/* Bytes 8-15.  */
 	ldrd	data1a, data1b, [src, #8]
@@ -67,7 +67,7 @@ ENTRY (__strlen_armv6t2)
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
-	cbnz	data1b, .Lnull_found
+	cbnz	data1b, L(null_found)
 
 	/* Bytes 16-23.  */
 	ldrd	data1a, data1b, [src, #16]
@@ -76,7 +76,7 @@ ENTRY (__strlen_armv6t2)
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
-	cbnz	data1b, .Lnull_found
+	cbnz	data1b, L(null_found)
 
 	/* Bytes 24-31.  */
 	ldrd	data1a, data1b, [src, #24]
@@ -87,9 +87,9 @@ ENTRY (__strlen_armv6t2)
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
 	cmp	data1b, #0
-	beq	.Lloop_aligned
+	beq	L(loop_aligned)
 
-.Lnull_found:
+L(null_found):
 	cmp	data1a, #0
 	itt	eq
 	addeq	result, result, #4
@@ -102,7 +102,7 @@ ENTRY (__strlen_armv6t2)
 	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
 	bx	lr
 
-.Lmisaligned8:
+L(misaligned8):
 	ldrd	data1a, data1b, [src]
 	and	tmp2, tmp1, #3
 	rsb	result, tmp1, #0
@@ -115,6 +115,6 @@ ENTRY (__strlen_armv6t2)
 	ornne	data1b, data1b, tmp2
 	movne	data1a, const_m1
 	mov	const_0, #0
-	b	.Lstart_realigned
+	b	L(start_realigned)
 
 END (__strlen_armv6t2)
author	Wilco Dijkstra <wdijkstr@arm.com>	2020-01-02 13:36:34 +0000
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>	2020-01-02 13:36:34 +0000
commit	833e86096b1c38218670459a4c11bf9c790a96d0 (patch)
tree	88e48be64786d4edf2d75416d6cb8f0c8002d29e
parent	31b560bc3b82ae45044e6455493ce6783aa94d98 (diff)
download	arm-optimized-routines-833e86096b1c38218670459a4c11bf9c790a96d0.tar.gz