aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wdijkstr@arm.com>2020-01-02 13:36:34 +0000
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2020-01-02 13:36:34 +0000
commit833e86096b1c38218670459a4c11bf9c790a96d0 (patch)
tree88e48be64786d4edf2d75416d6cb8f0c8002d29e
parent31b560bc3b82ae45044e6455493ce6783aa94d98 (diff)
downloadarm-optimized-routines-833e86096b1c38218670459a4c11bf9c790a96d0.tar.gz
string: Use L(name) for labels
Use L(name) for all assembler labels.
-rw-r--r--string/aarch64/memchr.S24
-rw-r--r--string/aarch64/strchr.S10
-rw-r--r--string/aarch64/strchrnul.S10
-rw-r--r--string/aarch64/strcpy.S40
-rw-r--r--string/aarch64/strlen.S2
-rw-r--r--string/aarch64/strncmp.S62
-rw-r--r--string/aarch64/strnlen.S24
-rw-r--r--string/arm/memcpy.S34
-rw-r--r--string/arm/strcmp.S134
-rw-r--r--string/arm/strlen-armv6t2.S20
10 files changed, 180 insertions, 180 deletions
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 6ffade1..10be49e 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -48,7 +48,7 @@
ENTRY (__memchr_aarch64)
/* Do not dereference srcin if no bytes to compare. */
- cbz cntin, .Lzero_length
+ cbz cntin, L(zero_length)
/*
* Magic constant 0x40100401 allows us to identify which lane matches
* the requested byte.
@@ -61,7 +61,7 @@ ENTRY (__memchr_aarch64)
dup vrepmask.4s, wtmp2
ands soff, srcin, #31
and cntrem, cntin, #31
- b.eq .Lloop
+ b.eq L(loop)
/*
* Input string is not 32-byte aligned. We calculate the syndrome
@@ -84,25 +84,25 @@ ENTRY (__memchr_aarch64)
lsr synd, synd, tmp
lsl synd, synd, tmp
/* The first block can also be the last */
- b.ls .Lmasklast
+ b.ls L(masklast)
/* Have we found something already? */
- cbnz synd, .Ltail
+ cbnz synd, L(tail)
-.Lloop:
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
subs cntin, cntin, #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
/* If we're out of data we finish regardless of the result */
- b.ls .Lend
+ b.ls L(end)
/* Use a fast check for the termination condition */
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend.2d, vend.2d, vend.2d
mov synd, vend.d[0]
/* We're not out of data, loop if we haven't found the character */
- cbz synd, .Lloop
+ cbz synd, L(loop)
-.Lend:
+L(end):
/* Termination condition found, let's calculate the syndrome value */
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
@@ -110,9 +110,9 @@ ENTRY (__memchr_aarch64)
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.d[0]
/* Only do the clear for the last possible block */
- b.hi .Ltail
+ b.hi L(tail)
-.Lmasklast:
+L(masklast):
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
add tmp, cntrem, soff
and tmp, tmp, #31
@@ -121,7 +121,7 @@ ENTRY (__memchr_aarch64)
lsl synd, synd, tmp
lsr synd, synd, tmp
-.Ltail:
+L(tail):
/* Count the trailing zeros using bit reversing */
rbit synd, synd
/* Compensate the last post-increment */
@@ -136,7 +136,7 @@ ENTRY (__memchr_aarch64)
csel result, xzr, result, eq
ret
-.Lzero_length:
+L(zero_length):
mov result, #0
ret
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 66a1fdd..00d9be3 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -61,7 +61,7 @@ ENTRY (__strchr_aarch64)
dup vrepmask_c.4s, wtmp2
ands tmp1, srcin, #31
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
- b.eq .Lloop
+ b.eq L(loop)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
@@ -87,9 +87,9 @@ ENTRY (__strchr_aarch64)
mov tmp3, vend1.d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
- cbnz tmp1, .Ltail
+ cbnz tmp1, L(tail)
-.Lloop:
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
@@ -101,7 +101,7 @@ ENTRY (__strchr_aarch64)
orr vend1.16b, vend1.16b, vend2.16b
addp vend1.2d, vend1.2d, vend1.2d
mov tmp1, vend1.d[0]
- cbz tmp1, .Lloop
+ cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
@@ -115,7 +115,7 @@ ENTRY (__strchr_aarch64)
addp vend1.16b, vend1.16b, vend2.16b // 128->64
mov tmp1, vend1.d[0]
-.Ltail:
+L(tail):
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 697dbf4..81264ea 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -55,7 +55,7 @@ ENTRY (__strchrnul_aarch64)
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask.4s, wtmp2
ands tmp1, srcin, #31
- b.eq .Lloop
+ b.eq L(loop)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
@@ -79,9 +79,9 @@ ENTRY (__strchrnul_aarch64)
mov tmp3, vend1.d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
- cbnz tmp1, .Ltail
+ cbnz tmp1, L(tail)
-.Lloop:
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
@@ -93,7 +93,7 @@ ENTRY (__strchrnul_aarch64)
orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend1.2d, vend1.2d, vend1.2d
mov tmp1, vend1.d[0]
- cbz tmp1, .Lloop
+ cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
@@ -103,7 +103,7 @@ ENTRY (__strchrnul_aarch64)
addp vend1.16b, vend1.16b, vend1.16b // 128->64
mov tmp1, vend1.d[0]
-.Ltail:
+L(tail):
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 766e71b..4edffcf 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -99,9 +99,9 @@ ENTRY (STRCPY)
srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
aligned string will never fail the page align check, so will
always take the fast path. */
- b.gt .Lpage_cross
+ b.gt L(page_cross)
-.Lpage_cross_ok:
+L(page_cross_ok):
ldp data1, data2, [srcin]
#ifdef __AARCH64EB__
/* Because we expect the end to be found within 16 characters
@@ -113,7 +113,7 @@ ENTRY (STRCPY)
sub tmp1, tmp2, zeroones
orr tmp2, tmp2, #REP8_7f
bics has_nul1, tmp1, tmp2
- b.ne .Lfp_le8
+ b.ne L(fp_le8)
rev tmp4, data2
sub tmp3, tmp4, zeroones
orr tmp4, tmp4, #REP8_7f
@@ -121,17 +121,17 @@ ENTRY (STRCPY)
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
bics has_nul1, tmp1, tmp2
- b.ne .Lfp_le8
+ b.ne L(fp_le8)
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
#endif
bics has_nul2, tmp3, tmp4
- b.eq .Lbulk_entry
+ b.eq L(bulk_entry)
/* The string is short (<=16 bytes). We don't know exactly how
short though, yet. Work out the exact length so that we can
quickly select the optimal copy strategy. */
-.Lfp_gt8:
+L(fp_gt8):
rev has_nul2, has_nul2
clz pos, has_nul2
mov tmp2, #56
@@ -149,12 +149,12 @@ ENTRY (STRCPY)
#endif
ret
-.Lfp_le8:
+L(fp_le8):
rev has_nul1, has_nul1
clz pos, has_nul1
add dst, dstin, pos, lsr #3 /* Bits to bytes. */
subs tmp2, pos, #24 /* Pos in bits. */
- b.lt .Lfp_lt4
+ b.lt L(fp_lt4)
#ifdef __AARCH64EB__
mov tmp2, #56
sub pos, tmp2, pos
@@ -170,15 +170,15 @@ ENTRY (STRCPY)
mov dstin, dst
#endif
ret
-.Lfp_lt4:
- cbz pos, .Lfp_lt2
+L(fp_lt4):
+ cbz pos, L(fp_lt2)
/* 2->3 bytes to copy. */
#ifdef __AARCH64EB__
lsr data1, data1, #48
#endif
strh data1w, [dstin]
/* Fall-through, one byte (max) to go. */
-.Lfp_lt2:
+L(fp_lt2):
/* Null-terminated string. Last character must be zero! */
strb wzr, [dst]
#ifdef BUILD_STPCPY
@@ -189,20 +189,20 @@ ENTRY (STRCPY)
.p2align 6
/* Aligning here ensures that the entry code and main loop all lies
within one 64-byte cache line. */
-.Lbulk_entry:
+L(bulk_entry):
sub to_align, to_align, #16
stp data1, data2, [dstin]
sub src, srcin, to_align
sub dst, dstin, to_align
- b .Lentry_no_page_cross
+ b L(entry_no_page_cross)
/* The inner loop deals with two Dwords at a time. This has a
slightly higher start-up cost, but we should win quite quickly,
especially on cores with a high number of issue slots per
cycle, as we get much better parallelism out of the operations. */
-.Lmain_loop:
+L(main_loop):
stp data1, data2, [dst], #16
-.Lentry_no_page_cross:
+L(entry_no_page_cross):
ldp data1, data2, [src], #16
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
@@ -211,7 +211,7 @@ ENTRY (STRCPY)
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq .Lmain_loop
+ b.eq L(main_loop)
/* Since we know we are copying at least 16 bytes, the fastest way
to deal with the tail is to determine the location of the
@@ -244,7 +244,7 @@ ENTRY (STRCPY)
#endif
ret
-.Lpage_cross:
+L(page_cross):
bic src, srcin, #15
/* Start by loading two words at [srcin & ~15], then forcing the
bytes that precede srcin to 0xff. This means they never look
@@ -270,7 +270,7 @@ ENTRY (STRCPY)
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq .Lpage_cross_ok
+ b.eq L(page_cross_ok)
/* We now need to make data1 and data2 look like they've been
loaded directly from srcin. Do a rotate on the 128-bit value. */
lsl tmp1, to_align, #3 /* Bytes->bits. */
@@ -301,8 +301,8 @@ ENTRY (STRCPY)
orr tmp4, data2, #REP8_7f
#endif
bic has_nul1, tmp1, tmp2
- cbnz has_nul1, .Lfp_le8
+ cbnz has_nul1, L(fp_le8)
bic has_nul2, tmp3, tmp4
- b .Lfp_gt8
+ b L(fp_gt8)
END (STRCPY)
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 8a7d753..2293f73 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -114,7 +114,7 @@ L(main_loop_entry):
sub src, src, 16
L(main_loop):
ldp data1, data2, [src, 32]!
-.Lpage_cross_entry:
+L(page_cross_entry):
sub tmp1, data1, zeroones
sub tmp3, data2, zeroones
orr tmp2, tmp1, tmp3
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index e29fb7d..fbd08ee 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -46,13 +46,13 @@
nop /* Pad so that the loop below fits a cache line. */
.endr
ENTRY_ALIGN (__strncmp_aarch64, 0)
- cbz limit, .Lret0
+ cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
and count, src1, #7
- b.ne .Lmisaligned8
- cbnz count, .Lmutual_align
+ b.ne L(misaligned8)
+ cbnz count, L(mutual_align)
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
@@ -61,10 +61,10 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
/* Start of performance-critical section -- one 64B cache line. */
-.Lloop_aligned:
+L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
subs limit_wd, limit_wd, #1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
@@ -72,15 +72,15 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
csinv endloop, diff, xzr, pl /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
- b.eq .Lloop_aligned
+ b.eq L(loop_aligned)
/* End of performance-critical section -- one 64B cache line. */
/* Not reached the limit, must have found the end or a diff. */
- tbz limit_wd, #63, .Lnot_limit
+ tbz limit_wd, #63, L(not_limit)
/* Limit % 8 == 0 => all bytes significant. */
ands limit, limit, #7
- b.eq .Lnot_limit
+ b.eq L(not_limit)
lsl limit, limit, #3 /* Bits -> bytes. */
mov mask, #~0
@@ -95,7 +95,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
/* Make sure that the NUL byte is marked in the syndrome. */
orr has_nul, has_nul, mask
-.Lnot_limit:
+L(not_limit):
orr syndrome, diff, has_nul
#ifndef __AARCH64EB__
@@ -148,7 +148,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
ret
#endif
-.Lmutual_align:
+L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that precede the start point.
@@ -176,56 +176,56 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
- b .Lstart_realigned
+ b L(start_realigned)
.p2align 6
/* Don't bother with dwords for up to 16 bytes. */
-.Lmisaligned8:
+L(misaligned8):
cmp limit, #16
- b.hs .Ltry_misaligned_words
+ b.hs L(try_misaligned_words)
-.Lbyte_loop:
+L(byte_loop):
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Lbyte_loop
-.Ldone:
+ b.eq L(byte_loop)
+L(done):
sub result, data1, data2
ret
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
-.Ltry_misaligned_words:
+L(try_misaligned_words):
lsr limit_wd, limit, #3
- cbz count, .Ldo_misaligned
+ cbz count, L(do_misaligned)
neg count, count
and count, count, #7
sub limit, limit, count
lsr limit_wd, limit, #3
-.Lpage_end_loop:
+L(page_end_loop):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.ne .Ldone
+ b.ne L(done)
subs count, count, #1
- b.hi .Lpage_end_loop
+ b.hi L(page_end_loop)
-.Ldo_misaligned:
+L(do_misaligned):
/* Prepare ourselves for the next page crossing. Unlike the aligned
loop, we fetch 1 less dword because we risk crossing bounds on
SRC2. */
mov count, #8
subs limit_wd, limit_wd, #1
- b.lo .Ldone_loop
-.Lloop_misaligned:
+ b.lo L(done_loop)
+L(loop_misaligned):
and tmp2, src2, #0xff8
eor tmp2, tmp2, #0xff8
- cbz tmp2, .Lpage_end_loop
+ cbz tmp2, L(page_end_loop)
ldr data1, [src1], #8
ldr data2, [src2], #8
@@ -234,14 +234,14 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
eor diff, data1, data2 /* Non-zero if differences found. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp diff, #0, #0, eq
- b.ne .Lnot_limit
+ b.ne L(not_limit)
subs limit_wd, limit_wd, #1
- b.pl .Lloop_misaligned
+ b.pl L(loop_misaligned)
-.Ldone_loop:
+L(done_loop):
/* We found a difference or a NULL before the limit was reached. */
and limit, limit, #7
- cbz limit, .Lnot_limit
+ cbz limit, L(not_limit)
/* Read the last word. */
sub src1, src1, 8
sub src2, src2, 8
@@ -252,9 +252,9 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
eor diff, data1, data2 /* Non-zero if differences found. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp diff, #0, #0, eq
- b.ne .Lnot_limit
+ b.ne L(not_limit)
-.Lret0:
+L(ret0):
mov result, #0
ret
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index bf72686..df66b60 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -38,22 +38,22 @@
.text
.p2align 6
-.Lstart:
+L(start):
/* Pre-pad to ensure critical loop begins an icache line. */
.rep 7
nop
.endr
/* Put this code here to avoid wasting more space with pre-padding. */
-.Lhit_limit:
+L(hit_limit):
mov len, limit
ret
ENTRY_ALIGN (__strnlen_aarch64, 0)
- cbz limit, .Lhit_limit
+ cbz limit, L(hit_limit)
mov zeroones, #REP8_01
bic src, srcin, #15
ands tmp1, srcin, #15
- b.ne .Lmisaligned
+ b.ne L(misaligned)
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
@@ -67,9 +67,9 @@ ENTRY_ALIGN (__strnlen_aarch64, 0)
cycle, as we get much better parallelism out of the operations. */
/* Start of critial section -- keep to one 64Byte cache line. */
-.Lloop:
+L(loop):
ldp data1, data2, [src], #16
-.Lrealigned:
+L(realigned):
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
@@ -79,24 +79,24 @@ ENTRY_ALIGN (__strnlen_aarch64, 0)
subs limit_wd, limit_wd, #1
orr tmp1, has_nul1, has_nul2
ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
- b.eq .Lloop
+ b.eq L(loop)
/* End of critical section -- keep to one 64Byte cache line. */
orr tmp1, has_nul1, has_nul2
- cbz tmp1, .Lhit_limit /* No null in final Qword. */
+ cbz tmp1, L(hit_limit) /* No null in final Qword. */
/* We know there's a null in the final Qword. The easiest thing
to do now is work out the length of the string and return
MIN (len, limit). */
sub len, src, srcin
- cbz has_nul1, .Lnul_in_data2
+ cbz has_nul1, L(nul_in_data2)
#ifdef __AARCH64EB__
mov data2, data1
#endif
sub len, len, #8
mov has_nul2, has_nul1
-.Lnul_in_data2:
+L(nul_in_data2):
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul directly. The
@@ -115,7 +115,7 @@ ENTRY_ALIGN (__strnlen_aarch64, 0)
csel len, len, limit, ls /* Return the lower value. */
ret
-.Lmisaligned:
+L(misaligned):
/* Deal with a partial first word.
We're doing two things in parallel here;
1) Calculate the number of words (but avoiding overflow if
@@ -150,6 +150,6 @@ ENTRY_ALIGN (__strnlen_aarch64, 0)
csinv data1, data1, xzr, le
csel data2, data2, data2a, le
- b .Lrealigned
+ b L(realigned)
END (__strnlen_aarch64)
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index e8f5843..aab78a2 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -124,11 +124,11 @@ ENTRY (__memcpy_arm)
mov dst, dstin /* Preserve dstin, we need to return it. */
cmp count, #64
- bge .Lcpy_not_short
+ bge L(cpy_not_short)
/* Deal with small copies quickly by dropping straight into the
exit block. */
-.Ltail63unaligned:
+L(tail63unaligned):
#ifdef USE_NEON
and tmp1, count, #0x38
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
@@ -207,13 +207,13 @@ ENTRY (__memcpy_arm)
strbne src, [dst]
bx lr
-.Lcpy_not_short:
+L(cpy_not_short):
/* At least 64 bytes to copy, but don't know the alignment yet. */
str tmp2, [sp, #-FRAME_SIZE]!
and tmp2, src, #7
and tmp1, dst, #7
cmp tmp1, tmp2
- bne .Lcpy_notaligned
+ bne L(cpy_notaligned)
#ifdef USE_VFP
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
@@ -239,12 +239,12 @@ ENTRY (__memcpy_arm)
1:
subs tmp2, count, #64 /* Use tmp2 for count. */
- blt .Ltail63aligned
+ blt L(tail63aligned)
cmp tmp2, #512
- bge .Lcpy_body_long
+ bge L(cpy_body_long)
-.Lcpy_body_medium: /* Count in tmp2. */
+L(cpy_body_medium): /* Count in tmp2. */
#ifdef USE_VFP
1:
vldr d0, [src, #0]
@@ -268,9 +268,9 @@ ENTRY (__memcpy_arm)
add dst, dst, #64
bge 1b
tst tmp2, #0x3f
- beq .Ldone
+ beq L(done)
-.Ltail63aligned: /* Count in tmp2. */
+L(tail63aligned): /* Count in tmp2. */
and tmp1, tmp2, #0x38
add dst, dst, tmp1
add src, src, tmp1
@@ -321,7 +321,7 @@ ENTRY (__memcpy_arm)
add src, src, #8
add dst, dst, #8
-.Ltail63aligned: /* Count in tmp2. */
+L(tail63aligned): /* Count in tmp2. */
/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
we know that the src and dest are 64-bit aligned so we can use
LDRD/STRD to improve efficiency. */
@@ -358,11 +358,11 @@ ENTRY (__memcpy_arm)
strhcs tmp1, [dst], #2
strbne tmp2, [dst]
-.Ldone:
+L(done):
ldr tmp2, [sp], #FRAME_SIZE
bx lr
-.Lcpy_body_long: /* Count in tmp2. */
+L(cpy_body_long): /* Count in tmp2. */
/* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */
@@ -419,7 +419,7 @@ ENTRY (__memcpy_arm)
vstr d2, [dst, #64 + 56]
add dst, dst, #128
add tmp2, tmp2, #prefetch_lines * 64
- b .Lcpy_body_medium
+ b L(cpy_body_medium)
#else
/* Long copy. Use an SMS style loop to maximize the I/O
bandwidth of the core. We don't have enough spare registers
@@ -473,12 +473,12 @@ ENTRY (__memcpy_arm)
ldrd D_l, D_h, [sp, #24]
add dst, dst, #72
tst tmp2, #0x3f
- bne .Ltail63aligned
+ bne L(tail63aligned)
ldr tmp2, [sp], #FRAME_SIZE
bx lr
#endif
-.Lcpy_notaligned:
+L(cpy_notaligned):
pld [src]
pld [src, #64]
/* There's at least 64 bytes to copy, but there is no mutual
@@ -500,7 +500,7 @@ ENTRY (__memcpy_arm)
pld [src, #(3 * 64)]
subs count, count, #64
ldrmi tmp2, [sp], #FRAME_SIZE
- bmi .Ltail63unaligned
+ bmi L(tail63unaligned)
pld [src, #(4 * 64)]
#ifdef USE_NEON
@@ -581,7 +581,7 @@ ENTRY (__memcpy_arm)
ands count, tmp2, #0x3f
#endif
ldr tmp2, [sp], #FRAME_SIZE
- bne .Ltail63unaligned
+ bne L(tail63unaligned)
bx lr
END (__memcpy_arm)
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index b46bc6d..295db8b 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -125,9 +125,9 @@
.text
.p2align 5
-.Lstrcmp_start_addr:
+L(strcmp_start_addr):
#if STRCMP_NO_PRECHECK == 0
-.Lfastpath_exit:
+L(fastpath_exit):
sub r0, r2, r3
bx lr
nop
@@ -139,7 +139,7 @@ ENTRY_ALIGN (__strcmp_arm, 0)
cmp r2, #1
it cs
cmpcs r2, r3
- bne .Lfastpath_exit
+ bne L(fastpath_exit)
#endif
strd r4, r5, [sp, #-16]!
.cfi_def_cfa_offset 16
@@ -151,12 +151,12 @@ ENTRY_ALIGN (__strcmp_arm, 0)
.cfi_offset 7, -4
mvn const_m1, #0
lsl r2, tmp1, #29
- cbz r2, .Lloop_aligned8
+ cbz r2, L(loop_aligned8)
-.Lnot_aligned:
+L(not_aligned):
eor tmp1, src1, src2
tst tmp1, #7
- bne .Lmisaligned8
+ bne L(misaligned8)
/* Deal with mutual misalignment by aligning downwards and then
masking off the unwanted loaded data to prevent a difference. */
@@ -173,29 +173,29 @@ ENTRY_ALIGN (__strcmp_arm, 0)
S2HI tmp1, const_m1, tmp2
orn data1a, data1a, tmp1
orn data2a, data2a, tmp1
- beq .Lstart_realigned8
+ beq L(start_realigned8)
orn data1b, data1b, tmp1
mov data1a, const_m1
orn data2b, data2b, tmp1
mov data2a, const_m1
- b .Lstart_realigned8
+ b L(start_realigned8)
/* Unwind the inner loop by a factor of 2, giving 16 bytes per
pass. */
.p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
.p2align 2 /* Always word aligned. */
-.Lloop_aligned8:
+L(loop_aligned8):
ldrd data1a, data1b, [src1], #16
ldrd data2a, data2b, [src2], #16
-.Lstart_realigned8:
+L(start_realigned8):
uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
eor syndrome_a, data1a, data2a
sel syndrome_a, syndrome_a, const_m1
- cbnz syndrome_a, .Ldiff_in_a
+ cbnz syndrome_a, L(diff_in_a)
uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
eor syndrome_b, data1b, data2b
sel syndrome_b, syndrome_b, const_m1
- cbnz syndrome_b, .Ldiff_in_b
+ cbnz syndrome_b, L(diff_in_b)
ldrd data1a, data1b, [src1, #-8]
ldrd data2a, data2b, [src2, #-8]
@@ -207,47 +207,47 @@ ENTRY_ALIGN (__strcmp_arm, 0)
sel syndrome_b, syndrome_b, const_m1
/* Can't use CBZ for backwards branch. */
orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
- beq .Lloop_aligned8
+ beq L(loop_aligned8)
-.Ldiff_found:
- cbnz syndrome_a, .Ldiff_in_a
+L(diff_found):
+ cbnz syndrome_a, L(diff_in_a)
-.Ldiff_in_b:
+L(diff_in_b):
strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
-.Ldiff_in_a:
+L(diff_in_a):
.cfi_restore_state
strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
.cfi_restore_state
-.Lmisaligned8:
+L(misaligned8):
tst tmp1, #3
- bne .Lmisaligned4
+ bne L(misaligned4)
ands tmp1, src1, #3
- bne .Lmutual_align4
+ bne L(mutual_align4)
/* Unrolled by a factor of 2, to reduce the number of post-increment
operations. */
-.Lloop_aligned4:
+L(loop_aligned4):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned4:
+L(start_realigned4):
uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
eor syndrome, data1, data2
sel syndrome, syndrome, const_m1
- cbnz syndrome, .Laligned4_done
+ cbnz syndrome, L(aligned4_done)
ldr data1, [src1, #-4]
ldr data2, [src2, #-4]
uadd8 syndrome, data1, const_m1
eor syndrome, data1, data2
sel syndrome, syndrome, const_m1
cmp syndrome, #0
- beq .Lloop_aligned4
+ beq L(loop_aligned4)
-.Laligned4_done:
+L(aligned4_done):
strcmp_epilogue_aligned syndrome, data1, data2, 0
-.Lmutual_align4:
+L(mutual_align4):
.cfi_restore_state
/* Deal with mutual misalignment by aligning downwards and then
masking off the unwanted loaded data to prevent a difference. */
@@ -262,57 +262,57 @@ ENTRY_ALIGN (__strcmp_arm, 0)
S2HI tmp1, const_m1, tmp1
orn data1, data1, tmp1
orn data2, data2, tmp1
- b .Lstart_realigned4
+ b L(start_realigned4)
-.Lmisaligned4:
+L(misaligned4):
ands tmp1, src1, #3
- beq .Lsrc1_aligned
+ beq L(src1_aligned)
sub src2, src2, tmp1
bic src1, src1, #3
lsls tmp1, tmp1, #31
ldr data1, [src1], #4
- beq .Laligned_m2
- bcs .Laligned_m1
+ beq L(aligned_m2)
+ bcs L(aligned_m1)
#if STRCMP_NO_PRECHECK == 1
ldrb data2, [src2, #1]
uxtb tmp1, data1, ror #BYTE1_OFFSET
subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbz data2, .Lmisaligned_exit
+ bne L(misaligned_exit)
+ cbz data2, L(misaligned_exit)
-.Laligned_m2:
+L(aligned_m2):
ldrb data2, [src2, #2]
uxtb tmp1, data1, ror #BYTE2_OFFSET
subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbz data2, .Lmisaligned_exit
+ bne L(misaligned_exit)
+ cbz data2, L(misaligned_exit)
-.Laligned_m1:
+L(aligned_m1):
ldrb data2, [src2, #3]
uxtb tmp1, data1, ror #BYTE3_OFFSET
subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
+ bne L(misaligned_exit)
add src2, src2, #4
- cbnz data2, .Lsrc1_aligned
+ cbnz data2, L(src1_aligned)
#else /* STRCMP_NO_PRECHECK */
/* If we've done the pre-check, then we don't need to check the
first byte again here. */
ldrb data2, [src2, #2]
uxtb tmp1, data1, ror #BYTE2_OFFSET
subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbz data2, .Lmisaligned_exit
+ bne L(misaligned_exit)
+ cbz data2, L(misaligned_exit)
-.Laligned_m2:
+L(aligned_m2):
ldrb data2, [src2, #3]
uxtb tmp1, data1, ror #BYTE3_OFFSET
subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbnz data2, .Laligned_m1
+ bne L(misaligned_exit)
+ cbnz data2, L(aligned_m1)
#endif
-.Lmisaligned_exit:
+L(misaligned_exit):
.cfi_remember_state
mov result, tmp1
ldr r4, [sp], #16
@@ -320,10 +320,10 @@ ENTRY_ALIGN (__strcmp_arm, 0)
bx lr
#if STRCMP_NO_PRECHECK == 0
-.Laligned_m1:
+L(aligned_m1):
add src2, src2, #4
#endif
-.Lsrc1_aligned:
+L(src1_aligned):
.cfi_restore_state
/* src1 is word aligned, but src2 has no common alignment
with it. */
@@ -332,11 +332,11 @@ ENTRY_ALIGN (__strcmp_arm, 0)
bic src2, src2, #3
ldr data2, [src2], #4
- bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
- bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
+ bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */
+ bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */
/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
-.Loverlap3:
+L(overlap3):
bic tmp1, data1, #MSB
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #8
@@ -348,14 +348,14 @@ ENTRY_ALIGN (__strcmp_arm, 0)
cmp tmp1, data2, S2HI #24
bne 6f
ldr data1, [src1], #4
- b .Loverlap3
+ b L(overlap3)
4:
S2LO data2, data2, #8
- b .Lstrcmp_tail
+ b L(strcmp_tail)
5:
bics syndrome, syndrome, #MSB
- bne .Lstrcmp_done_equal
+ bne L(strcmp_done_equal)
/* We can only get here if the MSB of data1 contains 0, so
fast-path the exit. */
@@ -374,10 +374,10 @@ ENTRY_ALIGN (__strcmp_arm, 0)
.cfi_restore_state
S2LO data1, data1, #24
and data2, data2, #LSB
- b .Lstrcmp_tail
+ b L(strcmp_tail)
.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
-.Loverlap2:
+L(overlap2):
and tmp1, data1, const_m1, S2LO #16
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #16
@@ -389,28 +389,28 @@ ENTRY_ALIGN (__strcmp_arm, 0)
cmp tmp1, data2, S2HI #16
bne 6f
ldr data1, [src1], #4
- b .Loverlap2
+ b L(overlap2)
4:
S2LO data2, data2, #16
- b .Lstrcmp_tail
+ b L(strcmp_tail)
5:
ands syndrome, syndrome, const_m1, S2LO #16
- bne .Lstrcmp_done_equal
+ bne L(strcmp_done_equal)
ldrh data2, [src2]
S2LO data1, data1, #16
#ifdef __ARM_BIG_ENDIAN
lsl data2, data2, #16
#endif
- b .Lstrcmp_tail
+ b L(strcmp_tail)
6:
S2LO data1, data1, #16
and data2, data2, const_m1, S2LO #16
- b .Lstrcmp_tail
+ b L(strcmp_tail)
.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
-.Loverlap1:
+L(overlap1):
and tmp1, data1, #LSB
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #24
@@ -422,20 +422,20 @@ ENTRY_ALIGN (__strcmp_arm, 0)
cmp tmp1, data2, S2HI #8
bne 6f
ldr data1, [src1], #4
- b .Loverlap1
+ b L(overlap1)
4:
S2LO data2, data2, #24
- b .Lstrcmp_tail
+ b L(strcmp_tail)
5:
tst syndrome, #LSB
- bne .Lstrcmp_done_equal
+ bne L(strcmp_done_equal)
ldr data2, [src2]
6:
S2LO data1, data1, #8
bic data2, data2, #MSB
- b .Lstrcmp_tail
+ b L(strcmp_tail)
-.Lstrcmp_done_equal:
+L(strcmp_done_equal):
mov result, #0
.cfi_remember_state
ldrd r4, r5, [sp], #16
@@ -446,7 +446,7 @@ ENTRY_ALIGN (__strcmp_arm, 0)
.cfi_restore 7
bx lr
-.Lstrcmp_tail:
+L(strcmp_tail):
.cfi_restore_state
#ifndef __ARM_BIG_ENDIAN
rev data1, data1
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 7245440..76e6930 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -45,20 +45,20 @@ ENTRY (__strlen_armv6t2)
mvn const_m1, #0
ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */
pld [src, #32]
- bne.w .Lmisaligned8
+ bne.w L(misaligned8)
mov const_0, #0
mov result, #-8
-.Lloop_aligned:
+L(loop_aligned):
/* Bytes 0-7. */
ldrd data1a, data1b, [src]
pld [src, #64]
add result, result, #8
-.Lstart_realigned:
+L(start_realigned):
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
- cbnz data1b, .Lnull_found
+ cbnz data1b, L(null_found)
/* Bytes 8-15. */
ldrd data1a, data1b, [src, #8]
@@ -67,7 +67,7 @@ ENTRY (__strlen_armv6t2)
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
- cbnz data1b, .Lnull_found
+ cbnz data1b, L(null_found)
/* Bytes 16-23. */
ldrd data1a, data1b, [src, #16]
@@ -76,7 +76,7 @@ ENTRY (__strlen_armv6t2)
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
- cbnz data1b, .Lnull_found
+ cbnz data1b, L(null_found)
/* Bytes 24-31. */
ldrd data1a, data1b, [src, #24]
@@ -87,9 +87,9 @@ ENTRY (__strlen_armv6t2)
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cmp data1b, #0
- beq .Lloop_aligned
+ beq L(loop_aligned)
-.Lnull_found:
+L(null_found):
cmp data1a, #0
itt eq
addeq result, result, #4
@@ -102,7 +102,7 @@ ENTRY (__strlen_armv6t2)
add result, result, data1a, lsr #3 /* Bits -> Bytes. */
bx lr
-.Lmisaligned8:
+L(misaligned8):
ldrd data1a, data1b, [src]
and tmp2, tmp1, #3
rsb result, tmp1, #0
@@ -115,6 +115,6 @@ ENTRY (__strlen_armv6t2)
ornne data1b, data1b, tmp2
movne data1a, const_m1
mov const_0, #0
- b .Lstart_realigned
+ b L(start_realigned)
END (__strlen_armv6t2)