diff options
author | Wilco Dijkstra <wdijkstr@arm.com> | 2020-01-02 13:36:34 +0000 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2020-01-02 13:36:34 +0000 |
commit | 833e86096b1c38218670459a4c11bf9c790a96d0 (patch) | |
tree | 88e48be64786d4edf2d75416d6cb8f0c8002d29e | |
parent | 31b560bc3b82ae45044e6455493ce6783aa94d98 (diff) | |
download | arm-optimized-routines-833e86096b1c38218670459a4c11bf9c790a96d0.tar.gz |
string: Use L(name) for labels
Use L(name) for all assembler labels.
-rw-r--r-- | string/aarch64/memchr.S | 24 | ||||
-rw-r--r-- | string/aarch64/strchr.S | 10 | ||||
-rw-r--r-- | string/aarch64/strchrnul.S | 10 | ||||
-rw-r--r-- | string/aarch64/strcpy.S | 40 | ||||
-rw-r--r-- | string/aarch64/strlen.S | 2 | ||||
-rw-r--r-- | string/aarch64/strncmp.S | 62 | ||||
-rw-r--r-- | string/aarch64/strnlen.S | 24 | ||||
-rw-r--r-- | string/arm/memcpy.S | 34 | ||||
-rw-r--r-- | string/arm/strcmp.S | 134 | ||||
-rw-r--r-- | string/arm/strlen-armv6t2.S | 20 |
10 files changed, 180 insertions, 180 deletions
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S index 6ffade1..10be49e 100644 --- a/string/aarch64/memchr.S +++ b/string/aarch64/memchr.S @@ -48,7 +48,7 @@ ENTRY (__memchr_aarch64) /* Do not dereference srcin if no bytes to compare. */ - cbz cntin, .Lzero_length + cbz cntin, L(zero_length) /* * Magic constant 0x40100401 allows us to identify which lane matches * the requested byte. @@ -61,7 +61,7 @@ ENTRY (__memchr_aarch64) dup vrepmask.4s, wtmp2 ands soff, srcin, #31 and cntrem, cntin, #31 - b.eq .Lloop + b.eq L(loop) /* * Input string is not 32-byte aligned. We calculate the syndrome @@ -84,25 +84,25 @@ ENTRY (__memchr_aarch64) lsr synd, synd, tmp lsl synd, synd, tmp /* The first block can also be the last */ - b.ls .Lmasklast + b.ls L(masklast) /* Have we found something already? */ - cbnz synd, .Ltail + cbnz synd, L(tail) -.Lloop: +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 subs cntin, cntin, #32 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b /* If we're out of data we finish regardless of the result */ - b.ls .Lend + b.ls L(end) /* Use a fast check for the termination condition */ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b addp vend.2d, vend.2d, vend.2d mov synd, vend.d[0] /* We're not out of data, loop if we haven't found the character */ - cbz synd, .Lloop + cbz synd, L(loop) -.Lend: +L(end): /* Termination condition found, let's calculate the syndrome value */ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b @@ -110,9 +110,9 @@ ENTRY (__memchr_aarch64) addp vend.16b, vend.16b, vend.16b /* 128->64 */ mov synd, vend.d[0] /* Only do the clear for the last possible block */ - b.hi .Ltail + b.hi L(tail) -.Lmasklast: +L(masklast): /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ add tmp, cntrem, soff and tmp, tmp, #31 @@ -121,7 +121,7 @@ ENTRY (__memchr_aarch64) lsl synd, synd, tmp lsr synd, synd, tmp -.Ltail: +L(tail): /* Count the trailing zeros using bit reversing */ rbit synd, synd /* Compensate the last post-increment */ @@ -136,7 +136,7 @@ ENTRY (__memchr_aarch64) csel result, xzr, result, eq ret -.Lzero_length: +L(zero_length): mov result, #0 ret diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S index 66a1fdd..00d9be3 100644 --- a/string/aarch64/strchr.S +++ b/string/aarch64/strchr.S @@ -61,7 +61,7 @@ ENTRY (__strchr_aarch64) dup vrepmask_c.4s, wtmp2 ands tmp1, srcin, #31 add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ - b.eq .Lloop + b.eq L(loop) /* Input string is not 32-byte aligned. Rather than forcing the padding bytes to a safe value, we calculate the syndrome @@ -87,9 +87,9 @@ ENTRY (__strchr_aarch64) mov tmp3, vend1.d[0] bic tmp1, tmp3, tmp1 // Mask padding bits. - cbnz tmp1, .Ltail + cbnz tmp1, L(tail) -.Lloop: +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b @@ -101,7 +101,7 @@ ENTRY (__strchr_aarch64) orr vend1.16b, vend1.16b, vend2.16b addp vend1.2d, vend1.2d, vend1.2d mov tmp1, vend1.d[0] - cbz tmp1, .Lloop + cbz tmp1, L(loop) /* Termination condition found. Now need to establish exactly why we terminated. */ @@ -115,7 +115,7 @@ ENTRY (__strchr_aarch64) addp vend1.16b, vend1.16b, vend2.16b // 128->64 mov tmp1, vend1.d[0] -.Ltail: +L(tail): /* Count the trailing zeros, by bit reversing... */ rbit tmp1, tmp1 /* Re-bias source. */ diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S index 697dbf4..81264ea 100644 --- a/string/aarch64/strchrnul.S +++ b/string/aarch64/strchrnul.S @@ -55,7 +55,7 @@ ENTRY (__strchrnul_aarch64) bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ dup vrepmask.4s, wtmp2 ands tmp1, srcin, #31 - b.eq .Lloop + b.eq L(loop) /* Input string is not 32-byte aligned. Rather than forcing the padding bytes to a safe value, we calculate the syndrome @@ -79,9 +79,9 @@ ENTRY (__strchrnul_aarch64) mov tmp3, vend1.d[0] bic tmp1, tmp3, tmp1 // Mask padding bits. - cbnz tmp1, .Ltail + cbnz tmp1, L(tail) -.Lloop: +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b @@ -93,7 +93,7 @@ ENTRY (__strchrnul_aarch64) orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b addp vend1.2d, vend1.2d, vend1.2d mov tmp1, vend1.d[0] - cbz tmp1, .Lloop + cbz tmp1, L(loop) /* Termination condition found. Now need to establish exactly why we terminated. */ @@ -103,7 +103,7 @@ ENTRY (__strchrnul_aarch64) addp vend1.16b, vend1.16b, vend1.16b // 128->64 mov tmp1, vend1.d[0] -.Ltail: +L(tail): /* Count the trailing zeros, by bit reversing... */ rbit tmp1, tmp1 /* Re-bias source. */ diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S index 766e71b..4edffcf 100644 --- a/string/aarch64/strcpy.S +++ b/string/aarch64/strcpy.S @@ -99,9 +99,9 @@ ENTRY (STRCPY) srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte aligned string will never fail the page align check, so will always take the fast path. */ - b.gt .Lpage_cross + b.gt L(page_cross) -.Lpage_cross_ok: +L(page_cross_ok): ldp data1, data2, [srcin] #ifdef __AARCH64EB__ /* Because we expect the end to be found within 16 characters @@ -113,7 +113,7 @@ ENTRY (STRCPY) sub tmp1, tmp2, zeroones orr tmp2, tmp2, #REP8_7f bics has_nul1, tmp1, tmp2 - b.ne .Lfp_le8 + b.ne L(fp_le8) rev tmp4, data2 sub tmp3, tmp4, zeroones orr tmp4, tmp4, #REP8_7f @@ -121,17 +121,17 @@ ENTRY (STRCPY) sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f bics has_nul1, tmp1, tmp2 - b.ne .Lfp_le8 + b.ne L(fp_le8) sub tmp3, data2, zeroones orr tmp4, data2, #REP8_7f #endif bics has_nul2, tmp3, tmp4 - b.eq .Lbulk_entry + b.eq L(bulk_entry) /* The string is short (<=16 bytes). We don't know exactly how short though, yet. Work out the exact length so that we can quickly select the optimal copy strategy. */ -.Lfp_gt8: +L(fp_gt8): rev has_nul2, has_nul2 clz pos, has_nul2 mov tmp2, #56 @@ -149,12 +149,12 @@ ENTRY (STRCPY) #endif ret -.Lfp_le8: +L(fp_le8): rev has_nul1, has_nul1 clz pos, has_nul1 add dst, dstin, pos, lsr #3 /* Bits to bytes. */ subs tmp2, pos, #24 /* Pos in bits. */ - b.lt .Lfp_lt4 + b.lt L(fp_lt4) #ifdef __AARCH64EB__ mov tmp2, #56 sub pos, tmp2, pos @@ -170,15 +170,15 @@ ENTRY (STRCPY) mov dstin, dst #endif ret -.Lfp_lt4: - cbz pos, .Lfp_lt2 +L(fp_lt4): + cbz pos, L(fp_lt2) /* 2->3 bytes to copy. */ #ifdef __AARCH64EB__ lsr data1, data1, #48 #endif strh data1w, [dstin] /* Fall-through, one byte (max) to go. */ -.Lfp_lt2: +L(fp_lt2): /* Null-terminated string. Last character must be zero! */ strb wzr, [dst] #ifdef BUILD_STPCPY @@ -189,20 +189,20 @@ ENTRY (STRCPY) .p2align 6 /* Aligning here ensures that the entry code and main loop all lies within one 64-byte cache line. */ -.Lbulk_entry: +L(bulk_entry): sub to_align, to_align, #16 stp data1, data2, [dstin] sub src, srcin, to_align sub dst, dstin, to_align - b .Lentry_no_page_cross + b L(entry_no_page_cross) /* The inner loop deals with two Dwords at a time. This has a slightly higher start-up cost, but we should win quite quickly, especially on cores with a high number of issue slots per cycle, as we get much better parallelism out of the operations. */ -.Lmain_loop: +L(main_loop): stp data1, data2, [dst], #16 -.Lentry_no_page_cross: +L(entry_no_page_cross): ldp data1, data2, [src], #16 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f @@ -211,7 +211,7 @@ ENTRY (STRCPY) bic has_nul1, tmp1, tmp2 bics has_nul2, tmp3, tmp4 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq .Lmain_loop + b.eq L(main_loop) /* Since we know we are copying at least 16 bytes, the fastest way to deal with the tail is to determine the location of the @@ -244,7 +244,7 @@ ENTRY (STRCPY) #endif ret -.Lpage_cross: +L(page_cross): bic src, srcin, #15 /* Start by loading two words at [srcin & ~15], then forcing the bytes that precede srcin to 0xff. This means they never look @@ -270,7 +270,7 @@ ENTRY (STRCPY) bic has_nul1, tmp1, tmp2 bics has_nul2, tmp3, tmp4 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq .Lpage_cross_ok + b.eq L(page_cross_ok) /* We now need to make data1 and data2 look like they've been loaded directly from srcin. Do a rotate on the 128-bit value. */ lsl tmp1, to_align, #3 /* Bytes->bits. */ @@ -301,8 +301,8 @@ ENTRY (STRCPY) orr tmp4, data2, #REP8_7f #endif bic has_nul1, tmp1, tmp2 - cbnz has_nul1, .Lfp_le8 + cbnz has_nul1, L(fp_le8) bic has_nul2, tmp3, tmp4 - b .Lfp_gt8 + b L(fp_gt8) END (STRCPY) diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S index 8a7d753..2293f73 100644 --- a/string/aarch64/strlen.S +++ b/string/aarch64/strlen.S @@ -114,7 +114,7 @@ L(main_loop_entry): sub src, src, 16 L(main_loop): ldp data1, data2, [src, 32]! -.Lpage_cross_entry: +L(page_cross_entry): sub tmp1, data1, zeroones sub tmp3, data2, zeroones orr tmp2, tmp1, tmp3 diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S index e29fb7d..fbd08ee 100644 --- a/string/aarch64/strncmp.S +++ b/string/aarch64/strncmp.S @@ -46,13 +46,13 @@ nop /* Pad so that the loop below fits a cache line. */ .endr ENTRY_ALIGN (__strncmp_aarch64, 0) - cbz limit, .Lret0 + cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 and count, src1, #7 - b.ne .Lmisaligned8 - cbnz count, .Lmutual_align + b.ne L(misaligned8) + cbnz count, L(mutual_align) /* Calculate the number of full and partial words -1. */ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ @@ -61,10 +61,10 @@ ENTRY_ALIGN (__strncmp_aarch64, 0) (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. */ /* Start of performance-critical section -- one 64B cache line. */ -.Lloop_aligned: +L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 -.Lstart_realigned: +L(start_realigned): subs limit_wd, limit_wd, #1 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f @@ -72,15 +72,15 @@ ENTRY_ALIGN (__strncmp_aarch64, 0) csinv endloop, diff, xzr, pl /* Last Dword or differences. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq - b.eq .Lloop_aligned + b.eq L(loop_aligned) /* End of performance-critical section -- one 64B cache line. */ /* Not reached the limit, must have found the end or a diff. */ - tbz limit_wd, #63, .Lnot_limit + tbz limit_wd, #63, L(not_limit) /* Limit % 8 == 0 => all bytes significant. */ ands limit, limit, #7 - b.eq .Lnot_limit + b.eq L(not_limit) lsl limit, limit, #3 /* Bits -> bytes. */ mov mask, #~0 @@ -95,7 +95,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0) /* Make sure that the NUL byte is marked in the syndrome. */ orr has_nul, has_nul, mask -.Lnot_limit: +L(not_limit): orr syndrome, diff, has_nul #ifndef __AARCH64EB__ @@ -148,7 +148,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0) ret #endif -.Lmutual_align: +L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off the bytes that precede the start point. @@ -176,56 +176,56 @@ ENTRY_ALIGN (__strncmp_aarch64, 0) orr data1, data1, tmp2 orr data2, data2, tmp2 add limit_wd, limit_wd, tmp3, lsr #3 - b .Lstart_realigned + b L(start_realigned) .p2align 6 /* Don't bother with dwords for up to 16 bytes. */ -.Lmisaligned8: +L(misaligned8): cmp limit, #16 - b.hs .Ltry_misaligned_words + b.hs L(try_misaligned_words) -.Lbyte_loop: +L(byte_loop): /* Perhaps we can do better than this. */ ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 subs limit, limit, #1 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq .Lbyte_loop -.Ldone: + b.eq L(byte_loop) +L(done): sub result, data1, data2 ret /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ -.Ltry_misaligned_words: +L(try_misaligned_words): lsr limit_wd, limit, #3 - cbz count, .Ldo_misaligned + cbz count, L(do_misaligned) neg count, count and count, count, #7 sub limit, limit, count lsr limit_wd, limit, #3 -.Lpage_end_loop: +L(page_end_loop): ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 cmp data1w, #1 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.ne .Ldone + b.ne L(done) subs count, count, #1 - b.hi .Lpage_end_loop + b.hi L(page_end_loop) -.Ldo_misaligned: +L(do_misaligned): /* Prepare ourselves for the next page crossing. Unlike the aligned loop, we fetch 1 less dword because we risk crossing bounds on SRC2. */ mov count, #8 subs limit_wd, limit_wd, #1 - b.lo .Ldone_loop -.Lloop_misaligned: + b.lo L(done_loop) +L(loop_misaligned): and tmp2, src2, #0xff8 eor tmp2, tmp2, #0xff8 - cbz tmp2, .Lpage_end_loop + cbz tmp2, L(page_end_loop) ldr data1, [src1], #8 ldr data2, [src2], #8 @@ -234,14 +234,14 @@ ENTRY_ALIGN (__strncmp_aarch64, 0) eor diff, data1, data2 /* Non-zero if differences found. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp diff, #0, #0, eq - b.ne .Lnot_limit + b.ne L(not_limit) subs limit_wd, limit_wd, #1 - b.pl .Lloop_misaligned + b.pl L(loop_misaligned) -.Ldone_loop: +L(done_loop): /* We found a difference or a NULL before the limit was reached. */ and limit, limit, #7 - cbz limit, .Lnot_limit + cbz limit, L(not_limit) /* Read the last word. */ sub src1, src1, 8 sub src2, src2, 8 @@ -252,9 +252,9 @@ ENTRY_ALIGN (__strncmp_aarch64, 0) eor diff, data1, data2 /* Non-zero if differences found. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp diff, #0, #0, eq - b.ne .Lnot_limit + b.ne L(not_limit) -.Lret0: +L(ret0): mov result, #0 ret diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S index bf72686..df66b60 100644 --- a/string/aarch64/strnlen.S +++ b/string/aarch64/strnlen.S @@ -38,22 +38,22 @@ .text .p2align 6 -.Lstart: +L(start): /* Pre-pad to ensure critical loop begins an icache line. */ .rep 7 nop .endr /* Put this code here to avoid wasting more space with pre-padding. */ -.Lhit_limit: +L(hit_limit): mov len, limit ret ENTRY_ALIGN (__strnlen_aarch64, 0) - cbz limit, .Lhit_limit + cbz limit, L(hit_limit) mov zeroones, #REP8_01 bic src, srcin, #15 ands tmp1, srcin, #15 - b.ne .Lmisaligned + b.ne L(misaligned) /* Calculate the number of full and partial words -1. */ sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ @@ -67,9 +67,9 @@ ENTRY_ALIGN (__strnlen_aarch64, 0) cycle, as we get much better parallelism out of the operations. */ /* Start of critial section -- keep to one 64Byte cache line. */ -.Lloop: +L(loop): ldp data1, data2, [src], #16 -.Lrealigned: +L(realigned): sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f sub tmp3, data2, zeroones @@ -79,24 +79,24 @@ ENTRY_ALIGN (__strnlen_aarch64, 0) subs limit_wd, limit_wd, #1 orr tmp1, has_nul1, has_nul2 ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ - b.eq .Lloop + b.eq L(loop) /* End of critical section -- keep to one 64Byte cache line. */ orr tmp1, has_nul1, has_nul2 - cbz tmp1, .Lhit_limit /* No null in final Qword. */ + cbz tmp1, L(hit_limit) /* No null in final Qword. */ /* We know there's a null in the final Qword. The easiest thing to do now is work out the length of the string and return MIN (len, limit). */ sub len, src, srcin - cbz has_nul1, .Lnul_in_data2 + cbz has_nul1, L(nul_in_data2) #ifdef __AARCH64EB__ mov data2, data1 #endif sub len, len, #8 mov has_nul2, has_nul1 -.Lnul_in_data2: +L(nul_in_data2): #ifdef __AARCH64EB__ /* For big-endian, carry propagation (if the final byte in the string is 0x01) means we cannot use has_nul directly. The @@ -115,7 +115,7 @@ ENTRY_ALIGN (__strnlen_aarch64, 0) csel len, len, limit, ls /* Return the lower value. */ ret -.Lmisaligned: +L(misaligned): /* Deal with a partial first word. We're doing two things in parallel here; 1) Calculate the number of words (but avoiding overflow if @@ -150,6 +150,6 @@ ENTRY_ALIGN (__strnlen_aarch64, 0) csinv data1, data1, xzr, le csel data2, data2, data2a, le - b .Lrealigned + b L(realigned) END (__strnlen_aarch64) diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S index e8f5843..aab78a2 100644 --- a/string/arm/memcpy.S +++ b/string/arm/memcpy.S @@ -124,11 +124,11 @@ ENTRY (__memcpy_arm) mov dst, dstin /* Preserve dstin, we need to return it. */ cmp count, #64 - bge .Lcpy_not_short + bge L(cpy_not_short) /* Deal with small copies quickly by dropping straight into the exit block. */ -.Ltail63unaligned: +L(tail63unaligned): #ifdef USE_NEON and tmp1, count, #0x38 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) @@ -207,13 +207,13 @@ ENTRY (__memcpy_arm) strbne src, [dst] bx lr -.Lcpy_not_short: +L(cpy_not_short): /* At least 64 bytes to copy, but don't know the alignment yet. */ str tmp2, [sp, #-FRAME_SIZE]! and tmp2, src, #7 and tmp1, dst, #7 cmp tmp1, tmp2 - bne .Lcpy_notaligned + bne L(cpy_notaligned) #ifdef USE_VFP /* Magic dust alert! Force VFP on Cortex-A9. Experiments show @@ -239,12 +239,12 @@ ENTRY (__memcpy_arm) 1: subs tmp2, count, #64 /* Use tmp2 for count. */ - blt .Ltail63aligned + blt L(tail63aligned) cmp tmp2, #512 - bge .Lcpy_body_long + bge L(cpy_body_long) -.Lcpy_body_medium: /* Count in tmp2. */ +L(cpy_body_medium): /* Count in tmp2. */ #ifdef USE_VFP 1: vldr d0, [src, #0] @@ -268,9 +268,9 @@ ENTRY (__memcpy_arm) add dst, dst, #64 bge 1b tst tmp2, #0x3f - beq .Ldone + beq L(done) -.Ltail63aligned: /* Count in tmp2. */ +L(tail63aligned): /* Count in tmp2. */ and tmp1, tmp2, #0x38 add dst, dst, tmp1 add src, src, tmp1 @@ -321,7 +321,7 @@ ENTRY (__memcpy_arm) add src, src, #8 add dst, dst, #8 -.Ltail63aligned: /* Count in tmp2. */ +L(tail63aligned): /* Count in tmp2. */ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but we know that the src and dest are 64-bit aligned so we can use LDRD/STRD to improve efficiency. */ @@ -358,11 +358,11 @@ ENTRY (__memcpy_arm) strhcs tmp1, [dst], #2 strbne tmp2, [dst] -.Ldone: +L(done): ldr tmp2, [sp], #FRAME_SIZE bx lr -.Lcpy_body_long: /* Count in tmp2. */ +L(cpy_body_long): /* Count in tmp2. */ /* Long copy. We know that there's at least (prefetch_lines * 64) bytes to go. */ @@ -419,7 +419,7 @@ ENTRY (__memcpy_arm) vstr d2, [dst, #64 + 56] add dst, dst, #128 add tmp2, tmp2, #prefetch_lines * 64 - b .Lcpy_body_medium + b L(cpy_body_medium) #else /* Long copy. Use an SMS style loop to maximize the I/O bandwidth of the core. We don't have enough spare registers @@ -473,12 +473,12 @@ ENTRY (__memcpy_arm) ldrd D_l, D_h, [sp, #24] add dst, dst, #72 tst tmp2, #0x3f - bne .Ltail63aligned + bne L(tail63aligned) ldr tmp2, [sp], #FRAME_SIZE bx lr #endif -.Lcpy_notaligned: +L(cpy_notaligned): pld [src] pld [src, #64] /* There's at least 64 bytes to copy, but there is no mutual @@ -500,7 +500,7 @@ ENTRY (__memcpy_arm) pld [src, #(3 * 64)] subs count, count, #64 ldrmi tmp2, [sp], #FRAME_SIZE - bmi .Ltail63unaligned + bmi L(tail63unaligned) pld [src, #(4 * 64)] #ifdef USE_NEON @@ -581,7 +581,7 @@ ENTRY (__memcpy_arm) ands count, tmp2, #0x3f #endif ldr tmp2, [sp], #FRAME_SIZE - bne .Ltail63unaligned + bne L(tail63unaligned) bx lr END (__memcpy_arm) diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S index b46bc6d..295db8b 100644 --- a/string/arm/strcmp.S +++ b/string/arm/strcmp.S @@ -125,9 +125,9 @@ .text .p2align 5 -.Lstrcmp_start_addr: +L(strcmp_start_addr): #if STRCMP_NO_PRECHECK == 0 -.Lfastpath_exit: +L(fastpath_exit): sub r0, r2, r3 bx lr nop @@ -139,7 +139,7 @@ ENTRY_ALIGN (__strcmp_arm, 0) cmp r2, #1 it cs cmpcs r2, r3 - bne .Lfastpath_exit + bne L(fastpath_exit) #endif strd r4, r5, [sp, #-16]! .cfi_def_cfa_offset 16 @@ -151,12 +151,12 @@ ENTRY_ALIGN (__strcmp_arm, 0) .cfi_offset 7, -4 mvn const_m1, #0 lsl r2, tmp1, #29 - cbz r2, .Lloop_aligned8 + cbz r2, L(loop_aligned8) -.Lnot_aligned: +L(not_aligned): eor tmp1, src1, src2 tst tmp1, #7 - bne .Lmisaligned8 + bne L(misaligned8) /* Deal with mutual misalignment by aligning downwards and then masking off the unwanted loaded data to prevent a difference. */ @@ -173,29 +173,29 @@ ENTRY_ALIGN (__strcmp_arm, 0) S2HI tmp1, const_m1, tmp2 orn data1a, data1a, tmp1 orn data2a, data2a, tmp1 - beq .Lstart_realigned8 + beq L(start_realigned8) orn data1b, data1b, tmp1 mov data1a, const_m1 orn data2b, data2b, tmp1 mov data2a, const_m1 - b .Lstart_realigned8 + b L(start_realigned8) /* Unwind the inner loop by a factor of 2, giving 16 bytes per pass. */ .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ .p2align 2 /* Always word aligned. */ -.Lloop_aligned8: +L(loop_aligned8): ldrd data1a, data1b, [src1], #16 ldrd data2a, data2b, [src2], #16 -.Lstart_realigned8: +L(start_realigned8): uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ eor syndrome_a, data1a, data2a sel syndrome_a, syndrome_a, const_m1 - cbnz syndrome_a, .Ldiff_in_a + cbnz syndrome_a, L(diff_in_a) uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ eor syndrome_b, data1b, data2b sel syndrome_b, syndrome_b, const_m1 - cbnz syndrome_b, .Ldiff_in_b + cbnz syndrome_b, L(diff_in_b) ldrd data1a, data1b, [src1, #-8] ldrd data2a, data2b, [src2, #-8] @@ -207,47 +207,47 @@ ENTRY_ALIGN (__strcmp_arm, 0) sel syndrome_b, syndrome_b, const_m1 /* Can't use CBZ for backwards branch. */ orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ - beq .Lloop_aligned8 + beq L(loop_aligned8) -.Ldiff_found: - cbnz syndrome_a, .Ldiff_in_a +L(diff_found): + cbnz syndrome_a, L(diff_in_a) -.Ldiff_in_b: +L(diff_in_b): strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 -.Ldiff_in_a: +L(diff_in_a): .cfi_restore_state strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 .cfi_restore_state -.Lmisaligned8: +L(misaligned8): tst tmp1, #3 - bne .Lmisaligned4 + bne L(misaligned4) ands tmp1, src1, #3 - bne .Lmutual_align4 + bne L(mutual_align4) /* Unrolled by a factor of 2, to reduce the number of post-increment operations. */ -.Lloop_aligned4: +L(loop_aligned4): ldr data1, [src1], #8 ldr data2, [src2], #8 -.Lstart_realigned4: +L(start_realigned4): uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ eor syndrome, data1, data2 sel syndrome, syndrome, const_m1 - cbnz syndrome, .Laligned4_done + cbnz syndrome, L(aligned4_done) ldr data1, [src1, #-4] ldr data2, [src2, #-4] uadd8 syndrome, data1, const_m1 eor syndrome, data1, data2 sel syndrome, syndrome, const_m1 cmp syndrome, #0 - beq .Lloop_aligned4 + beq L(loop_aligned4) -.Laligned4_done: +L(aligned4_done): strcmp_epilogue_aligned syndrome, data1, data2, 0 -.Lmutual_align4: +L(mutual_align4): .cfi_restore_state /* Deal with mutual misalignment by aligning downwards and then masking off the unwanted loaded data to prevent a difference. */ @@ -262,57 +262,57 @@ ENTRY_ALIGN (__strcmp_arm, 0) S2HI tmp1, const_m1, tmp1 orn data1, data1, tmp1 orn data2, data2, tmp1 - b .Lstart_realigned4 + b L(start_realigned4) -.Lmisaligned4: +L(misaligned4): ands tmp1, src1, #3 - beq .Lsrc1_aligned + beq L(src1_aligned) sub src2, src2, tmp1 bic src1, src1, #3 lsls tmp1, tmp1, #31 ldr data1, [src1], #4 - beq .Laligned_m2 - bcs .Laligned_m1 + beq L(aligned_m2) + bcs L(aligned_m1) #if STRCMP_NO_PRECHECK == 1 ldrb data2, [src2, #1] uxtb tmp1, data1, ror #BYTE1_OFFSET subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit + bne L(misaligned_exit) + cbz data2, L(misaligned_exit) -.Laligned_m2: +L(aligned_m2): ldrb data2, [src2, #2] uxtb tmp1, data1, ror #BYTE2_OFFSET subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit + bne L(misaligned_exit) + cbz data2, L(misaligned_exit) -.Laligned_m1: +L(aligned_m1): ldrb data2, [src2, #3] uxtb tmp1, data1, ror #BYTE3_OFFSET subs tmp1, tmp1, data2 - bne .Lmisaligned_exit + bne L(misaligned_exit) add src2, src2, #4 - cbnz data2, .Lsrc1_aligned + cbnz data2, L(src1_aligned) #else /* STRCMP_NO_PRECHECK */ /* If we've done the pre-check, then we don't need to check the first byte again here. */ ldrb data2, [src2, #2] uxtb tmp1, data1, ror #BYTE2_OFFSET subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit + bne L(misaligned_exit) + cbz data2, L(misaligned_exit) -.Laligned_m2: +L(aligned_m2): ldrb data2, [src2, #3] uxtb tmp1, data1, ror #BYTE3_OFFSET subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbnz data2, .Laligned_m1 + bne L(misaligned_exit) + cbnz data2, L(aligned_m1) #endif -.Lmisaligned_exit: +L(misaligned_exit): .cfi_remember_state mov result, tmp1 ldr r4, [sp], #16 @@ -320,10 +320,10 @@ ENTRY_ALIGN (__strcmp_arm, 0) bx lr #if STRCMP_NO_PRECHECK == 0 -.Laligned_m1: +L(aligned_m1): add src2, src2, #4 #endif -.Lsrc1_aligned: +L(src1_aligned): .cfi_restore_state /* src1 is word aligned, but src2 has no common alignment with it. */ @@ -332,11 +332,11 @@ ENTRY_ALIGN (__strcmp_arm, 0) bic src2, src2, #3 ldr data2, [src2], #4 - bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ - bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ + bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */ + bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */ /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ -.Loverlap3: +L(overlap3): bic tmp1, data1, #MSB uadd8 syndrome, data1, const_m1 eors syndrome, tmp1, data2, S2LO #8 @@ -348,14 +348,14 @@ ENTRY_ALIGN (__strcmp_arm, 0) cmp tmp1, data2, S2HI #24 bne 6f ldr data1, [src1], #4 - b .Loverlap3 + b L(overlap3) 4: S2LO data2, data2, #8 - b .Lstrcmp_tail + b L(strcmp_tail) 5: bics syndrome, syndrome, #MSB - bne .Lstrcmp_done_equal + bne L(strcmp_done_equal) /* We can only get here if the MSB of data1 contains 0, so fast-path the exit. */ @@ -374,10 +374,10 @@ ENTRY_ALIGN (__strcmp_arm, 0) .cfi_restore_state S2LO data1, data1, #24 and data2, data2, #LSB - b .Lstrcmp_tail + b L(strcmp_tail) .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ -.Loverlap2: +L(overlap2): and tmp1, data1, const_m1, S2LO #16 uadd8 syndrome, data1, const_m1 eors syndrome, tmp1, data2, S2LO #16 @@ -389,28 +389,28 @@ ENTRY_ALIGN (__strcmp_arm, 0) cmp tmp1, data2, S2HI #16 bne 6f ldr data1, [src1], #4 - b .Loverlap2 + b L(overlap2) 4: S2LO data2, data2, #16 - b .Lstrcmp_tail + b L(strcmp_tail) 5: ands syndrome, syndrome, const_m1, S2LO #16 - bne .Lstrcmp_done_equal + bne L(strcmp_done_equal) ldrh data2, [src2] S2LO data1, data1, #16 #ifdef __ARM_BIG_ENDIAN lsl data2, data2, #16 #endif - b .Lstrcmp_tail + b L(strcmp_tail) 6: S2LO data1, data1, #16 and data2, data2, const_m1, S2LO #16 - b .Lstrcmp_tail + b L(strcmp_tail) .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ -.Loverlap1: +L(overlap1): and tmp1, data1, #LSB uadd8 syndrome, data1, const_m1 eors syndrome, tmp1, data2, S2LO #24 @@ -422,20 +422,20 @@ ENTRY_ALIGN (__strcmp_arm, 0) cmp tmp1, data2, S2HI #8 bne 6f ldr data1, [src1], #4 - b .Loverlap1 + b L(overlap1) 4: S2LO data2, data2, #24 - b .Lstrcmp_tail + b L(strcmp_tail) 5: tst syndrome, #LSB - bne .Lstrcmp_done_equal + bne L(strcmp_done_equal) ldr data2, [src2] 6: S2LO data1, data1, #8 bic data2, data2, #MSB - b .Lstrcmp_tail + b L(strcmp_tail) -.Lstrcmp_done_equal: +L(strcmp_done_equal): mov result, #0 .cfi_remember_state ldrd r4, r5, [sp], #16 @@ -446,7 +446,7 @@ ENTRY_ALIGN (__strcmp_arm, 0) .cfi_restore 7 bx lr -.Lstrcmp_tail: +L(strcmp_tail): .cfi_restore_state #ifndef __ARM_BIG_ENDIAN rev data1, data1 diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S index 7245440..76e6930 100644 --- a/string/arm/strlen-armv6t2.S +++ b/string/arm/strlen-armv6t2.S @@ -45,20 +45,20 @@ ENTRY (__strlen_armv6t2) mvn const_m1, #0 ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */ pld [src, #32] - bne.w .Lmisaligned8 + bne.w L(misaligned8) mov const_0, #0 mov result, #-8 -.Lloop_aligned: +L(loop_aligned): /* Bytes 0-7. */ ldrd data1a, data1b, [src] pld [src, #64] add result, result, #8 -.Lstart_realigned: +L(start_realigned): uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ uadd8 data1b, data1b, const_m1 sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found + cbnz data1b, L(null_found) /* Bytes 8-15. */ ldrd data1a, data1b, [src, #8] @@ -67,7 +67,7 @@ ENTRY (__strlen_armv6t2) sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ uadd8 data1b, data1b, const_m1 sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found + cbnz data1b, L(null_found) /* Bytes 16-23. */ ldrd data1a, data1b, [src, #16] @@ -76,7 +76,7 @@ ENTRY (__strlen_armv6t2) sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ uadd8 data1b, data1b, const_m1 sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found + cbnz data1b, L(null_found) /* Bytes 24-31. */ ldrd data1a, data1b, [src, #24] @@ -87,9 +87,9 @@ ENTRY (__strlen_armv6t2) uadd8 data1b, data1b, const_m1 sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ cmp data1b, #0 - beq .Lloop_aligned + beq L(loop_aligned) -.Lnull_found: +L(null_found): cmp data1a, #0 itt eq addeq result, result, #4 @@ -102,7 +102,7 @@ ENTRY (__strlen_armv6t2) add result, result, data1a, lsr #3 /* Bits -> Bytes. */ bx lr -.Lmisaligned8: +L(misaligned8): ldrd data1a, data1b, [src] and tmp2, tmp1, #3 rsb result, tmp1, #0 @@ -115,6 +115,6 @@ ENTRY (__strlen_armv6t2) ornne data1b, data1b, tmp2 movne data1a, const_m1 mov const_0, #0 - b .Lstart_realigned + b L(start_realigned) END (__strlen_armv6t2) |