aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xAndroid.bp88
-rw-r--r--METADATA6
-rw-r--r--Makefile4
l---------NOTICE1
-rw-r--r--README6
-rw-r--r--TEST_MAPPING3
-rw-r--r--arm-optimized-routines-tests.xml26
-rw-r--r--config.mk.dist32
-rw-r--r--math/cosf.c2
-rw-r--r--math/erf.c244
-rw-r--r--math/erf_data.c85
-rw-r--r--math/erff.c104
-rw-r--r--math/erff_data.c22
-rw-r--r--math/exp.c2
-rw-r--r--math/exp2.c2
-rw-r--r--math/expf.c2
-rw-r--r--math/include/mathlib.h2
-rw-r--r--math/log.c2
-rw-r--r--math/log2.c2
-rw-r--r--math/logf.c2
-rw-r--r--math/logf_data.c2
-rw-r--r--math/math_config.h56
-rw-r--r--math/math_errf.c16
-rw-r--r--math/pow.c2
-rw-r--r--math/powf.c2
-rw-r--r--math/powf_log2_data.c2
-rw-r--r--math/sincosf.c2
-rw-r--r--math/sincosf_data.c2
-rw-r--r--math/sinf.c2
-rw-r--r--math/test/mathbench.c4
-rw-r--r--math/test/mathtest.c2
-rw-r--r--math/test/rtest/dotest.c2
-rw-r--r--math/test/rtest/intern.h2
-rw-r--r--math/test/rtest/main.c2
-rw-r--r--math/test/rtest/random.c2
-rw-r--r--math/test/rtest/random.h2
-rw-r--r--math/test/rtest/semi.c2
-rw-r--r--math/test/rtest/semi.h2
-rw-r--r--math/test/rtest/types.h2
-rw-r--r--math/test/rtest/wrappers.c2
-rw-r--r--math/test/rtest/wrappers.h2
-rwxr-xr-xmath/test/runulp.sh23
-rw-r--r--math/test/testcases/directed/cosf.tst2
-rw-r--r--math/test/testcases/directed/erf.tst17
-rw-r--r--math/test/testcases/directed/erff.tst17
-rw-r--r--math/test/testcases/directed/exp.tst2
-rw-r--r--math/test/testcases/directed/exp2.tst2
-rw-r--r--math/test/testcases/directed/exp2f.tst2
-rw-r--r--math/test/testcases/directed/expf.tst2
-rw-r--r--math/test/testcases/directed/log.tst2
-rw-r--r--math/test/testcases/directed/log2.tst2
-rw-r--r--math/test/testcases/directed/log2f.tst2
-rw-r--r--math/test/testcases/directed/logf.tst2
-rw-r--r--math/test/testcases/directed/pow.tst2
-rw-r--r--math/test/testcases/directed/powf.tst2
-rw-r--r--math/test/testcases/directed/sincosf.tst2
-rw-r--r--math/test/testcases/directed/sinf.tst2
-rw-r--r--math/test/testcases/random/double.tst2
-rw-r--r--math/test/testcases/random/float.tst2
-rw-r--r--math/test/ulp.c4
-rwxr-xr-xmath/tools/remez.jl2
-rw-r--r--math/v_math.h2
-rw-r--r--networking/Dir.mk76
-rw-r--r--networking/aarch64/chksum_simd.c146
-rw-r--r--networking/arm/chksum_simd.c149
-rw-r--r--networking/chksum.c81
-rw-r--r--networking/chksum_common.h132
-rw-r--r--networking/include/networking.h14
-rw-r--r--networking/test/chksum.c381
-rwxr-xr-xrun-arm-optimized-routines-tests-on-android.sh12
-rw-r--r--string/Dir.mk62
-rw-r--r--string/aarch64/__mtag_tag_region.S100
-rw-r--r--string/aarch64/__mtag_tag_zero_region.S100
-rw-r--r--string/aarch64/check-arch.S13
-rw-r--r--string/aarch64/memchr-mte.S116
-rw-r--r--string/aarch64/memchr-sve.S22
-rw-r--r--string/aarch64/memchr.S7
-rw-r--r--string/aarch64/memcmp-sve.S21
-rw-r--r--string/aarch64/memcmp.S6
-rw-r--r--string/aarch64/memcpy-advsimd.S206
-rw-r--r--string/aarch64/memcpy.S136
-rw-r--r--string/aarch64/memcpy_simd.S265
-rw-r--r--string/aarch64/memrchr.S117
-rw-r--r--string/aarch64/memset.S128
-rw-r--r--string/aarch64/stpcpy-mte.S10
-rw-r--r--string/aarch64/stpcpy-sve.S10
-rw-r--r--string/aarch64/stpcpy.S10
-rw-r--r--string/aarch64/strchr-mte.S105
-rw-r--r--string/aarch64/strchr-sve.S19
-rw-r--r--string/aarch64/strchr.S47
-rw-r--r--string/aarch64/strchrnul-mte.S84
-rw-r--r--string/aarch64/strchrnul-sve.S2
-rw-r--r--string/aarch64/strchrnul.S30
-rw-r--r--string/aarch64/strcmp-mte.S189
-rw-r--r--string/aarch64/strcmp-sve.S22
-rw-r--r--string/aarch64/strcmp.S5
-rw-r--r--string/aarch64/strcpy-mte.S161
-rw-r--r--string/aarch64/strcpy-sve.S20
-rw-r--r--string/aarch64/strcpy.S5
-rw-r--r--string/aarch64/strlen-mte.S80
-rw-r--r--string/aarch64/strlen-sve.S22
-rw-r--r--string/aarch64/strlen.S274
-rw-r--r--string/aarch64/strncmp-mte.S307
-rw-r--r--string/aarch64/strncmp-sve.S21
-rw-r--r--string/aarch64/strncmp.S19
-rw-r--r--string/aarch64/strnlen-sve.S22
-rw-r--r--string/aarch64/strnlen.S221
-rw-r--r--string/aarch64/strrchr-mte.S127
-rw-r--r--string/aarch64/strrchr-sve.S19
-rw-r--r--string/aarch64/strrchr.S22
-rw-r--r--string/arm/check-arch.S10
-rw-r--r--string/arm/memchr.S3
-rw-r--r--string/arm/memcpy.S24
-rw-r--r--string/arm/memset.S3
-rw-r--r--string/arm/strcmp-armv6m.S6
-rw-r--r--string/arm/strcmp.S7
-rw-r--r--string/arm/strcpy.c6
-rw-r--r--string/arm/strlen-armv6t2.S6
-rw-r--r--string/asmdefs.h69
-rw-r--r--string/bench/memcpy.c260
-rw-r--r--string/bench/strlen.c221
-rw-r--r--string/include/benchlib.h33
-rw-r--r--string/include/stringlib.h18
-rw-r--r--string/memchr.S15
-rw-r--r--string/memcmp.S13
-rw-r--r--string/memcpy.S15
-rw-r--r--string/memset.S12
-rw-r--r--string/strchr.S13
-rw-r--r--string/strchrnul.S13
-rw-r--r--string/strcmp.S19
-rw-r--r--string/strcpy-c.c10
-rw-r--r--string/strcpy.S13
-rw-r--r--string/strlen.S17
-rw-r--r--string/strncmp.S13
-rw-r--r--string/strnlen.S13
-rw-r--r--string/strrchr.S13
-rw-r--r--string/test/__mtag_tag_region.c147
-rw-r--r--string/test/__mtag_tag_zero_region.c147
-rw-r--r--string/test/memchr.c131
-rw-r--r--string/test/memcmp.c149
-rw-r--r--string/test/memcpy.c146
-rw-r--r--string/test/memmove.c234
-rw-r--r--string/test/memrchr.c106
-rw-r--r--string/test/memset.c162
-rw-r--r--string/test/mte.h142
-rw-r--r--string/test/stpcpy.c125
-rw-r--r--string/test/strchr.c148
-rw-r--r--string/test/strchrnul.c151
-rw-r--r--string/test/strcmp.c155
-rw-r--r--string/test/strcpy.c154
-rw-r--r--string/test/stringtest.h55
-rw-r--r--string/test/strlen.c119
-rw-r--r--string/test/strncmp.c170
-rw-r--r--string/test/strnlen.c132
-rw-r--r--string/test/strrchr.c148
-rw-r--r--string/x86_64/check-arch.S10
156 files changed, 2033 insertions, 6530 deletions
diff --git a/Android.bp b/Android.bp
index ba814eb..ea477a1 100755
--- a/Android.bp
+++ b/Android.bp
@@ -1,20 +1,3 @@
-package {
- default_applicable_licenses: ["external_arm-optimized-routines_license"],
-}
-
-// Added automatically by a large-scale-change
-// See: http://go/android-license-faq
-license {
- name: "external_arm-optimized-routines_license",
- visibility: [":__subpackages__"],
- license_kinds: [
- "SPDX-license-identifier-MIT",
- ],
- license_text: [
- "LICENSE",
- ],
-}
-
cc_defaults {
name: "arm-optimized-routines-defaults",
host_supported: true,
@@ -43,37 +26,12 @@ cc_defaults {
local_include_dirs: ["math/include"],
}
-cc_defaults {
- name: "libarm-optimized-routines-defaults",
+cc_library {
+ name: "libarm-optimized-routines-math",
defaults: ["arm-optimized-routines-defaults"],
ramdisk_available: true,
- vendor_ramdisk_available: true,
recovery_available: true,
native_bridge_supported: true,
- apex_available: [
- "//apex_available:platform",
- "com.android.runtime",
- ],
-
- stl: "none",
- static: {
- system_shared_libs: [],
- },
- header_libs: ["libc_headers"],
-}
-
-cc_library_static {
- name: "libarm-optimized-routines-math",
- defaults: ["libarm-optimized-routines-defaults"],
- exclude_srcs: [
- // Provided by:
- // bionic/libm/upstream-freebsd/lib/msun/src/s_erf.c
- // bionic/libm/upstream-freebsd/lib/msun/src/s_erff.c
- "math/erf.c",
- "math/erf_data.c",
- "math/erff.c",
- "math/erff_data.c",
- ],
srcs: [
"math/*.c",
],
@@ -95,43 +53,9 @@ cc_library_static {
enabled: true,
},
},
-}
-
-cc_library_static {
- name: "libarm-optimized-routines-string",
- defaults: ["libarm-optimized-routines-defaults"],
-
- arch: {
- arm64: {
- srcs: [
- "string/aarch64/memchr-mte.S",
- "string/aarch64/memchr.S",
- "string/aarch64/memcmp.S",
- "string/aarch64/memrchr.S",
- "string/aarch64/stpcpy-mte.S",
- "string/aarch64/stpcpy.S",
- "string/aarch64/strchrnul-mte.S",
- "string/aarch64/strchrnul.S",
- "string/aarch64/strchr-mte.S",
- "string/aarch64/strchr.S",
- "string/aarch64/strcmp-mte.S",
- "string/aarch64/strcmp.S",
- "string/aarch64/strcpy-mte.S",
- "string/aarch64/strcpy.S",
- "string/aarch64/strlen-mte.S",
- "string/aarch64/strlen.S",
- "string/aarch64/strncmp-mte.S",
- "string/aarch64/strncmp.S",
- "string/aarch64/strnlen.S",
- "string/aarch64/strrchr-mte.S",
- "string/aarch64/strrchr.S",
- ],
- asflags: [
- "-D__memcmp_aarch64=memcmp",
- "-D__memrchr_aarch64=memrchr",
- "-D__strnlen_aarch64=strnlen",
- ]
- },
+ stl: "none",
+ static: {
+ system_shared_libs: [],
},
}
@@ -169,7 +93,7 @@ sh_test {
test_suites: ["general-tests"],
host_supported: true,
device_supported: false,
- require_root: true,
+ test_config: "arm-optimized-routines-tests.xml",
target_required: [
"mathtest",
"ulp",
diff --git a/METADATA b/METADATA
index 59af591..94791ae 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@ third_party {
type: GIT
value: "https://github.com/ARM-software/optimized-routines.git"
}
- version: "v21.02"
+ version: "33ba19089a261964e1e84ba4edf90263b468c161"
license_type: NOTICE
last_upgrade_date {
- year: 2021
+ year: 2020
month: 2
- day: 18
+ day: 1
}
}
diff --git a/Makefile b/Makefile
index 169f89e..dee6134 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
# Makefile - requires GNU make
#
-# Copyright (c) 2018-2020, Arm Limited.
+# Copyright (c) 2018-2019, Arm Limited.
# SPDX-License-Identifier: MIT
srcdir = .
@@ -10,7 +10,7 @@ libdir = $(prefix)/lib
includedir = $(prefix)/include
# Configure these in config.mk, do not make changes in this file.
-SUBS = math string networking
+SUBS = math string
HOST_CC = cc
HOST_CFLAGS = -std=c99 -O2
HOST_LDFLAGS =
diff --git a/NOTICE b/NOTICE
new file mode 120000
index 0000000..7a694c9
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1 @@
+LICENSE \ No newline at end of file
diff --git a/README b/README
index ae465e9..440f08a 100644
--- a/README
+++ b/README
@@ -8,8 +8,7 @@ Assignment Agreement, please follow the instructions in
contributor-agreement.pdf. This is needed so upstreaming code
to projects that require copyright assignment is possible.
-Regular quarterly releases are tagged as vYY.MM, the latest
-release is v20.11.
+Regular quarterly releases are tagged as vYY.MM (e.g. v19.11).
Source code layout:
@@ -18,9 +17,6 @@ math/ - math subproject sources.
math/include/ - math library public headers.
math/test/ - math test and benchmark related sources.
math/tools/ - tools used for designing the algorithms.
-networking/ - networking subproject sources.
-networking/include/ - networking library public headers.
-networking/test/ - networking test and benchmark related sources.
string/ - string routines subproject sources.
string/include/ - string library public headers.
string/test/ - string test and benchmark related sources.
diff --git a/TEST_MAPPING b/TEST_MAPPING
index 66bdc01..e4d3d5e 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -2,9 +2,6 @@
"presubmit": [
{
"name": "CtsBionicTestCases"
- },
- {
- "name": "arm-optimized-routines-tests"
}
]
}
diff --git a/arm-optimized-routines-tests.xml b/arm-optimized-routines-tests.xml
new file mode 100644
index 0000000..96db90c
--- /dev/null
+++ b/arm-optimized-routines-tests.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Copyright (C) 2019 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<configuration description="Config for running arm-optimized-routines-tests through Atest or in Infra">
+ <option name="test-suite-tag" value="arm-optimized-routines-tests" />
+ <!-- This test requires a device, so it's not annotated with a null-device. -->
+ <test class="com.android.tradefed.testtype.binary.ExecutableHostTest" >
+ <option name="binary" value="run-arm-optimized-routines-tests-on-android.sh" />
+ <!-- Test script assumes a relative path with the tests/ folders. -->
+ <option name="relative-path-execution" value="true" />
+ <!-- Tests shouldn't be that long but set 15m to be safe. -->
+ <option name="per-binary-timeout" value="15m" />
+ </test>
+</configuration>
diff --git a/config.mk.dist b/config.mk.dist
index 177e1ac..301b5f9 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,28 +1,20 @@
# Example config.mk
#
-# Copyright (c) 2018-2020, Arm Limited.
+# Copyright (c) 2018-2019, Arm Limited.
# SPDX-License-Identifier: MIT
# Subprojects to build
-SUBS = math string networking
+SUBS = math string
-# Target architecture: aarch64, arm or x86_64
-ARCH = aarch64
-
-# Use for cross compilation with gcc.
-#CROSS_COMPILE = aarch64-none-linux-gnu-
+HOST_CC = gcc
+HOST_CFLAGS = -std=c99 -O2
+HOST_CFLAGS += -Wall -Wno-unused-function
-# Compiler for the target
CC = $(CROSS_COMPILE)gcc
CFLAGS = -std=c99 -pipe -O3
CFLAGS += -Wall -Wno-missing-braces
CFLAGS += -Werror=implicit-function-declaration
-# Used for test case generator that is executed on the host
-HOST_CC = gcc
-HOST_CFLAGS = -std=c99 -O2
-HOST_CFLAGS += -Wall -Wno-unused-function
-
# Enable debug info.
HOST_CFLAGS += -g
CFLAGS += -g
@@ -30,8 +22,8 @@ CFLAGS += -g
# Optimize the shared libraries on aarch64 assuming they fit in 1M.
#CFLAGS_SHARED = -fPIC -mcmodel=tiny
-# Enable MTE support.
-#CFLAGS += -march=armv8.5-a+memtag -DWANT_MTE_TEST=1
+# Use for cross compilation with gcc.
+#CROSS_COMPILE = aarch64-none-linux-gnu-
# Use with cross testing.
#EMULATOR = qemu-aarch64-static
@@ -43,7 +35,6 @@ math-ldlibs =
math-ulpflags =
math-testflags =
string-cflags =
-networking-cflags =
# Use if mpfr is available on the target for ulp error checking.
#math-ldlibs += -lmpfr -lgmp
@@ -62,12 +53,3 @@ math-cflags += -ffp-contract=fast -fno-math-errno
# Disable fenv checks
#math-ulpflags = -q -f
#math-testflags = -nostatus
-
-# Remove GNU Property Notes from asm files.
-#string-cflags += -DWANT_GNU_PROPERTY=0
-
-# Enable assertion checks.
-#networking-cflags += -DWANT_ASSERT
-
-# Avoid auto-vectorization of scalar code and unroll loops
-networking-cflags += -O2 -fno-tree-vectorize -funroll-loops
diff --git a/math/cosf.c b/math/cosf.c
index f29f194..831b39e 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -1,7 +1,7 @@
/*
* Single-precision cos function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/erf.c b/math/erf.c
deleted file mode 100644
index 12d7e51..0000000
--- a/math/erf.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Double-precision erf(x) function.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "math_config.h"
-#include <math.h>
-#include <stdint.h>
-
-#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
-#define C 0x1.b0ac16p-1
-#define PA __erf_data.erf_poly_A
-#define NA __erf_data.erf_ratio_N_A
-#define DA __erf_data.erf_ratio_D_A
-#define NB __erf_data.erf_ratio_N_B
-#define DB __erf_data.erf_ratio_D_B
-#define PC __erf_data.erfc_poly_C
-#define PD __erf_data.erfc_poly_D
-#define PE __erf_data.erfc_poly_E
-#define PF __erf_data.erfc_poly_F
-
-/* Top 32 bits of a double. */
-static inline uint32_t
-top32 (double x)
-{
- return asuint64 (x) >> 32;
-}
-
-/* Fast erf implementation using a mix of
- rational and polynomial approximations.
- Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0. */
-double
-erf (double x)
-{
- /* Get top word and sign. */
- uint32_t ix = top32 (x);
- uint32_t ia = ix & 0x7fffffff;
- uint32_t sign = ix >> 31;
-
- /* Normalized and subnormal cases */
- if (ia < 0x3feb0000)
- { /* a = |x| < 0.84375. */
-
- if (ia < 0x3e300000)
- { /* a < 2^(-28). */
- if (ia < 0x00800000)
- { /* a < 2^(-1015). */
- double y = fma (TwoOverSqrtPiMinusOne, x, x);
- return check_uflow (y);
- }
- return x + TwoOverSqrtPiMinusOne * x;
- }
-
- double x2 = x * x;
-
- if (ia < 0x3fe00000)
- { /* a < 0.5 - Use polynomial approximation. */
- double r1 = fma (x2, PA[1], PA[0]);
- double r2 = fma (x2, PA[3], PA[2]);
- double r3 = fma (x2, PA[5], PA[4]);
- double r4 = fma (x2, PA[7], PA[6]);
- double r5 = fma (x2, PA[9], PA[8]);
- double x4 = x2 * x2;
- double r = r5;
- r = fma (x4, r, r4);
- r = fma (x4, r, r3);
- r = fma (x4, r, r2);
- r = fma (x4, r, r1);
- return fma (r, x, x); /* This fma is crucial for accuracy. */
- }
- else
- { /* 0.5 <= a < 0.84375 - Use rational approximation. */
- double x4, x8, r1n, r2n, r1d, r2d, r3d;
-
- r1n = fma (x2, NA[1], NA[0]);
- x4 = x2 * x2;
- r2n = fma (x2, NA[3], NA[2]);
- x8 = x4 * x4;
- r1d = fma (x2, DA[0], 1.0);
- r2d = fma (x2, DA[2], DA[1]);
- r3d = fma (x2, DA[4], DA[3]);
- double P = r1n + x4 * r2n + x8 * NA[4];
- double Q = r1d + x4 * r2d + x8 * r3d;
- return fma (P / Q, x, x);
- }
- }
- else if (ia < 0x3ff40000)
- { /* 0.84375 <= |x| < 1.25. */
- double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d;
- double a = fabs (x) - 1.0;
- r1n = fma (a, NB[1], NB[0]);
- a2 = a * a;
- r1d = fma (a, DB[0], 1.0);
- a4 = a2 * a2;
- r2n = fma (a, NB[3], NB[2]);
- a6 = a4 * a2;
- r2d = fma (a, DB[2], DB[1]);
- r3n = fma (a, NB[5], NB[4]);
- r3d = fma (a, DB[4], DB[3]);
- r4n = NB[6];
- r4d = DB[5];
- double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n;
- double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d;
- if (sign)
- return -C - P / Q;
- else
- return C + P / Q;
- }
- else if (ia < 0x40000000)
- { /* 1.25 <= |x| < 2.0. */
- double a = fabs (x);
- a = a - 1.25;
-
- double r1 = fma (a, PC[1], PC[0]);
- double r2 = fma (a, PC[3], PC[2]);
- double r3 = fma (a, PC[5], PC[4]);
- double r4 = fma (a, PC[7], PC[6]);
- double r5 = fma (a, PC[9], PC[8]);
- double r6 = fma (a, PC[11], PC[10]);
- double r7 = fma (a, PC[13], PC[12]);
- double r8 = fma (a, PC[15], PC[14]);
-
- double a2 = a * a;
-
- double r = r8;
- r = fma (a2, r, r7);
- r = fma (a2, r, r6);
- r = fma (a2, r, r5);
- r = fma (a2, r, r4);
- r = fma (a2, r, r3);
- r = fma (a2, r, r2);
- r = fma (a2, r, r1);
-
- if (sign)
- return -1.0 + r;
- else
- return 1.0 - r;
- }
- else if (ia < 0x400a0000)
- { /* 2 <= |x| < 3.25. */
- double a = fabs (x);
- a = fma (0.5, a, -1.0);
-
- double r1 = fma (a, PD[1], PD[0]);
- double r2 = fma (a, PD[3], PD[2]);
- double r3 = fma (a, PD[5], PD[4]);
- double r4 = fma (a, PD[7], PD[6]);
- double r5 = fma (a, PD[9], PD[8]);
- double r6 = fma (a, PD[11], PD[10]);
- double r7 = fma (a, PD[13], PD[12]);
- double r8 = fma (a, PD[15], PD[14]);
- double r9 = fma (a, PD[17], PD[16]);
-
- double a2 = a * a;
-
- double r = r9;
- r = fma (a2, r, r8);
- r = fma (a2, r, r7);
- r = fma (a2, r, r6);
- r = fma (a2, r, r5);
- r = fma (a2, r, r4);
- r = fma (a2, r, r3);
- r = fma (a2, r, r2);
- r = fma (a2, r, r1);
-
- if (sign)
- return -1.0 + r;
- else
- return 1.0 - r;
- }
- else if (ia < 0x40100000)
- { /* 3.25 <= |x| < 4.0. */
- double a = fabs (x);
- a = a - 3.25;
-
- double r1 = fma (a, PE[1], PE[0]);
- double r2 = fma (a, PE[3], PE[2]);
- double r3 = fma (a, PE[5], PE[4]);
- double r4 = fma (a, PE[7], PE[6]);
- double r5 = fma (a, PE[9], PE[8]);
- double r6 = fma (a, PE[11], PE[10]);
- double r7 = fma (a, PE[13], PE[12]);
-
- double a2 = a * a;
-
- double r = r7;
- r = fma (a2, r, r6);
- r = fma (a2, r, r5);
- r = fma (a2, r, r4);
- r = fma (a2, r, r3);
- r = fma (a2, r, r2);
- r = fma (a2, r, r1);
-
- if (sign)
- return -1.0 + r;
- else
- return 1.0 - r;
- }
- else if (ia < 0x4017a000)
- { /* 4 <= |x| < 5.90625. */
- double a = fabs (x);
- a = fma (0.5, a, -2.0);
-
- double r1 = fma (a, PF[1], PF[0]);
- double r2 = fma (a, PF[3], PF[2]);
- double r3 = fma (a, PF[5], PF[4]);
- double r4 = fma (a, PF[7], PF[6]);
- double r5 = fma (a, PF[9], PF[8]);
- double r6 = fma (a, PF[11], PF[10]);
- double r7 = fma (a, PF[13], PF[12]);
- double r8 = fma (a, PF[15], PF[14]);
- double r9 = PF[16];
-
- double a2 = a * a;
-
- double r = r9;
- r = fma (a2, r, r8);
- r = fma (a2, r, r7);
- r = fma (a2, r, r6);
- r = fma (a2, r, r5);
- r = fma (a2, r, r4);
- r = fma (a2, r, r3);
- r = fma (a2, r, r2);
- r = fma (a2, r, r1);
-
- if (sign)
- return -1.0 + r;
- else
- return 1.0 - r;
- }
- else
- {
- /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */
- if (unlikely (ia >= 0x7ff00000))
- return (double) (1.0 - (sign << 1)) + 1.0 / x;
-
- if (sign)
- return -1.0;
- else
- return 1.0;
- }
-}
diff --git a/math/erf_data.c b/math/erf_data.c
deleted file mode 100644
index 807875b..0000000
--- a/math/erf_data.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Shared data between erf and erfc.
- *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "math_config.h"
-
-/*
-Minimax approximation of erf
-*/
-const struct erf_data __erf_data = {
-.erf_poly_A = {
-#if ERF_POLY_A_NCOEFFS == 10
-0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,
--0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,
-0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,
--0x1.18c47fd143c5ep-23
-#endif
-},
-/* Rational approximation on [0x1p-28, 0.84375] */
-.erf_ratio_N_A = {
-0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,
--0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16
-},
-.erf_ratio_D_A = {
-0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,
-0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18
-},
-/* Rational approximation on [0.84375, 1.25] */
-.erf_ratio_N_B = {
--0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2,
-0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5,
--0x1.1bf380a96073fp-9
-},
-.erf_ratio_D_B = {
-0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4,
-0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7
-},
-.erfc_poly_C = {
-#if ERFC_POLY_C_NCOEFFS == 16
-/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 */
-0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2,
--0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5,
--0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8,
--0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12,
-0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16,
--0x1.578c9e375d37p-19
-#endif
-},
-.erfc_poly_D = {
-#if ERFC_POLY_D_NCOEFFS == 18
-/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 */
-0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3,
--0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2,
--0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2,
-0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3,
--0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3,
-0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10
-#endif
-},
-.erfc_poly_E = {
-#if ERFC_POLY_E_NCOEFFS == 14
-/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 */
-0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14,
--0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12,
-0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14,
--0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20,
--0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23
-#endif
-},
-.erfc_poly_F = {
-#if ERFC_POLY_F_NCOEFFS == 17
-/* Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 */
-0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19,
--0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14,
-0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11,
--0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11,
-0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14,
--0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19
-#endif
-}
-};
-
diff --git a/math/erff.c b/math/erff.c
deleted file mode 100644
index a58e825..0000000
--- a/math/erff.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Single-precision erf(x) function.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include <stdint.h>
-#include <math.h>
-#include "math_config.h"
-
-#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
-#define A __erff_data.erff_poly_A
-#define B __erff_data.erff_poly_B
-
-/* Top 12 bits of a float. */
-static inline uint32_t
-top12 (float x)
-{
- return asuint (x) >> 20;
-}
-
-/* Efficient implementation of erff
- using either a pure polynomial approximation or
- the exponential of a polynomial.
- Worst-case error is 1.09ulps at 0x1.c111acp-1. */
-float
-erff (float x)
-{
- float r, x2, u;
-
- /* Get top word. */
- uint32_t ix = asuint (x);
- uint32_t sign = ix >> 31;
- uint32_t ia12 = top12 (x) & 0x7ff;
-
- /* Limit of both intervals is 0.875 for performance reasons but coefficients
- computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
- from 0.94 to 1.1ulps. */
- if (ia12 < 0x3f6)
- { /* a = |x| < 0.875. */
-
- /* Tiny and subnormal cases. */
- if (unlikely (ia12 < 0x318))
- { /* |x| < 2^(-28). */
- if (unlikely (ia12 < 0x040))
- { /* |x| < 2^(-119). */
- float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
- return check_uflowf (y);
- }
- return x + TwoOverSqrtPiMinusOne * x;
- }
-
- x2 = x * x;
-
- /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2). */
- r = A[5];
- r = fmaf (r, x2, A[4]);
- r = fmaf (r, x2, A[3]);
- r = fmaf (r, x2, A[2]);
- r = fmaf (r, x2, A[1]);
- r = fmaf (r, x2, A[0]);
- r = fmaf (r, x, x);
- }
- else if (ia12 < 0x408)
- { /* |x| < 4.0 - Use a custom Estrin scheme. */
-
- float a = fabsf (x);
- /* Start with Estrin scheme on high order (small magnitude) coefficients. */
- r = fmaf (B[6], a, B[5]);
- u = fmaf (B[4], a, B[3]);
- x2 = x * x;
- r = fmaf (r, x2, u);
- /* Then switch to pure Horner scheme. */
- r = fmaf (r, a, B[2]);
- r = fmaf (r, a, B[1]);
- r = fmaf (r, a, B[0]);
- r = fmaf (r, a, a);
- /* Single precision exponential with ~0.5ulps,
- ensures erff has max. rel. error
- < 1ulp on [0.921875, 4.0],
- < 1.1ulps on [0.875, 4.0]. */
- r = expf (-r);
- /* Explicit copysign (calling copysignf increases latency). */
- if (sign)
- r = -1.0f + r;
- else
- r = 1.0f - r;
- }
- else
- { /* |x| >= 4.0. */
-
- /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */
- if (unlikely (ia12 >= 0x7f8))
- return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
-
- /* Explicit copysign (calling copysignf increases latency). */
- if (sign)
- r = -1.0f;
- else
- r = 1.0f;
- }
- return r;
-}
diff --git a/math/erff_data.c b/math/erff_data.c
deleted file mode 100644
index fa6b1ef..0000000
--- a/math/erff_data.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Data for approximation of erff.
- *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "math_config.h"
-
-/* Minimax approximation of erff. */
-const struct erff_data __erff_data = {
-.erff_poly_A = {
-0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f,
--0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f
-},
-.erff_poly_B = {
-0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f,
--0x1.8d6300p-6f, 0x1.fd1336p-9f, -0x1.91d2ccp-12f,
-0x1.222900p-16f
-}
-};
-
diff --git a/math/exp.c b/math/exp.c
index 7f5024c..1909b8e 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -1,7 +1,7 @@
/*
* Double-precision e^x function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/exp2.c b/math/exp2.c
index 35ab39f..47aa479 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -1,7 +1,7 @@
/*
* Double-precision 2^x function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/expf.c b/math/expf.c
index 9b2f0c3..0fe1f7d 100644
--- a/math/expf.c
+++ b/math/expf.c
@@ -1,7 +1,7 @@
/*
* Single-precision e^x function.
*
- * Copyright (c) 2017-2019, Arm Limited.
+ * Copyright (c) 2017-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 279d829..4493008 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,7 +1,7 @@
/*
* Public API.
*
- * Copyright (c) 2015-2020, Arm Limited.
+ * Copyright (c) 2015-2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/log.c b/math/log.c
index d3b7bc6..b85d3ff 100644
--- a/math/log.c
+++ b/math/log.c
@@ -1,7 +1,7 @@
/*
* Double-precision log(x) function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/log2.c b/math/log2.c
index 55102b7..804fb85 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -1,7 +1,7 @@
/*
* Double-precision log2(x) function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/logf.c b/math/logf.c
index cfbaee1..ee3120a 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -1,7 +1,7 @@
/*
* Single-precision log function.
*
- * Copyright (c) 2017-2019, Arm Limited.
+ * Copyright (c) 2017-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/logf_data.c b/math/logf_data.c
index e8973ce..53c5f62 100644
--- a/math/logf_data.c
+++ b/math/logf_data.c
@@ -1,7 +1,7 @@
/*
* Data definition for logf.
*
- * Copyright (c) 2017-2019, Arm Limited.
+ * Copyright (c) 2017-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/math_config.h b/math/math_config.h
index e851043..7a1cc81 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -1,7 +1,7 @@
/*
* Configuration for math routines.
*
- * Copyright (c) 2017-2020, Arm Limited.
+ * Copyright (c) 2017-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -12,17 +12,12 @@
#include <stdint.h>
#ifndef WANT_ROUNDING
-/* If defined to 1, return correct results for special cases in non-nearest
- rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f).
- This may be set to 0 if there is no fenv support or if math functions only
- get called in round to nearest mode. */
+/* Correct special case results in non-nearest rounding modes. */
# define WANT_ROUNDING 1
#endif
#ifndef WANT_ERRNO
-/* If defined to 1, set errno in math functions according to ISO C. Many math
- libraries do not set errno, so this is 0 by default. It may need to be
- set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. */
-# define WANT_ERRNO 0
+/* Set errno according to ISO C with (math_errhandling & MATH_ERRNO) != 0. */
+# define WANT_ERRNO 1
#endif
#ifndef WANT_ERRNO_UFLOW
/* Set errno to ERANGE if result underflows to 0 (in all rounding modes). */
@@ -298,24 +293,6 @@ check_uflow (double x)
return WANT_ERRNO ? __math_check_uflow (x) : x;
}
-/* Check if the result overflowed to infinity. */
-HIDDEN float __math_check_oflowf (float);
-/* Check if the result underflowed to 0. */
-HIDDEN float __math_check_uflowf (float);
-
-/* Check if the result overflowed to infinity. */
-static inline float
-check_oflowf (float x)
-{
- return WANT_ERRNO ? __math_check_oflowf (x) : x;
-}
-
-/* Check if the result underflowed to 0. */
-static inline float
-check_uflowf (float x)
-{
- return WANT_ERRNO ? __math_check_uflowf (x) : x;
-}
/* Shared between expf, exp2f and powf. */
#define EXP2F_TABLE_BITS 5
@@ -434,29 +411,4 @@ extern const struct pow_log_data
struct {double invc, pad, logc, logctail;} tab[1 << POW_LOG_TABLE_BITS];
} __pow_log_data HIDDEN;
-extern const struct erff_data
-{
- float erff_poly_A[6];
- float erff_poly_B[7];
-} __erff_data HIDDEN;
-
-#define ERF_POLY_A_ORDER 19
-#define ERF_POLY_A_NCOEFFS 10
-#define ERFC_POLY_C_NCOEFFS 16
-#define ERFC_POLY_D_NCOEFFS 18
-#define ERFC_POLY_E_NCOEFFS 14
-#define ERFC_POLY_F_NCOEFFS 17
-extern const struct erf_data
-{
- double erf_poly_A[ERF_POLY_A_NCOEFFS];
- double erf_ratio_N_A[5];
- double erf_ratio_D_A[5];
- double erf_ratio_N_B[7];
- double erf_ratio_D_B[6];
- double erfc_poly_C[ERFC_POLY_C_NCOEFFS];
- double erfc_poly_D[ERFC_POLY_D_NCOEFFS];
- double erfc_poly_E[ERFC_POLY_E_NCOEFFS];
- double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
-} __erf_data HIDDEN;
-
#endif
diff --git a/math/math_errf.c b/math/math_errf.c
index d5350b8..07154c5 100644
--- a/math/math_errf.c
+++ b/math/math_errf.c
@@ -1,7 +1,7 @@
/*
* Single-precision math error handling.
*
- * Copyright (c) 2017-2020, Arm Limited.
+ * Copyright (c) 2017-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -64,17 +64,3 @@ __math_invalidf (float x)
float y = (x - x) / (x - x);
return isnan (x) ? y : with_errnof (y, EDOM);
}
-
-/* Check result and set errno if necessary. */
-
-HIDDEN float
-__math_check_uflowf (float y)
-{
- return y == 0.0f ? with_errnof (y, ERANGE) : y;
-}
-
-HIDDEN float
-__math_check_oflowf (float y)
-{
- return isinf (y) ? with_errnof (y, ERANGE) : y;
-}
diff --git a/math/pow.c b/math/pow.c
index 86842c6..ced7c4f 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -1,7 +1,7 @@
/*
* Double-precision x^y function.
*
- * Copyright (c) 2018-2020, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/powf.c b/math/powf.c
index 6ba45d3..1534a09 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -1,7 +1,7 @@
/*
* Single-precision pow function.
*
- * Copyright (c) 2017-2019, Arm Limited.
+ * Copyright (c) 2017-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c
index 97e0d98..b9fbdc4 100644
--- a/math/powf_log2_data.c
+++ b/math/powf_log2_data.c
@@ -1,7 +1,7 @@
/*
* Data definition for powf.
*
- * Copyright (c) 2017-2019, Arm Limited.
+ * Copyright (c) 2017-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/sincosf.c b/math/sincosf.c
index 9746f1c..e6cd41e 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -1,7 +1,7 @@
/*
* Single-precision sin/cos function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index ab4ac47..5d0b58e 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -1,7 +1,7 @@
/*
* Data definition for sinf, cosf and sincosf.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/sinf.c b/math/sinf.c
index ddbc1da..770b294 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -1,7 +1,7 @@
/*
* Single-precision sin function.
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 0c17826..33ceda3 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,7 +1,7 @@
/*
* Microbenchmark for math functions.
*
- * Copyright (c) 2018-2020, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -248,7 +248,6 @@ D (log2, 0.999, 1.001)
{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
D (xpow, 0.01, 11.1)
D (ypow, -9.9, 9.9)
-D (erf, -6.0, 6.0)
F (dummyf, 1.0, 2.0)
F (expf, -9.9, 9.9)
@@ -276,7 +275,6 @@ F (cosf, -3.1, 3.1)
F (cosf, 3.3, 33.3)
F (cosf, 100, 1000)
F (cosf, 1e6, 1e32)
-F (erff, -4.0, 4.0)
#if WANT_VMATH
D (__s_sin, -3.1, 3.1)
D (__s_cos, -3.1, 3.1)
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 3108967..2ff8c3f 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,7 +1,7 @@
/*
* mathtest.c - test rig for mathlib
*
- * Copyright (c) 1998-2019, Arm Limited.
+ * Copyright (c) 1998-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c
index 6be79e1..f416477 100644
--- a/math/test/rtest/dotest.c
+++ b/math/test/rtest/dotest.c
@@ -1,7 +1,7 @@
/*
* dotest.c - actually generate mathlib test cases
*
- * Copyright (c) 1999-2019, Arm Limited.
+ * Copyright (c) 1999-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h
index 12a9c74..af574b0 100644
--- a/math/test/rtest/intern.h
+++ b/math/test/rtest/intern.h
@@ -1,7 +1,7 @@
/*
* intern.h
*
- * Copyright (c) 1999-2019, Arm Limited.
+ * Copyright (c) 1999-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c
index 0d8ead8..e94e455 100644
--- a/math/test/rtest/main.c
+++ b/math/test/rtest/main.c
@@ -1,7 +1,7 @@
/*
* main.c
*
- * Copyright (c) 1999-2019, Arm Limited.
+ * Copyright (c) 1999-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c
index 5612396..e97a8c6 100644
--- a/math/test/rtest/random.c
+++ b/math/test/rtest/random.c
@@ -1,7 +1,7 @@
/*
* random.c - random number generator for producing mathlib test cases
*
- * Copyright (c) 1998-2019, Arm Limited.
+ * Copyright (c) 1998-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h
index b4b22df..c1ce956 100644
--- a/math/test/rtest/random.h
+++ b/math/test/rtest/random.h
@@ -1,7 +1,7 @@
/*
* random.h - header for random.c
*
- * Copyright (c) 2009-2019, Arm Limited.
+ * Copyright (c) 2009-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c
index c9f0daf..938dc3a 100644
--- a/math/test/rtest/semi.c
+++ b/math/test/rtest/semi.c
@@ -1,7 +1,7 @@
/*
* semi.c: test implementations of mathlib seminumerical functions
*
- * Copyright (c) 1999-2019, Arm Limited.
+ * Copyright (c) 1999-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h
index 17dc415..da473a2 100644
--- a/math/test/rtest/semi.h
+++ b/math/test/rtest/semi.h
@@ -1,7 +1,7 @@
/*
* semi.h: header for semi.c
*
- * Copyright (c) 1999-2019, Arm Limited.
+ * Copyright (c) 1999-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h
index 53cd557..1a76c2e 100644
--- a/math/test/rtest/types.h
+++ b/math/test/rtest/types.h
@@ -1,7 +1,7 @@
/*
* types.h
*
- * Copyright (c) 2005-2019, Arm Limited.
+ * Copyright (c) 2005-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c
index de45ac5..acaf671 100644
--- a/math/test/rtest/wrappers.c
+++ b/math/test/rtest/wrappers.c
@@ -1,7 +1,7 @@
/*
* wrappers.c - wrappers to modify output of MPFR/MPC test functions
*
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h
index 7b09c85..5804935 100644
--- a/math/test/rtest/wrappers.h
+++ b/math/test/rtest/wrappers.h
@@ -1,7 +1,7 @@
/*
* wrappers.h - wrappers to modify output of MPFR/MPC test functions
*
- * Copyright (c) 2014-2019, Arm Limited.
+ * Copyright (c) 2014-2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 0190d9a..a8c391b 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,7 +2,7 @@
# ULP error check script.
#
-# Copyright (c) 2019-2020, Arm Limited.
+# Copyright (c) 2019, Arm Limited.
# SPDX-License-Identifier: MIT
#set -x
@@ -72,16 +72,6 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000
t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000
t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
-L=1.0
-Ldir=0.9
-t erf 0 0xffff000000000000 10000
-t erf 0x1p-1022 0x1p-26 40000
-t erf -0x1p-1022 -0x1p-26 40000
-t erf 0x1p-26 0x1p3 40000
-t erf -0x1p-26 -0x1p3 40000
-t erf 0 inf 40000
-Ldir=0.5
-
L=0.01
t expf 0 0xffff0000 10000
t expf 0x1p-14 0x1p8 50000
@@ -129,17 +119,6 @@ t powf 0x1p-70 0x1p70 x 0x1p-1 0x1p1 50000
t powf 0x1p-70 0x1p70 x -0x1p-1 -0x1p1 50000
t powf 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p14 50000
t powf 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p14 50000
-
-L=0.6
-Ldir=0.9
-t erff 0 0xffff0000 10000
-t erff 0x1p-127 0x1p-26 40000
-t erff -0x1p-127 -0x1p-26 40000
-t erff 0x1p-26 0x1p3 40000
-t erff -0x1p-26 -0x1p3 40000
-t erff 0 inf 40000
-Ldir=0.5
-
done
# vector functions
diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst
index 7916044..5dc0994 100644
--- a/math/test/testcases/directed/cosf.tst
+++ b/math/test/testcases/directed/cosf.tst
@@ -1,6 +1,6 @@
; cosf.tst - Directed test cases for SP cosine
;
-; Copyright (c) 2007-2019, Arm Limited.
+; Copyright (c) 2007-2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=cosf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst
deleted file mode 100644
index 7fa4d18..0000000
--- a/math/test/testcases/directed/erf.tst
+++ /dev/null
@@ -1,17 +0,0 @@
-; erf.tst - Directed test cases for erf
-;
-; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
-
-func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
-func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
-func=erf op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
-func=erf op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
-func=erf op1=7ff00000.00000000 result=3ff00000.00000000 errno=0
-func=erf op1=fff00000.00000000 result=bff00000.00000000 errno=0
-func=erf op1=00000000.00000000 result=00000000.00000000 errno=ERANGE
-func=erf op1=80000000.00000000 result=80000000.00000000 errno=ERANGE
-func=erf op1=00000000.00000001 result=00000000.00000001 errno=0 status=ux
-func=erf op1=80000000.00000001 result=80000000.00000001 errno=0 status=ux
-func=erf op1=3ff00000.00000000 result=3feaf767.a741088a.c6d errno=0
-func=erf op1=bff00000.00000000 result=bfeaf767.a741088a.c6d errno=0
diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst
deleted file mode 100644
index d05b7b1..0000000
--- a/math/test/testcases/directed/erff.tst
+++ /dev/null
@@ -1,17 +0,0 @@
-; erff.tst
-;
-; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
-
-func=erff op1=7fc00001 result=7fc00001 errno=0
-func=erff op1=ffc00001 result=7fc00001 errno=0
-func=erff op1=7f800001 result=7fc00001 errno=0 status=i
-func=erff op1=ff800001 result=7fc00001 errno=0 status=i
-func=erff op1=7f800000 result=3f800000 errno=0
-func=erff op1=ff800000 result=bf800000 errno=0
-func=erff op1=00000000 result=00000000 errno=ERANGE
-func=erff op1=80000000 result=80000000 errno=ERANGE
-func=erff op1=00000001 result=00000001 errno=0 status=ux
-func=erff op1=80000001 result=80000001 errno=0 status=ux
-func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0
-func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0
diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst
index 85d556c..addfc0a 100644
--- a/math/test/testcases/directed/exp.tst
+++ b/math/test/testcases/directed/exp.tst
@@ -1,6 +1,6 @@
; Directed test cases for exp
;
-; Copyright (c) 2018-2019, Arm Limited.
+; Copyright (c) 2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst
index fa56c9f..04a5a50 100644
--- a/math/test/testcases/directed/exp2.tst
+++ b/math/test/testcases/directed/exp2.tst
@@ -1,6 +1,6 @@
; Directed test cases for exp2
;
-; Copyright (c) 2018-2019, Arm Limited.
+; Copyright (c) 2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst
index 38cfc3f..2b6a9b5 100644
--- a/math/test/testcases/directed/exp2f.tst
+++ b/math/test/testcases/directed/exp2f.tst
@@ -1,6 +1,6 @@
; exp2f.tst - Directed test cases for exp2f
;
-; Copyright (c) 2017-2019, Arm Limited.
+; Copyright (c) 2017-2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=exp2f op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst
index ff0f671..74664c7 100644
--- a/math/test/testcases/directed/expf.tst
+++ b/math/test/testcases/directed/expf.tst
@@ -1,6 +1,6 @@
; expf.tst - Directed test cases for expf
;
-; Copyright (c) 2007-2019, Arm Limited.
+; Copyright (c) 2007-2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=expf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst
index a0aa398..eeb762c 100644
--- a/math/test/testcases/directed/log.tst
+++ b/math/test/testcases/directed/log.tst
@@ -1,6 +1,6 @@
; Directed test cases for log
;
-; Copyright (c) 2018-2019, Arm Limited.
+; Copyright (c) 2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst
index ff1286c..e0765d8 100644
--- a/math/test/testcases/directed/log2.tst
+++ b/math/test/testcases/directed/log2.tst
@@ -1,6 +1,6 @@
; Directed test cases for log2
;
-; Copyright (c) 2018-2019, Arm Limited.
+; Copyright (c) 2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst
index 5832c4f..8d685ba 100644
--- a/math/test/testcases/directed/log2f.tst
+++ b/math/test/testcases/directed/log2f.tst
@@ -1,6 +1,6 @@
; log2f.tst - Directed test cases for log2f
;
-; Copyright (c) 2017-2019, Arm Limited.
+; Copyright (c) 2017-2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=log2f op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst
index 6e68a36..7ccc873 100644
--- a/math/test/testcases/directed/logf.tst
+++ b/math/test/testcases/directed/logf.tst
@@ -1,6 +1,6 @@
; logf.tst - Directed test cases for logf
;
-; Copyright (c) 2007-2019, Arm Limited.
+; Copyright (c) 2007-2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=logf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst
index 1966581..a4c42be 100644
--- a/math/test/testcases/directed/pow.tst
+++ b/math/test/testcases/directed/pow.tst
@@ -1,6 +1,6 @@
; Directed test cases for pow
;
-; Copyright (c) 2018-2019, Arm Limited.
+; Copyright (c) 2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst
index 3fa8b11..efd1dd5 100644
--- a/math/test/testcases/directed/powf.tst
+++ b/math/test/testcases/directed/powf.tst
@@ -1,6 +1,6 @@
; powf.tst - Directed test cases for powf
;
-; Copyright (c) 2007-2019, Arm Limited.
+; Copyright (c) 2007-2018, Arm Limited.
; SPDX-License-Identifier: MIT
func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst
index 4b33d22..b4b2526 100644
--- a/math/test/testcases/directed/sincosf.tst
+++ b/math/test/testcases/directed/sincosf.tst
@@ -1,6 +1,6 @@
; Directed test cases for SP sincos
;
-; Copyright (c) 2007-2019, Arm Limited.
+; Copyright (c) 2007-2018, Arm Limited.
; SPDX-License-Identifier: MIT
diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst
index ded80b1..13cfdca 100644
--- a/math/test/testcases/directed/sinf.tst
+++ b/math/test/testcases/directed/sinf.tst
@@ -1,6 +1,6 @@
; sinf.tst - Directed test cases for SP sine
;
-; Copyright (c) 2007-2019, Arm Limited.
+; Copyright (c) 2007-2018, Arm Limited.
; SPDX-License-Identifier: MIT
diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst
index c24ff80..c37e837 100644
--- a/math/test/testcases/random/double.tst
+++ b/math/test/testcases/random/double.tst
@@ -1,6 +1,6 @@
!! double.tst - Random test case specification for DP functions
!!
-!! Copyright (c) 1999-2019, Arm Limited.
+!! Copyright (c) 1999-2018, Arm Limited.
!! SPDX-License-Identifier: MIT
test exp 10000
diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst
index d02a227..baf62b9 100644
--- a/math/test/testcases/random/float.tst
+++ b/math/test/testcases/random/float.tst
@@ -1,6 +1,6 @@
!! single.tst - Random test case specification for SP functions
!!
-!! Copyright (c) 1999-2019, Arm Limited.
+!! Copyright (c) 1999-2018, Arm Limited.
!! SPDX-License-Identifier: MIT
test sinf 10000
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 51479b8..371567a 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,7 +1,7 @@
/*
* ULP error checking tool for math functions.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -331,13 +331,11 @@ static const struct fun fun[] = {
F1 (log)
F1 (log2)
F2 (pow)
- F1 (erf)
D1 (exp)
D1 (exp2)
D1 (log)
D1 (log2)
D2 (pow)
- D1 (erf)
#if WANT_VMATH
F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
diff --git a/math/tools/remez.jl b/math/tools/remez.jl
index 2ff436f..f479fc5 100755
--- a/math/tools/remez.jl
+++ b/math/tools/remez.jl
@@ -3,7 +3,7 @@
# remez.jl - implementation of the Remez algorithm for polynomial approximation
#
-# Copyright (c) 2015-2019, Arm Limited.
+# Copyright (c) 2015-2018, Arm Limited.
# SPDX-License-Identifier: MIT
import Base.\
diff --git a/math/v_math.h b/math/v_math.h
index f2cc467..3db22e5 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -1,7 +1,7 @@
/*
* Vector math abstractions.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/networking/Dir.mk b/networking/Dir.mk
deleted file mode 100644
index b496103..0000000
--- a/networking/Dir.mk
+++ /dev/null
@@ -1,76 +0,0 @@
-# Makefile fragment - requires GNU make
-#
-# Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
-
-S := $(srcdir)/networking
-B := build/networking
-
-ifeq ($(ARCH),)
-all-networking check-networking install-networking clean-networking:
- @echo "*** Please set ARCH in config.mk. ***"
- @exit 1
-else
-
-networking-lib-srcs := $(wildcard $(S)/*.[cS]) $(wildcard $(S)/$(ARCH)/*.[cS])
-networking-test-srcs := $(wildcard $(S)/test/*.c)
-
-networking-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
-
-networking-libs := \
- build/lib/libnetworking.so \
- build/lib/libnetworking.a \
-
-networking-tools := \
- build/bin/test/chksum
-
-networking-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-lib-srcs)))
-networking-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(networking-test-srcs)))
-
-networking-objs := \
- $(networking-lib-objs) \
- $(networking-lib-objs:%.o=%.os) \
- $(networking-test-objs) \
-
-networking-files := \
- $(networking-objs) \
- $(networking-libs) \
- $(networking-tools) \
- $(networking-includes) \
-
-all-networking: $(networking-libs) $(networking-tools) $(networking-includes)
-
-$(networking-objs): $(networking-includes)
-$(networking-objs): CFLAGS_ALL += $(networking-cflags)
-
-build/lib/libnetworking.so: $(networking-lib-objs:%.o=%.os)
- $(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
-
-build/lib/libnetworkinglib.a: $(networking-lib-objs)
- rm -f $@
- $(AR) rc $@ $^
- $(RANLIB) $@
-
-build/bin/test/%: $(B)/test/%.o build/lib/libnetworkinglib.a
- $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
-
-build/include/%.h: $(S)/include/%.h
- cp $< $@
-
-build/bin/%.sh: $(S)/test/%.sh
- cp $< $@
-
-check-networking: $(networking-tools)
- $(EMULATOR) build/bin/test/chksum -i simple
- $(EMULATOR) build/bin/test/chksum -i scalar
- $(EMULATOR) build/bin/test/chksum -i simd || true # simd is not always available
-
-install-networking: \
- $(networking-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
- $(networking-includes:build/include/%=$(DESTDIR)$(includedir)/%)
-
-clean-networking:
- rm -f $(networking-files)
-endif
-
-.PHONY: all-networking check-networking install-networking clean-networking
diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c
deleted file mode 100644
index 6d5be58..0000000
--- a/networking/aarch64/chksum_simd.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * AArch64-specific checksum implementation using NEON
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "networking.h"
-#include "../chksum_common.h"
-
-#ifndef __ARM_NEON
-#pragma GCC target("+simd")
-#endif
-
-#include <arm_neon.h>
-
-always_inline
-static inline uint64_t
-slurp_head64(const void **pptr, uint32_t *nbytes)
-{
- Assert(*nbytes >= 8);
- uint64_t sum = 0;
- uint32_t off = (uintptr_t) *pptr % 8;
- if (likely(off != 0))
- {
- /* Get rid of bytes 0..off-1 */
- const unsigned char *ptr64 = align_ptr(*pptr, 8);
- uint64_t mask = ALL_ONES << (CHAR_BIT * off);
- uint64_t val = load64(ptr64) & mask;
- /* Fold 64-bit sum to 33 bits */
- sum = val >> 32;
- sum += (uint32_t) val;
- *pptr = ptr64 + 8;
- *nbytes -= 8 - off;
- }
- return sum;
-}
-
-always_inline
-static inline uint64_t
-slurp_tail64(uint64_t sum, const void *ptr, uint32_t nbytes)
-{
- Assert(nbytes < 8);
- if (likely(nbytes != 0))
- {
- /* Get rid of bytes 7..nbytes */
- uint64_t mask = ALL_ONES >> (CHAR_BIT * (8 - nbytes));
- Assert(__builtin_popcountl(mask) / CHAR_BIT == nbytes);
- uint64_t val = load64(ptr) & mask;
- sum += val >> 32;
- sum += (uint32_t) val;
- nbytes = 0;
- }
- Assert(nbytes == 0);
- return sum;
-}
-
-unsigned short
-__chksum_aarch64_simd(const void *ptr, unsigned int nbytes)
-{
- bool swap = (uintptr_t) ptr & 1;
- uint64_t sum;
-
- if (unlikely(nbytes < 50))
- {
- sum = slurp_small(ptr, nbytes);
- swap = false;
- goto fold;
- }
-
- /* 8-byte align pointer */
- Assert(nbytes >= 8);
- sum = slurp_head64(&ptr, &nbytes);
- Assert(((uintptr_t) ptr & 7) == 0);
-
- const uint32_t *may_alias ptr32 = ptr;
-
- uint64x2_t vsum0 = { 0, 0 };
- uint64x2_t vsum1 = { 0, 0 };
- uint64x2_t vsum2 = { 0, 0 };
- uint64x2_t vsum3 = { 0, 0 };
-
- /* Sum groups of 64 bytes */
- for (uint32_t i = 0; i < nbytes / 64; i++)
- {
- uint32x4_t vtmp0 = vld1q_u32(ptr32);
- uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
- uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8);
- uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12);
- vsum0 = vpadalq_u32(vsum0, vtmp0);
- vsum1 = vpadalq_u32(vsum1, vtmp1);
- vsum2 = vpadalq_u32(vsum2, vtmp2);
- vsum3 = vpadalq_u32(vsum3, vtmp3);
- ptr32 += 16;
- }
- nbytes %= 64;
-
- /* Fold vsum2 and vsum3 into vsum0 and vsum1 */
- vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2));
- vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3));
-
- /* Add any trailing group of 32 bytes */
- if (nbytes & 32)
- {
- uint32x4_t vtmp0 = vld1q_u32(ptr32);
- uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
- vsum0 = vpadalq_u32(vsum0, vtmp0);
- vsum1 = vpadalq_u32(vsum1, vtmp1);
- ptr32 += 8;
- nbytes -= 32;
- }
- Assert(nbytes < 32);
-
- /* Fold vsum1 into vsum0 */
- vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1));
-
- /* Add any trailing group of 16 bytes */
- if (nbytes & 16)
- {
- uint32x4_t vtmp = vld1q_u32(ptr32);
- vsum0 = vpadalq_u32(vsum0, vtmp);
- ptr32 += 4;
- nbytes -= 16;
- }
- Assert(nbytes < 16);
-
- /* Add any trailing group of 8 bytes */
- if (nbytes & 8)
- {
- uint32x2_t vtmp = vld1_u32(ptr32);
- vsum0 = vaddw_u32(vsum0, vtmp);
- ptr32 += 2;
- nbytes -= 8;
- }
- Assert(nbytes < 8);
-
- uint64_t val = vaddlvq_u32(vreinterpretq_u32_u64(vsum0));
- sum += val >> 32;
- sum += (uint32_t) val;
-
- /* Handle any trailing 0..7 bytes */
- sum = slurp_tail64(sum, ptr32, nbytes);
-
-fold:
- return fold_and_swap(sum, swap);
-}
diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c
deleted file mode 100644
index 7f69adf..0000000
--- a/networking/arm/chksum_simd.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Armv7-A specific checksum implementation using NEON
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "networking.h"
-#include "../chksum_common.h"
-
-#ifndef __ARM_NEON
-#pragma GCC target("+simd")
-#endif
-
-#include <arm_neon.h>
-
-unsigned short
-__chksum_arm_simd(const void *ptr, unsigned int nbytes)
-{
- bool swap = (uintptr_t) ptr & 1;
- uint64x1_t vsum = { 0 };
-
- if (unlikely(nbytes < 40))
- {
- uint64_t sum = slurp_small(ptr, nbytes);
- return fold_and_swap(sum, false);
- }
-
- /* 8-byte align pointer */
- /* Inline slurp_head-like code since we use NEON here */
- Assert(nbytes >= 8);
- uint32_t off = (uintptr_t) ptr & 7;
- if (likely(off != 0))
- {
- const uint64_t *may_alias ptr64 = align_ptr(ptr, 8);
- uint64x1_t vword64 = vld1_u64(ptr64);
- /* Get rid of bytes 0..off-1 */
- uint64x1_t vmask = vdup_n_u64(ALL_ONES);
- int64x1_t vshiftl = vdup_n_s64(CHAR_BIT * off);
- vmask = vshl_u64(vmask, vshiftl);
- vword64 = vand_u64(vword64, vmask);
- uint32x2_t vtmp = vreinterpret_u32_u64(vword64);
- /* Set accumulator */
- vsum = vpaddl_u32(vtmp);
- /* Update pointer and remaining size */
- ptr = (char *) ptr64 + 8;
- nbytes -= 8 - off;
- }
- Assert(((uintptr_t) ptr & 7) == 0);
-
- /* Sum groups of 64 bytes */
- uint64x2_t vsum0 = { 0, 0 };
- uint64x2_t vsum1 = { 0, 0 };
- uint64x2_t vsum2 = { 0, 0 };
- uint64x2_t vsum3 = { 0, 0 };
- const uint32_t *may_alias ptr32 = ptr;
- for (uint32_t i = 0; i < nbytes / 64; i++)
- {
- uint32x4_t vtmp0 = vld1q_u32(ptr32);
- uint32x4_t vtmp1 = vld1q_u32(ptr32 + 4);
- uint32x4_t vtmp2 = vld1q_u32(ptr32 + 8);
- uint32x4_t vtmp3 = vld1q_u32(ptr32 + 12);
- vsum0 = vpadalq_u32(vsum0, vtmp0);
- vsum1 = vpadalq_u32(vsum1, vtmp1);
- vsum2 = vpadalq_u32(vsum2, vtmp2);
- vsum3 = vpadalq_u32(vsum3, vtmp3);
- ptr32 += 16;
- }
- nbytes %= 64;
-
- /* Fold vsum1/vsum2/vsum3 into vsum0 */
- vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum2));
- vsum1 = vpadalq_u32(vsum1, vreinterpretq_u32_u64(vsum3));
- vsum0 = vpadalq_u32(vsum0, vreinterpretq_u32_u64(vsum1));
-
- /* Add any trailing 16-byte groups */
- while (likely(nbytes >= 16))
- {
- uint32x4_t vtmp0 = vld1q_u32(ptr32);
- vsum0 = vpadalq_u32(vsum0, vtmp0);
- ptr32 += 4;
- nbytes -= 16;
- }
- Assert(nbytes < 16);
-
- /* Fold vsum0 into vsum */
- {
- /* 4xu32 (4x32b) -> 2xu64 (2x33b) */
- vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0));
- /* 4xu32 (2x(1b+32b)) -> 2xu64 (2x(0b+32b)) */
- vsum0 = vpaddlq_u32(vreinterpretq_u32_u64(vsum0));
- /* 4xu32 (4x32b) -> 2xu64 (2x33b) */
- Assert((vgetq_lane_u64(vsum0, 0) >> 32) == 0);
- Assert((vgetq_lane_u64(vsum0, 1) >> 32) == 0);
- uint32x2_t vtmp = vmovn_u64(vsum0);
- /* Add to accumulator */
- vsum = vpadal_u32(vsum, vtmp);
- }
-
- /* Add any trailing group of 8 bytes */
- if (nbytes & 8)
- {
- uint32x2_t vtmp = vld1_u32(ptr32);
- /* Add to accumulator */
- vsum = vpadal_u32(vsum, vtmp);
- ptr32 += 2;
- nbytes -= 8;
- }
- Assert(nbytes < 8);
-
- /* Handle any trailing 1..7 bytes */
- if (likely(nbytes != 0))
- {
- Assert(((uintptr_t) ptr32 & 7) == 0);
- Assert(nbytes < 8);
- uint64x1_t vword64 = vld1_u64((const uint64_t *) ptr32);
- /* Get rid of bytes 7..nbytes */
- uint64x1_t vmask = vdup_n_u64(ALL_ONES);
- int64x1_t vshiftr = vdup_n_s64(-CHAR_BIT * (8 - nbytes));
- vmask = vshl_u64(vmask, vshiftr);/* Shift right */
- vword64 = vand_u64(vword64, vmask);
- /* Fold 64-bit sum to 33 bits */
- vword64 = vpaddl_u32(vreinterpret_u32_u64(vword64));
- /* Add to accumulator */
- vsum = vpadal_u32(vsum, vreinterpret_u32_u64(vword64));
- }
-
- /* Fold 64-bit vsum to 32 bits */
- vsum = vpaddl_u32(vreinterpret_u32_u64(vsum));
- vsum = vpaddl_u32(vreinterpret_u32_u64(vsum));
- Assert(vget_lane_u32(vreinterpret_u32_u64(vsum), 1) == 0);
-
- /* Fold 32-bit vsum to 16 bits */
- uint32x2_t vsum32 = vreinterpret_u32_u64(vsum);
- vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32));
- vsum32 = vpaddl_u16(vreinterpret_u16_u32(vsum32));
- Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 1) == 0);
- Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 2) == 0);
- Assert(vget_lane_u16(vreinterpret_u16_u32(vsum32), 3) == 0);
-
- /* Convert to 16-bit scalar */
- uint16_t sum = vget_lane_u16(vreinterpret_u16_u32(vsum32), 0);
-
- if (unlikely(swap))/* Odd base pointer is unexpected */
- {
- sum = bswap16(sum);
- }
- return sum;
-}
diff --git a/networking/chksum.c b/networking/chksum.c
deleted file mode 100644
index 95ce5ba..0000000
--- a/networking/chksum.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Compute 16-bit sum in ones' complement arithmetic (with end-around carry).
- * This sum is often used as a simple checksum in networking.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "networking.h"
-#include "chksum_common.h"
-
-always_inline
-static inline uint32_t
-slurp_head32(const void **pptr, uint32_t *nbytes)
-{
- uint32_t sum = 0;
- Assert(*nbytes >= 4);
- uint32_t off = (uintptr_t) *pptr % 4;
- if (likely(off != 0))
- {
- /* Get rid of bytes 0..off-1 */
- const unsigned char *ptr32 = align_ptr(*pptr, 4);
- uint32_t mask = ~0U << (CHAR_BIT * off);
- sum = load32(ptr32) & mask;
- *pptr = ptr32 + 4;
- *nbytes -= 4 - off;
- }
- return sum;
-}
-
-/* Additional loop unrolling would help when not auto-vectorizing */
-unsigned short
-__chksum(const void *ptr, unsigned int nbytes)
-{
- bool swap = false;
- uint64_t sum = 0;
-
- if (nbytes > 300)
- {
- /* 4-byte align pointer */
- swap = (uintptr_t) ptr & 1;
- sum = slurp_head32(&ptr, &nbytes);
- }
- /* Else benefit of aligning not worth the overhead */
-
- /* Sum all 16-byte chunks */
- const char *cptr = ptr;
- for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--)
- {
- uint64_t h0 = load32(cptr + 0);
- uint64_t h1 = load32(cptr + 4);
- uint64_t h2 = load32(cptr + 8);
- uint64_t h3 = load32(cptr + 12);
- sum += h0 + h1 + h2 + h3;
- cptr += 16;
- }
- nbytes %= 16;
- Assert(nbytes < 16);
-
- /* Handle any trailing 4-byte chunks */
- while (nbytes >= 4)
- {
- sum += load32(cptr);
- cptr += 4;
- nbytes -= 4;
- }
- Assert(nbytes < 4);
-
- if (nbytes & 2)
- {
- sum += load16(cptr);
- cptr += 2;
- }
-
- if (nbytes & 1)
- {
- sum += *(uint8_t *)cptr;
- }
-
- return fold_and_swap(sum, swap);
-}
diff --git a/networking/chksum_common.h b/networking/chksum_common.h
deleted file mode 100644
index 958c8cc..0000000
--- a/networking/chksum_common.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Common code for checksum implementations
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef CHKSUM_COMMON_H
-#define CHKSUM_COMMON_H
-
-#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
-#error Only little endian supported
-#endif
-
-#include <limits.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <string.h>
-
-/* Assertions must be explicitly enabled */
-#if WANT_ASSERT
-#undef NDEBUG
-#include <assert.h>
-#define Assert(exp) assert(exp)
-#else
-#define Assert(exp) (void) (exp)
-#endif
-
-#ifdef __GNUC__
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
-#define may_alias __attribute__((__may_alias__))
-#define always_inline __attribute__((always_inline))
-#ifdef __clang__
-#define no_unroll_loops
-#else
-#define no_unroll_loops __attribute__((optimize("no-unroll-loops")))
-#endif
-#define bswap16(x) __builtin_bswap16((x))
-#else
-#define likely(x) (x)
-#define unlikely(x) (x)
-#define may_alias
-#define always_inline
-#define no_unroll_loops
-#define bswap16(x) ((uint8_t)((x) >> 8) | ((uint8_t)(x) << 8))
-#endif
-
-#define ALL_ONES ~UINT64_C(0)
-
-static inline
-uint64_t load64(const void *ptr)
-{
- /* GCC will optimise this to a normal load instruction */
- uint64_t v;
- memcpy(&v, ptr, sizeof v);
- return v;
-}
-
-static inline
-uint32_t load32(const void *ptr)
-{
- /* GCC will optimise this to a normal load instruction */
- uint32_t v;
- memcpy(&v, ptr, sizeof v);
- return v;
-}
-
-static inline
-uint16_t load16(const void *ptr)
-{
- /* GCC will optimise this to a normal load instruction */
- uint16_t v;
- memcpy(&v, ptr, sizeof v);
- return v;
-}
-
-/* slurp_small() is for small buffers, don't waste cycles on alignment */
-no_unroll_loops
-always_inline
-static inline uint64_t
-slurp_small(const void *ptr, uint32_t nbytes)
-{
- const unsigned char *cptr = ptr;
- uint64_t sum = 0;
- while (nbytes >= 4)
- {
- sum += load32(cptr);
- cptr += 4;
- nbytes -= 4;
- }
- if (nbytes & 2)
- {
- sum += load16(cptr);
- cptr += 2;
- }
- if (nbytes & 1)
- {
- sum += (uint8_t) *cptr;
- }
- return sum;
-}
-
-static inline const void *
-align_ptr(const void *ptr, size_t bytes)
-{
- return (void *) ((uintptr_t) ptr & -(uintptr_t) bytes);
-}
-
-always_inline
-static inline uint16_t
-fold_and_swap(uint64_t sum, bool swap)
-{
- /* Fold 64-bit sum to 32 bits */
- sum = (sum & 0xffffffff) + (sum >> 32);
- sum = (sum & 0xffffffff) + (sum >> 32);
- Assert(sum == (uint32_t) sum);
-
- /* Fold 32-bit sum to 16 bits */
- sum = (sum & 0xffff) + (sum >> 16);
- sum = (sum & 0xffff) + (sum >> 16);
- Assert(sum == (uint16_t) sum);
-
- if (unlikely(swap)) /* Odd base pointer is unexpected */
- {
- sum = bswap16(sum);
- }
-
- return (uint16_t) sum;
-}
-
-#endif
diff --git a/networking/include/networking.h b/networking/include/networking.h
deleted file mode 100644
index a88feff..0000000
--- a/networking/include/networking.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Public API.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-unsigned short __chksum (const void *, unsigned int);
-#if __aarch64__ && __ARM_NEON
-unsigned short __chksum_aarch64_simd (const void *, unsigned int);
-#endif
-#if __arm__ && __ARM_NEON
-unsigned short __chksum_arm_simd (const void *, unsigned int);
-#endif
diff --git a/networking/test/chksum.c b/networking/test/chksum.c
deleted file mode 100644
index 41b9812..0000000
--- a/networking/test/chksum.c
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Ones' complement checksum test & benchmark
- *
- * Copyright (c) 2016-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define _GNU_SOURCE
-#include <inttypes.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <time.h>
-#include <unistd.h>
-#include "../include/networking.h"
-
-#if WANT_ASSERT
-#undef NDEBUG
-#include <assert.h>
-#define Assert(exp) assert(exp)
-#else
-#define Assert(exp) (void) (exp)
-#endif
-
-#ifdef __GNUC__
-#define may_alias __attribute__((__may_alias__))
-#else
-#define may_alias
-#endif
-
-#define CACHE_LINE 64
-#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
-
-/* Reference implementation - do not modify! */
-static uint16_t
-checksum_simple(const void *ptr, uint32_t nbytes)
-{
- const uint16_t *may_alias hptr = ptr;
- uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */
-
- /* Sum all halfwords, assume misaligned accesses are handled in HW */
- for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
- {
- sum += *hptr++;
- }
-
- /* Add any trailing odd byte */
- if ((nbytes & 0x01) != 0)
- {
- sum += *(uint8_t *) hptr;
- }
-
- /* Fold 64-bit sum to 32 bits */
- sum = (sum & 0xffffffff) + (sum >> 32);
- sum = (sum & 0xffffffff) + (sum >> 32);
- Assert(sum == (uint32_t) sum);
-
- /* Fold 32-bit sum to 16 bits */
- sum = (sum & 0xffff) + (sum >> 16);
- sum = (sum & 0xffff) + (sum >> 16);
- Assert(sum == (uint16_t) sum);
-
- return (uint16_t) sum;
-}
-
-static struct
-{
- uint16_t (*cksum_fp)(const void *, uint32_t);
- const char *name;
-} implementations[] =
-{
- { checksum_simple, "simple"},
- { __chksum, "scalar"},
-#if __arm__
- { __chksum_arm_simd, "simd" },
-#elif __aarch64__
- { __chksum_aarch64_simd, "simd" },
-#endif
- { NULL, NULL}
-};
-
-static int
-find_impl(const char *name)
-{
- for (int i = 0; implementations[i].name != NULL; i++)
- {
- if (strcmp(implementations[i].name, name) == 0)
- {
- return i;
- }
- }
- return -1;
-}
-
-static uint16_t (*CKSUM_FP)(const void *, uint32_t);
-static volatile uint16_t SINK;
-
-static bool
-verify(const void *data, uint32_t offset, uint32_t size)
-{
-
- uint16_t csum_expected = checksum_simple(data, size);
- uint16_t csum_actual = CKSUM_FP(data, size);
- if (csum_actual != csum_expected)
- {
- fprintf(stderr, "\nInvalid checksum for offset %u size %u: "
- "actual %04x expected %04x (valid)",
- offset, size, csum_actual, csum_expected);
- if (size < 65536)
- {
- /* Fatal error */
- exit(EXIT_FAILURE);
- }
- /* Else some implementations only support sizes up to 2^16 */
- return false;
- }
- return true;
-}
-
-static uint64_t
-clock_get_ns(void)
-{
- struct timespec ts;
- clock_gettime(CLOCK_MONOTONIC, &ts);
- return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
-}
-
-static void
-benchmark(const uint8_t *base,
- size_t poolsize,
- uint32_t blksize,
- uint32_t numops,
- uint64_t cpufreq)
-{
- printf("%11u ", (unsigned int) blksize); fflush(stdout);
-
- uint64_t start = clock_get_ns();
- for (uint32_t i = 0; i < numops; i ++)
- {
- /* Read a random value from the pool */
- uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
- /* Generate a random starting address */
- const void *data = &base[random % (poolsize - blksize)];
- SINK = CKSUM_FP(data, blksize);
- }
- uint64_t end = clock_get_ns();
-
-#define MEGABYTE 1000000 /* Decimal megabyte (MB) */
- uint64_t elapsed_ns = end - start;
- uint64_t elapsed_ms = elapsed_ns / 1000000;
- uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
- uint64_t accbytes = (uint64_t) numops * blksize;
- printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
- unsigned int cyc_per_blk = cpufreq / blks_per_s;
- printf("%11u ", cyc_per_blk);
- if (blksize != 0)
- {
- unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
- printf("%7u.%03u ",
- cyc_per_byte / 1000, cyc_per_byte % 1000);
- }
- printf("\n");
-}
-
-int main(int argc, char *argv[])
-{
- int c;
- bool DUMP = false;
- uint32_t IMPL = 0;/* Simple implementation */
- uint64_t CPUFREQ = 0;
- uint32_t BLKSIZE = 0;
- uint32_t NUMOPS = 1000000;
- uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */
-
- setvbuf(stdout, NULL, _IOLBF, 160);
- while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)
- {
- switch (c)
- {
- case 'b' :
- {
- int blksize = atoi(optarg);
- if (blksize < 1 || blksize > POOLSIZE / 2)
- {
- fprintf(stderr, "Invalid block size %d\n", blksize);
- exit(EXIT_FAILURE);
- }
- BLKSIZE = (unsigned) blksize;
- break;
- }
- case 'd' :
- DUMP = true;
- break;
- case 'f' :
- {
- int64_t cpufreq = atoll(optarg);
- if (cpufreq < 1)
- {
- fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",
- cpufreq);
- exit(EXIT_FAILURE);
- }
- CPUFREQ = cpufreq;
- break;
- }
- case 'i' :
- {
- int impl = find_impl(optarg);
- if (impl < 0)
- {
- fprintf(stderr, "Invalid implementation %s\n", optarg);
- goto usage;
- }
- IMPL = (unsigned) impl;
- break;
- }
- case 'n' :
- {
- int numops = atoi(optarg);
- if (numops < 1)
- {
- fprintf(stderr, "Invalid number of operations %d\n", numops);
- exit(EXIT_FAILURE);
- }
- NUMOPS = (unsigned) numops;
- break;
- }
- case 'p' :
- {
- int poolsize = atoi(optarg);
- if (poolsize < 4096)
- {
- fprintf(stderr, "Invalid pool size %d\n", poolsize);
- exit(EXIT_FAILURE);
- }
- char c = optarg[strlen(optarg) - 1];
- if (c == 'M')
- {
- POOLSIZE = (unsigned) poolsize * 1024 * 1024;
- }
- else if (c == 'K')
- {
- POOLSIZE = (unsigned) poolsize * 1024;
- }
- else
- {
- POOLSIZE = (unsigned) poolsize;
- }
- break;
- }
- default :
-usage :
- fprintf(stderr, "Usage: checksum <options>\n"
- "-b <blksize> Block size\n"
- "-d Dump first 96 bytes of data\n"
- "-f <cpufreq> CPU frequency (Hz)\n"
- "-i <impl> Implementation\n"
- "-n <numops> Number of operations\n"
- "-p <poolsize> Pool size (K or M suffix)\n"
- );
- printf("Implementations:");
- for (int i = 0; implementations[i].name != NULL; i++)
- {
- printf(" %s", implementations[i].name);
- }
- printf("\n");
- exit(EXIT_FAILURE);
- }
- }
- if (optind > argc)
- {
- goto usage;
- }
-
- CKSUM_FP = implementations[IMPL].cksum_fp;
- POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
- uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,
- MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
- if (base == MAP_FAILED)
- {
- perror("aligned_alloc"), exit(EXIT_FAILURE);
- }
- for (size_t i = 0; i < POOLSIZE / 4; i++)
- {
- ((uint32_t *) base)[i] = rand();
- }
-
- printf("Implementation: %s\n", implementations[IMPL].name);
- printf("numops %u, poolsize ", NUMOPS);
- if (POOLSIZE % (1024 * 1024) == 0)
- {
- printf("%uMiB", POOLSIZE / (1024 * 1024));
- }
- else if (POOLSIZE % 1024 == 0)
- {
- printf("%uKiB", POOLSIZE / 1024);
- }
- else
- {
- printf("%uB", POOLSIZE);
- }
- printf(", blocksize %u, CPU frequency %juMHz\n",
- BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
-#if WANT_ASSERT
- printf("Warning: assertions are enabled\n");
-#endif
-
- if (DUMP)
- {
- /* Print out first 96 bytes of data for human debugging */
- for (int i = 0; i < 96; i++)
- {
- if (i % 8 == 0)
- printf("%2u:", i);
- printf(" %02x", base[i]);
- if (i % 8 == 7)
- printf("\n");
- }
- }
-
- /* Verify that chosen algorithm handles all combinations of offsets and sizes */
- printf("Verifying..."); fflush(stdout);
- bool success = true;
- /* Check all (relevant) combinations of size and offset */
- for (int size = 0; size <= 256; size++)
- {
- for (int offset = 0; offset < 255; offset++)
- {
- /* Check at start of mapped memory */
- success &= verify(&base[offset], offset, size);
- /* Check at end of mapped memory */
- uint8_t *p = base + POOLSIZE - (size + offset);
- success &= verify(p, (uintptr_t) p % 64, size);
- }
- }
- /* Check increasingly larger sizes */
- for (size_t size = 1; size < POOLSIZE; size *= 2)
- {
- success &= verify(base, 0, size);
- }
- /* Check the full size, this can detect accumulator overflows */
- success &= verify(base, 0, POOLSIZE);
- printf("%s\n", success ? "OK" : "failure");
-
- /* Print throughput in decimal megabyte (1000000B) per second */
- if (CPUFREQ != 0)
- {
- printf("%11s %11s %11s %11s\n",
- "block size", "MB/s", "cycles/blk", "cycles/byte");
- }
- else
- {
- printf("%11s %11s %11s %11s\n",
- "block size", "MB/s", "ns/blk", "ns/byte");
- CPUFREQ = 1000000000;
- }
- if (BLKSIZE != 0)
- {
- benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);
- }
- else
- {
- static const uint16_t sizes[] =
- { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
- for (int i = 0; sizes[i] != 0; i++)
- {
- uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
- benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);
- }
- }
-
- if (munmap(base, POOLSIZE) != 0)
- {
- perror("munmap"), exit(EXIT_FAILURE);
- }
-
- return success ? EXIT_SUCCESS : EXIT_FAILURE;
-}
diff --git a/run-arm-optimized-routines-tests-on-android.sh b/run-arm-optimized-routines-tests-on-android.sh
index 21163a3..61efeaf 100755
--- a/run-arm-optimized-routines-tests-on-android.sh
+++ b/run-arm-optimized-routines-tests-on-android.sh
@@ -25,20 +25,16 @@ check_failure() {
}
# Run the 32-bit tests.
-if [ -e "$ANDROID_PRODUCT_OUT/data/nativetest/mathtest/mathtest" ]; then
- adb shell /data/nativetest/mathtest/mathtest /data/nativetest/mathtest/math/test/testcases/directed/*
- check_failure
-fi
+adb shell /data/nativetest/mathtest/mathtest /data/nativetest/mathtest/math/test/testcases/directed/*
+check_failure
# TODO: these tests are currently a bloodbath.
#adb shell 'cp /data/nativetest/ulp/math/test/runulp.sh /data/nativetest/ulp/ && sh /data/nativetest/ulp/runulp.sh'
#check_failure
# Run the 64-bit tests.
-if [ -e "$ANDROID_PRODUCT_OUT/data/nativetest64/mathtest/mathtest" ]; then
- adb shell /data/nativetest64/mathtest/mathtest /data/nativetest64/mathtest/math/test/testcases/directed/*
- check_failure
-fi
+adb shell /data/nativetest64/mathtest/mathtest /data/nativetest64/mathtest/math/test/testcases/directed/*
+check_failure
# TODO: these tests are currently a bloodbath.
#adb shell 'cp /data/nativetest64/ulp/math/test/runulp.sh /data/nativetest64/ulp/ && sh /data/nativetest64/ulp/runulp.sh'
diff --git a/string/Dir.mk b/string/Dir.mk
index cf3453f..470917a 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -1,20 +1,13 @@
# Makefile fragment - requires GNU make
#
-# Copyright (c) 2019-2021, Arm Limited.
+# Copyright (c) 2019, Arm Limited.
# SPDX-License-Identifier: MIT
S := $(srcdir)/string
B := build/string
-ifeq ($(ARCH),)
-all-string bench-string check-string install-string clean-string:
- @echo "*** Please set ARCH in config.mk. ***"
- @exit 1
-else
-
-string-lib-srcs := $(wildcard $(S)/$(ARCH)/*.[cS])
+string-lib-srcs := $(wildcard $(S)/*.[cS])
string-test-srcs := $(wildcard $(S)/test/*.c)
-string-bench-srcs := $(wildcard $(S)/bench/*.c)
string-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
@@ -22,17 +15,13 @@ string-libs := \
build/lib/libstringlib.so \
build/lib/libstringlib.a \
-string-tests := \
+string-tools := \
build/bin/test/memcpy \
build/bin/test/memmove \
build/bin/test/memset \
build/bin/test/memchr \
- build/bin/test/memrchr \
build/bin/test/memcmp \
- build/bin/test/__mtag_tag_region \
- build/bin/test/__mtag_tag_zero_region \
build/bin/test/strcpy \
- build/bin/test/stpcpy \
build/bin/test/strcmp \
build/bin/test/strchr \
build/bin/test/strrchr \
@@ -41,34 +30,25 @@ string-tests := \
build/bin/test/strnlen \
build/bin/test/strncmp
-string-benches := \
- build/bin/bench/memcpy \
- build/bin/bench/strlen
-
string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs)))
-string-bench-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-bench-srcs)))
string-objs := \
$(string-lib-objs) \
$(string-lib-objs:%.o=%.os) \
$(string-test-objs) \
- $(string-bench-objs)
string-files := \
$(string-objs) \
$(string-libs) \
- $(string-tests) \
- $(string-benches) \
+ $(string-tools) \
$(string-includes) \
-all-string: $(string-libs) $(string-tests) $(string-benches) $(string-includes)
+all-string: $(string-libs) $(string-tools) $(string-includes)
$(string-objs): $(string-includes)
$(string-objs): CFLAGS_ALL += $(string-cflags)
-$(string-test-objs): CFLAGS_ALL += -D_GNU_SOURCE
-
build/lib/libstringlib.so: $(string-lib-objs:%.o=%.os)
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -shared -o $@ $^
@@ -80,27 +60,26 @@ build/lib/libstringlib.a: $(string-lib-objs)
build/bin/test/%: $(B)/test/%.o build/lib/libstringlib.a
$(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
-build/bin/bench/%: $(B)/bench/%.o build/lib/libstringlib.a
- $(CC) $(CFLAGS_ALL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS)
-
build/include/%.h: $(S)/include/%.h
cp $< $@
build/bin/%.sh: $(S)/test/%.sh
cp $< $@
-string-tests-out = $(string-tests:build/bin/test/%=build/string/test/%.out)
-
-build/string/test/%.out: build/bin/test/%
- $(EMULATOR) $^ | tee $@.tmp
- mv $@.tmp $@
-
-check-string: $(string-tests-out)
- ! grep FAIL $^
-
-bench-string: $(string-benches)
- $(EMULATOR) build/bin/bench/strlen
- $(EMULATOR) build/bin/bench/memcpy
+check-string: $(string-tools)
+ $(EMULATOR) build/bin/test/memcpy
+ $(EMULATOR) build/bin/test/memmove
+ $(EMULATOR) build/bin/test/memset
+ $(EMULATOR) build/bin/test/memchr
+ $(EMULATOR) build/bin/test/memcmp
+ $(EMULATOR) build/bin/test/strcpy
+ $(EMULATOR) build/bin/test/strcmp
+ $(EMULATOR) build/bin/test/strchr
+ $(EMULATOR) build/bin/test/strrchr
+ $(EMULATOR) build/bin/test/strchrnul
+ $(EMULATOR) build/bin/test/strlen
+ $(EMULATOR) build/bin/test/strnlen
+ $(EMULATOR) build/bin/test/strncmp
install-string: \
$(string-libs:build/lib/%=$(DESTDIR)$(libdir)/%) \
@@ -108,6 +87,5 @@ install-string: \
clean-string:
rm -f $(string-files)
-endif
-.PHONY: all-string bench-string check-string install-string clean-string
+.PHONY: all-string check-string install-string clean-string
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
deleted file mode 100644
index 84339f7..0000000
--- a/string/aarch64/__mtag_tag_region.S
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * __mtag_tag_region - tag memory
- *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, MTE, LP64 ABI.
- *
- * Interface contract:
- * Address is 16 byte aligned and size is multiple of 16.
- * Returns the passed pointer.
- * The memory region may remain untagged if tagging is not enabled.
- */
-
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_MEMORY_TAGGING
-
-#define dstin x0
-#define count x1
-#define dst x2
-#define dstend x3
-#define tmp x4
-#define zva_val x4
-
-ENTRY (__mtag_tag_region)
- PTR_ARG (0)
- SIZE_ARG (1)
-
- add dstend, dstin, count
-
- cmp count, 96
- b.hi L(set_long)
-
- tbnz count, 6, L(set96)
-
- /* Set 0, 16, 32, or 48 bytes. */
- lsr tmp, count, 5
- add tmp, dstin, tmp, lsl 4
- cbz count, L(end)
- stg dstin, [dstin]
- stg dstin, [tmp]
- stg dstin, [dstend, -16]
-L(end):
- ret
-
- .p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set96):
- st2g dstin, [dstin]
- st2g dstin, [dstin, 32]
- st2g dstin, [dstend, -32]
- ret
-
- .p2align 4
- /* Size is > 96 bytes. */
-L(set_long):
- cmp count, 160
- b.lo L(no_zva)
-
-#ifndef SKIP_ZVA_CHECK
- mrs zva_val, dczid_el0
- and zva_val, zva_val, 31
- cmp zva_val, 4 /* ZVA size is 64 bytes. */
- b.ne L(no_zva)
-#endif
- st2g dstin, [dstin]
- st2g dstin, [dstin, 32]
- bic dst, dstin, 63
- sub count, dstend, dst /* Count is now 64 too large. */
- sub count, count, 128 /* Adjust count and bias for loop. */
-
- .p2align 4
-L(zva_loop):
- add dst, dst, 64
- dc gva, dst
- subs count, count, 64
- b.hi L(zva_loop)
- st2g dstin, [dstend, -64]
- st2g dstin, [dstend, -32]
- ret
-
-L(no_zva):
- sub dst, dstin, 32 /* Dst is biased by -32. */
- sub count, count, 64 /* Adjust count for loop. */
-L(no_zva_loop):
- st2g dstin, [dst, 32]
- st2g dstin, [dst, 64]!
- subs count, count, 64
- b.hi L(no_zva_loop)
- st2g dstin, [dstend, -64]
- st2g dstin, [dstend, -32]
- ret
-
-END (__mtag_tag_region)
-#endif
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
deleted file mode 100644
index f58364c..0000000
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * __mtag_tag_zero_region - tag memory and fill it with zero bytes
- *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, MTE, LP64 ABI.
- *
- * Interface contract:
- * Address is 16 byte aligned and size is multiple of 16.
- * Returns the passed pointer.
- * The memory region may remain untagged if tagging is not enabled.
- */
-
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_MEMORY_TAGGING
-
-#define dstin x0
-#define count x1
-#define dst x2
-#define dstend x3
-#define tmp x4
-#define zva_val x4
-
-ENTRY (__mtag_tag_zero_region)
- PTR_ARG (0)
- SIZE_ARG (1)
-
- add dstend, dstin, count
-
- cmp count, 96
- b.hi L(set_long)
-
- tbnz count, 6, L(set96)
-
- /* Set 0, 16, 32, or 48 bytes. */
- lsr tmp, count, 5
- add tmp, dstin, tmp, lsl 4
- cbz count, L(end)
- stzg dstin, [dstin]
- stzg dstin, [tmp]
- stzg dstin, [dstend, -16]
-L(end):
- ret
-
- .p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set96):
- stz2g dstin, [dstin]
- stz2g dstin, [dstin, 32]
- stz2g dstin, [dstend, -32]
- ret
-
- .p2align 4
- /* Size is > 96 bytes. */
-L(set_long):
- cmp count, 160
- b.lo L(no_zva)
-
-#ifndef SKIP_ZVA_CHECK
- mrs zva_val, dczid_el0
- and zva_val, zva_val, 31
- cmp zva_val, 4 /* ZVA size is 64 bytes. */
- b.ne L(no_zva)
-#endif
- stz2g dstin, [dstin]
- stz2g dstin, [dstin, 32]
- bic dst, dstin, 63
- sub count, dstend, dst /* Count is now 64 too large. */
- sub count, count, 128 /* Adjust count and bias for loop. */
-
- .p2align 4
-L(zva_loop):
- add dst, dst, 64
- dc gzva, dst
- subs count, count, 64
- b.hi L(zva_loop)
- stz2g dstin, [dstend, -64]
- stz2g dstin, [dstend, -32]
- ret
-
-L(no_zva):
- sub dst, dstin, 32 /* Dst is biased by -32. */
- sub count, count, 64 /* Adjust count for loop. */
-L(no_zva_loop):
- stz2g dstin, [dst, 32]
- stz2g dstin, [dst, 64]!
- subs count, count, 64
- b.hi L(no_zva_loop)
- stz2g dstin, [dstend, -64]
- stz2g dstin, [dstend, -32]
- ret
-
-END (__mtag_tag_zero_region)
-#endif
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
deleted file mode 100644
index 5a54242..0000000
--- a/string/aarch64/check-arch.S
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * check ARCH setting.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if !__aarch64__
-# error ARCH setting does not match the compiler.
-#endif
-
-/* Include for GNU property notes. */
-#include "../asmdefs.h"
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
deleted file mode 100644
index c2e967d..0000000
--- a/string/aarch64/memchr-mte.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * memchr - find a character in a memory zone
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define srcin x0
-#define chrin w1
-#define cntin x2
-#define result x0
-
-#define src x3
-#define cntrem x4
-#define synd x5
-#define shift x6
-#define tmp x7
-#define wtmp w7
-
-#define vrepchr v0
-#define qdata q1
-#define vdata v1
-#define vhas_chr v2
-#define vrepmask v3
-#define vend v4
-#define dend d4
-
-/*
- Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
-
-ENTRY (__memchr_aarch64_mte)
- PTR_ARG (0)
- SIZE_ARG (2)
- bic src, srcin, 15
- cbz cntin, L(nomatch)
- ld1 {vdata.16b}, [src]
- dup vrepchr.16b, chrin
- mov wtmp, 0xf00f
- dup vrepmask.8h, wtmp
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- lsl shift, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
- lsr synd, synd, shift
- cbz synd, L(start_loop)
-
- rbit synd, synd
- clz synd, synd
- add result, srcin, synd, lsr 2
- cmp cntin, synd, lsr 2
- csel result, result, xzr, hi
- ret
-
-L(start_loop):
- sub tmp, src, srcin
- add tmp, tmp, 16
- subs cntrem, cntin, tmp
- b.ls L(nomatch)
-
- /* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
-
- .p2align 4
-L(loop32):
- ldr qdata, [src, 16]!
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
- cbnz synd, L(end)
-
-L(loop32_2):
- ldr qdata, [src, 16]!
- subs cntrem, cntrem, 32
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- b.ls L(end)
- umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
- cbz synd, L(loop32)
-L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
- add tmp, srcin, cntin
- sub cntrem, tmp, src
-#ifndef __AARCH64EB__
- rbit synd, synd
-#endif
- clz synd, synd
- cmp cntrem, synd, lsr 2
- add result, src, synd, lsr 2
- csel result, result, xzr, hi
- ret
-
-L(nomatch):
- mov result, 0
- ret
-
-END (__memchr_aarch64_mte)
-
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index c22e659..0d75acd 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -1,27 +1,28 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_SVE
/* Assumptions:
*
* ARMv8-a, AArch64
* SVE Available.
*/
-ENTRY (__memchr_aarch64_sve)
- PTR_ARG (0)
- SIZE_ARG (2)
+ .arch armv8-a+sve
+ .text
+
+ .globl __memchr_aarch64_sve
+ .type __memchr_aarch64_sve, %function
+ .p2align 4
+__memchr_aarch64_sve:
dup z1.b, w1 /* duplicate c to a vector */
setffr /* initialize FFR */
mov x3, 0 /* initialize off */
+ nop
- .p2align 4
0: whilelo p1.b, x3, x2 /* make sure off < max */
b.none 9f
@@ -58,7 +59,4 @@ ENTRY (__memchr_aarch64_sve)
9: mov x0, 0 /* return null */
ret
-END (__memchr_aarch64_sve)
-
-#endif
-
+ .size __memchr_aarch64_sve, . - __memchr_aarch64_sve
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 353f0d1..10be49e 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -1,7 +1,7 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2014-2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -47,8 +47,6 @@
*/
ENTRY (__memchr_aarch64)
- PTR_ARG (0)
- SIZE_ARG (2)
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, L(zero_length)
/*
@@ -112,7 +110,7 @@ L(end):
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.d[0]
/* Only do the clear for the last possible block */
- b.hs L(tail)
+ b.hi L(tail)
L(masklast):
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
@@ -143,4 +141,3 @@ L(zero_length):
ret
END (__memchr_aarch64)
-
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index 78c5eca..d4f6026 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -1,23 +1,23 @@
/*
* memcmp - compare memory
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_SVE
/* Assumptions:
*
* ARMv8-a, AArch64
* SVE Available.
*/
-ENTRY (__memcmp_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
+ .arch armv8-a+sve
+ .text
+
+ .globl __memcmp_aarch64_sve
+ .type __memcmp_aarch64_sve, %function
+ .p2align 4
+__memcmp_aarch64_sve:
mov x3, 0 /* initialize off */
0: whilelo p0.b, x3, x2 /* while off < max */
@@ -45,7 +45,4 @@ ENTRY (__memcmp_aarch64_sve)
9: mov x0, 0 /* return equality */
ret
-END (__memcmp_aarch64_sve)
-
-#endif
-
+ .size __memcmp_aarch64_sve, . - __memcmp_aarch64_sve
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 3b10266..6722516 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,6 +1,6 @@
/* memcmp - compare memory
*
- * Copyright (c) 2013-2020, Arm Limited.
+ * Copyright (c) 2013, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -28,9 +28,6 @@
#define tmp2 x8
ENTRY (__memcmp_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
subs limit, limit, 8
b.lo L(less8)
@@ -134,4 +131,3 @@ L(byte_loop):
ret
END (__memcmp_aarch64)
-
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
deleted file mode 100644
index f97f2c3..0000000
--- a/string/aarch64/memcpy-advsimd.S
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * memcpy - copy memory area
- *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
- *
- */
-
-#include "../asmdefs.h"
-
-#define dstin x0
-#define src x1
-#define count x2
-#define dst x3
-#define srcend x4
-#define dstend x5
-#define A_l x6
-#define A_lw w6
-#define A_h x7
-#define B_l x8
-#define B_lw w8
-#define B_h x9
-#define C_lw w10
-#define tmp1 x14
-
-#define A_q q0
-#define B_q q1
-#define C_q q2
-#define D_q q3
-#define E_q q4
-#define F_q q5
-#define G_q q6
-#define H_q q7
-
-/* This implementation handles overlaps and supports both memcpy and memmove
- from a single entry point. It uses unaligned accesses and branchless
- sequences to keep the code small, simple and improve performance.
-
- Copies are split into 3 main cases: small copies of up to 32 bytes, medium
- copies of up to 128 bytes, and large copies. The overhead of the overlap
- check is negligible since it is only required for large copies.
-
- Large copies use a software pipelined loop processing 64 bytes per iteration.
- The source pointer is 16-byte aligned to minimize unaligned accesses.
- The loop tail is handled by always copying 64 bytes from the end.
-*/
-
-ENTRY_ALIAS (__memmove_aarch64_simd)
-ENTRY (__memcpy_aarch64_simd)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
- add srcend, src, count
- add dstend, dstin, count
- cmp count, 128
- b.hi L(copy_long)
- cmp count, 32
- b.hi L(copy32_128)
-
- /* Small copies: 0..32 bytes. */
- cmp count, 16
- b.lo L(copy16)
- ldr A_q, [src]
- ldr B_q, [srcend, -16]
- str A_q, [dstin]
- str B_q, [dstend, -16]
- ret
-
- /* Copy 8-15 bytes. */
-L(copy16):
- tbz count, 3, L(copy8)
- ldr A_l, [src]
- ldr A_h, [srcend, -8]
- str A_l, [dstin]
- str A_h, [dstend, -8]
- ret
-
- .p2align 3
- /* Copy 4-7 bytes. */
-L(copy8):
- tbz count, 2, L(copy4)
- ldr A_lw, [src]
- ldr B_lw, [srcend, -4]
- str A_lw, [dstin]
- str B_lw, [dstend, -4]
- ret
-
- /* Copy 0..3 bytes using a branchless sequence. */
-L(copy4):
- cbz count, L(copy0)
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb C_lw, [srcend, -1]
- ldrb B_lw, [src, tmp1]
- strb A_lw, [dstin]
- strb B_lw, [dstin, tmp1]
- strb C_lw, [dstend, -1]
-L(copy0):
- ret
-
- .p2align 4
- /* Medium copies: 33..128 bytes. */
-L(copy32_128):
- ldp A_q, B_q, [src]
- ldp C_q, D_q, [srcend, -32]
- cmp count, 64
- b.hi L(copy128)
- stp A_q, B_q, [dstin]
- stp C_q, D_q, [dstend, -32]
- ret
-
- .p2align 4
- /* Copy 65..128 bytes. */
-L(copy128):
- ldp E_q, F_q, [src, 32]
- cmp count, 96
- b.ls L(copy96)
- ldp G_q, H_q, [srcend, -64]
- stp G_q, H_q, [dstend, -64]
-L(copy96):
- stp A_q, B_q, [dstin]
- stp E_q, F_q, [dstin, 32]
- stp C_q, D_q, [dstend, -32]
- ret
-
- /* Copy more than 128 bytes. */
-L(copy_long):
- /* Use backwards copy if there is an overlap. */
- sub tmp1, dstin, src
- cmp tmp1, count
- b.lo L(copy_long_backwards)
-
- /* Copy 16 bytes and then align src to 16-byte alignment. */
- ldr D_q, [src]
- and tmp1, src, 15
- bic src, src, 15
- sub dst, dstin, tmp1
- add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_q, B_q, [src, 16]
- str D_q, [dstin]
- ldp C_q, D_q, [src, 48]
- subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(copy64_from_end)
-L(loop64):
- stp A_q, B_q, [dst, 16]
- ldp A_q, B_q, [src, 80]
- stp C_q, D_q, [dst, 48]
- ldp C_q, D_q, [src, 112]
- add src, src, 64
- add dst, dst, 64
- subs count, count, 64
- b.hi L(loop64)
-
- /* Write the last iteration and copy 64 bytes from the end. */
-L(copy64_from_end):
- ldp E_q, F_q, [srcend, -64]
- stp A_q, B_q, [dst, 16]
- ldp A_q, B_q, [srcend, -32]
- stp C_q, D_q, [dst, 48]
- stp E_q, F_q, [dstend, -64]
- stp A_q, B_q, [dstend, -32]
- ret
-
- /* Large backwards copy for overlapping copies.
- Copy 16 bytes and then align srcend to 16-byte alignment. */
-L(copy_long_backwards):
- cbz tmp1, L(copy0)
- ldr D_q, [srcend, -16]
- and tmp1, srcend, 15
- bic srcend, srcend, 15
- sub count, count, tmp1
- ldp A_q, B_q, [srcend, -32]
- str D_q, [dstend, -16]
- ldp C_q, D_q, [srcend, -64]
- sub dstend, dstend, tmp1
- subs count, count, 128
- b.ls L(copy64_from_start)
-
-L(loop64_backwards):
- str B_q, [dstend, -16]
- str A_q, [dstend, -32]
- ldp A_q, B_q, [srcend, -96]
- str D_q, [dstend, -48]
- str C_q, [dstend, -64]!
- ldp C_q, D_q, [srcend, -128]
- sub srcend, srcend, 64
- subs count, count, 64
- b.hi L(loop64_backwards)
-
- /* Write the last iteration and copy 64 bytes from the start. */
-L(copy64_from_start):
- ldp E_q, F_q, [src, 32]
- stp A_q, B_q, [dstend, -32]
- ldp A_q, B_q, [src]
- stp C_q, D_q, [dstend, -64]
- stp E_q, F_q, [dstin, 32]
- stp A_q, B_q, [dstin]
- ret
-
-END (__memcpy_aarch64_simd)
-
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index dd254f6..1aad88e 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -1,7 +1,7 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2012-2020, Arm Limited.
+ * Copyright (c) 2012-2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -22,11 +22,11 @@
#define A_l x6
#define A_lw w6
#define A_h x7
+#define A_hw w7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_l x10
-#define C_lw w10
#define C_h x11
#define D_l x12
#define D_h x13
@@ -40,117 +40,119 @@
#define H_h srcend
#define tmp1 x14
-/* This implementation handles overlaps and supports both memcpy and memmove
- from a single entry point. It uses unaligned accesses and branchless
- sequences to keep the code small, simple and improve performance.
+/* This implementation of memcpy correctly handles overlaps, therefore
+ __memmove_aarch64 aliases to __memcpy_aarch64. By moving the src and
+ dst buffer overlap check from the start of memmove code to the
+ beginning of large copy code, the overhead of combining memcpy
+ and memmove implementations is negligible.
- Copies are split into 3 main cases: small copies of up to 32 bytes, medium
- copies of up to 128 bytes, and large copies. The overhead of the overlap
- check is negligible since it is only required for large copies.
+ Copies are split into 3 main cases: small copies of up to 16 bytes,
+ medium copies of 17..128 bytes which are fully unrolled, and large
+ copies (moves).
- Large copies use a software pipelined loop processing 64 bytes per iteration.
- The destination pointer is 16-byte aligned to minimize unaligned accesses.
- The loop tail is handled by always copying 64 bytes from the end.
+ Large forward moves align the destination and use an unrolled loop
+ processing 64 bytes per iteration.
+
+ Large backward moves align dstend and use an unrolled loop processing
+ 64 bytes per iteration.
*/
-ENTRY_ALIAS (__memmove_aarch64)
ENTRY (__memcpy_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
+ENTRY_ALIAS (__memmove_aarch64)
add srcend, src, count
add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
cmp count, 128
- b.hi L(copy_long)
- cmp count, 32
- b.hi L(copy32_128)
+ b.hi L(move_long)
- /* Small copies: 0..32 bytes. */
- cmp count, 16
- b.lo L(copy16)
+ /* Medium copies: 17..128 bytes. */
ldp A_l, A_h, [src]
ldp D_l, D_h, [srcend, -16]
+ cmp count, 32
+ b.hi L(copy33_128)
stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16]
ret
- /* Copy 8-15 bytes. */
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
L(copy16):
- tbz count, 3, L(copy8)
+ /* 8-15 bytes. */
+ cmp count, 8
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
- .p2align 3
- /* Copy 4-7 bytes. */
-L(copy8):
- tbz count, 2, L(copy4)
+ .p2align 4
+1:
+ /* 4-7 bytes. */
+ tbz count, 2, 1f
ldr A_lw, [src]
- ldr B_lw, [srcend, -4]
+ ldr A_hw, [srcend, -4]
str A_lw, [dstin]
- str B_lw, [dstend, -4]
+ str A_hw, [dstend, -4]
ret
- /* Copy 0..3 bytes using a branchless sequence. */
-L(copy4):
- cbz count, L(copy0)
+ .p2align 4
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+1:
+ cbz count, 2f
lsr tmp1, count, 1
ldrb A_lw, [src]
- ldrb C_lw, [srcend, -1]
+ ldrb A_hw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
- strb C_lw, [dstend, -1]
-L(copy0):
- ret
+ strb A_hw, [dstend, -1]
+2: ret
.p2align 4
- /* Medium copies: 33..128 bytes. */
-L(copy32_128):
- ldp A_l, A_h, [src]
+ /* Copy 33..128 bytes. */
+L(copy33_128):
ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [srcend, -32]
- ldp D_l, D_h, [srcend, -16]
cmp count, 64
- b.hi L(copy128)
+ b.hi L(copy65_128)
stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstend, -32]
- stp D_l, D_h, [dstend, -16]
ret
.p2align 4
/* Copy 65..128 bytes. */
-L(copy128):
+L(copy65_128):
ldp E_l, E_h, [src, 32]
ldp F_l, F_h, [src, 48]
- cmp count, 96
- b.ls L(copy96)
ldp G_l, G_h, [srcend, -64]
ldp H_l, H_h, [srcend, -48]
- stp G_l, G_h, [dstend, -64]
- stp H_l, H_h, [dstend, -48]
-L(copy96):
stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
stp E_l, E_h, [dstin, 32]
stp F_l, F_h, [dstin, 48]
- stp C_l, C_h, [dstend, -32]
- stp D_l, D_h, [dstend, -16]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
ret
.p2align 4
- /* Copy more than 128 bytes. */
-L(copy_long):
- /* Use backwards copy if there is an overlap. */
- sub tmp1, dstin, src
+ /* Move more than 128 bytes. */
+L(move_long):
+ sub tmp1, dstin, src /* Overlap check. */
cbz tmp1, L(copy0)
cmp tmp1, count
- b.lo L(copy_long_backwards)
+ b.lo L(move_long_backwards)
- /* Copy 16 bytes and then align dst to 16-byte alignment. */
+ /* Align dst to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 128 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
ldp D_l, D_h, [src]
and tmp1, dstin, 15
@@ -177,7 +179,9 @@ L(loop64):
subs count, count, 64
b.hi L(loop64)
- /* Write the last iteration and copy 64 bytes from the end. */
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
L(copy64_from_end):
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
@@ -191,13 +195,20 @@ L(copy64_from_end):
stp A_l, A_h, [dstend, -48]
stp B_l, B_h, [dstend, -32]
stp C_l, C_h, [dstend, -16]
+
+L(copy0):
ret
.p2align 4
- /* Large backwards copy for overlapping copies.
- Copy 16 bytes and then align dst to 16-byte alignment. */
-L(copy_long_backwards):
+ /* Move more than 128 bytes where src and dst buffers overlap
+ and dst > src.
+
+ Align dstend to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 128 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+L(move_long_backwards):
ldp D_l, D_h, [srcend, -16]
and tmp1, dstend, 15
sub srcend, srcend, tmp1
@@ -223,7 +234,9 @@ L(loop64_backwards):
subs count, count, 64
b.hi L(loop64_backwards)
- /* Write the last iteration and copy 64 bytes from the start. */
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
L(copy64_from_start):
ldp G_l, G_h, [src, 48]
stp A_l, A_h, [dstend, -16]
@@ -240,4 +253,3 @@ L(copy64_from_start):
ret
END (__memcpy_aarch64)
-
diff --git a/string/aarch64/memcpy_simd.S b/string/aarch64/memcpy_simd.S
new file mode 100644
index 0000000..fa2442f
--- /dev/null
+++ b/string/aarch64/memcpy_simd.S
@@ -0,0 +1,265 @@
+/*
+ * memcpy/memmove using SIMD registers
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#include "../asmdefs.h"
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
+#define tmp1 x14
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
+/* This implementation of memcpy correctly handles overlaps, therefore
+ __memmove_aarch64_simd aliases to __memcpy_aarch64_simd. By moving the
+ src and dst buffer overlap check from the start of memmove code to the
+ beginning of large copy code, the overhead of combining memcpy
+ and memmove implementations is negligible.
+
+ Copies are split into 3 main cases: small copies of up to 16 bytes,
+ medium copies of 17..128 bytes which are fully unrolled, and large
+ copies (moves).
+
+ Large forward moves align the source and use an unrolled loop
+ processing 64 bytes per iteration.
+
+ Large backward moves align srcend and use an unrolled loop processing
+ 64 bytes per iteration.
+*/
+
+ENTRY (__memcpy_aarch64_simd)
+ENTRY_ALIAS (__memmove_aarch64_simd)
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16_simd)
+ cmp count, 128
+ b.hi L(move_long_simd)
+
+ /* Medium copies: 17..128 bytes. */
+ ldr A_q, [src]
+ ldr D_q, [srcend, -16]
+ cmp count, 32
+ b.hi L(copy33_128_simd)
+ str A_q, [dstin]
+ str D_q, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16_simd):
+ /* 8-15 bytes. */
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ .p2align 4
+1:
+ /* 4-7 bytes. */
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ .p2align 4
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+1:
+ cbz count, 2f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+2: ret
+
+ .p2align 4
+ /* Copy 33..128 bytes. */
+L(copy33_128_simd):
+ ldr B_q, [src, 16]
+ ldr C_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy65_128_simd)
+ str A_q, [dstin]
+ str D_q, [dstend, -16]
+ str B_q, [dstin, 16]
+ str C_q, [dstend, -32]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy65_128_simd):
+ ldr E_q, [src, 32]
+ ldr F_q, [src, 48]
+ ldr G_q, [srcend, -64]
+ ldr H_q, [srcend, -48]
+ str A_q, [dstin]
+ str D_q, [dstend, -16]
+ str B_q, [dstin, 16]
+ str C_q, [dstend, -32]
+ str E_q, [dstin, 32]
+ str F_q, [dstin, 48]
+ str G_q, [dstend, -64]
+ str H_q, [dstend, -48]
+ ret
+
+ .p2align 4
+ /* Move more than 128 bytes. */
+L(move_long_simd):
+ sub tmp1, dstin, src /* Overlap check. */
+ cbz tmp1, L(copy0_simd)
+ cmp tmp1, count
+ b.lo L(move_long_backwards_simd)
+
+ /* Align src to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 128 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldr A_q, [src, 16]
+ str D_q, [dstin]
+ ldr B_q, [src, 32]
+ ldr C_q, [src, 48]
+ ldr D_q, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end_simd)
+
+L(loop64_simd):
+ str A_q, [dst, 16]
+ ldr A_q, [src, 16]
+ str B_q, [dst, 32]
+ ldr B_q, [src, 32]
+ str C_q, [dst, 48]
+ ldr C_q, [src, 48]
+ str D_q, [dst, 64]!
+ ldr D_q, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64_simd)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+L(copy64_from_end_simd):
+ ldr E_q, [srcend, -64]
+ str A_q, [dst, 16]
+ ldr A_q, [srcend, -48]
+ str B_q, [dst, 32]
+ ldr B_q, [srcend, -32]
+ str C_q, [dst, 48]
+ ldr C_q, [srcend, -16]
+ str D_q, [dst, 64]
+ str E_q, [dstend, -64]
+ str A_q, [dstend, -48]
+ str B_q, [dstend, -32]
+ str C_q, [dstend, -16]
+
+L(copy0_simd):
+ ret
+
+ .p2align 4
+
+ /* Move more than 128 bytes where src and dst buffers overlap
+ and dst > src.
+
+ Align srcend to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 128 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+L(move_long_backwards_simd):
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldr A_q, [srcend, -16]
+ str D_q, [dstend, -16]
+ ldr B_q, [srcend, -32]
+ ldr C_q, [srcend, -48]
+ ldr D_q, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start_simd)
+
+L(loop64_backwards_simd):
+ str A_q, [dstend, -16]
+ ldr A_q, [srcend, -16]
+ str B_q, [dstend, -32]
+ ldr B_q, [srcend, -32]
+ str C_q, [dstend, -48]
+ ldr C_q, [srcend, -48]
+ str D_q, [dstend, -64]!
+ ldr D_q, [srcend, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards_simd)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
+L(copy64_from_start_simd):
+ ldr G_q, [src, 48]
+ str A_q, [dstend, -16]
+ ldr A_q, [src, 32]
+ str B_q, [dstend, -32]
+ ldr B_q, [src, 16]
+ str C_q, [dstend, -48]
+ ldr C_q, [src]
+ str D_q, [dstend, -64]
+ str G_q, [dstin, 48]
+ str A_q, [dstin, 32]
+ str B_q, [dstin, 16]
+ str C_q, [dstin]
+ ret
+
+END (__memcpy_aarch64_simd)
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
deleted file mode 100644
index 7b4be84..0000000
--- a/string/aarch64/memrchr.S
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * memrchr - find last character in a memory zone.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define srcin x0
-#define chrin w1
-#define cntin x2
-#define result x0
-
-#define src x3
-#define cntrem x4
-#define synd x5
-#define shift x6
-#define tmp x7
-#define wtmp w7
-#define end x8
-#define endm1 x9
-
-#define vrepchr v0
-#define qdata q1
-#define vdata v1
-#define vhas_chr v2
-#define vrepmask v3
-#define vend v4
-#define dend d4
-
-/*
- Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
-
-ENTRY (__memrchr_aarch64)
- PTR_ARG (0)
- add end, srcin, cntin
- sub endm1, end, 1
- bic src, endm1, 15
- cbz cntin, L(nomatch)
- ld1 {vdata.16b}, [src]
- dup vrepchr.16b, chrin
- mov wtmp, 0xf00f
- dup vrepmask.8h, wtmp
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- neg shift, end, lsl 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
- lsl synd, synd, shift
- cbz synd, L(start_loop)
-
- clz synd, synd
- sub result, endm1, synd, lsr 2
- cmp cntin, synd, lsr 2
- csel result, result, xzr, hi
- ret
-
-L(start_loop):
- sub tmp, end, src
- subs cntrem, cntin, tmp
- b.ls L(nomatch)
-
- /* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
-
- .p2align 4
-L(loop32):
- ldr qdata, [src, -16]!
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
- cbnz synd, L(end)
-
-L(loop32_2):
- ldr qdata, [src, -16]!
- subs cntrem, cntrem, 32
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- b.ls L(end)
- umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
- cbz synd, L(loop32)
-L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
-
- add tmp, src, 15
-#ifdef __AARCH64EB__
- rbit synd, synd
-#endif
- clz synd, synd
- sub tmp, tmp, synd, lsr 2
- cmp tmp, srcin
- csel result, tmp, xzr, hs
- ret
-
-L(nomatch):
- mov result, 0
- ret
-
-END (__memrchr_aarch64)
-
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 9fcd975..3868141 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,13 +1,13 @@
/*
* memset - fill memory with a constant byte
*
- * Copyright (c) 2012-2021, Arm Limited.
+ * Copyright (c) 2012, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * ARMv8-a, AArch64, unaligned accesses
*
*/
@@ -19,11 +19,14 @@
#define count x2
#define dst x3
#define dstend x4
-#define zva_val x5
+#define tmp1 x5
+#define tmp1w w5
+#define tmp2 x6
+#define tmp2w w6
+#define zva_len x7
+#define zva_lenw w7
ENTRY (__memset_aarch64)
- PTR_ARG (0)
- SIZE_ARG (2)
dup v0.16B, valw
add dstend, dstin, count
@@ -39,7 +42,7 @@ ENTRY (__memset_aarch64)
str val, [dstin]
str val, [dstend, -8]
ret
- .p2align 4
+ nop
1: tbz count, 2, 2f
str valw, [dstin]
str valw, [dstend, -4]
@@ -69,49 +72,108 @@ L(set96):
stp q0, q0, [dstend, -32]
ret
- .p2align 4
+ .p2align 3
+ nop
L(set_long):
and valw, valw, 255
bic dst, dstin, 15
str q0, [dstin]
- cmp count, 160
- ccmp valw, 0, 0, hs
- b.ne L(no_zva)
-
-#ifndef SKIP_ZVA_CHECK
- mrs zva_val, dczid_el0
- and zva_val, zva_val, 31
- cmp zva_val, 4 /* ZVA size is 64 bytes. */
- b.ne L(no_zva)
-#endif
+ cmp count, 256
+ ccmp valw, 0, 0, cs
+ b.eq L(try_zva)
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ add dst, dst, 16
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+L(tail64):
+ subs count, count, 64
+ b.hi 1b
+2: stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 3
+L(try_zva):
+ mrs tmp1, dczid_el0
+ tbnz tmp1w, 4, L(no_zva)
+ and tmp1w, tmp1w, 15
+ cmp tmp1w, 4 /* ZVA size is 64 bytes. */
+ b.ne L(zva_128)
+
+ /* Write the first and last 64 byte aligned block using stp rather
+ than using DC ZVA. This is faster on some cores.
+ */
+L(zva_64):
str q0, [dst, 16]
stp q0, q0, [dst, 32]
bic dst, dst, 63
- sub count, dstend, dst /* Count is now 64 too large. */
- sub count, count, 128 /* Adjust count and bias for loop. */
-
- .p2align 4
-L(zva_loop):
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+ nop
+1: dc zva, dst
add dst, dst, 64
- dc zva, dst
subs count, count, 64
- b.hi L(zva_loop)
+ b.hi 1b
+ stp q0, q0, [dst, 0]
+ stp q0, q0, [dst, 32]
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
-L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-L(no_zva_loop):
+ .p2align 3
+L(zva_128):
+ cmp tmp1w, 5 /* ZVA size is 128 bytes. */
+ b.ne L(zva_other)
+
+ str q0, [dst, 16]
stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
- subs count, count, 64
- b.hi L(no_zva_loop)
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ bic dst, dst, 127
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+128 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+1: dc zva, dst
+ add dst, dst, 128
+ subs count, count, 128
+ b.hi 1b
+ stp q0, q0, [dstend, -128]
+ stp q0, q0, [dstend, -96]
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
-END (__memset_aarch64)
+L(zva_other):
+ mov tmp2w, 4
+ lsl zva_lenw, tmp2w, tmp1w
+ add tmp1, zva_len, 64 /* Max alignment bytes written. */
+ cmp count, tmp1
+ blo L(no_zva)
+ sub tmp2, zva_len, 1
+ add tmp1, dst, zva_len
+ add dst, dst, 16
+ subs count, tmp1, dst /* Actual alignment bytes to write. */
+ bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
+ beq 2f
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+ subs count, count, 64
+ b.hi 1b
+2: mov dst, tmp1
+ sub count, dstend, tmp1 /* Remaining bytes to write. */
+ subs count, count, zva_len
+ b.lo 4f
+3: dc zva, dst
+ add dst, dst, zva_len
+ subs count, count, zva_len
+ b.hs 3b
+4: add count, count, zva_len
+ b L(tail64)
+
+END (__memset_aarch64)
diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
deleted file mode 100644
index f1c7119..0000000
--- a/string/aarch64/stpcpy-mte.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * stpcpy - copy a string returning pointer to end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define BUILD_STPCPY 1
-
-#include "strcpy-mte.S"
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S
deleted file mode 100644
index 82dd971..0000000
--- a/string/aarch64/stpcpy-sve.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * stpcpy - copy a string returning pointer to end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define BUILD_STPCPY 1
-
-#include "strcpy-sve.S"
diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S
deleted file mode 100644
index 4f62aa4..0000000
--- a/string/aarch64/stpcpy.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * stpcpy - copy a string returning pointer to end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define BUILD_STPCPY 1
-
-#include "strcpy.S"
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
deleted file mode 100644
index dcb0e46..0000000
--- a/string/aarch64/strchr-mte.S
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * strchr - find a character in a string
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define srcin x0
-#define chrin w1
-#define result x0
-
-#define src x2
-#define tmp1 x1
-#define wtmp2 w3
-#define tmp3 x3
-
-#define vrepchr v0
-#define vdata v1
-#define qdata q1
-#define vhas_nul v2
-#define vhas_chr v3
-#define vrepmask v4
-#define vrepmask2 v5
-#define vend v6
-#define dend d6
-
-/* Core algorithm.
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
- requested character, bits 2-3 are set if the byte is NUL (or matched), and
- bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
- bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
- in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
-
-ENTRY (__strchr_aarch64_mte)
- PTR_ARG (0)
- bic src, srcin, 15
- dup vrepchr.16b, chrin
- ld1 {vdata.16b}, [src]
- mov wtmp2, 0x3003
- dup vrepmask.8h, wtmp2
- cmeq vhas_nul.16b, vdata.16b, 0
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- mov wtmp2, 0xf00f
- dup vrepmask2.8h, wtmp2
-
- bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- lsl tmp3, srcin, 2
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
-
- fmov tmp1, dend
- lsr tmp1, tmp1, tmp3
- cbz tmp1, L(loop)
-
- rbit tmp1, tmp1
- clz tmp1, tmp1
- /* Tmp1 is an even multiple of 2 if the target character was
- found first. Otherwise we've found the end of string. */
- tst tmp1, 2
- add result, srcin, tmp1, lsr 2
- csel result, result, xzr, eq
- ret
-
- .p2align 4
-L(loop):
- ldr qdata, [src, 16]!
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov tmp1, dend
- cbz tmp1, L(loop)
-
-#ifdef __AARCH64EB__
- bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
- fmov tmp1, dend
-#else
- bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
- fmov tmp1, dend
- rbit tmp1, tmp1
-#endif
- clz tmp1, tmp1
- /* Tmp1 is an even multiple of 2 if the target character was
- found first. Otherwise we've found the end of string. */
- tst tmp1, 2
- add result, src, tmp1, lsr 2
- csel result, result, xzr, eq
- ret
-
-END (__strchr_aarch64_mte)
-
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 13ba9f4..8d8a319 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -1,19 +1,19 @@
/*
* strchr/strchrnul - find a character in a string
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_SVE
/* Assumptions:
*
* ARMv8-a, AArch64
* SVE Available.
*/
+ .arch armv8-a+sve
+ .text
+
/* To build as strchrnul, define BUILD_STRCHRNUL before compiling this file. */
#ifdef BUILD_STRCHRNUL
#define FUNC __strchrnul_aarch64_sve
@@ -21,8 +21,10 @@
#define FUNC __strchr_aarch64_sve
#endif
-ENTRY (FUNC)
- PTR_ARG (0)
+ .globl FUNC
+ .type FUNC, %function
+ .p2align 4
+FUNC:
dup z1.b, w1 /* replicate byte across vector */
setffr /* initialize FFR */
ptrue p1.b /* all ones; loop invariant */
@@ -64,7 +66,4 @@ ENTRY (FUNC)
incp x0, p0.b
b 0b
-END (FUNC)
-
-#endif
-
+ .size FUNC, . - FUNC
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 1063cbf..00d9be3 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -1,7 +1,7 @@
/*
* strchr - find a character in a string
*
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2014-2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -51,12 +51,11 @@
/* Locals and temporaries. */
ENTRY (__strchr_aarch64)
- PTR_ARG (0)
- /* Magic constant 0xc0300c03 to allow us to identify which lane
- matches the requested byte. Even bits are set if the character
- matches, odd bits if either the char is NUL or matches. */
- mov wtmp2, 0x0c03
- movk wtmp2, 0xc030, lsl 16
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the requested byte. Magic constant 0x80200802 used
+ similarly for NUL termination. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
dup vrepchr.16b, chrin
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask_c.4s, wtmp2
@@ -74,10 +73,12 @@ ENTRY (__strchr_aarch64)
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
- bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
- and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
- and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vend1.16b, vend2.16b // 256->128
mov tmp3, #~0
@@ -88,26 +89,31 @@ ENTRY (__strchr_aarch64)
bic tmp1, tmp3, tmp1 // Mask padding bits.
cbnz tmp1, L(tail)
- .p2align 4
L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
- cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
- orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
- umaxp vend1.16b, vend1.16b, vend1.16b
+ /* Use a fast check for the termination condition. */
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ orr vend1.16b, vend1.16b, vend2.16b
+ addp vend1.2d, vend1.2d, vend1.2d
mov tmp1, vend1.d[0]
cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
- bif vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
- bif vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
- and vend1.16b, vhas_nul1.16b, vrepmask_c.16b
- and vend2.16b, vhas_nul2.16b, vrepmask_c.16b
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
addp vend1.16b, vend1.16b, vend2.16b // 256->128
addp vend1.16b, vend1.16b, vend2.16b // 128->64
+
mov tmp1, vend1.d[0]
L(tail):
/* Count the trailing zeros, by bit reversing... */
@@ -123,4 +129,3 @@ L(tail):
ret
END (__strchr_aarch64)
-
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
deleted file mode 100644
index 1b0d0a6..0000000
--- a/string/aarch64/strchrnul-mte.S
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * strchrnul - find a character or nul in a string
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define srcin x0
-#define chrin w1
-#define result x0
-
-#define src x2
-#define tmp1 x1
-#define tmp2 x3
-#define tmp2w w3
-
-#define vrepchr v0
-#define vdata v1
-#define qdata q1
-#define vhas_nul v2
-#define vhas_chr v3
-#define vrepmask v4
-#define vend v5
-#define dend d5
-
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
-
-ENTRY (__strchrnul_aarch64_mte)
- PTR_ARG (0)
- bic src, srcin, 15
- dup vrepchr.16b, chrin
- ld1 {vdata.16b}, [src]
- mov tmp2w, 0xf00f
- dup vrepmask.8h, tmp2w
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
- lsl tmp2, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov tmp1, dend
- lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
- cbz tmp1, L(loop)
-
- rbit tmp1, tmp1
- clz tmp1, tmp1
- add result, srcin, tmp1, lsr 2
- ret
-
- .p2align 4
-L(loop):
- ldr qdata, [src, 16]!
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
- umaxp vend.16b, vhas_chr.16b, vhas_chr.16b
- fmov tmp1, dend
- cbz tmp1, L(loop)
-
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov tmp1, dend
-#ifndef __AARCH64EB__
- rbit tmp1, tmp1
-#endif
- clz tmp1, tmp1
- add result, src, tmp1, lsr 2
- ret
-
-END (__strchrnul_aarch64_mte)
-
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
index 428ff1a..5140e59 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/strchrnul-sve.S
@@ -1,7 +1,7 @@
/*
* strchrnul - find a character or nul in a string
*
- * Copyright (c) 2018-2019, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index a4230d9..81264ea 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -1,7 +1,7 @@
/*
* strchrnul - find a character or nul in a string
*
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2014-2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -47,7 +47,6 @@
/* Locals and temporaries. */
ENTRY (__strchrnul_aarch64)
- PTR_ARG (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the termination condition. */
mov wtmp2, #0x0401
@@ -64,12 +63,14 @@ ENTRY (__strchrnul_aarch64)
syndrome that are related to the padding. */
ld1 {vdata1.16b, vdata2.16b}, [src], #32
neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
- cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
- and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
- and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+ orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
+ orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
lsl tmp1, tmp1, #1
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
mov tmp3, #~0
@@ -80,22 +81,24 @@ ENTRY (__strchrnul_aarch64)
bic tmp1, tmp3, tmp1 // Mask padding bits.
cbnz tmp1, L(tail)
- .p2align 4
L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- cmhs vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
- cmhs vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
- orr vend1.16b, vhas_nul1.16b, vhas_nul2.16b
- umaxp vend1.16b, vend1.16b, vend1.16b
+ /* Use a fast check for the termination condition. */
+ orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
+ orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
+ addp vend1.2d, vend1.2d, vend1.2d
mov tmp1, vend1.d[0]
cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
- and vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
- and vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
addp vend1.16b, vend1.16b, vend1.16b // 128->64
@@ -111,4 +114,3 @@ L(tail):
ret
END (__strchrnul_aarch64)
-
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
deleted file mode 100644
index 12d1a6b..0000000
--- a/string/aarch64/strcmp-mte.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * strcmp - compare two strings
- *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-#define src1 x0
-#define src2 x1
-#define result x0
-
-#define data1 x2
-#define data1w w2
-#define data2 x3
-#define data2w w3
-#define has_nul x4
-#define diff x5
-#define off1 x5
-#define syndrome x6
-#define tmp x6
-#define data3 x7
-#define zeroones x8
-#define shift x9
-#define off2 x10
-
-/* On big-endian early bytes are at MSB and on little-endian LSB.
- LS_FW means shifting towards early bytes. */
-#ifdef __AARCH64EB__
-# define LS_FW lsl
-#else
-# define LS_FW lsr
-#endif
-
-/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word.
- Since carry propagation makes 0x1 bytes before a NUL byte appear
- NUL too in big-endian, byte-reverse the data before the NUL check. */
-
-
-ENTRY (__strcmp_aarch64_mte)
- PTR_ARG (0)
- PTR_ARG (1)
- sub off2, src2, src1
- mov zeroones, REP8_01
- and tmp, src1, 7
- tst off2, 7
- b.ne L(misaligned8)
- cbnz tmp, L(mutual_align)
-
- .p2align 4
-
-L(loop_aligned):
- ldr data2, [src1, off2]
- ldr data1, [src1], 8
-L(start_realigned):
-#ifdef __AARCH64EB__
- rev tmp, data1
- sub has_nul, tmp, zeroones
- orr tmp, tmp, REP8_7f
-#else
- sub has_nul, data1, zeroones
- orr tmp, data1, REP8_7f
-#endif
- bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
- ccmp data1, data2, 0, eq
- b.eq L(loop_aligned)
-#ifdef __AARCH64EB__
- rev has_nul, has_nul
-#endif
- eor diff, data1, data2
- orr syndrome, diff, has_nul
-L(end):
-#ifndef __AARCH64EB__
- rev syndrome, syndrome
- rev data1, data1
- rev data2, data2
-#endif
- clz shift, syndrome
- /* The most-significant-non-zero bit of the syndrome marks either the
- first bit that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
- lsl data1, data1, shift
- lsl data2, data2, shift
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, 56
- sub result, data1, data2, lsr 56
- ret
-
- .p2align 4
-
-L(mutual_align):
- /* Sources are mutually aligned, but are not currently at an
- alignment boundary. Round down the addresses and then mask off
- the bytes that precede the start point. */
- bic src1, src1, 7
- ldr data2, [src1, off2]
- ldr data1, [src1], 8
- neg shift, src2, lsl 3 /* Bits to alignment -64. */
- mov tmp, -1
- LS_FW tmp, tmp, shift
- orr data1, data1, tmp
- orr data2, data2, tmp
- b L(start_realigned)
-
-L(misaligned8):
- /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
- checking to make sure that we don't access beyond the end of SRC2. */
- cbz tmp, L(src1_aligned)
-L(do_misaligned):
- ldrb data1w, [src1], 1
- ldrb data2w, [src2], 1
- cmp data1w, 0
- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
- b.ne L(done)
- tst src1, 7
- b.ne L(do_misaligned)
-
-L(src1_aligned):
- neg shift, src2, lsl 3
- bic src2, src2, 7
- ldr data3, [src2], 8
-#ifdef __AARCH64EB__
- rev data3, data3
-#endif
- lsr tmp, zeroones, shift
- orr data3, data3, tmp
- sub has_nul, data3, zeroones
- orr tmp, data3, REP8_7f
- bics has_nul, has_nul, tmp
- b.ne L(tail)
-
- sub off1, src2, src1
-
- .p2align 4
-
-L(loop_unaligned):
- ldr data3, [src1, off1]
- ldr data2, [src1, off2]
-#ifdef __AARCH64EB__
- rev data3, data3
-#endif
- sub has_nul, data3, zeroones
- orr tmp, data3, REP8_7f
- ldr data1, [src1], 8
- bics has_nul, has_nul, tmp
- ccmp data1, data2, 0, eq
- b.eq L(loop_unaligned)
-
- lsl tmp, has_nul, shift
-#ifdef __AARCH64EB__
- rev tmp, tmp
-#endif
- eor diff, data1, data2
- orr syndrome, diff, tmp
- cbnz syndrome, L(end)
-L(tail):
- ldr data1, [src1]
- neg shift, shift
- lsr data2, data3, shift
- lsr has_nul, has_nul, shift
-#ifdef __AARCH64EB__
- rev data2, data2
- rev has_nul, has_nul
-#endif
- eor diff, data1, data2
- orr syndrome, diff, has_nul
- b L(end)
-
-L(done):
- sub result, data1, data2
- ret
-
-END (__strcmp_aarch64_mte)
-
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index e6d2da5..91bac19 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -1,28 +1,29 @@
/*
* __strcmp_aarch64_sve - compare two strings
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_SVE
/* Assumptions:
*
* ARMv8-a, AArch64
* SVE Available.
*/
-ENTRY (__strcmp_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
+ .arch armv8-a+sve
+ .text
+
+ .globl __strcmp_aarch64_sve
+ .type __strcmp_aarch64_sve, %function
+ .p2align 4
+__strcmp_aarch64_sve:
setffr /* initialize FFR */
ptrue p1.b, all /* all ones; loop invariant */
mov x2, 0 /* initialize offset */
+ nop
/* Read a vector's worth of bytes, stopping on first fault. */
- .p2align 4
0: ldff1b z0.b, p1/z, [x0, x2]
ldff1b z1.b, p1/z, [x1, x2]
rdffrs p0.b, p1/z
@@ -53,7 +54,4 @@ ENTRY (__strcmp_aarch64_sve)
b.none 0b
b 1b
-END (__strcmp_aarch64_sve)
-
-#endif
-
+ .size __strcmp_aarch64_sve, . - __strcmp_aarch64_sve
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 7714ebf..65af5ce 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -1,7 +1,7 @@
/*
* strcmp - compare two strings
*
- * Copyright (c) 2012-2020, Arm Limited.
+ * Copyright (c) 2012, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -37,8 +37,6 @@
/* Start of performance-critical section -- one 64B cache line. */
ENTRY (__strcmp_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
@@ -170,4 +168,3 @@ L(done):
ret
END (__strcmp_aarch64)
-
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
deleted file mode 100644
index 88c222d..0000000
--- a/string/aarch64/strcpy-mte.S
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * strcpy/stpcpy - copy a string returning pointer to start/end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define dstin x0
-#define srcin x1
-#define result x0
-
-#define src x2
-#define dst x3
-#define len x4
-#define synd x4
-#define tmp x5
-#define wtmp w5
-#define shift x5
-#define data1 x6
-#define dataw1 w6
-#define data2 x7
-#define dataw2 w7
-
-#define dataq q0
-#define vdata v0
-#define vhas_nul v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
-#define dataq2 q1
-
-#ifdef BUILD_STPCPY
-# define STRCPY __stpcpy_aarch64_mte
-# define IFSTPCPY(X,...) X,__VA_ARGS__
-#else
-# define STRCPY __strcpy_aarch64_mte
-# define IFSTPCPY(X,...)
-#endif
-
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
-
-ENTRY (STRCPY)
- PTR_ARG (0)
- PTR_ARG (1)
- bic src, srcin, 15
- mov wtmp, 0xf00f
- ld1 {vdata.16b}, [src]
- dup vrepmask.8h, wtmp
- cmeq vhas_nul.16b, vdata.16b, 0
- lsl shift, srcin, 2
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- lsr synd, synd, shift
- cbnz synd, L(tail)
-
- ldr dataq, [src, 16]!
- cmeq vhas_nul.16b, vdata.16b, 0
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbz synd, L(start_loop)
-
-#ifndef __AARCH64EB__
- rbit synd, synd
-#endif
- sub tmp, src, srcin
- clz len, synd
- add len, tmp, len, lsr 2
- tbz len, 4, L(less16)
- sub tmp, len, 15
- ldr dataq, [srcin]
- ldr dataq2, [srcin, tmp]
- str dataq, [dstin]
- str dataq2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
- ret
-
- .p2align 4,,8
-L(tail):
- rbit synd, synd
- clz len, synd
- lsr len, len, 2
-
- .p2align 4
-L(less16):
- tbz len, 3, L(less8)
- sub tmp, len, 7
- ldr data1, [srcin]
- ldr data2, [srcin, tmp]
- str data1, [dstin]
- str data2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
- ret
-
- .p2align 4
-L(less8):
- subs tmp, len, 3
- b.lo L(less4)
- ldr dataw1, [srcin]
- ldr dataw2, [srcin, tmp]
- str dataw1, [dstin]
- str dataw2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
- ret
-
-L(less4):
- cbz len, L(zerobyte)
- ldrh dataw1, [srcin]
- strh dataw1, [dstin]
-L(zerobyte):
- strb wzr, [dstin, len]
- IFSTPCPY (add result, dstin, len)
- ret
-
- .p2align 4
-L(start_loop):
- sub len, src, srcin
- ldr dataq2, [srcin]
- add dst, dstin, len
- str dataq2, [dstin]
-
- .p2align 5
-L(loop):
- str dataq, [dst], 16
- ldr dataq, [src, 16]!
- cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbz synd, L(loop)
-
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
- fmov synd, dend
-#ifndef __AARCH64EB__
- rbit synd, synd
-#endif
- clz len, synd
- lsr len, len, 2
- sub tmp, len, 15
- ldr dataq, [src, tmp]
- str dataq, [dst, tmp]
- IFSTPCPY (add result, dst, len)
- ret
-
-END (STRCPY)
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index f515462..c929f37 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -1,19 +1,19 @@
/*
* strcpy/stpcpy - copy a string returning pointer to start/end.
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_SVE
/* Assumptions:
*
* ARMv8-a, AArch64
* SVE Available.
*/
+ .arch armv8-a+sve
+ .text
+
/* To build as stpcpy, define BUILD_STPCPY before compiling this file. */
#ifdef BUILD_STPCPY
#define FUNC __stpcpy_aarch64_sve
@@ -21,9 +21,10 @@
#define FUNC __strcpy_aarch64_sve
#endif
-ENTRY (FUNC)
- PTR_ARG (0)
- PTR_ARG (1)
+ .globl FUNC
+ .type FUNC, %function
+ .p2align 4
+FUNC:
setffr /* initialize FFR */
ptrue p2.b, all /* all ones; loop invariant */
mov x2, 0 /* initialize offset */
@@ -65,7 +66,4 @@ ENTRY (FUNC)
#endif
ret
-END (FUNC)
-
-#endif
-
+ .size FUNC, . - FUNC
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 6e9ed42..4edffcf 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,7 +1,7 @@
/*
* strcpy/stpcpy - copy a string returning pointer to start/end.
*
- * Copyright (c) 2013-2020, Arm Limited.
+ * Copyright (c) 2013-2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -80,8 +80,6 @@
#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
ENTRY (STRCPY)
- PTR_ARG (0)
- PTR_ARG (1)
/* For moderately short strings, the fastest way to do the copy is to
calculate the length of the string in the same way as strlen, then
essentially do a memcpy of the result. This avoids the need for
@@ -308,4 +306,3 @@ L(page_cross):
b L(fp_gt8)
END (STRCPY)
-
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
deleted file mode 100644
index 7cf41d5..0000000
--- a/string/aarch64/strlen-mte.S
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * strlen - calculate the length of a string.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define srcin x0
-#define result x0
-
-#define src x1
-#define synd x2
-#define tmp x3
-#define wtmp w3
-#define shift x4
-
-#define data q0
-#define vdata v0
-#define vhas_nul v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
-
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
-
-ENTRY (__strlen_aarch64_mte)
- PTR_ARG (0)
- bic src, srcin, 15
- mov wtmp, 0xf00f
- ld1 {vdata.16b}, [src]
- dup vrepmask.8h, wtmp
- cmeq vhas_nul.16b, vdata.16b, 0
- lsl shift, srcin, 2
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
- fmov synd, dend
- lsr synd, synd, shift
- cbz synd, L(loop)
-
- rbit synd, synd
- clz result, synd
- lsr result, result, 2
- ret
-
- .p2align 5
-L(loop):
- ldr data, [src, 16]!
- cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbz synd, L(loop)
-
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
- sub result, src, srcin
- fmov synd, dend
-#ifndef __AARCH64EB__
- rbit synd, synd
-#endif
- clz tmp, synd
- add result, result, tmp, lsr 2
- ret
-
-END (__strlen_aarch64_mte)
-
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 2392493..64ede85 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -1,28 +1,31 @@
/*
* __strlen_aarch64_sve - compute the length of a string
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_SVE
/* Assumptions:
*
* ARMv8-a, AArch64
* SVE Available.
*/
-ENTRY (__strlen_aarch64_sve)
- PTR_ARG (0)
+ .arch armv8-a+sve
+ .text
+
+ .globl __strlen_aarch64_sve
+ .type __strlen_aarch64_sve, %function
+ .p2align 4
+__strlen_aarch64_sve:
setffr /* initialize FFR */
ptrue p2.b /* all ones; loop invariant */
mov x1, 0 /* initialize length */
+ nop
/* Read a vector's worth of bytes, stopping on first fault. */
- .p2align 4
0: ldff1b z0.b, p2/z, [x0, x1]
+ nop
rdffrs p0.b, p2/z
b.nlast 2f
@@ -49,7 +52,4 @@ ENTRY (__strlen_aarch64_sve)
incp x1, p0.b
b 0b
-END (__strlen_aarch64_sve)
-
-#endif
-
+ .size __strlen_aarch64_sve, . - __strlen_aarch64_sve
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index a1b164a..2293f73 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,88 +1,84 @@
/*
- * strlen - calculate the length of a string.
+ * strlen - calculate the length of a string
*
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2013, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
- * Not MTE compatible.
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
#include "../asmdefs.h"
-#define srcin x0
-#define len x0
-
-#define src x1
-#define data1 x2
-#define data2 x3
-#define has_nul1 x4
-#define has_nul2 x5
-#define tmp1 x4
-#define tmp2 x5
-#define tmp3 x6
-#define tmp4 x7
-#define zeroones x8
-
-#define maskv v0
-#define maskd d0
-#define dataq1 q1
-#define dataq2 q2
-#define datav1 v1
-#define datav2 v2
-#define tmp x2
-#define tmpw w2
-#define synd x3
-#define shift x4
-
-/* For the first 32 bytes, NUL detection works on the principle that
- (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
- byte is zero, and can be done in parallel across the entire word. */
+/* To test the page crossing code path more thoroughly, compile with
+ -DTEST_PAGE_CROSS - this will force all calls through the slower
+ entry path. This option is not intended for production use. */
+
+/* Arguments and results. */
+#define srcin x0
+#define len x0
+
+/* Locals and temporaries. */
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. A faster check
+ (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+ false hits for characters 129..255. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-/* To test the page crossing code path more thoroughly, compile with
- -DTEST_PAGE_CROSS - this will force all calls through the slower
- entry path. This option is not intended for production use. */
+#define REP8_80 0x8080808080808080
#ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 32
+# define MIN_PAGE_SIZE 15
#else
# define MIN_PAGE_SIZE 4096
#endif
-/* Core algorithm:
-
- Since strings are short on average, we check the first 32 bytes of the
- string for a NUL character without aligning the string. In order to use
- unaligned loads safely we must do a page cross check first.
-
- If there is a NUL byte we calculate the length from the 2 8-byte words
- using conditional select to reduce branch mispredictions (it is unlikely
- strlen will be repeatedly called on strings with the same length).
-
- If the string is longer than 32 bytes, align src so we don't need further
- page cross checks, and process 32 bytes per iteration using a fast SIMD
- loop.
-
- If the page cross check fails, we read 32 bytes from an aligned address,
- and ignore any characters before the string. If it contains a NUL
- character, return the length, if not, continue in the main loop. */
+ /* Since strings are short on average, we check the first 16 bytes
+ of the string for a NUL character. In order to do an unaligned ldp
+ safely we have to do a page cross check first. If there is a NUL
+ byte we calculate the length from the 2 8-byte words using
+ conditional select to reduce branch mispredictions (it is unlikely
+ __strlen_aarch64 will be repeatedly called on strings with the same length).
+
+ If the string is longer than 16 bytes, we align src so don't need
+ further page cross checks, and process 32 bytes per iteration
+ using the fast NUL check. If we encounter non-ASCII characters,
+ fallback to a second loop using the full NUL check.
+
+ If the page cross check fails, we read 16 bytes from an aligned
+ address, remove any characters before the string, and continue
+ in the main loop using aligned loads. Since strings crossing a
+ page in the first 16 bytes are rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+ AArch64 systems have a minimum page size of 4k. We don't bother
+ checking for larger page sizes - the cost of setting up the correct
+ page size is just not worth the extra gain from a small reduction in
+ the cases taking the slow path. Note that we only care about
+ whether the first fetch, which may be misaligned, crosses a page
+ boundary. */
ENTRY (__strlen_aarch64)
- PTR_ARG (0)
and tmp1, srcin, MIN_PAGE_SIZE - 1
- cmp tmp1, MIN_PAGE_SIZE - 32
- b.hi L(page_cross)
-
- /* Look for a NUL byte in the first 16 bytes. */
- ldp data1, data2, [srcin]
mov zeroones, REP8_01
-
+ cmp tmp1, MIN_PAGE_SIZE - 16
+ b.gt L(page_cross)
+ ldp data1, data2, [srcin]
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly.
@@ -98,103 +94,113 @@ ENTRY (__strlen_aarch64)
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- b.eq L(bytes16_31)
+ beq L(main_loop_entry)
- /* Find the exact offset of the first NUL byte in the first 16 bytes
- from the string start. Enter with C = has_nul1 == 0. */
+ /* Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
rev has_nul1, has_nul1
- csel len, xzr, len, cc
clz tmp1, has_nul1
+ csel len, xzr, len, cc
add len, len, tmp1, lsr 3
ret
- .p2align 3
- /* Look for a NUL byte at offset 16..31 in the string. */
-L(bytes16_31):
- ldp data1, data2, [srcin, 16]
-#ifdef __AARCH64EB__
- rev data1, data1
- rev data2, data2
-#endif
+ /* The inner loop processes 32 bytes per iteration and uses the fast
+ NUL check. If we encounter non-ASCII characters, use a second
+ loop with the accurate NUL check. */
+ .p2align 4
+L(main_loop_entry):
+ bic src, srcin, 15
+ sub src, src, 16
+L(main_loop):
+ ldp data1, data2, [src, 32]!
+L(page_cross_entry):
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ bne 1f
+ ldp data1, data2, [src, 16]
sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ beq L(main_loop)
+ add src, src, 16
+1:
+ /* The fast check failed, so do the slower, accurate NUL check. */
+ orr tmp2, data1, REP8_7f
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- b.eq L(loop_entry)
+ beq L(nonascii_loop)
- /* Find the exact offset of the first NUL byte at offset 16..31 from
- the string start. Enter with C = has_nul1 == 0. */
+ /* Enter with C = has_nul1 == 0. */
+L(tail):
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ csel data1, data1, data2, cc
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
csel has_nul1, has_nul1, has_nul2, cc
- mov len, 24
+#endif
+ sub len, src, srcin
rev has_nul1, has_nul1
- mov tmp3, 16
+ add tmp2, len, 8
clz tmp1, has_nul1
- csel len, tmp3, len, cc
+ csel len, len, tmp2, cc
add len, len, tmp1, lsr 3
ret
-L(loop_entry):
- bic src, srcin, 31
-
- .p2align 5
-L(loop):
- ldp dataq1, dataq2, [src, 32]!
- uminp maskv.16b, datav1.16b, datav2.16b
- uminp maskv.16b, maskv.16b, maskv.16b
- cmeq maskv.8b, maskv.8b, 0
- fmov synd, maskd
- cbz synd, L(loop)
+L(nonascii_loop):
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ bne L(tail)
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+ b L(tail)
- /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
- cmeq maskv.16b, datav1.16b, 0
- sub len, src, srcin
- tst synd, 0xffffffff
- b.ne 1f
- cmeq maskv.16b, datav2.16b, 0
- add len, len, 16
-1:
- /* Generate a bitmask and compute correct byte offset. */
+ /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+ srcin to 0x7f, so we ignore any NUL bytes before the string.
+ Then continue in the aligned loop. */
+L(page_cross):
+ bic src, srcin, 15
+ ldp data1, data2, [src]
+ lsl tmp1, srcin, 3
+ mov tmp4, -1
#ifdef __AARCH64EB__
- bic maskv.8h, 0xf0
+ /* Big-endian. Early bytes are at MSB. */
+ lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
#else
- bic maskv.8h, 0x0f, lsl 8
-#endif
- umaxp maskv.16b, maskv.16b, maskv.16b
- fmov synd, maskd
-#ifndef __AARCH64EB__
- rbit synd, synd
+ /* Little-endian. Early bytes are at LSB. */
+ lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
#endif
- clz tmp, synd
- add len, len, tmp, lsr 2
- ret
-
- .p2align 4
-
-L(page_cross):
- bic src, srcin, 31
- mov tmpw, 0x0c03
- movk tmpw, 0xc030, lsl 16
- ld1 {datav1.16b, datav2.16b}, [src]
- dup maskv.4s, tmpw
- cmeq datav1.16b, datav1.16b, 0
- cmeq datav2.16b, datav2.16b, 0
- and datav1.16b, datav1.16b, maskv.16b
- and datav2.16b, datav2.16b, maskv.16b
- addp maskv.16b, datav1.16b, datav2.16b
- addp maskv.16b, maskv.16b, maskv.16b
- fmov synd, maskd
- lsl shift, srcin, 1
- lsr synd, synd, shift
- cbz synd, L(loop)
-
- rbit synd, synd
- clz len, synd
- lsr len, len, 1
- ret
+ orr tmp1, tmp1, REP8_80
+ orn data1, data1, tmp1
+ orn tmp2, data2, tmp1
+ tst srcin, 8
+ csel data1, data1, tmp4, eq
+ csel data2, data2, tmp2, eq
+ b L(page_cross_entry)
END (__strlen_aarch64)
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
deleted file mode 100644
index c9d6fc8..0000000
--- a/string/aarch64/strncmp-mte.S
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * strncmp - compare two strings
- *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-/* Parameters and result. */
-#define src1 x0
-#define src2 x1
-#define limit x2
-#define result x0
-
-/* Internal variables. */
-#define data1 x3
-#define data1w w3
-#define data2 x4
-#define data2w w4
-#define has_nul x5
-#define diff x6
-#define syndrome x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define zeroones x11
-#define pos x12
-#define mask x13
-#define endloop x14
-#define count mask
-#define offset pos
-#define neg_offset x15
-
-/* Define endian dependent shift operations.
- On big-endian early bytes are at MSB and on little-endian LSB.
- LS_FW means shifting towards early bytes.
- LS_BK means shifting towards later bytes.
- */
-#ifdef __AARCH64EB__
-#define LS_FW lsl
-#define LS_BK lsr
-#else
-#define LS_FW lsr
-#define LS_BK lsl
-#endif
-
-ENTRY (__strncmp_aarch64_mte)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
- cbz limit, L(ret0)
- eor tmp1, src1, src2
- mov zeroones, #REP8_01
- tst tmp1, #7
- and count, src1, #7
- b.ne L(misaligned8)
- cbnz count, L(mutual_align)
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
- .p2align 4
-L(loop_aligned):
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-L(start_realigned):
- subs limit, limit, #8
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, hi /* Last Dword or differences. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp endloop, #0, #0, eq
- b.eq L(loop_aligned)
- /* End of main loop */
-
-L(full_check):
-#ifndef __AARCH64EB__
- orr syndrome, diff, has_nul
- add limit, limit, 8 /* Rewind limit to before last subs. */
-L(syndrome_check):
- /* Limit was reached. Check if the NUL byte or the difference
- is before the limit. */
- rev syndrome, syndrome
- rev data1, data1
- clz pos, syndrome
- rev data2, data2
- lsl data1, data1, pos
- cmp limit, pos, lsr #3
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- csel result, result, xzr, hi
- ret
-#else
- /* Not reached the limit, must have found the end or a diff. */
- tbz limit, #63, L(not_limit)
- add tmp1, limit, 8
- cbz limit, L(not_limit)
-
- lsl limit, tmp1, #3 /* Bits -> bytes. */
- mov mask, #~0
- lsr mask, mask, limit
- bic data1, data1, mask
- bic data2, data2, mask
-
- /* Make sure that the NUL byte is marked in the syndrome. */
- orr has_nul, has_nul, mask
-
-L(not_limit):
- /* For big-endian we cannot use the trick with the syndrome value
- as carry-propagation can corrupt the upper bits if the trailing
- bytes in the string contain 0x01. */
- /* However, if there is no NUL byte in the dword, we can generate
- the result directly. We can't just subtract the bytes as the
- MSB might be significant. */
- cbnz has_nul, 1f
- cmp data1, data2
- cset result, ne
- cneg result, result, lo
- ret
-1:
- /* Re-compute the NUL-byte detection, using a byte-reversed value. */
- rev tmp3, data1
- sub tmp1, tmp3, zeroones
- orr tmp2, tmp3, #REP8_7f
- bic has_nul, tmp1, tmp2
- rev has_nul, has_nul
- orr syndrome, diff, has_nul
- clz pos, syndrome
- /* The most-significant-non-zero bit of the syndrome marks either the
- first bit that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
-L(end_quick):
- lsl data1, data1, pos
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- ret
-#endif
-
-L(mutual_align):
- /* Sources are mutually aligned, but are not currently at an
- alignment boundary. Round down the addresses and then mask off
- the bytes that precede the start point.
- We also need to adjust the limit calculations, but without
- overflowing if the limit is near ULONG_MAX. */
- bic src1, src1, #7
- bic src2, src2, #7
- ldr data1, [src1], #8
- neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
- ldr data2, [src2], #8
- mov tmp2, #~0
- LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
- /* Adjust the limit and ensure it doesn't overflow. */
- adds limit, limit, count
- csinv limit, limit, xzr, lo
- orr data1, data1, tmp2
- orr data2, data2, tmp2
- b L(start_realigned)
-
- .p2align 4
- /* Don't bother with dwords for up to 16 bytes. */
-L(misaligned8):
- cmp limit, #16
- b.hs L(try_misaligned_words)
-
-L(byte_loop):
- /* Perhaps we can do better than this. */
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs limit, limit, #1
- ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq L(byte_loop)
-L(done):
- sub result, data1, data2
- ret
- /* Align the SRC1 to a dword by doing a bytewise compare and then do
- the dword loop. */
-L(try_misaligned_words):
- cbz count, L(src1_aligned)
-
- neg count, count
- and count, count, #7
- sub limit, limit, count
-
-L(page_end_loop):
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- cmp data1w, #1
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.ne L(done)
- subs count, count, #1
- b.hi L(page_end_loop)
-
- /* The following diagram explains the comparison of misaligned strings.
- The bytes are shown in natural order. For little-endian, it is
- reversed in the registers. The "x" bytes are before the string.
- The "|" separates data that is loaded at one time.
- src1 | a a a a a a a a | b b b c c c c c | . . .
- src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
-
- After shifting in each step, the data looks like this:
- STEP_A STEP_B STEP_C
- data1 a a a a a a a a b b b c c c c c b b b c c c c c
- data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
-
- The bytes with "0" are eliminated from the syndrome via mask.
-
- Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
- time from SRC2. The comparison happens in 3 steps. After each step
- the loop can exit, or read from SRC1 or SRC2. */
-L(src1_aligned):
- /* Calculate offset from 8 byte alignment to string start in bits. No
- need to mask offset since shifts are ignoring upper bits. */
- lsl offset, src2, #3
- bic src2, src2, #0xf
- mov mask, -1
- neg neg_offset, offset
- ldr data1, [src1], #8
- ldp tmp1, tmp2, [src2], #16
- LS_BK mask, mask, neg_offset
- and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
- /* Skip the first compare if data in tmp1 is irrelevant. */
- tbnz offset, 6, L(misaligned_mid_loop)
-
-L(loop_misaligned):
- /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
- LS_FW data2, tmp1, offset
- LS_BK tmp1, tmp2, neg_offset
- subs limit, limit, #8
- orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
- sub has_nul, data1, zeroones
- eor diff, data1, data2 /* Non-zero if differences found. */
- orr tmp3, data1, #REP8_7f
- csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
- bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
- orr tmp3, endloop, has_nul
- cbnz tmp3, L(full_check)
-
- ldr data1, [src1], #8
-L(misaligned_mid_loop):
- /* STEP_B: Compare first part of data1 to second part of tmp2. */
- LS_FW data2, tmp2, offset
-#ifdef __AARCH64EB__
- /* For big-endian we do a byte reverse to avoid carry-propagation
- problem described above. This way we can reuse the has_nul in the
- next step and also use syndrome value trick at the end. */
- rev tmp3, data1
- #define data1_fixed tmp3
-#else
- #define data1_fixed data1
-#endif
- sub has_nul, data1_fixed, zeroones
- orr tmp3, data1_fixed, #REP8_7f
- eor diff, data2, data1 /* Non-zero if differences found. */
- bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
-#ifdef __AARCH64EB__
- rev has_nul, has_nul
-#endif
- cmp limit, neg_offset, lsr #3
- orr syndrome, diff, has_nul
- bic syndrome, syndrome, mask /* Ignore later bytes. */
- csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
- cbnz tmp3, L(syndrome_check)
-
- /* STEP_C: Compare second part of data1 to first part of tmp1. */
- ldp tmp1, tmp2, [src2], #16
- cmp limit, #8
- LS_BK data2, tmp1, neg_offset
- eor diff, data2, data1 /* Non-zero if differences found. */
- orr syndrome, diff, has_nul
- and syndrome, syndrome, mask /* Ignore earlier bytes. */
- csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
- cbnz tmp3, L(syndrome_check)
-
- ldr data1, [src1], #8
- sub limit, limit, #8
- b L(loop_misaligned)
-
-#ifdef __AARCH64EB__
-L(syndrome_check):
- clz pos, syndrome
- cmp pos, limit, lsl #3
- b.lo L(end_quick)
-#endif
-
-L(ret0):
- mov result, #0
- ret
-END(__strncmp_aarch64_mte)
-
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index 234190e..6f31eca 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -1,23 +1,23 @@
/*
* strncmp - compare two strings with limit
*
- * Copyright (c) 2018-2021, Arm Limited.
+ * Copyright (c) 2018, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_SVE
/* Assumptions:
*
* ARMv8-a, AArch64
* SVE Available.
*/
-ENTRY (__strncmp_aarch64_sve)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
+ .arch armv8-a+sve
+ .text
+
+ .globl __strncmp_aarch64_sve
+ .type __strncmp_aarch64_sve, %function
+ .p2align 4
+__strncmp_aarch64_sve:
setffr /* initialize FFR */
mov x3, 0 /* initialize off */
@@ -63,7 +63,4 @@ ENTRY (__strncmp_aarch64_sve)
9: mov x0, 0 /* return equal */
ret
-END (__strncmp_aarch64_sve)
-
-#endif
-
+ .size __strncmp_aarch64_sve, . - __strncmp_aarch64_sve
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 738b653..fbd08ee 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -1,7 +1,7 @@
/*
* strncmp - compare two strings
*
- * Copyright (c) 2013-2021, Arm Limited.
+ * Copyright (c) 2013, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -40,10 +40,12 @@
#define endloop x15
#define count mask
-ENTRY (__strncmp_aarch64)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
+ .text
+ .p2align 6
+ .rep 7
+ nop /* Pad so that the loop below fits a cache line. */
+ .endr
+ENTRY_ALIGN (__strncmp_aarch64, 0)
cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
@@ -58,7 +60,7 @@ ENTRY (__strncmp_aarch64)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
- .p2align 4
+ /* Start of performance-critical section -- one 64B cache line. */
L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
@@ -71,7 +73,7 @@ L(start_realigned):
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq L(loop_aligned)
- /* End of main loop */
+ /* End of performance-critical section -- one 64B cache line. */
/* Not reached the limit, must have found the end or a diff. */
tbz limit_wd, #63, L(not_limit)
@@ -176,7 +178,7 @@ L(mutual_align):
add limit_wd, limit_wd, tmp3, lsr #3
b L(start_realigned)
- .p2align 4
+ .p2align 6
/* Don't bother with dwords for up to 16 bytes. */
L(misaligned8):
cmp limit, #16
@@ -257,4 +259,3 @@ L(ret0):
ret
END ( __strncmp_aarch64)
-
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 5b9ebf7..3a9be08 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -1,22 +1,23 @@
/*
* strnlen - calculate the length of a string with limit.
*
- * Copyright (c) 2019-2021, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_SVE
/* Assumptions:
*
* ARMv8-a, AArch64
* SVE Available.
*/
-ENTRY (__strnlen_aarch64_sve)
- PTR_ARG (0)
- SIZE_ARG (1)
+ .arch armv8-a+sve
+ .text
+
+ .globl __strnlen_aarch64_sve
+ .type __strnlen_aarch64_sve, %function
+ .p2align 4
+__strnlen_aarch64_sve:
setffr /* initialize FFR */
mov x2, 0 /* initialize len */
b 1f
@@ -65,10 +66,7 @@ ENTRY (__strnlen_aarch64_sve)
b 1b
/* End of count. Return max. */
-9: mov x0, x1
+9: mov x0, x2
ret
-END (__strnlen_aarch64_sve)
-
-#endif
-
+ .size __strnlen_aarch64_sve, . - __strnlen_aarch64_sve
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index 48d2495..df66b60 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -1,112 +1,155 @@
/*
* strnlen - calculate the length of a string with limit.
*
- * Copyright (c) 2020, Arm Limited.
+ * Copyright (c) 2013, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
+ * ARMv8-a, AArch64
*/
#include "../asmdefs.h"
+/* Arguments and results. */
#define srcin x0
-#define cntin x1
-#define result x0
+#define len x0
+#define limit x1
+/* Locals and temporaries. */
#define src x2
-#define synd x3
-#define shift x4
-#define wtmp w4
-#define tmp x4
-#define cntrem x5
-
-#define qdata q0
-#define vdata v0
-#define vhas_chr v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
+#define data1 x3
+#define data2 x4
+#define data2a x5
+#define has_nul1 x6
+#define has_nul2 x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define tmp4 x11
+#define zeroones x12
+#define pos x13
+#define limit_wd x14
-/*
- Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
-
-ENTRY (__strnlen_aarch64)
- PTR_ARG (0)
- SIZE_ARG (1)
- bic src, srcin, 15
- mov wtmp, 0xf00f
- cbz cntin, L(nomatch)
- ld1 {vdata.16b}, [src], 16
- dup vrepmask.8h, wtmp
- cmeq vhas_chr.16b, vdata.16b, 0
- lsl shift, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
- lsr synd, synd, shift
- cbz synd, L(start_loop)
-L(finish):
- rbit synd, synd
- clz synd, synd
- lsr result, synd, 2
- cmp cntin, result
- csel result, cntin, result, ls
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ .text
+ .p2align 6
+L(start):
+ /* Pre-pad to ensure critical loop begins an icache line. */
+ .rep 7
+ nop
+ .endr
+ /* Put this code here to avoid wasting more space with pre-padding. */
+L(hit_limit):
+ mov len, limit
ret
-L(start_loop):
- sub tmp, src, srcin
- subs cntrem, cntin, tmp
- b.ls L(nomatch)
-
- /* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
-
- .p2align 5
-L(loop32):
- ldr qdata, [src], 16
- cmeq vhas_chr.16b, vdata.16b, 0
- umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
- cbnz synd, L(end)
-L(loop32_2):
- ldr qdata, [src], 16
- subs cntrem, cntrem, 32
- cmeq vhas_chr.16b, vdata.16b, 0
- b.ls L(end)
- umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- fmov synd, dend
- cbz synd, L(loop32)
-
-L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- sub src, src, 16
- mov synd, vend.d[0]
- sub result, src, srcin
-#ifndef __AARCH64EB__
- rbit synd, synd
+ENTRY_ALIGN (__strnlen_aarch64, 0)
+ cbz limit, L(hit_limit)
+ mov zeroones, #REP8_01
+ bic src, srcin, #15
+ ands tmp1, srcin, #15
+ b.ne L(misaligned)
+ /* Calculate the number of full and partial words -1. */
+ sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
+ lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ /* The inner loop deals with two Dwords at a time. This has a
+ slightly higher start-up cost, but we should win quite quickly,
+ especially on cores with a high number of issue slots per
+ cycle, as we get much better parallelism out of the operations. */
+
+ /* Start of critial section -- keep to one 64Byte cache line. */
+L(loop):
+ ldp data1, data2, [src], #16
+L(realigned):
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ subs limit_wd, limit_wd, #1
+ orr tmp1, has_nul1, has_nul2
+ ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
+ b.eq L(loop)
+ /* End of critical section -- keep to one 64Byte cache line. */
+
+ orr tmp1, has_nul1, has_nul2
+ cbz tmp1, L(hit_limit) /* No null in final Qword. */
+
+ /* We know there's a null in the final Qword. The easiest thing
+ to do now is work out the length of the string and return
+ MIN (len, limit). */
+
+ sub len, src, srcin
+ cbz has_nul1, L(nul_in_data2)
+#ifdef __AARCH64EB__
+ mov data2, data1
+#endif
+ sub len, len, #8
+ mov has_nul2, has_nul1
+L(nul_in_data2):
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ rev data2, data2
+ sub tmp1, data2, zeroones
+ orr tmp2, data2, #REP8_7f
+ bic has_nul2, tmp1, tmp2
#endif
- clz synd, synd
- add result, result, synd, lsr 2
- cmp cntin, result
- csel result, cntin, result, ls
+ sub len, len, #8
+ rev has_nul2, has_nul2
+ clz pos, has_nul2
+ add len, len, pos, lsr #3 /* Bits to bytes. */
+ cmp len, limit
+ csel len, len, limit, ls /* Return the lower value. */
ret
-L(nomatch):
- mov result, cntin
- ret
+L(misaligned):
+ /* Deal with a partial first word.
+ We're doing two things in parallel here;
+ 1) Calculate the number of words (but avoiding overflow if
+ limit is near ULONG_MAX) - to do this we need to work out
+ limit + tmp1 - 1 as a 65-bit value before shifting it;
+ 2) Load and mask the initial data words - we force the bytes
+ before the ones we are interested in to 0xff - this ensures
+ early bytes will not hit any zero detection. */
+ sub limit_wd, limit, #1
+ neg tmp4, tmp1
+ cmp tmp1, #8
-END (__strnlen_aarch64)
+ and tmp3, limit_wd, #15
+ lsr limit_wd, limit_wd, #4
+ mov tmp2, #~0
+
+ ldp data1, data2, [src], #16
+ lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
+ add tmp3, tmp3, tmp1
+
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+#endif
+ add limit_wd, limit_wd, tmp3, lsr #4
+
+ orr data1, data1, tmp2
+ orr data2a, data2, tmp2
+ csinv data1, data1, xzr, le
+ csel data2, data2, data2a, le
+ b L(realigned)
+
+END (__strnlen_aarch64)
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
deleted file mode 100644
index 1e4fb1a..0000000
--- a/string/aarch64/strrchr-mte.S
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * strrchr - find last position of a character in a string.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define srcin x0
-#define chrin w1
-#define result x0
-
-#define src x2
-#define tmp x3
-#define wtmp w3
-#define synd x3
-#define shift x4
-#define src_match x4
-#define nul_match x5
-#define chr_match x6
-
-#define vrepchr v0
-#define vdata v1
-#define vhas_nul v2
-#define vhas_chr v3
-#define vrepmask v4
-#define vrepmask2 v5
-#define vend v5
-#define dend d5
-
-/* Core algorithm.
-
- For each 16-byte chunk we calculate a 64-bit syndrome value, with
- four bits per byte (LSB is always in bits 0 and 1, for both big
- and little-endian systems). For each tuple, bits 0-1 are set if
- the relevant byte matched the requested character; bits 2-3 are set
- if the relevant byte matched the NUL end of string. */
-
-ENTRY (__strrchr_aarch64_mte)
- PTR_ARG (0)
- bic src, srcin, 15
- dup vrepchr.16b, chrin
- mov wtmp, 0x3003
- dup vrepmask.8h, wtmp
- tst srcin, 15
- beq L(loop1)
-
- ld1 {vdata.16b}, [src], 16
- cmeq vhas_nul.16b, vdata.16b, 0
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- mov wtmp, 0xf00f
- dup vrepmask2.8h, wtmp
- bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
- lsl shift, srcin, 2
- fmov synd, dend
- lsr synd, synd, shift
- lsl synd, synd, shift
- ands nul_match, synd, 0xcccccccccccccccc
- bne L(tail)
- cbnz synd, L(loop2)
-
- .p2align 5
-L(loop1):
- ld1 {vdata.16b}, [src], 16
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbz synd, L(loop1)
-
- cmeq vhas_nul.16b, vdata.16b, 0
- bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- bic vhas_nul.8h, 0x0f, lsl 8
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- ands nul_match, synd, 0xcccccccccccccccc
- beq L(loop2)
-
-L(tail):
- sub nul_match, nul_match, 1
- and chr_match, synd, 0x3333333333333333
- ands chr_match, chr_match, nul_match
- sub result, src, 1
- clz tmp, chr_match
- sub result, result, tmp, lsr 2
- csel result, result, xzr, ne
- ret
-
- .p2align 4
-L(loop2):
- cmp synd, 0
- csel src_match, src, src_match, ne
- csel chr_match, synd, chr_match, ne
- ld1 {vdata.16b}, [src], 16
- cmeq vhas_nul.16b, vdata.16b, 0
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- tst synd, 0xcccccccccccccccc
- beq L(loop2)
-
- bic vhas_nul.8h, 0x0f, lsl 8
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- and nul_match, synd, 0xcccccccccccccccc
- sub nul_match, nul_match, 1
- and tmp, synd, 0x3333333333333333
- ands tmp, tmp, nul_match
- csel chr_match, tmp, chr_match, ne
- csel src_match, src, src_match, ne
- sub src_match, src_match, 1
- clz tmp, chr_match
- sub result, src_match, tmp, lsr 2
- ret
-
-END (__strrchr_aarch64_mte)
-
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index d36d69a..bb522e7 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -1,21 +1,23 @@
/*
* strrchr - find the last of a character in a string
*
- * Copyright (c) 2019-2021, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#include "../asmdefs.h"
-
-#if __ARM_FEATURE_SVE
/* Assumptions:
*
* ARMv8-a, AArch64
* SVE Available.
*/
-ENTRY (__strrchr_aarch64_sve)
- PTR_ARG (0)
+ .arch armv8-a+sve
+ .text
+
+ .globl __strrchr_aarch64_sve
+ .type __strrchr_aarch64_sve, %function
+ .p2align 4
+__strrchr_aarch64_sve:
dup z1.b, w1 /* replicate byte across vector */
setffr /* initialize FFR */
ptrue p1.b /* all ones; loop invariant */
@@ -78,7 +80,4 @@ ENTRY (__strrchr_aarch64_sve)
5: mov x0, 0
ret
-END (__strrchr_aarch64_sve)
-
-#endif
-
+ .size __strrchr_aarch64_sve, . - __strrchr_aarch64_sve
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index 56185ff..1b4caac 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -55,7 +55,6 @@
identify exactly which byte is causing the termination, and why. */
ENTRY (__strrchr_aarch64)
- PTR_ARG (0)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the requested byte. Magic constant 0x80200802 used
similarly for NUL termination. */
@@ -85,38 +84,38 @@ ENTRY (__strrchr_aarch64)
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
- addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64
- mov nul_match, vend1.d[0]
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vhas_nul1.d[0]
lsl tmp1, tmp1, #1
mov const_m1, #~0
+ mov chr_match, vhas_chr1.d[0]
lsr tmp3, const_m1, tmp1
- mov chr_match, vend1.d[1]
bic nul_match, nul_match, tmp3 // Mask padding bits.
bic chr_match, chr_match, tmp3 // Mask padding bits.
cbnz nul_match, L(tail)
- .p2align 4
L(loop):
cmp chr_match, #0
csel src_match, src, src_match, ne
csel src_offset, chr_match, src_offset, ne
L(aligned):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- uminp vend1.16b, vdata1.16b, vdata2.16b
+ addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- cmeq vend1.16b, vend1.16b, 0
addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
- addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
mov nul_match, vend1.d[0]
- mov chr_match, vend1.d[1]
+ mov chr_match, vhas_chr1.d[0]
cbz nul_match, L(loop)
- cmeq vhas_nul1.16b, vdata1.16b, #0
- cmeq vhas_nul2.16b, vdata2.16b, #0
and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
@@ -146,4 +145,3 @@ L(tail):
ret
END (__strrchr_aarch64)
-
diff --git a/string/arm/check-arch.S b/string/arm/check-arch.S
deleted file mode 100644
index 1cff934..0000000
--- a/string/arm/check-arch.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * check ARCH setting.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if !__arm__
-# error ARCH setting does not match the compiler.
-#endif
diff --git a/string/arm/memchr.S b/string/arm/memchr.S
index 3f1ac4d..2eff4d1 100644
--- a/string/arm/memchr.S
+++ b/string/arm/memchr.S
@@ -1,7 +1,7 @@
/*
* memchr - scan memory for a character
*
- * Copyright (c) 2010-2021, Arm Limited.
+ * Copyright (c) 2010, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -31,6 +31,7 @@
#else
#define CHARTSTMASK(c) 1<<(c*8)
#endif
+ .text
.thumb
@ ---------------------------------------------------------------------------
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index 86e6493..aab78a2 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -1,7 +1,7 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2013-2020, Arm Limited.
+ * Copyright (c) 2013, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -124,7 +124,7 @@ ENTRY (__memcpy_arm)
mov dst, dstin /* Preserve dstin, we need to return it. */
cmp count, #64
- bhs L(cpy_not_short)
+ bge L(cpy_not_short)
/* Deal with small copies quickly by dropping straight into the
exit block. */
@@ -239,10 +239,10 @@ L(cpy_not_short):
1:
subs tmp2, count, #64 /* Use tmp2 for count. */
- blo L(tail63aligned)
+ blt L(tail63aligned)
cmp tmp2, #512
- bhs L(cpy_body_long)
+ bge L(cpy_body_long)
L(cpy_body_medium): /* Count in tmp2. */
#ifdef USE_VFP
@@ -266,7 +266,7 @@ L(cpy_body_medium): /* Count in tmp2. */
add src, src, #64
vstr d1, [dst, #56]
add dst, dst, #64
- bhs 1b
+ bge 1b
tst tmp2, #0x3f
beq L(done)
@@ -312,7 +312,7 @@ L(tail63aligned): /* Count in tmp2. */
ldrd A_l, A_h, [src, #64]!
strd A_l, A_h, [dst, #64]!
subs tmp2, tmp2, #64
- bhs 1b
+ bge 1b
tst tmp2, #0x3f
bne 1f
ldr tmp2,[sp], #FRAME_SIZE
@@ -383,7 +383,7 @@ L(cpy_body_long): /* Count in tmp2. */
add src, src, #32
subs tmp2, tmp2, #prefetch_lines * 64 * 2
- blo 2f
+ blt 2f
1:
cpy_line_vfp d3, 0
cpy_line_vfp d4, 64
@@ -395,7 +395,7 @@ L(cpy_body_long): /* Count in tmp2. */
add dst, dst, #2 * 64
add src, src, #2 * 64
subs tmp2, tmp2, #prefetch_lines * 64
- bhs 1b
+ bge 1b
2:
cpy_tail_vfp d3, 0
@@ -499,15 +499,15 @@ L(cpy_notaligned):
1:
pld [src, #(3 * 64)]
subs count, count, #64
- ldrlo tmp2, [sp], #FRAME_SIZE
- blo L(tail63unaligned)
+ ldrmi tmp2, [sp], #FRAME_SIZE
+ bmi L(tail63unaligned)
pld [src, #(4 * 64)]
#ifdef USE_NEON
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
subs count, count, #64
- blo 2f
+ bmi 2f
1:
pld [src, #(4 * 64)]
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
@@ -515,7 +515,7 @@ L(cpy_notaligned):
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
vld1.8 {d4-d7}, [src]!
subs count, count, #64
- bhs 1b
+ bpl 1b
2:
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
diff --git a/string/arm/memset.S b/string/arm/memset.S
index 11e9273..3ee5238 100644
--- a/string/arm/memset.S
+++ b/string/arm/memset.S
@@ -1,7 +1,7 @@
/*
* memset - fill memory with a constant
*
- * Copyright (c) 2010-2021, Arm Limited.
+ * Copyright (c) 2010, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -25,6 +25,7 @@
#else
#define CHARTSTMASK(c) 1<<(c*8)
#endif
+ .text
.thumb
@ ---------------------------------------------------------------------------
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index b75d414..d615231 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -1,12 +1,10 @@
/*
* strcmp for ARMv6-M (optimized for performance, not size)
*
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2014-2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#if __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
-
.thumb_func
.syntax unified
.arch armv6-m
@@ -113,5 +111,3 @@ ENTRY_ALIGN (__strcmp_armv6m, 4)
pop {r4, r5, r6, pc}
END (__strcmp_armv6m)
-
-#endif /* __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1 */
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index 51443e3..295db8b 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -1,12 +1,10 @@
/*
* strcmp for ARMv7
*
- * Copyright (c) 2012-2021, Arm Limited.
+ * Copyright (c) 2012-2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
-
/* Implementation of strcmp for ARMv7 when DSP instructions are
available. Use ldrd to support wider loads, provided the data
is sufficiently aligned. Use saturating arithmetic to optimize
@@ -125,6 +123,7 @@
#endif
.endm
+ .text
.p2align 5
L(strcmp_start_addr):
#if STRCMP_NO_PRECHECK == 0
@@ -471,5 +470,3 @@ L(strcmp_tail):
bx lr
END (__strcmp_arm)
-
-#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */
diff --git a/string/arm/strcpy.c b/string/arm/strcpy.c
index 02cf94f..48ebbe8 100644
--- a/string/arm/strcpy.c
+++ b/string/arm/strcpy.c
@@ -1,12 +1,10 @@
/*
* strcpy
*
- * Copyright (c) 2008-2020, Arm Limited.
+ * Copyright (c) 2008-2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#if defined (__thumb2__) && !defined (__thumb__)
-
/* For GLIBC:
#include <string.h>
#include <memcopy.h>
@@ -129,5 +127,3 @@ __strcpy_arm (char* dst, const char* src)
"BX LR");
}
/* For GLIBC: libc_hidden_builtin_def (strcpy) */
-
-#endif /* defined (__thumb2__) && !defined (__thumb__) */
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 5ad30c9..76e6930 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -1,12 +1,10 @@
/*
* strlen - calculate the length of a string
*
- * Copyright (c) 2010-2020, Arm Limited.
+ * Copyright (c) 2010, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
-
/*
Assumes:
ARMv6T2, AArch32
@@ -120,5 +118,3 @@ L(misaligned8):
b L(start_realigned)
END (__strlen_armv6t2)
-
-#endif /* __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 */
diff --git a/string/asmdefs.h b/string/asmdefs.h
index 340b427..7d143a9 100644
--- a/string/asmdefs.h
+++ b/string/asmdefs.h
@@ -1,64 +1,13 @@
/*
* Macros for asm code.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
#ifndef _ASMDEFS_H
#define _ASMDEFS_H
-#if defined(__aarch64__)
-
-/* Branch Target Identitication support. */
-#define BTI_C hint 34
-#define BTI_J hint 36
-/* Return address signing support (pac-ret). */
-#define PACIASP hint 25; .cfi_window_save
-#define AUTIASP hint 29; .cfi_window_save
-
-/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
-#define FEATURE_1_AND 0xc0000000
-#define FEATURE_1_BTI 1
-#define FEATURE_1_PAC 2
-
-/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
-#define GNU_PROPERTY(type, value) \
- .section .note.gnu.property, "a"; \
- .p2align 3; \
- .word 4; \
- .word 16; \
- .word 5; \
- .asciz "GNU"; \
- .word type; \
- .word 4; \
- .word value; \
- .word 0; \
- .text
-
-/* If set then the GNU Property Note section will be added to
- mark objects to support BTI and PAC-RET. */
-#ifndef WANT_GNU_PROPERTY
-#define WANT_GNU_PROPERTY 1
-#endif
-
-#if WANT_GNU_PROPERTY
-/* Add property note with supported features to all asm files. */
-GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
-#endif
-
-#define ENTRY_ALIGN(name, alignment) \
- .global name; \
- .type name,%function; \
- .align alignment; \
- name: \
- .cfi_startproc; \
- BTI_C;
-
-#else
-
-#define END_FILE
-
#define ENTRY_ALIGN(name, alignment) \
.global name; \
.type name,%function; \
@@ -66,8 +15,6 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
name: \
.cfi_startproc;
-#endif
-
#define ENTRY(name) ENTRY_ALIGN(name, 6)
#define ENTRY_ALIAS(name) \
@@ -81,18 +28,4 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
#define L(l) .L ## l
-#ifdef __ILP32__
- /* Sanitize padding bits of pointer arguments as per aapcs64 */
-#define PTR_ARG(n) mov w##n, w##n
-#else
-#define PTR_ARG(n)
-#endif
-
-#ifdef __ILP32__
- /* Sanitize padding bits of size arguments as per aapcs64 */
-#define SIZE_ARG(n) mov w##n, w##n
-#else
-#define SIZE_ARG(n)
-#endif
-
#endif
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
deleted file mode 100644
index d5d4ea7..0000000
--- a/string/bench/memcpy.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * memcpy benchmark.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define _GNU_SOURCE
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "stringlib.h"
-#include "benchlib.h"
-
-#define ITERS 5000
-#define ITERS2 20000000
-#define ITERS3 500000
-#define MAX_COPIES 8192
-#define SIZE (256*1024)
-
-static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
-static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
-
-#define F(x) {#x, x},
-
-static const struct fun
-{
- const char *name;
- void *(*fun)(void *, const void *, size_t);
-} funtab[] =
-{
- F(memcpy)
-#if __aarch64__
- F(__memcpy_aarch64)
-# if __ARM_NEON
- F(__memcpy_aarch64_simd)
-# endif
-#elif __arm__
- F(__memcpy_arm)
-#endif
-#undef F
- {0, 0}
-};
-
-typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
-typedef struct { uint8_t align; uint16_t freq; } align_data_t;
-
-#define SIZE_NUM 65536
-#define SIZE_MASK (SIZE_NUM-1)
-static uint8_t size_arr[SIZE_NUM];
-
-/* Frequency data for memcpy of less than 4096 bytes based on SPEC2017. */
-static freq_data_t size_freq[] =
-{
-{32,22320}, { 16,9554}, { 8,8915}, {152,5327}, { 4,2159}, {292,2035},
-{ 12,1608}, { 24,1343}, {1152,895}, {144, 813}, {884, 733}, {284, 721},
-{120, 661}, { 2, 649}, {882, 550}, { 5, 475}, { 7, 461}, {108, 460},
-{ 10, 361}, { 9, 361}, { 6, 334}, { 3, 326}, {464, 308}, {2048,303},
-{ 1, 298}, { 64, 250}, { 11, 197}, {296, 194}, { 68, 187}, { 15, 185},
-{192, 184}, {1764,183}, { 13, 173}, {560, 126}, {160, 115}, {288, 96},
-{104, 96}, {1144, 83}, { 18, 80}, { 23, 78}, { 40, 77}, { 19, 68},
-{ 48, 63}, { 17, 57}, { 72, 54}, {1280, 51}, { 20, 49}, { 28, 47},
-{ 22, 46}, {640, 45}, { 25, 41}, { 14, 40}, { 56, 37}, { 27, 35},
-{ 35, 33}, {384, 33}, { 29, 32}, { 80, 30}, {4095, 22}, {232, 22},
-{ 36, 19}, {184, 17}, { 21, 17}, {256, 16}, { 44, 15}, { 26, 15},
-{ 31, 14}, { 88, 14}, {176, 13}, { 33, 12}, {1024, 12}, {208, 11},
-{ 62, 11}, {128, 10}, {704, 10}, {324, 10}, { 96, 10}, { 60, 9},
-{136, 9}, {124, 9}, { 34, 8}, { 30, 8}, {480, 8}, {1344, 8},
-{273, 7}, {520, 7}, {112, 6}, { 52, 6}, {344, 6}, {336, 6},
-{504, 5}, {168, 5}, {424, 5}, { 0, 4}, { 76, 3}, {200, 3},
-{512, 3}, {312, 3}, {240, 3}, {960, 3}, {264, 2}, {672, 2},
-{ 38, 2}, {328, 2}, { 84, 2}, { 39, 2}, {216, 2}, { 42, 2},
-{ 37, 2}, {1608, 2}, { 70, 2}, { 46, 2}, {536, 2}, {280, 1},
-{248, 1}, { 47, 1}, {1088, 1}, {1288, 1}, {224, 1}, { 41, 1},
-{ 50, 1}, { 49, 1}, {808, 1}, {360, 1}, {440, 1}, { 43, 1},
-{ 45, 1}, { 78, 1}, {968, 1}, {392, 1}, { 54, 1}, { 53, 1},
-{ 59, 1}, {376, 1}, {664, 1}, { 58, 1}, {272, 1}, { 66, 1},
-{2688, 1}, {472, 1}, {568, 1}, {720, 1}, { 51, 1}, { 63, 1},
-{ 86, 1}, {496, 1}, {776, 1}, { 57, 1}, {680, 1}, {792, 1},
-{122, 1}, {760, 1}, {824, 1}, {552, 1}, { 67, 1}, {456, 1},
-{984, 1}, { 74, 1}, {408, 1}, { 75, 1}, { 92, 1}, {576, 1},
-{116, 1}, { 65, 1}, {117, 1}, { 82, 1}, {352, 1}, { 55, 1},
-{100, 1}, { 90, 1}, {696, 1}, {111, 1}, {880, 1}, { 79, 1},
-{488, 1}, { 61, 1}, {114, 1}, { 94, 1}, {1032, 1}, { 98, 1},
-{ 87, 1}, {584, 1}, { 85, 1}, {648, 1}, {0, 0}
-};
-
-#define ALIGN_NUM 1024
-#define ALIGN_MASK (ALIGN_NUM-1)
-static uint8_t src_align_arr[ALIGN_NUM];
-static uint8_t dst_align_arr[ALIGN_NUM];
-
-/* Source alignment frequency for memcpy based on SPEC2017. */
-static align_data_t src_align_freq[] =
-{
- {8, 300}, {16, 292}, {32, 168}, {64, 153}, {4, 79}, {2, 14}, {1, 18}, {0, 0}
-};
-
-static align_data_t dst_align_freq[] =
-{
- {8, 265}, {16, 263}, {64, 209}, {32, 174}, {4, 90}, {2, 10}, {1, 13}, {0, 0}
-};
-
-typedef struct
-{
- uint64_t src : 24;
- uint64_t dst : 24;
- uint64_t len : 16;
-} copy_t;
-
-static copy_t copy[MAX_COPIES];
-
-typedef char *(*proto_t) (char *, const char *, size_t);
-
-static void
-init_copy_distribution (void)
-{
- int i, j, freq, size, n;
-
- for (n = i = 0; (freq = size_freq[i].freq) != 0; i++)
- for (j = 0, size = size_freq[i].size; j < freq; j++)
- size_arr[n++] = size;
- assert (n == SIZE_NUM);
-
- for (n = i = 0; (freq = src_align_freq[i].freq) != 0; i++)
- for (j = 0, size = src_align_freq[i].align; j < freq; j++)
- src_align_arr[n++] = size - 1;
- assert (n == ALIGN_NUM);
-
- for (n = i = 0; (freq = dst_align_freq[i].freq) != 0; i++)
- for (j = 0, size = dst_align_freq[i].align; j < freq; j++)
- dst_align_arr[n++] = size - 1;
- assert (n == ALIGN_NUM);
-}
-
-static size_t
-init_copies (size_t max_size)
-{
- size_t total = 0;
- /* Create a random set of copies with the given size and alignment
- distributions. */
- for (int i = 0; i < MAX_COPIES; i++)
- {
- copy[i].dst = (rand32 (0) & (max_size - 1));
- copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
- copy[i].src = (rand32 (0) & (max_size - 1));
- copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
- copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
- total += copy[i].len;
- }
-
- return total;
-}
-
-int main (void)
-{
- init_copy_distribution ();
-
- memset (a, 1, sizeof (a));
- memset (b, 2, sizeof (b));
-
- printf("Random memcpy:\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- size_t total = 0;
- uint64_t tsum = 0;
- printf ("%22s (B/ns) ", funtab[f].name);
- rand32 (0x12345678);
-
- for (int size = 16384; size <= SIZE; size *= 2)
- {
- size_t copy_size = init_copies (size) * ITERS;
-
- for (int c = 0; c < MAX_COPIES; c++)
- funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < MAX_COPIES; c++)
- funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
- t = clock_get_ns () - t;
- total += copy_size;
- tsum += t;
- printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
- }
- printf( "avg %.2f\n", (double)total / tsum);
- }
-
- printf ("\nMedium memcpy:\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s (B/ns) ", funtab[f].name);
-
- for (int size = 16; size <= 512; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (b, a, size);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
-
- printf ("\nLarge memcpy:\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s (B/ns) ", funtab[f].name);
-
- for (int size = 1024; size <= 32768; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (b, a, size);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
-
- printf ("\nUnaligned forwards memmove:\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s (B/ns) ", funtab[f].name);
-
- for (int size = 1024; size <= 32768; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a, a + 256 + (i & 31), size);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
-
-
- printf ("\nUnaligned backwards memmove:\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s (B/ns) ", funtab[f].name);
-
- for (int size = 1024; size <= 32768; size *= 2)
- {
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a + 256 + (i & 31), a, size);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
-
- return 0;
-}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
deleted file mode 100644
index cc0f04b..0000000
--- a/string/bench/strlen.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * strlen benchmark.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define _GNU_SOURCE
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "stringlib.h"
-#include "benchlib.h"
-
-#define ITERS 2000
-#define ITERS2 20000000
-#define ITERS3 2000000
-#define NUM_STRLEN 16384
-
-#define MAX_ALIGN 32
-#define MAX_STRLEN 256
-
-static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
-
-#define F(x, mte) {#x, x, mte},
-
-static const struct fun
-{
- const char *name;
- size_t (*fun) (const char *s);
- int test_mte;
-} funtab[] = {
- // clang-format off
- F(strlen, 0)
-#if __aarch64__
- F(__strlen_aarch64, 0)
- F(__strlen_aarch64_mte, 1)
-# if __ARM_FEATURE_SVE
- F(__strlen_aarch64_sve, 1)
-# endif
-#elif __arm__
-# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
- F(__strlen_armv6t2, 0)
-# endif
-#endif
- {0, 0, 0}
- // clang-format on
-};
-#undef F
-
-static uint16_t strlen_tests[NUM_STRLEN];
-
-typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
-typedef struct { uint8_t align; uint16_t freq; } align_data_t;
-
-#define SIZE_NUM 65536
-#define SIZE_MASK (SIZE_NUM - 1)
-static uint8_t strlen_len_arr[SIZE_NUM];
-
-/* Frequency data for strlen sizes up to 128 based on SPEC2017. */
-static freq_data_t strlen_len_freq[] =
-{
- { 12,22671}, { 18,12834}, { 13, 9555}, { 6, 6348}, { 17, 6095}, { 11, 2115},
- { 10, 1335}, { 7, 814}, { 2, 646}, { 9, 483}, { 8, 471}, { 16, 418},
- { 4, 390}, { 1, 388}, { 5, 233}, { 3, 204}, { 0, 79}, { 14, 79},
- { 15, 69}, { 26, 36}, { 22, 35}, { 31, 24}, { 32, 24}, { 19, 21},
- { 25, 17}, { 28, 15}, { 21, 14}, { 33, 14}, { 20, 13}, { 24, 9},
- { 29, 9}, { 30, 9}, { 23, 7}, { 34, 7}, { 27, 6}, { 44, 5},
- { 42, 4}, { 45, 3}, { 47, 3}, { 40, 2}, { 41, 2}, { 43, 2},
- { 58, 2}, { 78, 2}, { 36, 2}, { 48, 1}, { 52, 1}, { 60, 1},
- { 64, 1}, { 56, 1}, { 76, 1}, { 68, 1}, { 80, 1}, { 84, 1},
- { 72, 1}, { 86, 1}, { 35, 1}, { 39, 1}, { 50, 1}, { 38, 1},
- { 37, 1}, { 46, 1}, { 98, 1}, {102, 1}, {128, 1}, { 51, 1},
- {107, 1}, { 0, 0}
-};
-
-#define ALIGN_NUM 1024
-#define ALIGN_MASK (ALIGN_NUM - 1)
-static uint8_t strlen_align_arr[ALIGN_NUM];
-
-/* Alignment data for strlen based on SPEC2017. */
-static align_data_t string_align_freq[] =
-{
- {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0}
-};
-
-static void
-init_strlen_distribution (void)
-{
- int i, j, freq, size, n;
-
- for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++)
- for (j = 0, size = strlen_len_freq[i].size; j < freq; j++)
- strlen_len_arr[n++] = size;
- assert (n == SIZE_NUM);
-
- for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++)
- for (j = 0, size = string_align_freq[i].align; j < freq; j++)
- strlen_align_arr[n++] = size;
- assert (n == ALIGN_NUM);
-}
-
-static void
-init_strlen_tests (void)
-{
- uint16_t index[MAX_ALIGN];
-
- memset (a, 'x', sizeof (a));
-
- /* Create indices for strings at all alignments. */
- for (int i = 0; i < MAX_ALIGN; i++)
- {
- index[i] = i * (MAX_STRLEN + 1);
- a[index[i] + MAX_STRLEN] = 0;
- }
-
- /* Create a random set of strlen input strings using the string length
- and alignment distributions. */
- for (int n = 0; n < NUM_STRLEN; n++)
- {
- int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
- int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
-
- strlen_tests[n] =
- index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
- }
-}
-
-static volatile size_t maskv = 0;
-
-int main (void)
-{
- rand32 (0x12345678);
- init_strlen_distribution ();
- init_strlen_tests ();
-
- printf ("\nRandom strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- size_t res = 0, strlen_size = 0, mask = maskv;
- printf ("%22s ", funtab[f].name);
-
- for (int c = 0; c < NUM_STRLEN; c++)
- strlen_size += funtab[f].fun (a + strlen_tests[c]);
- strlen_size *= ITERS;
-
- /* Measure latency of strlen result with (res & mask). */
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS; i++)
- for (int c = 0; c < NUM_STRLEN; c++)
- res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
- t = clock_get_ns () - t;
- printf ("%.2f\n", (double)strlen_size / t);
- }
-
- printf ("\nSmall aligned strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 1; size <= 64; size *= 2)
- {
- memset (a, 'x', size);
- a[size - 1] = 0;
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (a);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
-
- printf ("\nSmall unaligned strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- int align = 9;
- for (int size = 1; size <= 64; size *= 2)
- {
- memset (a + align, 'x', size);
- a[align + size - 1] = 0;
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS2; i++)
- funtab[f].fun (a + align);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
- }
- printf ("\n");
- }
-
- printf ("\nMedium strlen (bytes/ns):\n");
- for (int f = 0; funtab[f].name != 0; f++)
- {
- printf ("%22s ", funtab[f].name);
-
- for (int size = 128; size <= 4096; size *= 2)
- {
- memset (a, 'x', size);
- a[size - 1] = 0;
-
- uint64_t t = clock_get_ns ();
- for (int i = 0; i < ITERS3; i++)
- funtab[f].fun (a);
- t = clock_get_ns () - t;
- printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
- size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
- }
- printf ("\n");
- }
-
- printf ("\n");
-
- return 0;
-}
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
deleted file mode 100644
index 0f2ce2e..0000000
--- a/string/include/benchlib.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Benchmark support functions.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include <stdint.h>
-#include <time.h>
-
-/* Fast and accurate timer returning nanoseconds. */
-static inline uint64_t
-clock_get_ns (void)
-{
- struct timespec ts;
- clock_gettime (CLOCK_MONOTONIC, &ts);
- return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
-}
-
-/* Fast 32-bit random number generator. Passing a non-zero seed
- value resets the internal state. */
-static inline uint32_t
-rand32 (uint32_t seed)
-{
- static uint64_t state = 0xb707be451df0bb19ULL;
- if (seed != 0)
- state = seed;
- uint32_t res = state >> 32;
- state = state * 6364136223846793005ULL + 1;
- return res;
-}
-
-
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 378c3cd..b3b6181 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -1,7 +1,7 @@
/*
* Public API.
*
- * Copyright (c) 2019-2021, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -17,10 +17,8 @@ void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64 (void *, const void *, size_t);
void *__memset_aarch64 (void *, int, size_t);
void *__memchr_aarch64 (const void *, int, size_t);
-void *__memrchr_aarch64 (const void *, int, size_t);
int __memcmp_aarch64 (const void *, const void *, size_t);
char *__strcpy_aarch64 (char *__restrict, const char *__restrict);
-char *__stpcpy_aarch64 (char *__restrict, const char *__restrict);
int __strcmp_aarch64 (const char *, const char *);
char *__strchr_aarch64 (const char *, int);
char *__strrchr_aarch64 (const char *, int);
@@ -28,15 +26,6 @@ char *__strchrnul_aarch64 (const char *, int );
size_t __strlen_aarch64 (const char *);
size_t __strnlen_aarch64 (const char *, size_t);
int __strncmp_aarch64 (const char *, const char *, size_t);
-void * __memchr_aarch64_mte (const void *, int, size_t);
-char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
-char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
-char *__strchr_aarch64_mte (const char *, int);
-char * __strchrnul_aarch64_mte (const char *, int );
-size_t __strlen_aarch64_mte (const char *);
-char *__strrchr_aarch64_mte (const char *, int);
-int __strcmp_aarch64_mte (const char *, const char *);
-int __strncmp_aarch64_mte (const char *, const char *, size_t);
#if __ARM_NEON
void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64_simd (void *, const void *, size_t);
@@ -49,15 +38,10 @@ char *__strrchr_aarch64_sve (const char *, int);
char *__strchrnul_aarch64_sve (const char *, int );
int __strcmp_aarch64_sve (const char *, const char *);
char *__strcpy_aarch64_sve (char *__restrict, const char *__restrict);
-char *__stpcpy_aarch64_sve (char *__restrict, const char *__restrict);
size_t __strlen_aarch64_sve (const char *);
size_t __strnlen_aarch64_sve (const char *, size_t);
int __strncmp_aarch64_sve (const char *, const char *, size_t);
# endif
-# if __ARM_FEATURE_MEMORY_TAGGING
-void *__mtag_tag_region (void *, size_t);
-void *__mtag_tag_zero_region (void *, size_t);
-# endif
#elif __arm__
void *__memcpy_arm (void *__restrict, const void *__restrict, size_t);
void *__memset_arm (void *, int, size_t);
diff --git a/string/memchr.S b/string/memchr.S
new file mode 100644
index 0000000..0a564d8
--- /dev/null
+++ b/string/memchr.S
@@ -0,0 +1,15 @@
+/*
+ * Selected possible memchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memchr.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/memchr-sve.S"
+# endif
+#elif __arm__
+#include "arm/memchr.S"
+#endif
diff --git a/string/memcmp.S b/string/memcmp.S
new file mode 100644
index 0000000..22da685
--- /dev/null
+++ b/string/memcmp.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible memcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memcmp.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/memcmp-sve.S"
+# endif
+#endif
diff --git a/string/memcpy.S b/string/memcpy.S
new file mode 100644
index 0000000..b52b603
--- /dev/null
+++ b/string/memcpy.S
@@ -0,0 +1,15 @@
+/*
+ * Selected possible memcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memcpy.S"
+# if __ARM_NEON
+#include "aarch64/memcpy_simd.S"
+# endif
+#elif __arm__
+#include "arm/memcpy.S"
+#endif
diff --git a/string/memset.S b/string/memset.S
new file mode 100644
index 0000000..57542ef
--- /dev/null
+++ b/string/memset.S
@@ -0,0 +1,12 @@
+/*
+ * Selected possible memset implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/memset.S"
+#elif __arm__
+#include "arm/memset.S"
+#endif
diff --git a/string/strchr.S b/string/strchr.S
new file mode 100644
index 0000000..8cead02
--- /dev/null
+++ b/string/strchr.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strchr.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strchr-sve.S"
+# endif
+#endif
diff --git a/string/strchrnul.S b/string/strchrnul.S
new file mode 100644
index 0000000..3dfdeef
--- /dev/null
+++ b/string/strchrnul.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strchrnul.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strchrnul-sve.S"
+# endif
+#endif
diff --git a/string/strcmp.S b/string/strcmp.S
new file mode 100644
index 0000000..12530ec
--- /dev/null
+++ b/string/strcmp.S
@@ -0,0 +1,19 @@
+/*
+ * Selected possible strcmp implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strcmp.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strcmp-sve.S"
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
+#include "arm/strcmp.S"
+# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
+#include "arm/strcmp-armv6m.S"
+# endif
+#endif
diff --git a/string/strcpy-c.c b/string/strcpy-c.c
new file mode 100644
index 0000000..6bde24a
--- /dev/null
+++ b/string/strcpy-c.c
@@ -0,0 +1,10 @@
+/*
+ * Selected possible strcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __arm__ && defined (__thumb2__) && !defined (__thumb__)
+#include "arm/strcpy.c"
+#endif
diff --git a/string/strcpy.S b/string/strcpy.S
new file mode 100644
index 0000000..a604b22
--- /dev/null
+++ b/string/strcpy.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strcpy implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strcpy.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strcpy-sve.S"
+# endif
+#endif
diff --git a/string/strlen.S b/string/strlen.S
new file mode 100644
index 0000000..d681033
--- /dev/null
+++ b/string/strlen.S
@@ -0,0 +1,17 @@
+/*
+ * Selected possible strlen implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strlen.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strlen-sve.S"
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+#include "arm/strlen-armv6t2.S"
+# endif
+#endif
diff --git a/string/strncmp.S b/string/strncmp.S
new file mode 100644
index 0000000..26b56b7
--- /dev/null
+++ b/string/strncmp.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strncmp implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strncmp.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strncmp-sve.S"
+# endif
+#endif
diff --git a/string/strnlen.S b/string/strnlen.S
new file mode 100644
index 0000000..eebe777
--- /dev/null
+++ b/string/strnlen.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strnlen implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strnlen.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strnlen-sve.S"
+# endif
+#endif
diff --git a/string/strrchr.S b/string/strrchr.S
new file mode 100644
index 0000000..119b1d5
--- /dev/null
+++ b/string/strrchr.S
@@ -0,0 +1,13 @@
+/*
+ * Selected possible strrchr implementations.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#if __aarch64__
+#include "aarch64/strrchr.S"
+# if __ARM_FEATURE_SVE
+#include "aarch64/strrchr-sve.S"
+# endif
+#endif
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
deleted file mode 100644
index d8c02d9..0000000
--- a/string/test/__mtag_tag_region.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * __mtag_tag_region test.
- *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "mte.h"
-#include "stringlib.h"
-#include "stringtest.h"
-
-static void
-mtag_quoteat (const char *prefix, void *p, int len, int at)
-{
- /* Print tag, untag and quote the context. */
- printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
- untag_buffer (p, len, 1);
- p = untag_pointer (p);
- quoteat (prefix, p, len, at);
-}
-
-#define F(x) {#x, x},
-
-static const struct fun
-{
- const char *name;
- void *(*fun) (void *s, size_t n);
-} funtab[] = {
-// clang-format off
-#if __aarch64__
- F(__mtag_tag_region)
-#endif
- {0, 0}
- // clang-format on
-};
-#undef F
-
-#define A 64
-#define LEN 250000
-static unsigned char *sbuf;
-
-static void *
-alignup (void *p)
-{
- return (void *) (((uintptr_t) p + A - 1) & -A);
-}
-
-static void
-test (const struct fun *fun, int salign, int len)
-{
- unsigned char *src = alignup (sbuf);
- unsigned char *s = src + salign;
- void *p;
- int i;
-
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || salign >= A)
- abort ();
- for (i = 0; i < len + 2 * A; i++)
- src[i] = '?';
- for (i = 0; i < len; i++)
- s[i] = 'a';
-
- src = tag_buffer (src, len + 2 * A, 1);
- s = src + salign;
- /* Use different tag. */
- s = __arm_mte_increment_tag (s, 1);
- p = fun->fun (s, len);
-
- if (p != s)
- ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
-
- for (i = 0; i < salign; i++)
- {
- if (src[i] != '?')
- {
- ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
- mtag_quoteat ("got head", src, len + 2 * A, i);
- return;
- }
- }
-
- for (; i < salign + len; i++)
- {
- if (s[i - salign] != 'a')
- {
- ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
- mtag_quoteat ("got body", src, len + 2 * A, i);
- return;
- }
- }
-
- for (; i < len + 2 * A; i++)
- {
- if (src[i] != '?')
- {
- ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
- mtag_quoteat ("got tail", src, len + 2 * A, i);
- return;
- }
- }
-
- untag_buffer (src, len + 2 * A, 1);
-}
-
-int
-main ()
-{
- if (!mte_enabled ())
- return 0;
-
- sbuf = mte_mmap (LEN + 3 * A);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int s = 0; s < A; s += 16)
- {
- int n;
- for (n = 0; n < 200; n += 16)
- {
- test (funtab + i, s, n);
- }
- for (; n < LEN; n *= 2)
- {
- test (funtab + i, s, n);
- }
- }
- printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
-}
-#else
-int
-main ()
-{
- return 0;
-}
-#endif
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
deleted file mode 100644
index 221c223..0000000
--- a/string/test/__mtag_tag_zero_region.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * __mtag_tag_zero_region test.
- *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "mte.h"
-#include "stringlib.h"
-#include "stringtest.h"
-
-static void
-mtag_quoteat (const char *prefix, void *p, int len, int at)
-{
- /* Print tag, untag and quote the context. */
- printf ("location: %p\n", __arm_mte_get_tag ((char *) p + at));
- untag_buffer (p, len, 1);
- p = untag_pointer (p);
- quoteat (prefix, p, len, at);
-}
-
-#define F(x) {#x, x},
-
-static const struct fun
-{
- const char *name;
- void *(*fun) (void *s, size_t n);
-} funtab[] = {
-// clang-format off
-#if __aarch64__
- F(__mtag_tag_zero_region)
-#endif
- {0, 0}
- // clang-format on
-};
-#undef F
-
-#define A 64
-#define LEN 250000
-static unsigned char *sbuf;
-
-static void *
-alignup (void *p)
-{
- return (void *) (((uintptr_t) p + A - 1) & -A);
-}
-
-static void
-test (const struct fun *fun, int salign, int len)
-{
- unsigned char *src = alignup (sbuf);
- unsigned char *s = src + salign;
- void *p;
- int i;
-
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || salign >= A)
- abort ();
- for (i = 0; i < len + 2 * A; i++)
- src[i] = '?';
- for (i = 0; i < len; i++)
- s[i] = 'a' + i % 23;
-
- src = tag_buffer (src, len + 2 * A, 1);
- s = src + salign;
- /* Use different tag. */
- s = __arm_mte_increment_tag (s, 1);
- p = fun->fun (s, len);
-
- if (p != s)
- ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
-
- for (i = 0; i < salign; i++)
- {
- if (src[i] != '?')
- {
- ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
- mtag_quoteat ("got head", src, len + 2 * A, i);
- return;
- }
- }
-
- for (; i < salign + len; i++)
- {
- if (s[i - salign] != 0)
- {
- ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
- mtag_quoteat ("got body", src, len + 2 * A, i);
- return;
- }
- }
-
- for (; i < len + 2 * A; i++)
- {
- if (src[i] != '?')
- {
- ERR ("%s(align %d, %d) failed\n", fun->name, salign, len);
- mtag_quoteat ("got tail", src, len + 2 * A, i);
- return;
- }
- }
-
- untag_buffer (src, len + 2 * A, 1);
-}
-
-int
-main ()
-{
- if (!mte_enabled ())
- return 0;
-
- sbuf = mte_mmap (LEN + 3 * A);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int s = 0; s < A; s += 16)
- {
- int n;
- for (n = 0; n < 200; n += 16)
- {
- test (funtab + i, s, n);
- }
- for (; n < LEN; n *= 2)
- {
- test (funtab + i, s, n);
- }
- }
- printf ("%s %s\n", err_count ? "FAIL" : "PASS", funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
-}
-#else
-int
-main ()
-{
- return 0;
-}
-#endif
diff --git a/string/test/memchr.c b/string/test/memchr.c
index 0ff77f5..1ebc6d6 100644
--- a/string/test/memchr.c
+++ b/string/test/memchr.c
@@ -1,7 +1,7 @@
/*
* memchr test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -10,101 +10,84 @@
#include <stdlib.h>
#include <string.h>
#include <limits.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- void *(*fun) (const void *s, int c, size_t n);
- int test_mte;
+ const char *name;
+ void *(*fun)(const void *, int c, size_t n);
} funtab[] = {
- // clang-format off
- F(memchr, 0)
+#define F(x) {#x, x},
+F(memchr)
#if __aarch64__
- F(__memchr_aarch64, 0)
- F(__memchr_aarch64_mte, 1)
+F(__memchr_aarch64)
# if __ARM_FEATURE_SVE
- F(__memchr_aarch64_sve, 1)
+F(__memchr_aarch64_sve)
# endif
#elif __arm__
- F(__memchr_arm, 0)
+F(__memchr_arm)
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-#define ALIGN 32
-#define LEN 512
-static char *sbuf;
+#define A 32
+#define SP 512
+#define LEN 250000
+static unsigned char sbuf[LEN+2*A];
-static void *
-alignup (void *p)
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int align, size_t seekpos, size_t len,
- size_t maxlen)
+static void test(const struct fun *fun, int align, int seekpos, int len)
{
- char *src = alignup (sbuf);
- char *s = src + align;
- char *f = seekpos < maxlen ? s + seekpos : NULL;
- int seekchar = 1;
- void *p;
+ unsigned char *src = alignup(sbuf);
+ unsigned char *s = src + align;
+ unsigned char *f = len ? s + seekpos : 0;
+ int seekchar = 0x1;
+ int i;
+ void *p;
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || seekpos > LEN || align > ALIGN)
- abort ();
+ if (len > LEN || seekpos >= len || align >= A)
+ abort();
- for (int i = 0; src + i < s; i++)
- src[i] = seekchar;
- for (int i = 0; i <= ALIGN; i++)
- s[len + i] = seekchar;
- for (int i = 0; i < len; i++)
- s[i] = 'a' + (i & 31);
- s[seekpos] = seekchar;
- s[((len ^ align) & 1) ? seekpos + 1 : len] = seekchar;
+ for (i = 0; i < seekpos; i++)
+ s[i] = 'a' + i%23;
+ s[i++] = seekchar;
+ for (; i < len; i++)
+ s[i] = 'a' + i%23;
- int mte_len = seekpos != -1 ? seekpos + 1 : maxlen;
- s = tag_buffer (s, mte_len, fun->test_mte);
- p = fun->fun (s, seekchar, maxlen);
- untag_buffer (s, mte_len, fun->test_mte);
- p = untag_pointer (p);
+ p = fun->fun(s, seekchar, len);
- if (p != f)
- {
- ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s,
- seekchar, maxlen, p, f);
- quote ("input", s, len);
- }
+ if (p != f) {
+ ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+ ERR("expected: %p\n", f);
+ abort();
+ }
}
-int
-main (void)
+int main()
{
- sbuf = mte_mmap (LEN + 3 * ALIGN);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int a = 0; a < ALIGN; a++)
- for (int n = 0; n < LEN; n++)
- {
- for (int sp = 0; sp < LEN; sp++)
- test (funtab + i, a, sp, n, n);
- test (funtab + i, a, n, n, SIZE_MAX - a);
- }
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ for (int n = 0; n < 100; n++)
+ for (int sp = 0; sp < n-1; sp++)
+ test(funtab+i, a, sp, n);
+ for (int n = 100; n < LEN; n *= 2) {
+ test(funtab+i, a, n-1, n);
+ test(funtab+i, a, n/2, n);
+ }
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index 7a7cf9c..114f1d7 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -1,7 +1,7 @@
/*
* memcmp test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -9,117 +9,88 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- int (*fun) (const void *s1, const void *s2, size_t n);
- int test_mte;
+ const char *name;
+ int (*fun)(const void *s1, const void *s2, size_t n);
} funtab[] = {
- // clang-format off
- F(memcmp, 0)
+#define F(x) {#x, x},
+F(memcmp)
#if __aarch64__
- F(__memcmp_aarch64, 1)
+F(__memcmp_aarch64)
# if __ARM_FEATURE_SVE
- F(__memcmp_aarch64_sve, 1)
+F(__memcmp_aarch64_sve)
# endif
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
#define A 32
#define LEN 250000
-static unsigned char *s1buf;
-static unsigned char *s2buf;
+static unsigned char s1buf[LEN+2*A];
+static unsigned char s2buf[LEN+2*A];
-static void *
-alignup (void *p)
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + A - 1) & -A);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int s1align, int s2align, int len, int diffpos,
- int delta)
+static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
{
- unsigned char *src1 = alignup (s1buf);
- unsigned char *src2 = alignup (s2buf);
- unsigned char *s1 = src1 + s1align;
- unsigned char *s2 = src2 + s2align;
- int r;
+ unsigned char *src1 = alignup(s1buf);
+ unsigned char *src2 = alignup(s2buf);
+ unsigned char *s1 = src1 + s1align;
+ unsigned char *s2 = src2 + s2align;
+ int r;
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || s1align >= A || s2align >= A)
- abort ();
- if (diffpos >= len)
- abort ();
- if ((diffpos < 0) != (delta == 0))
- abort ();
+ if (len > LEN || s1align >= A || s2align >= A)
+ abort();
+ if (diffpos && diffpos >= len)
+ abort();
- for (int i = 0; i < len + A; i++)
- src1[i] = src2[i] = '?';
- for (int i = 0; i < len; i++)
- s1[i] = s2[i] = 'a' + i % 23;
- if (delta)
- s1[diffpos] += delta;
+ for (int i = 0; i < len+A; i++)
+ src1[i] = src2[i] = '?';
+ for (int i = 0; i < len; i++)
+ s1[i] = s2[i] = 'a' + i%23;
+ if (diffpos)
+ s1[diffpos]++;
- s1 = tag_buffer (s1, len, fun->test_mte);
- s2 = tag_buffer (s2, len, fun->test_mte);
- r = fun->fun (s1, s2, len);
- untag_buffer (s1, len, fun->test_mte);
- untag_buffer (s2, len, fun->test_mte);
+ r = fun->fun(s1, s2, len);
- if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
- {
- ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name,
- s1align, s2align, len, r);
- quoteat ("src1", src1, len + A, diffpos);
- quoteat ("src2", src2, len + A, diffpos);
- }
+ if ((!diffpos && r != 0) || (diffpos && r == 0)) {
+ ERR("%s(align %d, align %d, %d) failed, returned %d\n",
+ fun->name, s1align, s2align, len, r);
+ ERR("src1: %.*s\n", s1align+len+1, src1);
+ ERR("src2: %.*s\n", s2align+len+1, src2);
+ }
}
-int
-main ()
+int main()
{
- s1buf = mte_mmap (LEN + 2 * A);
- s2buf = mte_mmap (LEN + 2 * A);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int d = 0; d < A; d++)
- for (int s = 0; s < A; s++)
- {
- int n;
- test (funtab + i, d, s, 0, -1, 0);
- test (funtab + i, d, s, 1, -1, 0);
- test (funtab + i, d, s, 1, 0, -1);
- test (funtab + i, d, s, 1, 0, 1);
- for (n = 2; n < 100; n++)
- {
- test (funtab + i, d, s, n, -1, 0);
- test (funtab + i, d, s, n, 0, -1);
- test (funtab + i, d, s, n, n - 1, -1);
- test (funtab + i, d, s, n, n / 2, 1);
- }
- for (; n < LEN; n *= 2)
- {
- test (funtab + i, d, s, n, -1, 0);
- test (funtab + i, d, s, n, n / 2, -1);
- }
- }
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++) {
+ test(funtab+i, d, s, n, 0);
+ test(funtab+i, d, s, n, n / 2);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, d, s, n, 0);
+ test(funtab+i, d, s, n, n / 2);
+ }
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index ce0ceee..8572452 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -1,7 +1,7 @@
/*
* memcpy test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -9,112 +9,90 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- void *(*fun) (void *, const void *, size_t);
- int test_mte;
+ const char *name;
+ void *(*fun)(void *, const void *, size_t);
} funtab[] = {
- // clang-format off
- F(memcpy, 0)
+#define F(x) {#x, x},
+F(memcpy)
#if __aarch64__
- F(__memcpy_aarch64, 1)
+F(__memcpy_aarch64)
# if __ARM_NEON
- F(__memcpy_aarch64_simd, 1)
+F(__memcpy_aarch64_simd)
# endif
#elif __arm__
- F(__memcpy_arm, 0)
+F(__memcpy_arm)
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
#define A 32
#define LEN 250000
-static unsigned char *dbuf;
-static unsigned char *sbuf;
-static unsigned char wbuf[LEN + 2 * A];
+static unsigned char dbuf[LEN+2*A];
+static unsigned char sbuf[LEN+2*A];
+static unsigned char wbuf[LEN+2*A];
-static void *
-alignup (void *p)
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + A - 1) & -A);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int dalign, int salign, int len)
+static void test(const struct fun *fun, int dalign, int salign, int len)
{
- unsigned char *src = alignup (sbuf);
- unsigned char *dst = alignup (dbuf);
- unsigned char *want = wbuf;
- unsigned char *s = src + salign;
- unsigned char *d = dst + dalign;
- unsigned char *w = want + dalign;
- void *p;
- int i;
+ unsigned char *src = alignup(sbuf);
+ unsigned char *dst = alignup(dbuf);
+ unsigned char *want = wbuf;
+ unsigned char *s = src + salign;
+ unsigned char *d = dst + dalign;
+ unsigned char *w = want + dalign;
+ void *p;
+ int i;
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || dalign >= A || salign >= A)
- abort ();
- for (i = 0; i < len + A; i++)
- {
- src[i] = '?';
- want[i] = dst[i] = '*';
- }
- for (i = 0; i < len; i++)
- s[i] = w[i] = 'a' + i % 23;
-
- s = tag_buffer (s, len, fun->test_mte);
- d = tag_buffer (d, len, fun->test_mte);
- p = fun->fun (d, s, len);
- untag_buffer (s, len, fun->test_mte);
- untag_buffer (d, len, fun->test_mte);
+ if (len > LEN || dalign >= A || salign >= A)
+ abort();
+ for (i = 0; i < len+A; i++) {
+ src[i] = '?';
+ want[i] = dst[i] = '*';
+ }
+ for (i = 0; i < len; i++)
+ s[i] = w[i] = 'a' + i%23;
- if (p != d)
- ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
- for (i = 0; i < len + A; i++)
- {
- if (dst[i] != want[i])
- {
- ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
- len);
- quoteat ("got", dst, len + A, i);
- quoteat ("want", want, len + A, i);
- break;
+ p = fun->fun(d, s, len);
+ if (p != d)
+ ERR("%s(%p,..) returned %p\n", fun->name, d, p);
+ for (i = 0; i < len+A; i++) {
+ if (dst[i] != want[i]) {
+ ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
+ ERR("got : %.*s\n", dalign+len+1, dst);
+ ERR("want: %.*s\n", dalign+len+1, want);
+ break;
+ }
}
- }
}
-int
-main ()
+int main()
{
- dbuf = mte_mmap (LEN + 2 * A);
- sbuf = mte_mmap (LEN + 2 * A);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int d = 0; d < A; d++)
- for (int s = 0; s < A; s++)
- {
- int n;
- for (n = 0; n < 100; n++)
- test (funtab + i, d, s, n);
- for (; n < LEN; n *= 2)
- test (funtab + i, d, s, n);
- }
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++)
+ test(funtab+i, d, s, n);
+ for (; n < LEN; n *= 2)
+ test(funtab+i, d, s, n);
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/memmove.c b/string/test/memmove.c
index 689b68c..7891b14 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -1,7 +1,7 @@
/*
* memmove test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -9,156 +9,136 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- void *(*fun) (void *, const void *, size_t);
- int test_mte;
+ const char *name;
+ void *(*fun)(void *, const void *, size_t);
} funtab[] = {
- // clang-format off
- F(memmove, 0)
+#define F(x) {#x, x},
+F(memmove)
#if __aarch64__
- F(__memmove_aarch64, 1)
+F(__memmove_aarch64)
# if __ARM_NEON
- F(__memmove_aarch64_simd, 1)
+F(__memmove_aarch64_simd)
# endif
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
#define A 32
#define LEN 250000
-static unsigned char *dbuf;
-static unsigned char *sbuf;
-static unsigned char wbuf[LEN + 2 * A];
+static unsigned char dbuf[LEN+2*A];
+static unsigned char sbuf[LEN+2*A];
+static unsigned char wbuf[LEN+2*A];
-static void *
-alignup (void *p)
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + A - 1) & -A);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int dalign, int salign, int len)
+static void test(const struct fun *fun, int dalign, int salign, int len)
{
- unsigned char *src = alignup (sbuf);
- unsigned char *dst = alignup (dbuf);
- unsigned char *want = wbuf;
- unsigned char *s = src + salign;
- unsigned char *d = dst + dalign;
- unsigned char *w = want + dalign;
- void *p;
- int i;
-
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || dalign >= A || salign >= A)
- abort ();
- for (i = 0; i < len + A; i++)
- {
- src[i] = '?';
- want[i] = dst[i] = '*';
- }
- for (i = 0; i < len; i++)
- s[i] = w[i] = 'a' + i % 23;
-
- p = fun->fun (d, s, len);
- if (p != d)
- ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
- for (i = 0; i < len + A; i++)
- {
- if (dst[i] != want[i])
- {
- ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
- len);
- quoteat ("got", dst, len + A, i);
- quoteat ("want", want, len + A, i);
- break;
+ unsigned char *src = alignup(sbuf);
+ unsigned char *dst = alignup(dbuf);
+ unsigned char *want = wbuf;
+ unsigned char *s = src + salign;
+ unsigned char *d = dst + dalign;
+ unsigned char *w = want + dalign;
+ void *p;
+ int i;
+
+ if (len > LEN || dalign >= A || salign >= A)
+ abort();
+ for (i = 0; i < len+A; i++) {
+ src[i] = '?';
+ want[i] = dst[i] = '*';
+ }
+ for (i = 0; i < len; i++)
+ s[i] = w[i] = 'a' + i%23;
+
+ p = fun->fun(d, s, len);
+ if (p != d)
+ ERR("%s(%p,..) returned %p\n", fun->name, d, p);
+ for (i = 0; i < len+A; i++) {
+ if (dst[i] != want[i]) {
+ ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
+ ERR("got : %.*s\n", dalign+len+1, dst);
+ ERR("want: %.*s\n", dalign+len+1, want);
+ break;
+ }
}
- }
}
-static void
-test_overlap (const struct fun *fun, int dalign, int salign, int len)
+static void test_overlap(const struct fun *fun, int dalign, int salign, int len)
{
- unsigned char *src = alignup (sbuf);
- unsigned char *dst = src;
- unsigned char *want = wbuf;
- unsigned char *s = src + salign;
- unsigned char *d = dst + dalign;
- unsigned char *w = wbuf + dalign;
- void *p;
-
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || dalign >= A || salign >= A)
- abort ();
-
- for (int i = 0; i < len + A; i++)
- src[i] = want[i] = '?';
-
- for (int i = 0; i < len; i++)
- s[i] = want[salign + i] = 'a' + i % 23;
- for (int i = 0; i < len; i++)
- w[i] = s[i];
-
- s = tag_buffer (s, len, fun->test_mte);
- d = tag_buffer (d, len, fun->test_mte);
- p = fun->fun (d, s, len);
- untag_buffer (s, len, fun->test_mte);
- untag_buffer (d, len, fun->test_mte);
-
- if (p != d)
- ERR ("%s(%p,..) returned %p\n", fun->name, d, p);
- for (int i = 0; i < len + A; i++)
- {
- if (dst[i] != want[i])
- {
- ERR ("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign,
- len);
- quoteat ("got", dst, len + A, i);
- quoteat ("want", want, len + A, i);
- break;
+ unsigned char *src = alignup(sbuf);
+ unsigned char *dst = alignup(sbuf);
+ unsigned char *want = wbuf;
+ unsigned char *s = src + salign;
+ unsigned char *d = dst + dalign;
+ unsigned char *w = wbuf + dalign;
+ void *p;
+
+ if (len > LEN || dalign >= A || salign >= A)
+ abort();
+
+ for (int i = 0; i < len+A; i++)
+ src[i] = want[i] = '?';
+
+ for (int i = 0; i < len; i++)
+ s[i] = w[i] = 'a' + i%23;
+
+ /* Copy the potential overlap range. */
+ if (s < d) {
+ for (int i = 0; i < (uintptr_t)d-(uintptr_t)s; i++)
+ want[salign+i] = src[salign+i];
+ } else {
+ for (int i = 0; i < (uintptr_t)s-(uintptr_t)d; i++)
+ want[len + dalign + i] = src[len + dalign + i];
+ }
+
+ p = fun->fun(d, s, len);
+ if (p != d)
+ ERR("%s(%p,..) returned %p\n", fun->name, d, p);
+ for (int i = 0; i < len+A; i++) {
+ if (dst[i] != want[i]) {
+ ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
+ ERR("got : %.*s\n", dalign+len+1, dst);
+ ERR("want: %.*s\n", dalign+len+1, want);
+ abort();
+ break;
+ }
}
- }
}
-int
-main ()
+int main()
{
- dbuf = mte_mmap (LEN + 2 * A);
- sbuf = mte_mmap (LEN + 2 * A);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int d = 0; d < A; d++)
- for (int s = 0; s < A; s++)
- {
- int n;
- for (n = 0; n < 100; n++)
- {
- test (funtab + i, d, s, n);
- test_overlap (funtab + i, d, s, n);
- }
- for (; n < LEN; n *= 2)
- {
- test (funtab + i, d, s, n);
- test_overlap (funtab + i, d, s, n);
- }
- }
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ test_overlap(funtab+0, 2, 1, 1);
+
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++) {
+ test(funtab+i, d, s, n);
+ test_overlap(funtab+i, d, s, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, d, s, n);
+ test_overlap(funtab+i, d, s, n);
+ }
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/memrchr.c b/string/test/memrchr.c
deleted file mode 100644
index adf96f0..0000000
--- a/string/test/memrchr.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * memchr test.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include "mte.h"
-#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
-
-static const struct fun
-{
- const char *name;
- void *(*fun) (const void *s, int c, size_t n);
- int test_mte;
-} funtab[] = {
- // clang-format off
- F(memrchr, 0)
-#if __aarch64__
- F(__memrchr_aarch64, 1)
-#endif
- {0, 0, 0}
- // clang-format on
-};
-#undef F
-
-#define ALIGN 32
-#define LEN 512
-static char *sbuf;
-
-static void *
-alignup (void *p)
-{
- return (void *) (((uintptr_t) p + ALIGN) & -ALIGN);
-}
-
-static void
-test (const struct fun *fun, int align, size_t seekpos, size_t len,
- size_t maxlen)
-{
- char *src = alignup (sbuf);
- char *s = src + align;
- char *f = seekpos < maxlen ? s + seekpos : NULL;
- int seekchar = 1;
- void *p;
-
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || seekpos > LEN || align > ALIGN)
- abort ();
-
- for (int i = 0; src + i < s; i++)
- src[i] = seekchar;
- for (int i = 0; i <= ALIGN; i++)
- s[len + i] = seekchar;
- for (int i = 0; i < len; i++)
- s[i] = 'a' + (i & 31);
- s[seekpos] = seekchar;
- s[((len ^ align) & 1) && seekpos < maxlen ? seekpos - 1 : len] = seekchar;
-
- s = tag_buffer (s, maxlen, fun->test_mte);
- p = fun->fun (s, seekchar, maxlen);
- untag_buffer (s, maxlen, fun->test_mte);
- p = untag_pointer (p);
-
- if (p != f)
- {
- ERR ("%s (%p, 0x%02x, %zu) returned %p, expected %p\n", fun->name, s,
- seekchar, maxlen, p, f);
- quote ("input", s, len);
- }
-}
-
-int
-main (void)
-{
- sbuf = mte_mmap (LEN + 3 * ALIGN);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int a = 0; a < ALIGN; a++)
- for (int n = 0; n < LEN; n++)
- {
- for (int sp = 0; sp < LEN; sp++)
- test (funtab + i, a, sp, n, n);
- }
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
-}
diff --git a/string/test/memset.c b/string/test/memset.c
index f172144..48c10fa 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -1,7 +1,7 @@
/*
* memset test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -9,121 +9,103 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- void *(*fun) (void *s, int c, size_t n);
- int test_mte;
+ const char *name;
+ void *(*fun)(void *s, int c, size_t n);
} funtab[] = {
- // clang-format off
- F(memset, 0)
+#define F(x) {#x, x},
+F(memset)
#if __aarch64__
- F(__memset_aarch64, 1)
+F(__memset_aarch64)
#elif __arm__
- F(__memset_arm, 0)
+F(__memset_arm)
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
#define A 32
#define LEN 250000
-static unsigned char *sbuf;
+static unsigned char sbuf[LEN+2*A];
-static void *
-alignup (void *p)
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + A - 1) & -A);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int salign, int c, int len)
+static void err(const char *name, unsigned char *src, int salign, int c, int len)
{
- unsigned char *src = alignup (sbuf);
- unsigned char *s = src + salign;
- void *p;
- int i;
+ ERR("%s(align %d, %d, %d) failed\n", name, salign, c, len);
+ ERR("got : %.*s\n", salign+len+1, src);
+}
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || salign >= A)
- abort ();
- for (i = 0; i < len + A; i++)
- src[i] = '?';
- for (i = 0; i < len; i++)
- s[i] = 'a' + i % 23;
+static void test(const struct fun *fun, int salign, int c, int len)
+{
+ unsigned char *src = alignup(sbuf);
+ unsigned char *s = src + salign;
+ void *p;
+ int i;
- s = tag_buffer (s, len, fun->test_mte);
- p = fun->fun (s, c, len);
- untag_buffer (s, len, fun->test_mte);
+ if (len > LEN || salign >= A)
+ abort();
+ for (i = 0; i < len+A; i++)
+ src[i] = '?';
+ for (i = 0; i < len; i++)
+ s[i] = 'a' + i%23;
+ for (; i<len%A; i++)
+ s[i] = '*';
- if (p != s)
- ERR ("%s(%p,..) returned %p\n", fun->name, s, p);
+ p = fun->fun(s, c, len);
+ if (p != s)
+ ERR("%s(%p,..) returned %p\n", fun->name, s, p);
- for (i = 0; i < salign; i++)
- {
- if (src[i] != '?')
- {
- ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
- quoteat ("got", src, len + A, i);
- return;
+ for (i = 0; i < salign; i++) {
+ if (src[i] != '?') {
+ err(fun->name, src, salign, c, len);
+ return;
+ }
}
- }
- for (; i < salign + len; i++)
- {
- if (src[i] != (unsigned char) c)
- {
- ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
- quoteat ("got", src, len + A, i);
- return;
+ for (i = salign; i < len; i++) {
+ if (src[i] != (unsigned char)c) {
+ err(fun->name, src, salign, c, len);
+ return;
+ }
}
- }
- for (; i < len + A; i++)
- {
- if (src[i] != '?')
- {
- ERR ("%s(align %d, %d, %d) failed\n", fun->name, salign, c, len);
- quoteat ("got", src, len + A, i);
- return;
+ for (; i < len%A; i++) {
+ if (src[i] != '*') {
+ err(fun->name, src, salign, c, len);
+ return;
+ }
}
- }
}
-int
-main ()
+int main()
{
- sbuf = mte_mmap (LEN + 2 * A);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int s = 0; s < A; s++)
- {
- int n;
- for (n = 0; n < 100; n++)
- {
- test (funtab + i, s, 0, n);
- test (funtab + i, s, 0x25, n);
- test (funtab + i, s, 0xaa25, n);
- }
- for (; n < LEN; n *= 2)
- {
- test (funtab + i, s, 0, n);
- test (funtab + i, s, 0x25, n);
- test (funtab + i, s, 0xaa25, n);
- }
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++) {
+ test(funtab+i, s, 0, n);
+ test(funtab+i, s, 0x25, n);
+ test(funtab+i, s, 0xaa25, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, s, 0, n);
+ test(funtab+i, s, 0x25, n);
+ test(funtab+i, s, 0xaa25, n);
+ }
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
}
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ return r;
}
diff --git a/string/test/mte.h b/string/test/mte.h
deleted file mode 100644
index e67cbd9..0000000
--- a/string/test/mte.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Memory tagging testing code.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef __TEST_MTE_H
-#define __TEST_MTE_H
-
-#include <stdlib.h>
-
-#if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
-#include <arm_acle.h>
-#include <sys/mman.h>
-#include <sys/prctl.h>
-
-// These depend on a not yet merged kernel ABI.
-#define PR_SET_TAGGED_ADDR_CTRL 55
-#define PR_TAGGED_ADDR_ENABLE (1UL << 0)
-#define PR_MTE_TCF_SHIFT 1
-#define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT)
-#define PR_MTE_TAG_SHIFT 3
-#define PROT_MTE 0x20
-
-#define MTE_GRANULE_SIZE 16
-
-int
-mte_enabled ()
-{
- static int enabled = -1;
- if (enabled == -1)
- {
- int res = prctl (PR_SET_TAGGED_ADDR_CTRL,
- PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_SYNC
- | (0xfffe << PR_MTE_TAG_SHIFT),
- 0, 0, 0);
- enabled = (res == 0);
- }
- return enabled;
-}
-
-static void *
-mte_mmap (size_t size)
-{
- if (mte_enabled ())
- {
- return mmap (NULL, size, PROT_READ | PROT_WRITE | PROT_MTE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- }
- else
- {
- return malloc (size);
- }
-}
-
-void *
-alignup_mte (void *p)
-{
- return (void *) (((uintptr_t) p + MTE_GRANULE_SIZE - 1)
- & ~(MTE_GRANULE_SIZE - 1));
-}
-
-void *
-aligndown_mte (void *p)
-{
- return (void *) ((uintptr_t) p & ~(MTE_GRANULE_SIZE - 1));
-}
-
-void *
-untag_pointer (void *p)
-{
- return (void *) ((unsigned long long) p & (~0ULL >> 8));
-}
-
-void
-tag_buffer_helper (void *p, int len)
-{
- char *ptr = p;
- char *end = alignup_mte (ptr + len);
- ptr = aligndown_mte (p);
- for (; ptr < end; ptr += MTE_GRANULE_SIZE)
- {
- __arm_mte_set_tag (ptr);
- }
-}
-
-void *
-tag_buffer (void *p, int len, int test_mte)
-{
- if (test_mte && mte_enabled ())
- {
- p = __arm_mte_increment_tag (p, 1);
- tag_buffer_helper (p, len);
- }
- return p;
-}
-
-void *
-untag_buffer (void *p, int len, int test_mte)
-{
- p = untag_pointer (p);
- if (test_mte && mte_enabled ())
- {
- tag_buffer_helper (p, len);
- }
- return p;
-}
-
-#else // __ARM_FEATURE_MEMORY_TAGGING
-int
-mte_enabled ()
-{
- return 0;
-}
-static void *
-mte_mmap (size_t size)
-{
- return malloc (size);
-}
-void *
-tag_buffer (void *p, int len, int test_mte)
-{
- (void) len;
- (void) test_mte;
- return p;
-}
-void *
-untag_buffer (void *p, int len, int test_mte)
-{
- (void) len;
- (void) test_mte;
- return p;
-}
-void *
-untag_pointer (void *p)
-{
- return p;
-}
-#endif // __ARM_FEATURE_MEMORY_TAGGING
-
-#endif
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
deleted file mode 100644
index 1827e68..0000000
--- a/string/test/stpcpy.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * stpcpy test.
- *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "mte.h"
-#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
-
-static const struct fun
-{
- const char *name;
- char *(*fun) (char *dest, const char *src);
- int test_mte;
-} funtab[] = {
- // clang-format off
- F(stpcpy, 0)
-#if __aarch64__
- F(__stpcpy_aarch64, 0)
- F(__stpcpy_aarch64_mte, 1)
-# if __ARM_FEATURE_SVE
- F(__stpcpy_aarch64_sve, 1)
-# endif
-#endif
- {0, 0, 0}
- // clang-format on
-};
-#undef F
-
-#define ALIGN 32
-#define LEN 512
-static char *dbuf;
-static char *sbuf;
-static char wbuf[LEN + 3 * ALIGN];
-
-static void *
-alignup (void *p)
-{
- return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
-}
-
-static void
-test (const struct fun *fun, int dalign, int salign, int len)
-{
- char *src = alignup (sbuf);
- char *dst = alignup (dbuf);
- char *want = wbuf;
- char *s = src + salign;
- char *d = dst + dalign;
- char *w = want + dalign;
- void *p;
- int i;
-
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || dalign >= ALIGN || salign >= ALIGN)
- abort ();
- for (i = 0; i < len + ALIGN; i++)
- {
- src[i] = '?';
- want[i] = dst[i] = '*';
- }
- for (int i = 0; src + i < s; i++)
- src[i] = 0;
- for (int i = 1; i <= ALIGN; i++)
- s[len + i] = (len + salign) & 1 ? 1 : 0;
- for (i = 0; i < len; i++)
- s[i] = w[i] = 'a' + (i & 31);
- s[len] = w[len] = '\0';
-
- s = tag_buffer (s, len + 1, fun->test_mte);
- d = tag_buffer (d, len + 1, fun->test_mte);
- p = fun->fun (d, s);
- untag_buffer (s, len + 1, fun->test_mte);
- untag_buffer (d, len + 1, fun->test_mte);
-
- if (p != d + len)
- ERR ("%s (%p,..) returned %p expected %p\n", fun->name, d, p, d + len);
-
- for (i = 0; i < len + ALIGN; i++)
- {
- if (dst[i] != want[i])
- {
- ERR ("%s (align %d, align %d, %d) failed\n",
- fun->name, dalign, salign, len);
- quoteat ("got", dst, len + ALIGN, i);
- quoteat ("want", want, len + ALIGN, i);
- break;
- }
- }
-}
-
-int
-main (void)
-{
- sbuf = mte_mmap (LEN + 3 * ALIGN);
- dbuf = mte_mmap (LEN + 3 * ALIGN);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int d = 0; d < ALIGN; d++)
- for (int s = 0; s < ALIGN; s++)
- for (int n = 0; n < LEN; n++)
- test (funtab + i, d, s, n);
-
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
-}
diff --git a/string/test/strchr.c b/string/test/strchr.c
index f3ae982..a625567 100644
--- a/string/test/strchr.c
+++ b/string/test/strchr.c
@@ -1,7 +1,7 @@
/*
* strchr test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -10,112 +10,88 @@
#include <stdlib.h>
#include <string.h>
#include <limits.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- char *(*fun) (const char *s, int c);
- int test_mte;
+ const char *name;
+ char *(*fun)(const char *s, int c);
} funtab[] = {
- // clang-format off
- F(strchr, 0)
+#define F(x) {#x, x},
+F(strchr)
#if __aarch64__
- F(__strchr_aarch64, 0)
- F(__strchr_aarch64_mte, 1)
+F(__strchr_aarch64)
# if __ARM_FEATURE_SVE
- F(__strchr_aarch64_sve, 1)
+F(__strchr_aarch64_sve)
# endif
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-#define ALIGN 32
-#define LEN 512
-static char *sbuf;
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
-static void *
-alignup (void *p)
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int align, int seekpos, int len)
+static void test(const struct fun *fun, int align, int seekpos, int len)
{
- char *src = alignup (sbuf);
- char *s = src + align;
- char *f = seekpos != -1 ? s + seekpos : 0;
- int seekchar = 0x1;
- void *p;
+ char *src = alignup(sbuf);
+ char *s = src + align;
+ char *f = seekpos != -1 ? s + seekpos : 0;
+ int seekchar = 0x1;
+ void *p;
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || seekpos >= len || align >= ALIGN)
- abort ();
+ if (len > LEN || seekpos >= len - 1 || align >= A)
+ abort();
+ if (seekchar >= 'a' && seekchar <= 'a' + 23)
+ abort();
- for (int i = 0; src + i < s; i++)
- src[i] = (i + len) & 1 ? seekchar : 0;
- for (int i = 1; i <= ALIGN; i++)
- s[len + i] = (i + len) & 1 ? seekchar : 0;
- for (int i = 0; i < len; i++)
- s[i] = 'a' + (i & 31);
- if (seekpos != -1)
- s[seekpos] = seekchar;
- if (seekpos != -1 && (len + align) & 1)
- s[seekpos + 1] = seekchar;
- s[len] = '\0';
+ for (int i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (int i = 0; i < len - 2; i++)
+ s[i] = 'a' + i%23;
+ if (seekpos != -1)
+ s[seekpos] = seekchar;
+ s[len - 1] = '\0';
- s = tag_buffer (s, len + 1, fun->test_mte);
- p = fun->fun (s, seekchar);
- untag_buffer (s, len + 1, fun->test_mte);
- p = untag_pointer (p);
+ p = fun->fun(s, seekchar);
- if (p != f)
- {
- ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
- fun->name, s, seekchar, len, p, f, seekpos);
- quote ("input", s, len);
- }
-
- s = tag_buffer (s, len + 1, fun->test_mte);
- p = fun->fun (s, 0);
- untag_buffer (s, len + 1, fun->test_mte);
-
- if (p != s + len)
- {
- ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
- fun->name, s, 0, len, p, f, len);
- quote ("input", s, len);
- }
+ if (p != f) {
+ ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+ ERR("expected: %p\n", f);
+ abort();
+ }
}
-int
-main (void)
+int main()
{
- sbuf = mte_mmap (LEN + 3 * ALIGN);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int a = 0; a < ALIGN; a++)
- for (int n = 0; n < LEN; n++)
- {
- for (int sp = 0; sp < n; sp++)
- test (funtab + i, a, sp, n);
- test (funtab + i, a, -1, n);
- }
-
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ int n;
+ for (n = 1; n < 100; n++) {
+ for (int sp = 0; sp < n - 1; sp++)
+ test(funtab+i, a, sp, n);
+ test(funtab+i, a, -1, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, a, -1, n);
+ test(funtab+i, a, n / 2, n);
+ }
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
index 6c30ab2..814dd1e 100644
--- a/string/test/strchrnul.c
+++ b/string/test/strchrnul.c
@@ -1,126 +1,99 @@
/*
* strchrnul test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#ifndef _GNU_SOURCE
#define _GNU_SOURCE
-#endif
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- char *(*fun) (const char *s, int c);
- int test_mte;
+ const char *name;
+ char *(*fun)(const char *s, int c);
} funtab[] = {
- // clang-format off
- F(strchrnul, 0)
+#define F(x) {#x, x},
+F(strchrnul)
#if __aarch64__
- F(__strchrnul_aarch64, 0)
- F(__strchrnul_aarch64_mte, 1)
+F(__strchrnul_aarch64)
# if __ARM_FEATURE_SVE
- F(__strchrnul_aarch64_sve, 1)
+F(__strchrnul_aarch64_sve)
# endif
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
-#define ALIGN 32
-#define LEN 512
-static char *sbuf;
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-static void *
-alignup (void *p)
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int align, int seekpos, int len)
+static void test(const struct fun *fun, int align, int seekpos, int len)
{
- char *src = alignup (sbuf);
- char *s = src + align;
- char *f = seekpos != -1 ? s + seekpos : s + len;
- int seekchar = 0x1;
- void *p;
-
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || seekpos >= len || align >= ALIGN)
- abort ();
+ char *src = alignup(sbuf);
+ char *s = src + align;
+ char *f = seekpos != -1 ? s + seekpos : s + len - 1;
+ int seekchar = 0x1;
+ void *p;
- for (int i = 0; src + i < s; i++)
- src[i] = (i + len) & 1 ? seekchar : 0;
- for (int i = 1; i <= ALIGN; i++)
- s[len + i] = (i + len) & 1 ? seekchar : 0;
- for (int i = 0; i < len; i++)
- s[i] = 'a' + (i & 31);
- if (seekpos != -1)
- s[seekpos] = seekchar;
- if (seekpos != -1 && (len + align) & 1)
- s[seekpos + 1] = seekchar;
- s[len] = '\0';
+ if (len > LEN || seekpos >= len - 1 || align >= A)
+ abort();
+ if (seekchar >= 'a' && seekchar <= 'a' + 23)
+ abort();
- int mte_len = seekpos != -1 ? seekpos + 1 : len + 1;
- s = tag_buffer (s, mte_len, fun->test_mte);
- p = fun->fun (s, seekchar);
- untag_buffer (s, mte_len, fun->test_mte);
- p = untag_pointer (p);
+ for (int i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (int i = 0; i < len - 2; i++)
+ s[i] = 'a' + i%23;
+ if (seekpos != -1)
+ s[seekpos] = seekchar;
+ s[len - 1] = '\0';
- if (p != f)
- {
- ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
- fun->name, s, seekchar, len, p, f, seekpos);
- quote ("input", s, len);
- }
+ p = fun->fun(s, seekchar);
- s = tag_buffer (s, len + 1, fun->test_mte);
- p = fun->fun (s, 0);
- untag_buffer (s, len + 1, fun->test_mte);
-
- if (p != s + len)
- {
- ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
- fun->name, s, 0, len, p, f, len);
- quote ("input", s, len);
- }
+ if (p != f) {
+ ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+ ERR("expected: %p\n", f);
+ abort();
+ }
}
-int
-main (void)
+int main()
{
- sbuf = mte_mmap (LEN + 3 * ALIGN);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int a = 0; a < ALIGN; a++)
- for (int n = 0; n < LEN; n++)
- {
- for (int sp = 0; sp < n; sp++)
- test (funtab + i, a, sp, n);
- test (funtab + i, a, -1, n);
- }
-
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ int n;
+ for (n = 1; n < 100; n++) {
+ for (int sp = 0; sp < n - 1; sp++)
+ test(funtab+i, a, sp, n);
+ test(funtab+i, a, -1, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, a, -1, n);
+ test(funtab+i, a, n / 2, n);
+ }
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index d57b54e..91fa9dd 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -1,7 +1,7 @@
/*
* strcmp test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -9,124 +9,95 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- int (*fun) (const char *s1, const char *s2);
- int test_mte;
+ const char *name;
+ int (*fun)(const char *s1, const char *s2);
} funtab[] = {
- // clang-format off
- F(strcmp, 0)
+#define F(x) {#x, x},
+F(strcmp)
#if __aarch64__
- F(__strcmp_aarch64, 0)
- F(__strcmp_aarch64_mte, 1)
+F(__strcmp_aarch64)
# if __ARM_FEATURE_SVE
- F(__strcmp_aarch64_sve, 1)
+F(__strcmp_aarch64_sve)
# endif
#elif __arm__
# if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
- F(__strcmp_arm, 0)
+F(__strcmp_arm)
# elif __ARM_ARCH == 6 && __ARM_ARCH_6M__ >= 1
- F(__strcmp_armv6m, 0)
+F(__strcmp_armv6m)
# endif
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
#define A 32
#define LEN 250000
-static char *s1buf;
-static char *s2buf;
+static char s1buf[LEN+2*A];
+static char s2buf[LEN+2*A];
-static void *
-alignup (void *p)
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + A - 1) & -A);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int s1align, int s2align, int len, int diffpos,
- int delta)
+static void test(const struct fun *fun, int s1align, int s2align, int len, int diffpos)
{
- char *src1 = alignup (s1buf);
- char *src2 = alignup (s2buf);
- char *s1 = src1 + s1align;
- char *s2 = src2 + s2align;
- int r;
+ char *src1 = alignup(s1buf);
+ char *src2 = alignup(s2buf);
+ char *s1 = src1 + s1align;
+ char *s2 = src2 + s2align;
+ int r;
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || s1align >= A || s2align >= A)
- abort ();
- if (diffpos >= len)
- abort ();
- if ((diffpos < 0) != (delta == 0))
- abort ();
+ if (len > LEN || s1align >= A || s2align >= A)
+ abort();
+ if (diffpos > 1 && diffpos >= len-1)
+ abort();
- for (int i = 0; i < len + A; i++)
- src1[i] = src2[i] = '?';
- for (int i = 0; i < len; i++)
- s1[i] = s2[i] = 'a' + i % 23;
- if (delta)
- s1[diffpos] += delta;
- s1[len] = s2[len] = '\0';
+ for (int i = 0; i < len+A; i++)
+ src1[i] = src2[i] = '?';
+ for (int i = 0; i < len-1; i++)
+ s1[i] = s2[i] = 'a' + i%23;
+ if (diffpos > 1)
+ s1[diffpos]++;
+ s1[len] = s2[len] = '\0';
- s1 = tag_buffer (s1, len + 1, fun->test_mte);
- s2 = tag_buffer (s2, len + 1, fun->test_mte);
- r = fun->fun (s1, s2);
- untag_buffer (s1, len + 1, fun->test_mte);
- untag_buffer (s2, len + 1, fun->test_mte);
+ r = fun->fun(s1, s2);
- if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
- {
- ERR ("%s(align %d, align %d, %d) failed, returned %d\n", fun->name,
- s1align, s2align, len, r);
- quoteat ("src1", src1, len + A, diffpos);
- quoteat ("src2", src2, len + A, diffpos);
- }
+ if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
+ ERR("%s(align %d, align %d, %d) failed, returned %d\n",
+ fun->name, s1align, s2align, len, r);
+ ERR("src1: %.*s\n", s1align+len+1, src1);
+ ERR("src2: %.*s\n", s2align+len+1, src2);
+ }
}
-int
-main ()
+int main()
{
- s1buf = mte_mmap (LEN + 2 * A + 1);
- s2buf = mte_mmap (LEN + 2 * A + 1);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int d = 0; d < A; d++)
- for (int s = 0; s < A; s++)
- {
- int n;
- test (funtab + i, d, s, 0, -1, 0);
- test (funtab + i, d, s, 1, -1, 0);
- test (funtab + i, d, s, 1, 0, 1);
- test (funtab + i, d, s, 1, 0, -1);
- for (n = 2; n < 100; n++)
- {
- test (funtab + i, d, s, n, -1, 0);
- test (funtab + i, d, s, n, n - 1, -1);
- test (funtab + i, d, s, n, n / 2, 1);
- }
- for (; n < LEN; n *= 2)
- {
- test (funtab + i, d, s, n, -1, 0);
- test (funtab + i, d, s, n, n / 2, -1);
- }
- }
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++) {
+ test(funtab+i, d, s, n, 0);
+ test(funtab+i, d, s, n, n / 2);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, d, s, n, 0);
+ test(funtab+i, d, s, n, n / 2);
+ }
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index e84cace..4882c9f 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -1,7 +1,7 @@
/*
* strcpy test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -9,115 +9,91 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- char *(*fun) (char *dest, const char *src);
- int test_mte;
+ const char *name;
+ char *(*fun)(char *dest, const char *src);
} funtab[] = {
- // clang-format off
- F(strcpy, 0)
+#define F(x) {#x, x},
+F(strcpy)
#if __aarch64__
- F(__strcpy_aarch64, 0)
- F(__strcpy_aarch64_mte, 1)
+F(__strcpy_aarch64)
# if __ARM_FEATURE_SVE
- F(__strcpy_aarch64_sve, 1)
+F(__strcpy_aarch64_sve)
# endif
#elif __arm__ && defined (__thumb2__) && !defined (__thumb__)
- F(__strcpy_arm, 0)
+F(__strcpy_arm)
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
-#define ALIGN 32
-#define LEN 512
-static char *dbuf;
-static char *sbuf;
-static char wbuf[LEN + 3 * ALIGN];
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-static void *
-alignup (void *p)
+#define A 32
+#define LEN 250000
+static char dbuf[LEN+2*A];
+static char sbuf[LEN+2*A];
+static char wbuf[LEN+2*A];
+
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int dalign, int salign, int len)
+static void test(const struct fun *fun, int dalign, int salign, int len)
{
- char *src = alignup (sbuf);
- char *dst = alignup (dbuf);
- char *want = wbuf;
- char *s = src + salign;
- char *d = dst + dalign;
- char *w = want + dalign;
- void *p;
- int i;
-
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || dalign >= ALIGN || salign >= ALIGN)
- abort ();
- for (i = 0; i < len + ALIGN; i++)
- {
- src[i] = '?';
- want[i] = dst[i] = '*';
- }
- for (int i = 0; src + i < s; i++)
- src[i] = 0;
- for (int i = 1; i <= ALIGN; i++)
- s[len + i] = (len + salign) & 1 ? 1 : 0;
- for (i = 0; i < len; i++)
- s[i] = w[i] = 'a' + (i & 31);
- s[len] = w[len] = '\0';
+ char *src = alignup(sbuf);
+ char *dst = alignup(dbuf);
+ char *want = wbuf;
+ char *s = src + salign;
+ char *d = dst + dalign;
+ char *w = want + dalign;
+ void *p;
+ int i;
- s = tag_buffer (s, len + 1, fun->test_mte);
- d = tag_buffer (d, len + 1, fun->test_mte);
- p = fun->fun (d, s);
- untag_buffer (s, len + 1, fun->test_mte);
- untag_buffer (d, len + 1, fun->test_mte);
-
- if (p != d)
- ERR ("%s (%p,..) returned %p\n", fun->name, d, p);
+ if (len > LEN || dalign >= A || salign >= A)
+ abort();
+ for (i = 0; i < len+A; i++) {
+ src[i] = '?';
+ want[i] = dst[i] = '*';
+ }
+ for (i = 0; i < len-1; i++)
+ s[i] = w[i] = 'a' + i%23;
+ s[i] = w[i] = '\0';
- for (i = 0; i < len + ALIGN; i++)
- {
- if (dst[i] != want[i])
- {
- ERR ("%s (align %d, align %d, %d) failed\n",
- fun->name, dalign, salign, len);
- quoteat ("got", dst, len + ALIGN, i);
- quoteat ("want", want, len + ALIGN, i);
- break;
+ p = fun->fun(d, s);
+ if (p != d)
+ ERR("%s(%p,..) returned %p\n", fun->name, d, p);
+ for (i = 0; i < len+A; i++) {
+ if (dst[i] != want[i]) {
+ ERR("%s(align %d, align %d, %d) failed\n", fun->name, dalign, salign, len);
+ ERR("got : %.*s\n", dalign+len+1, dst);
+ ERR("want: %.*s\n", dalign+len+1, want);
+ break;
+ }
}
- }
}
-int
-main (void)
+int main()
{
- sbuf = mte_mmap (LEN + 3 * ALIGN);
- dbuf = mte_mmap (LEN + 3 * ALIGN);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int d = 0; d < ALIGN; d++)
- for (int s = 0; s < ALIGN; s++)
- for (int n = 0; n < LEN; n++)
- test (funtab + i, d, s, n);
-
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++)
+ test(funtab+i, d, s, n);
+ for (; n < LEN; n *= 2)
+ test(funtab+i, d, s, n);
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
deleted file mode 100644
index fe855fc..0000000
--- a/string/test/stringtest.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Common string test code.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include <ctype.h>
-#include <stdio.h>
-
-/* Accounting errors for a test case. */
-static int err_count;
-#define ERR_LIMIT 10
-#define ERR(...) (err_count++, printf (__VA_ARGS__))
-
-static inline void
-quotechar (unsigned char c)
-{
- if (isprint (c))
- putchar (c);
- else
- printf ("\\x%02x", c);
-}
-
-/* quoted print around at or the entire string if at < 0. */
-static void
-quoteat (const char *prefix, const void *p, int len, int at)
-{
- static const int CTXLEN = 15;
- int i;
- const char *pre = "\"";
- const char *post = "\"";
- const char *s = p;
- if (at > CTXLEN)
- {
- s += at - CTXLEN;
- len -= at - CTXLEN;
- pre = "...\"";
- }
- if (at >= 0 && len > 2 * CTXLEN + 1)
- {
- len = 2 * CTXLEN + 1;
- post = "\"...";
- }
- printf ("%4s: %s", prefix, pre);
- for (i = 0; i < len; i++)
- quotechar (s[i]);
- printf ("%s\n", post);
-}
-
-static inline void
-quote (const char *prefix, const void *p, int len)
-{
- quoteat (prefix, p, len, -1);
-}
diff --git a/string/test/strlen.c b/string/test/strlen.c
index 6278380..ff8e328 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -1,7 +1,7 @@
/*
* strlen test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -9,95 +9,82 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <sys/mman.h>
#include <limits.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- size_t (*fun) (const char *s);
- int test_mte;
+ const char *name;
+ size_t (*fun)(const char *s);
} funtab[] = {
- // clang-format off
- F(strlen, 0)
+#define F(x) {#x, x},
+F(strlen)
#if __aarch64__
- F(__strlen_aarch64, 0)
- F(__strlen_aarch64_mte, 1)
+F(__strlen_aarch64)
# if __ARM_FEATURE_SVE
- F(__strlen_aarch64_sve, 1)
+F(__strlen_aarch64_sve)
# endif
#elif __arm__
# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
- F(__strlen_armv6t2, 0)
+F(__strlen_armv6t2)
# endif
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
-#define ALIGN 32
-#define LEN 512
-static char *sbuf;
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-static void *
-alignup (void *p)
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
+
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int align, int len)
+static void test(const struct fun *fun, int align, int len)
{
- char *src = alignup (sbuf);
- char *s = src + align;
- size_t r;
-
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || align >= ALIGN)
- abort ();
+ char *src = alignup(sbuf);
+ char *s = src + align;
+ size_t r;
- for (int i = 0; src + i < s; i++)
- src[i] = 0;
- for (int i = 1; i <= ALIGN; i++)
- s[len + i] = (len + align) & 1 ? 1 : 0;
- for (int i = 0; i < len; i++)
- s[i] = 'a' + (i & 31);
- s[len] = '\0';
+ if (len > LEN || align >= A)
+ abort();
- s = tag_buffer (s, len + 1, fun->test_mte);
- r = fun->fun (s);
- untag_buffer (s, len + 1, fun->test_mte);
+ for (int i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (int i = 0; i < len - 2; i++)
+ s[i] = 'a' + i%23;
+ s[len - 1] = '\0';
- if (r != len)
- {
- ERR ("%s (%p) returned %zu expected %d\n", fun->name, s, r, len);
- quote ("input", src, len);
- }
+ r = fun->fun(s);
+ if (r != len-1) {
+ ERR("%s(%p) returned %zu\n", fun->name, s, r);
+ ERR("input: %.*s\n", align+len+1, src);
+ ERR("expected: %d\n", len);
+ abort();
+ }
}
-int
-main (void)
+int main()
{
- sbuf = mte_mmap (LEN + 3 * ALIGN);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int a = 0; a < ALIGN; a++)
- for (int n = 0; n < LEN; n++)
- test (funtab + i, a, n);
-
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ int n;
+ for (n = 1; n < 100; n++)
+ test(funtab+i, a, n);
+ for (; n < LEN; n *= 2)
+ test(funtab+i, a, n);
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 018a8a4..43f941d 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -1,7 +1,7 @@
/*
* strncmp test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -9,131 +9,95 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- int (*fun) (const char *, const char *, size_t);
- int test_mte;
+ const char *name;
+ int (*fun)(const char *, const char *, size_t);
} funtab[] = {
- // clang-format off
- F(strncmp, 0)
+#define F(x) {#x, x},
+F(strncmp)
#if __aarch64__
- F(__strncmp_aarch64, 0)
- F(__strncmp_aarch64_mte, 1)
+F(__strncmp_aarch64)
# if __ARM_FEATURE_SVE
- F(__strncmp_aarch64_sve, 1)
+F(__strncmp_aarch64_sve)
# endif
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
#define A 32
#define LEN 250000
-static char *s1buf;
-static char *s2buf;
+static char s1buf[LEN+2*A];
+static char s2buf[LEN+2*A];
-static void *
-alignup (void *p)
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + A - 1) & -A);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos,
- int len, int delta)
+static void test(const struct fun *fun, int s1align, int s2align, int maxlen, int diffpos, int len)
{
- char *src1 = alignup (s1buf);
- char *src2 = alignup (s2buf);
- char *s1 = src1 + s1align;
- char *s2 = src2 + s2align;
- int r;
+ char *src1 = alignup(s1buf);
+ char *src2 = alignup(s2buf);
+ char *s1 = src1 + s1align;
+ char *s2 = src2 + s2align;
+ int r;
+
+ if (len > LEN || s1align >= A || s2align >= A)
+ abort();
+ if (diffpos > 1 && diffpos >= len-1)
+ abort();
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || s1align >= A || s2align >= A)
- abort ();
- if (diffpos >= len)
- abort ();
- if ((diffpos < 0) != (delta == 0))
- abort ();
+ for (int i = 0; i < len+A; i++)
+ src1[i] = src2[i] = '?';
+ for (int i = 0; i < len-1; i++)
+ s1[i] = s2[i] = 'a' + i%23;
+ if (diffpos > 1)
+ s1[diffpos]++;
+ s1[len] = s2[len] = '\0';
- for (int i = 0; i < len + A; i++)
- src1[i] = src2[i] = '?';
- for (int i = 0; i < len; i++)
- s1[i] = s2[i] = 'a' + i % 23;
- if (delta)
- s1[diffpos] += delta;
- s1[len] = s2[len] = '\0';
+ r = fun->fun(s1, s2, maxlen);
- size_t mte_len = maxlen < len + 1 ? maxlen : len + 1;
- s1 = tag_buffer (s1, mte_len, fun->test_mte);
- s2 = tag_buffer (s2, mte_len, fun->test_mte);
- r = fun->fun (s1, s2, maxlen);
- untag_buffer (s1, mte_len, fun->test_mte);
- untag_buffer (s2, mte_len, fun->test_mte);
+ diffpos = maxlen <= diffpos ? 0 : diffpos;
- if (diffpos >= maxlen)
- {
- diffpos = -1;
- delta = 0;
- }
- if ((delta == 0 && r != 0) || (delta > 0 && r <= 0) || (delta < 0 && r >= 0))
- {
- ERR (
- "%s(align %d, align %d, %d) (len=%d, diffpos=%d) failed, returned %d\n",
- fun->name, s1align, s2align, maxlen, len, diffpos, r);
- quoteat ("src1", src1, len + A, diffpos);
- quoteat ("src2", src2, len + A, diffpos);
- }
+ if (((diffpos <= 1) && r != 0) || (diffpos > 1 && r == 0)) {
+ ERR("%s(align %d, align %d, %d (%d)) failed, returned %d (%d)\n",
+ fun->name, s1align, s2align, maxlen, len, r, diffpos);
+ ERR("src1: %.*s\n", s1align+len+1, src1);
+ ERR("src2: %.*s\n", s2align+len+1, src2);
+ }
}
-int
-main ()
+int main()
{
- s1buf = mte_mmap (LEN + 2 * A + 1);
- s2buf = mte_mmap (LEN + 2 * A + 1);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int d = 0; d < A; d++)
- for (int s = 0; s < A; s++)
- {
- int n;
- test (funtab + i, d, s, 0, -1, 0, 0);
- test (funtab + i, d, s, 1, -1, 0, 0);
- test (funtab + i, d, s, 0, -1, 1, 0);
- test (funtab + i, d, s, 1, -1, 1, 0);
- test (funtab + i, d, s, 2, -1, 1, 0);
- test (funtab + i, d, s, 1, 0, 1, 1);
- test (funtab + i, d, s, 1, 0, 1, -1);
- for (n = 2; n < 100; n++)
- {
- test (funtab + i, d, s, n, -1, n, 0);
- test (funtab + i, d, s, n, n / 2, n, 1);
- test (funtab + i, d, s, n / 2, -1, n, 0);
- test (funtab + i, d, s, n / 2, n / 2, n, -1);
- }
- for (; n < LEN; n *= 2)
- {
- test (funtab + i, d, s, n, -1, n, 0);
- test (funtab + i, d, s, n, n / 2, n, -1);
- test (funtab + i, d, s, n / 2, -1, n, 0);
- test (funtab + i, d, s, n / 2, n / 2, n, 1);
- }
- }
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int d = 0; d < A; d++)
+ for (int s = 0; s < A; s++) {
+ int n;
+ for (n = 0; n < 100; n++) {
+ test(funtab+i, d, s, n, 0, n);
+ test(funtab+i, d, s, n, n/2, n);
+ test(funtab+i, d, s, n/2, 0, n);
+ test(funtab+i, d, s, n/2, n/2, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, d, s, n, 0, n);
+ test(funtab+i, d, s, n, n/2, n);
+ test(funtab+i, d, s, n/2, 0, n);
+ test(funtab+i, d, s, n/2, n/2, n);
+ }
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
index 0dea00e..db41f2a 100644
--- a/string/test/strnlen.c
+++ b/string/test/strnlen.c
@@ -1,109 +1,93 @@
/*
* strnlen test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
+#define _POSIX_C_SOURCE 200809L
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- size_t (*fun) (const char *s, size_t m);
- int test_mte;
+ const char *name;
+ size_t (*fun)(const char *s, size_t m);
} funtab[] = {
- // clang-format off
- F(strnlen, 0)
+#define F(x) {#x, x},
+F(strnlen)
#if __aarch64__
- F(__strnlen_aarch64, 1)
+F(__strnlen_aarch64)
# if __ARM_FEATURE_SVE
- F(__strnlen_aarch64_sve, 1)
+F(__strnlen_aarch64_sve)
# endif
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-#define ALIGN 32
-#define LEN 512
-static char *sbuf;
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
-static void *
-alignup (void *p)
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int align, size_t maxlen, size_t len)
+static void test(const struct fun *fun, int align, int maxlen, int len)
{
- char *src = alignup (sbuf);
- char *s = src + align;
- size_t r;
- size_t e = maxlen < len ? maxlen : len;
-
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || align >= ALIGN)
- abort ();
+ char *src = alignup(sbuf);
+ char *s = src + align;
+ size_t r;
+ size_t e = maxlen < len ? maxlen : len - 1;
- for (int i = 0; src + i < s; i++)
- src[i] = 0;
- for (int i = 1; i <= ALIGN; i++)
- s[len + i] = (len + align) & 1 ? 1 : 0;
- for (int i = 0; i < len; i++)
- s[i] = 'a' + (i & 31);
- s[len] = 0;
- if ((len + align) & 1)
- s[e + 1] = 0;
+ if (len > LEN || align >= A)
+ abort();
- size_t mte_len = maxlen < len + 1 ? maxlen : len + 1;
- s = tag_buffer (s, mte_len, fun->test_mte);
- r = fun->fun (s, maxlen);
- untag_buffer (s, mte_len, fun->test_mte);
+ for (int i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (int i = 0; i < len - 2; i++)
+ s[i] = 'a' + i%23;
+ s[len - 1] = '\0';
- if (r != e)
- {
- ERR ("%s (%p, %zu) len %zu returned %zu, expected %zu\n",
- fun->name, s, maxlen, len, r, e);
- quote ("input", s, len);
- }
+ r = fun->fun(s, maxlen);
+ if (r != e) {
+ ERR("%s(%p) returned %zu\n", fun->name, s, r);
+ ERR("input: %.*s\n", align+len+1, src);
+ ERR("expected: %d\n", len);
+ abort();
+ }
}
-int
-main (void)
+int main()
{
- sbuf = mte_mmap (LEN + 3 * ALIGN);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int a = 0; a < ALIGN; a++)
- for (int n = 0; n < LEN; n++)
- {
- for (int maxlen = 0; maxlen < LEN; maxlen++)
- test (funtab + i, a, maxlen, n);
- test (funtab + i, a, SIZE_MAX - a, n);
- }
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ int n;
+ for (n = 1; n < 100; n++)
+ for (int maxlen = 0; maxlen < 100; maxlen++)
+ test(funtab+i, a, maxlen, n);
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, a, n*2, n);
+ test(funtab+i, a, n, n);
+ test(funtab+i, a, n/2, n);
+ }
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index fedbdc5..532fa51 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -1,7 +1,7 @@
/*
* strrchr test.
*
- * Copyright (c) 2019-2021, Arm Limited.
+ * Copyright (c) 2019, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -10,112 +10,88 @@
#include <stdlib.h>
#include <string.h>
#include <limits.h>
-#include "mte.h"
#include "stringlib.h"
-#include "stringtest.h"
-
-#define F(x, mte) {#x, x, mte},
static const struct fun
{
- const char *name;
- char *(*fun) (const char *s, int c);
- int test_mte;
+ const char *name;
+ char *(*fun)(const char *s, int c);
} funtab[] = {
- // clang-format off
- F(strrchr, 0)
+#define F(x) {#x, x},
+F(strrchr)
#if __aarch64__
- F(__strrchr_aarch64, 0)
- F(__strrchr_aarch64_mte, 1)
+F(__strrchr_aarch64)
# if __ARM_FEATURE_SVE
- F(__strrchr_aarch64_sve, 1)
+F(__strrchr_aarch64_sve)
# endif
#endif
- {0, 0, 0}
- // clang-format on
-};
#undef F
+ {0, 0}
+};
+
+static int test_status;
+#define ERR(...) (test_status=1, printf(__VA_ARGS__))
-#define ALIGN 32
-#define LEN 512
-static char *sbuf;
+#define A 32
+#define SP 512
+#define LEN 250000
+static char sbuf[LEN+2*A];
-static void *
-alignup (void *p)
+static void *alignup(void *p)
{
- return (void *) (((uintptr_t) p + ALIGN - 1) & -ALIGN);
+ return (void*)(((uintptr_t)p + A-1) & -A);
}
-static void
-test (const struct fun *fun, int align, int seekpos, int len)
+static void test(const struct fun *fun, int align, int seekpos, int len)
{
- char *src = alignup (sbuf);
- char *s = src + align;
- char *f = seekpos != -1 ? s + seekpos : 0;
- int seekchar = 0x1;
- void *p;
+ char *src = alignup(sbuf);
+ char *s = src + align;
+ char *f = seekpos != -1 ? s + seekpos : 0;
+ int seekchar = 0x1;
+ void *p;
- if (err_count >= ERR_LIMIT)
- return;
- if (len > LEN || seekpos >= len || align >= ALIGN)
- abort ();
+ if (len > LEN || seekpos >= len - 1 || align >= A)
+ abort();
+ if (seekchar >= 'a' && seekchar <= 'a' + 23)
+ abort();
- for (int i = 0; src + i < s; i++)
- src[i] = (i + len) & 1 ? seekchar : 0;
- for (int i = 1; i <= ALIGN; i++)
- s[len + i] = (i + len) & 1 ? seekchar : 0;
- for (int i = 0; i < len; i++)
- s[i] = 'a' + (i & 31);
- if (seekpos != -1)
- s[seekpos / 2] = s[seekpos] = seekchar;
- if (seekpos > 0 && (len + align) & 1)
- s[seekpos - 1] = seekchar;
- s[len] = '\0';
+ for (int i = 0; i < len + A; i++)
+ src[i] = '?';
+ for (int i = 0; i < len - 2; i++)
+ s[i] = 'a' + i%23;
+ if (seekpos != -1)
+ s[seekpos/2] = s[seekpos] = seekchar;
+ s[len - 1] = '\0';
- s = tag_buffer (s, len + 1, fun->test_mte);
- p = fun->fun (s, seekchar);
- untag_buffer (s, len + 1, fun->test_mte);
- p = untag_pointer (p);
+ p = fun->fun(s, seekchar);
- if (p != f)
- {
- ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
- fun->name, s, seekchar, len, p, f, seekpos);
- quote ("input", s, len);
- }
-
- s = tag_buffer (s, len + 1, fun->test_mte);
- p = fun->fun (s, 0);
- untag_buffer (s, len + 1, fun->test_mte);
-
- if (p != s + len)
- {
- ERR ("%s (%p, 0x%02x) len %d returned %p, expected %p pos %d\n",
- fun->name, s, 0, len, p, s + len, len);
- quote ("input", s, len);
- }
+ if (p != f) {
+ ERR("%s(%p,0x%02x,%d) returned %p\n", fun->name, s, seekchar, len, p);
+ ERR("expected: %p\n", f);
+ abort();
+ }
}
-int
-main (void)
+int main()
{
- sbuf = mte_mmap (LEN + 3 * ALIGN);
- int r = 0;
- for (int i = 0; funtab[i].name; i++)
- {
- err_count = 0;
- for (int a = 0; a < ALIGN; a++)
- for (int n = 0; n < LEN; n++)
- {
- for (int sp = 0; sp < n; sp++)
- test (funtab + i, a, sp, n);
- test (funtab + i, a, -1, n);
- }
-
- char *pass = funtab[i].test_mte && mte_enabled () ? "MTE PASS" : "PASS";
- printf ("%s %s\n", err_count ? "FAIL" : pass, funtab[i].name);
- if (err_count)
- r = -1;
- }
- return r;
+ int r = 0;
+ for (int i=0; funtab[i].name; i++) {
+ test_status = 0;
+ for (int a = 0; a < A; a++) {
+ int n;
+ for (n = 1; n < 100; n++) {
+ for (int sp = 0; sp < n - 1; sp++)
+ test(funtab+i, a, sp, n);
+ test(funtab+i, a, -1, n);
+ }
+ for (; n < LEN; n *= 2) {
+ test(funtab+i, a, -1, n);
+ test(funtab+i, a, n / 2, n);
+ }
+ }
+ printf("%s %s\n", test_status ? "FAIL" : "PASS", funtab[i].name);
+ if (test_status)
+ r = -1;
+ }
+ return r;
}
diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S
deleted file mode 100644
index 26ade0a..0000000
--- a/string/x86_64/check-arch.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * check ARCH setting.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if !__x86_64__
-# error ARCH setting does not match the compiler.
-#endif