aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2016-07-14 19:59:02 -0700
committerElliott Hughes <enh@google.com>2016-07-14 19:59:02 -0700
commit0653ca33e7ffbb54f09f97c7924bac17aca8d770 (patch)
tree6087083c35131860fa72456a56e20c03ecd8f881
parent55acd713d02f61c984e10ff2d4cef775a7fb597c (diff)
downloadjpeg-0653ca33e7ffbb54f09f97c7924bac17aca8d770.tar.gz
Fix AArch64 ABI conformance issue in SIMD code.
In the AArch64 ABI, the high (unused) DWORD of a 32-bit argument's register is undefined, so it was incorrect to use 64-bit instructions to transfer a JDIMENSION argument in the 64-bit NEON SIMD functions. The code worked thus far only because the existing compiler optimizers weren't smart enough to do anything else with the register in question, so the upper 32 bits happened to be all zeroes. The latest builds of Clang/LLVM have a smarter optimizer, and under certain circumstances, it will attempt to load-combine adjacent 32-bit integers from one of the libjpeg structures into a single 64-bit integer and pass that 64-bit integer as a 32-bit argument to one of the SIMD functions (which is allowed by the ABI, since the upper 32 bits of the 32-bit argument's register are undefined.) This caused the libjpeg-turbo regression tests to crash. This patch tries to use the Wn registers whenever possible. Otherwise, it uses a zero-extend instruction to avoid using the upper 32 bits of the 64-bit registers, which are not guaranteed to be valid for 32-bit arguments. Based on sebpop@1fbae13 Closes #91. Refer also to android-ndk/ndk#110 and https://llvm.org/bugs/show_bug.cgi?id=28393 (cherrypick from external/libjpeg-turbo of c8282be605a139b269f2b8f4b4fb6f4118e740ab.) Change-Id: I935a3696106e7ffcf9b1a6c12cb7f31d95b4ccc8
-rw-r--r--jsimd_arm64_neon.S36
1 files changed, 28 insertions, 8 deletions
diff --git a/jsimd_arm64_neon.S b/jsimd_arm64_neon.S
index 099d4b5..26a8b11 100644
--- a/jsimd_arm64_neon.S
+++ b/jsimd_arm64_neon.S
@@ -237,6 +237,11 @@ asm_function jsimd_idct_islow_neon
TMP3 .req x2
TMP4 .req x15
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
+ instruction ensures that those bits are set to zero. */
+ uxtw x3, w3
+
ROW0L .req v16
ROW0R .req v17
ROW1L .req v18
@@ -794,6 +799,11 @@ asm_function jsimd_idct_ifast_neon
TMP4 .req x22
TMP5 .req x23
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
+ instruction ensures that those bits are set to zero. */
+ uxtw x3, w3
+
/* Load and dequantize coefficients into NEON registers
* with the following allocation:
* 0 1 2 3 | 4 5 6 7
@@ -1167,6 +1177,11 @@ asm_function jsimd_idct_4x4_neon
TMP3 .req x2
TMP4 .req x15
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
+ instruction ensures that those bits are set to zero. */
+ uxtw x3, w3
+
/* Save all used NEON registers */
sub sp, sp, 272
str x15, [sp], 16
@@ -1362,6 +1377,12 @@ asm_function jsimd_idct_2x2_neon
TMP1 .req x0
TMP2 .req x15
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
+ instruction ensures that those bits are set to zero. */
+ uxtw x3, w3
+
+
/* vpush {v8.4h - v15.4h} ; not available */
sub sp, sp, 208
str x15, [sp], 16
@@ -1709,11 +1730,11 @@ Ljsimd_ycc_\colorid\()_neon_consts:
.short -128, -128, -128, -128
asm_function jsimd_ycc_\colorid\()_convert_neon
- OUTPUT_WIDTH .req x0
+ OUTPUT_WIDTH .req w0
INPUT_BUF .req x1
- INPUT_ROW .req x2
+ INPUT_ROW .req w2
OUTPUT_BUF .req x3
- NUM_ROWS .req x4
+ NUM_ROWS .req w4
INPUT_BUF0 .req x5
INPUT_BUF1 .req x6
@@ -1723,7 +1744,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
Y .req x8
U .req x9
V .req x10
- N .req x15
+ N .req w15
sub sp, sp, 336
str x15, [sp], 16
@@ -1760,11 +1781,10 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
cmp NUM_ROWS, #1
b.lt 9f
0:
- lsl x16, INPUT_ROW, #3
- ldr Y, [INPUT_BUF0, x16]
- ldr U, [INPUT_BUF1, x16]
+ ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
+ ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
mov N, OUTPUT_WIDTH
- ldr V, [INPUT_BUF2, x16]
+ ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
add INPUT_ROW, INPUT_ROW, #1
ldr RGB, [OUTPUT_BUF], #8