diff options
Diffstat (limited to 'simd')
-rw-r--r-- | simd/arm/aarch32/jsimd.c | 27 | ||||
-rw-r--r-- | simd/arm/aarch64/jchuff-neon.c | 236 | ||||
-rw-r--r-- | simd/arm/aarch64/jsimd.c | 42 | ||||
-rw-r--r-- | simd/arm/jcphuff-neon.c | 187 | ||||
-rw-r--r-- | simd/arm/jdcolor-neon.c | 1 | ||||
-rw-r--r-- | simd/arm/jdmerge-neon.c | 1 | ||||
-rw-r--r-- | simd/arm/jidctint-neon.c | 1 | ||||
-rw-r--r-- | simd/arm/jquanti-neon.c | 5 | ||||
-rw-r--r-- | simd/i386/jsimd.c | 107 | ||||
-rw-r--r-- | simd/jsimd.h | 12 | ||||
-rw-r--r-- | simd/x86_64/jchuff-sse2.asm | 3 | ||||
-rw-r--r-- | simd/x86_64/jsimd.c | 74 |
12 files changed, 393 insertions, 303 deletions
diff --git a/simd/arm/aarch32/jsimd.c b/simd/arm/aarch32/jsimd.c index fac55dfb..04d64526 100644 --- a/simd/arm/aarch32/jsimd.c +++ b/simd/arm/aarch32/jsimd.c @@ -3,8 +3,8 @@ * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). - * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander. + * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois. * Copyright (C) 2019, Google LLC. * Copyright (C) 2020, Arm Limited. * @@ -25,12 +25,10 @@ #include "../../../jsimddct.h" #include "../../jsimd.h" -#include <stdio.h> -#include <string.h> #include <ctype.h> -static unsigned int simd_support = ~0; -static unsigned int simd_huffman = 1; +static THREAD_LOCAL unsigned int simd_support = ~0; +static THREAD_LOCAL unsigned int simd_huffman = 1; #if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) @@ -98,14 +96,12 @@ parse_proc_cpuinfo(int bufsize) /* * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. */ LOCAL(void) init_simd(void) { #ifndef NO_GETENV - char *env = NULL; + char env[2] = { 0 }; #endif #if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)) int bufsize = 1024; /* an initial guess for the line buffer size limit */ @@ -131,14 +127,11 @@ init_simd(void) #ifndef NO_GETENV /* Force different settings through environment variables */ - env = getenv("JSIMD_FORCENEON"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1")) simd_support = JSIMD_NEON; - env = getenv("JSIMD_FORCENONE"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1")) simd_support = 0; - env = getenv("JSIMD_NOHUFFENC"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1")) simd_huffman = 0; #endif } @@ -950,7 +943,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void) GLOBAL(void) jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) + int Al, UJCOEF *values, size_t *zerobits) { jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start, Sl, Al, values, zerobits); @@ -975,7 +968,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void) GLOBAL(int) jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) + int Al, UJCOEF *absvalues, size_t *bits) { return jsimd_encode_mcu_AC_refine_prepare_neon(block, jpeg_natural_order_start, Sl, diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c index f13fd1b5..607a1160 100644 --- a/simd/arm/aarch64/jchuff-neon.c +++ b/simd/arm/aarch64/jchuff-neon.c @@ -2,7 +2,7 @@ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon) * * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. - * Copyright (C) 2020, D. R. Commander. All Rights Reserved. + * Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -59,6 +59,17 @@ ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = { 14, 15, 30, 31, 44, 45, 46, 47 }; +/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned + * address warning because the macro sometimes writes a 64-bit value to a + * non-64-bit-aligned address. That behavior is technically undefined per + * the C specification, but it is supported by the AArch64 architecture and + * compilers. + */ +#if defined(__has_feature) +#if __has_feature(undefined_behavior_sanitizer) +__attribute__((no_sanitize("alignment"))) +#endif +#endif JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val, c_derived_tbl *dctbl, @@ -158,73 +169,43 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, 7), row6, 5); /* DCT block is now in zig-zag order; start Huffman encoding process. */ - int16x8_t abs_row0 = vabsq_s16(row0); - int16x8_t abs_row1 = vabsq_s16(row1); - int16x8_t abs_row2 = vabsq_s16(row2); - int16x8_t abs_row3 = vabsq_s16(row3); - int16x8_t abs_row4 = vabsq_s16(row4); - int16x8_t abs_row5 = vabsq_s16(row5); - int16x8_t abs_row6 = vabsq_s16(row6); - int16x8_t abs_row7 = vabsq_s16(row7); - - /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */ - uint16x8_t row0_diff = - vreinterpretq_u16_s16(veorq_s16(abs_row0, vshrq_n_s16(row0, 15))); - uint16x8_t row1_diff = - vreinterpretq_u16_s16(veorq_s16(abs_row1, vshrq_n_s16(row1, 15))); - uint16x8_t row2_diff = - vreinterpretq_u16_s16(veorq_s16(abs_row2, vshrq_n_s16(row2, 15))); - uint16x8_t row3_diff = - vreinterpretq_u16_s16(veorq_s16(abs_row3, vshrq_n_s16(row3, 15))); - uint16x8_t row4_diff = - vreinterpretq_u16_s16(veorq_s16(abs_row4, vshrq_n_s16(row4, 15))); - uint16x8_t row5_diff = - vreinterpretq_u16_s16(veorq_s16(abs_row5, vshrq_n_s16(row5, 15))); - uint16x8_t row6_diff = - vreinterpretq_u16_s16(veorq_s16(abs_row6, vshrq_n_s16(row6, 15))); - uint16x8_t row7_diff = - vreinterpretq_u16_s16(veorq_s16(abs_row7, vshrq_n_s16(row7, 15))); /* Construct bitmap to accelerate encoding of AC coefficients. A set bit * means that the corresponding coefficient != 0. */ - uint8x8_t abs_row0_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row0), - vdupq_n_u16(0))); - uint8x8_t abs_row1_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row1), - vdupq_n_u16(0))); - uint8x8_t abs_row2_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row2), - vdupq_n_u16(0))); - uint8x8_t abs_row3_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row3), - vdupq_n_u16(0))); - uint8x8_t abs_row4_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row4), - vdupq_n_u16(0))); - uint8x8_t abs_row5_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row5), - vdupq_n_u16(0))); - uint8x8_t abs_row6_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row6), - vdupq_n_u16(0))); - uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7), - vdupq_n_u16(0))); + uint16x8_t row0_ne_0 = vtstq_s16(row0, row0); + uint16x8_t row1_ne_0 = vtstq_s16(row1, row1); + uint16x8_t row2_ne_0 = vtstq_s16(row2, row2); + uint16x8_t row3_ne_0 = vtstq_s16(row3, row3); + uint16x8_t row4_ne_0 = vtstq_s16(row4, row4); + uint16x8_t row5_ne_0 = vtstq_s16(row5, row5); + uint16x8_t row6_ne_0 = vtstq_s16(row6, row6); + uint16x8_t row7_ne_0 = vtstq_s16(row7, row7); + + uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0), + vreinterpretq_u8_u16(row0_ne_0)); + uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0), + vreinterpretq_u8_u16(row2_ne_0)); + uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0), + vreinterpretq_u8_u16(row4_ne_0)); + uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0), + vreinterpretq_u8_u16(row6_ne_0)); /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */ - const uint8x8_t bitmap_mask = - vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080)); + const uint8x16_t bitmap_mask = + vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080)); - abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask); - abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask); - abs_row2_gt0 = vand_u8(abs_row2_gt0, bitmap_mask); - abs_row3_gt0 = vand_u8(abs_row3_gt0, bitmap_mask); - abs_row4_gt0 = vand_u8(abs_row4_gt0, bitmap_mask); - abs_row5_gt0 = vand_u8(abs_row5_gt0, bitmap_mask); - abs_row6_gt0 = vand_u8(abs_row6_gt0, bitmap_mask); - abs_row7_gt0 = vand_u8(abs_row7_gt0, bitmap_mask); + uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask); + uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask); + uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask); + uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask); - uint8x8_t bitmap_rows_10 = vpadd_u8(abs_row1_gt0, abs_row0_gt0); - uint8x8_t bitmap_rows_32 = vpadd_u8(abs_row3_gt0, abs_row2_gt0); - uint8x8_t bitmap_rows_54 = vpadd_u8(abs_row5_gt0, abs_row4_gt0); - uint8x8_t bitmap_rows_76 = vpadd_u8(abs_row7_gt0, abs_row6_gt0); - uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10); - uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54); - uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210); + uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10); + uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54); + uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654, + bitmap_rows_3210); + uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210), + vget_high_u8(bitmap_rows_76543210)); /* Shift left to remove DC bit. */ bitmap_all = @@ -241,16 +222,16 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, /* Encode DC coefficient. */ + /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */ + int16x8_t abs_row0 = vabsq_s16(row0); + int16x8_t row0_lz = vclzq_s16(abs_row0); + uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz)); + uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask); /* Find nbits required to specify sign and amplitude of coefficient. */ -#if defined(_MSC_VER) && !defined(__clang__) - unsigned int lz = BUILTIN_CLZ(vgetq_lane_s16(abs_row0, 0)); -#else - unsigned int lz; - __asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0))); -#endif - unsigned int nbits = 32 - lz; + unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0); + unsigned int nbits = 16 - lz; /* Emit Huffman-coded symbol and additional diff bits. */ - unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz; + unsigned int diff = vgetq_lane_u16(row0_diff, 0); PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff) /* Encode AC coefficients. */ @@ -263,13 +244,20 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, /* The most efficient method of computing nbits and diff depends on the * number of non-zero coefficients. If the bitmap is not too sparse (> 8 - * non-zero AC coefficients), it is beneficial to use Neon; else we compute - * nbits and diff on demand using scalar code. + * non-zero AC coefficients), it is beneficial to do all of the work using + * Neon; else we do some of the work using Neon and the rest on demand using + * scalar code. */ if (non_zero_coefficients > 8) { uint8_t block_nbits[DCTSIZE2]; - int16x8_t row0_lz = vclzq_s16(abs_row0); + int16x8_t abs_row1 = vabsq_s16(row1); + int16x8_t abs_row2 = vabsq_s16(row2); + int16x8_t abs_row3 = vabsq_s16(row3); + int16x8_t abs_row4 = vabsq_s16(row4); + int16x8_t abs_row5 = vabsq_s16(row5); + int16x8_t abs_row6 = vabsq_s16(row6); + int16x8_t abs_row7 = vabsq_s16(row7); int16x8_t row1_lz = vclzq_s16(abs_row1); int16x8_t row2_lz = vclzq_s16(abs_row2); int16x8_t row3_lz = vclzq_s16(abs_row3); @@ -277,49 +265,48 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, int16x8_t row5_lz = vclzq_s16(abs_row5); int16x8_t row6_lz = vclzq_s16(abs_row6); int16x8_t row7_lz = vclzq_s16(abs_row7); + /* Narrow leading zero count to 8 bits. */ + uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz), + vreinterpretq_u8_s16(row1_lz)); + uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz), + vreinterpretq_u8_s16(row3_lz)); + uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz), + vreinterpretq_u8_s16(row5_lz)); + uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz), + vreinterpretq_u8_s16(row7_lz)); /* Compute nbits needed to specify magnitude of each coefficient. */ - uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16), - vmovn_u16(vreinterpretq_u16_s16(row0_lz))); - uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16), - vmovn_u16(vreinterpretq_u16_s16(row1_lz))); - uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16), - vmovn_u16(vreinterpretq_u16_s16(row2_lz))); - uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16), - vmovn_u16(vreinterpretq_u16_s16(row3_lz))); - uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16), - vmovn_u16(vreinterpretq_u16_s16(row4_lz))); - uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16), - vmovn_u16(vreinterpretq_u16_s16(row5_lz))); - uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16), - vmovn_u16(vreinterpretq_u16_s16(row6_lz))); - uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16), - vmovn_u16(vreinterpretq_u16_s16(row7_lz))); + uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz); + uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz); + uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz); + uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz); /* Store nbits. */ - vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits); - vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits); - vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits); - vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits); - vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits); - vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits); - vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits); - vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits); + vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits); + vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits); + vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits); + vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits); /* Mask bits not required to specify sign and amplitude of diff. */ - row0_diff = vshlq_u16(row0_diff, row0_lz); - row1_diff = vshlq_u16(row1_diff, row1_lz); - row2_diff = vshlq_u16(row2_diff, row2_lz); - row3_diff = vshlq_u16(row3_diff, row3_lz); - row4_diff = vshlq_u16(row4_diff, row4_lz); - row5_diff = vshlq_u16(row5_diff, row5_lz); - row6_diff = vshlq_u16(row6_diff, row6_lz); - row7_diff = vshlq_u16(row7_diff, row7_lz); - row0_diff = vshlq_u16(row0_diff, vnegq_s16(row0_lz)); - row1_diff = vshlq_u16(row1_diff, vnegq_s16(row1_lz)); - row2_diff = vshlq_u16(row2_diff, vnegq_s16(row2_lz)); - row3_diff = vshlq_u16(row3_diff, vnegq_s16(row3_lz)); - row4_diff = vshlq_u16(row4_diff, vnegq_s16(row4_lz)); - row5_diff = vshlq_u16(row5_diff, vnegq_s16(row5_lz)); - row6_diff = vshlq_u16(row6_diff, vnegq_s16(row6_lz)); - row7_diff = vshlq_u16(row7_diff, vnegq_s16(row7_lz)); + uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz)); + uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz)); + uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz)); + uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz)); + uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz)); + uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz)); + uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz)); + /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */ + uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), + row1_mask); + uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), + row2_mask); + uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), + row3_mask); + uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), + row4_mask); + uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), + row5_mask); + uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), + row6_mask); + uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), + row7_mask); /* Store diff bits. */ vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff); vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff); @@ -349,7 +336,14 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, } } else if (bitmap != 0) { uint16_t block_abs[DCTSIZE2]; - /* Store absolute value of coefficients. */ + /* Compute and store absolute value of coefficients. */ + int16x8_t abs_row1 = vabsq_s16(row1); + int16x8_t abs_row2 = vabsq_s16(row2); + int16x8_t abs_row3 = vabsq_s16(row3); + int16x8_t abs_row4 = vabsq_s16(row4); + int16x8_t abs_row5 = vabsq_s16(row5); + int16x8_t abs_row6 = vabsq_s16(row6); + int16x8_t abs_row7 = vabsq_s16(row7); vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0)); vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1)); vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2)); @@ -358,7 +352,21 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5)); vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6)); vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7)); - /* Store diff bits. */ + /* Compute diff bits (without nbits mask) and store. */ + uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), + vcltzq_s16(row1)); + uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), + vcltzq_s16(row2)); + uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), + vcltzq_s16(row3)); + uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), + vcltzq_s16(row4)); + uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), + vcltzq_s16(row5)); + uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), + vcltzq_s16(row6)); + uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), + vcltzq_s16(row7)); vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff); vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff); vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff); @@ -375,7 +383,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer, bitmap <<= r; lz = BUILTIN_CLZ(block_abs[i]); nbits = 32 - lz; - diff = (unsigned int)(block_diff[i] << lz) >> lz; + diff = ((unsigned int)block_diff[i] << lz) >> lz; while (r > 15) { /* If run length > 15, emit special run-length-16 codes. */ PUT_BITS(code_0xf0, size_0xf0) diff --git a/simd/arm/aarch64/jsimd.c b/simd/arm/aarch64/jsimd.c index 8570b82c..358e1597 100644 --- a/simd/arm/aarch64/jsimd.c +++ b/simd/arm/aarch64/jsimd.c @@ -3,8 +3,8 @@ * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies). - * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, D. R. Commander. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander. + * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois. * Copyright (C) 2020, Arm Limited. * * Based on the x86 SIMD extension for IJG JPEG library, @@ -23,20 +23,17 @@ #include "../../../jdct.h" #include "../../../jsimddct.h" #include "../../jsimd.h" -#include "jconfigint.h" -#include <stdio.h> -#include <string.h> #include <ctype.h> #define JSIMD_FASTLD3 1 #define JSIMD_FASTST3 2 #define JSIMD_FASTTBL 4 -static unsigned int simd_support = ~0; -static unsigned int simd_huffman = 1; -static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 | - JSIMD_FASTTBL; +static THREAD_LOCAL unsigned int simd_support = ~0; +static THREAD_LOCAL unsigned int simd_huffman = 1; +static THREAD_LOCAL unsigned int simd_features = JSIMD_FASTLD3 | + JSIMD_FASTST3 | JSIMD_FASTTBL; #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) @@ -111,8 +108,6 @@ parse_proc_cpuinfo(int bufsize) /* * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. */ /* @@ -125,7 +120,7 @@ LOCAL(void) init_simd(void) { #ifndef NO_GETENV - char *env = NULL; + char env[2] = { 0 }; #endif #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) int bufsize = 1024; /* an initial guess for the line buffer size limit */ @@ -147,24 +142,19 @@ init_simd(void) #ifndef NO_GETENV /* Force different settings through environment variables */ - env = getenv("JSIMD_FORCENEON"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1")) simd_support = JSIMD_NEON; - env = getenv("JSIMD_FORCENONE"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1")) simd_support = 0; - env = getenv("JSIMD_NOHUFFENC"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1")) simd_huffman = 0; - env = getenv("JSIMD_FASTLD3"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1")) simd_features |= JSIMD_FASTLD3; - if ((env != NULL) && (strcmp(env, "0") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0")) simd_features &= ~JSIMD_FASTLD3; - env = getenv("JSIMD_FASTST3"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1")) simd_features |= JSIMD_FASTST3; - if ((env != NULL) && (strcmp(env, "0") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0")) simd_features &= ~JSIMD_FASTST3; #endif } @@ -1028,7 +1018,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void) GLOBAL(void) jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) + int Al, UJCOEF *values, size_t *zerobits) { jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start, Sl, Al, values, zerobits); @@ -1055,7 +1045,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void) GLOBAL(int) jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) + int Al, UJCOEF *absvalues, size_t *bits) { return jsimd_encode_mcu_AC_refine_prepare_neon(block, jpeg_natural_order_start, diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c index b91c5db4..51db3c5f 100644 --- a/simd/arm/jcphuff-neon.c +++ b/simd/arm/jcphuff-neon.c @@ -2,6 +2,8 @@ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon) * * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. + * Copyright (C) 2022, Matthieu Darbois. All Rights Reserved. + * Copyright (C) 2022, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -21,7 +23,6 @@ */ #define JPEG_INTERNALS -#include "jconfigint.h" #include "../../jinclude.h" #include "../../jpeglib.h" #include "../../jsimd.h" @@ -41,10 +42,10 @@ void jsimd_encode_mcu_AC_first_prepare_neon (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *values, size_t *zerobits) + UJCOEF *values, size_t *zerobits) { - JCOEF *values_ptr = values; - JCOEF *diff_values_ptr = values + DCTSIZE2; + UJCOEF *values_ptr = values; + UJCOEF *diff_values_ptr = values + DCTSIZE2; /* Rows of coefficients to zero (since they haven't been processed) */ int i, rows_to_zero = 8; @@ -68,23 +69,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7); /* Isolate sign of coefficients. */ - int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15); - int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15); + uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)); + uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); /* Compute diff values. */ - int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1); - int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2); + uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1); + uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2); /* Store transformed coefficients and diff values. */ - vst1q_s16(values_ptr, coefs1); - vst1q_s16(values_ptr + DCTSIZE, coefs2); - vst1q_s16(diff_values_ptr, diff1); - vst1q_s16(diff_values_ptr + DCTSIZE, diff2); + vst1q_u16(values_ptr, abs_coefs1); + vst1q_u16(values_ptr + DCTSIZE, abs_coefs2); + vst1q_u16(diff_values_ptr, diff1); + vst1q_u16(diff_values_ptr + DCTSIZE, diff2); values_ptr += 16; diff_values_ptr += 16; jpeg_natural_order_start += 16; @@ -130,23 +131,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon } /* Isolate sign of coefficients. */ - int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15); - int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15); + uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)); + uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); /* Compute diff values. */ - int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1); - int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2); + uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1); + uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2); /* Store transformed coefficients and diff values. */ - vst1q_s16(values_ptr, coefs1); - vst1q_s16(values_ptr + DCTSIZE, coefs2); - vst1q_s16(diff_values_ptr, diff1); - vst1q_s16(diff_values_ptr + DCTSIZE, diff2); + vst1q_u16(values_ptr, abs_coefs1); + vst1q_u16(values_ptr + DCTSIZE, abs_coefs2); + vst1q_u16(diff_values_ptr, diff1); + vst1q_u16(diff_values_ptr + DCTSIZE, diff2); values_ptr += 16; diff_values_ptr += 16; rows_to_zero -= 2; @@ -184,17 +185,17 @@ void jsimd_encode_mcu_AC_first_prepare_neon } /* Isolate sign of coefficients. */ - int16x8_t sign_coefs = vshrq_n_s16(coefs, 15); + uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs = vabsq_s16(coefs); - coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al)); + uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs)); + abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al)); /* Compute diff values. */ - int16x8_t diff = veorq_s16(coefs, sign_coefs); + uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs); /* Store transformed coefficients and diff values. */ - vst1q_s16(values_ptr, coefs); - vst1q_s16(diff_values_ptr, diff); + vst1q_u16(values_ptr, abs_coefs); + vst1q_u16(diff_values_ptr, diff); values_ptr += 8; diff_values_ptr += 8; rows_to_zero--; @@ -202,8 +203,8 @@ void jsimd_encode_mcu_AC_first_prepare_neon /* Zero remaining memory in the values and diff_values blocks. */ for (i = 0; i < rows_to_zero; i++) { - vst1q_s16(values_ptr, vdupq_n_s16(0)); - vst1q_s16(diff_values_ptr, vdupq_n_s16(0)); + vst1q_u16(values_ptr, vdupq_n_u16(0)); + vst1q_u16(diff_values_ptr, vdupq_n_u16(0)); values_ptr += 8; diff_values_ptr += 8; } @@ -211,23 +212,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon /* Construct zerobits bitmap. A set bit means that the corresponding * coefficient != 0. */ - int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE); - int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE); - int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE); - int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE); - int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE); - int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE); - int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE); - int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE); - - uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0))); - uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0))); - uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0))); - uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0))); - uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0))); - uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0))); - uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0))); - uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0))); + uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE); + uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE); + uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE); + uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE); + uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE); + uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE); + uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE); + uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE); + + uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0))); + uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0))); + uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0))); + uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0))); + uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0))); + uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0))); + uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0))); + uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0))); /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ const uint8x8_t bitmap_mask = @@ -274,7 +275,7 @@ void jsimd_encode_mcu_AC_first_prepare_neon int jsimd_encode_mcu_AC_refine_prepare_neon (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *absvalues, size_t *bits) + UJCOEF *absvalues, size_t *bits) { /* Temporary storage buffers for data used to compute the signbits bitmap and * the end-of-block (EOB) position @@ -282,7 +283,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon uint8_t coef_sign_bits[64]; uint8_t coef_eq1_bits[64]; - JCOEF *absvalues_ptr = absvalues; + UJCOEF *absvalues_ptr = absvalues; uint8_t *coef_sign_bits_ptr = coef_sign_bits; uint8_t *eq1_bits_ptr = coef_eq1_bits; @@ -316,18 +317,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); - vst1q_s16(absvalues_ptr, coefs1); - vst1q_s16(absvalues_ptr + DCTSIZE, coefs2); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); + vst1q_u16(absvalues_ptr, abs_coefs1); + vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2); /* Test whether transformed coefficient values == 1 (used to find EOB * position.) */ - uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1))); - uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1))); + uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1))); + uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1))); vst1_u8(eq1_bits_ptr, coefs_eq11); vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); @@ -385,18 +386,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs1 = vabsq_s16(coefs1); - int16x8_t abs_coefs2 = vabsq_s16(coefs2); - coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al)); - coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al)); - vst1q_s16(absvalues_ptr, coefs1); - vst1q_s16(absvalues_ptr + DCTSIZE, coefs2); + uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1)); + uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2)); + abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al)); + abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al)); + vst1q_u16(absvalues_ptr, abs_coefs1); + vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2); /* Test whether transformed coefficient values == 1 (used to find EOB * position.) */ - uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1))); - uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1))); + uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1))); + uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1))); vst1_u8(eq1_bits_ptr, coefs_eq11); vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12); @@ -444,14 +445,14 @@ int jsimd_encode_mcu_AC_refine_prepare_neon vst1_u8(coef_sign_bits_ptr, sign_coefs); /* Compute absolute value of coefficients and apply point transform Al. */ - int16x8_t abs_coefs = vabsq_s16(coefs); - coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al)); - vst1q_s16(absvalues_ptr, coefs); + uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs)); + abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al)); + vst1q_u16(absvalues_ptr, abs_coefs); /* Test whether transformed coefficient values == 1 (used to find EOB * position.) */ - uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1))); + uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1))); vst1_u8(eq1_bits_ptr, coefs_eq1); absvalues_ptr += 8; @@ -462,7 +463,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon /* Zero remaining memory in blocks. */ for (i = 0; i < rows_to_zero; i++) { - vst1q_s16(absvalues_ptr, vdupq_n_s16(0)); + vst1q_u16(absvalues_ptr, vdupq_n_u16(0)); vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0)); vst1_u8(eq1_bits_ptr, vdup_n_u8(0)); absvalues_ptr += 8; @@ -471,23 +472,23 @@ int jsimd_encode_mcu_AC_refine_prepare_neon } /* Construct zerobits bitmap. */ - int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE); - int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE); - int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE); - int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE); - int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE); - int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE); - int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE); - int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE); - - uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0))); - uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0))); - uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0))); - uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0))); - uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0))); - uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0))); - uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0))); - uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0))); + uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE); + uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE); + uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE); + uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE); + uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE); + uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE); + uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE); + uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE); + + uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0))); + uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0))); + uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0))); + uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0))); + uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0))); + uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0))); + uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0))); + uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0))); /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */ const uint8x8_t bitmap_mask = diff --git a/simd/arm/jdcolor-neon.c b/simd/arm/jdcolor-neon.c index ea4668f1..28dbc572 100644 --- a/simd/arm/jdcolor-neon.c +++ b/simd/arm/jdcolor-neon.c @@ -21,7 +21,6 @@ */ #define JPEG_INTERNALS -#include "jconfigint.h" #include "../../jinclude.h" #include "../../jpeglib.h" #include "../../jsimd.h" diff --git a/simd/arm/jdmerge-neon.c b/simd/arm/jdmerge-neon.c index e4f91fdc..18fb9d8a 100644 --- a/simd/arm/jdmerge-neon.c +++ b/simd/arm/jdmerge-neon.c @@ -21,7 +21,6 @@ */ #define JPEG_INTERNALS -#include "jconfigint.h" #include "../../jinclude.h" #include "../../jpeglib.h" #include "../../jsimd.h" diff --git a/simd/arm/jidctint-neon.c b/simd/arm/jidctint-neon.c index 043b652e..d25112ef 100644 --- a/simd/arm/jidctint-neon.c +++ b/simd/arm/jidctint-neon.c @@ -22,7 +22,6 @@ */ #define JPEG_INTERNALS -#include "jconfigint.h" #include "../../jinclude.h" #include "../../jpeglib.h" #include "../../jsimd.h" diff --git a/simd/arm/jquanti-neon.c b/simd/arm/jquanti-neon.c index a7eb6f19..d5d95d89 100644 --- a/simd/arm/jquanti-neon.c +++ b/simd/arm/jquanti-neon.c @@ -1,7 +1,7 @@ /* * jquanti-neon.c - sample data conversion and quantization (Arm Neon) * - * Copyright (C) 2020, Arm Limited. All Rights Reserved. + * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages @@ -100,6 +100,9 @@ void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2; int i; +#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64)) +#pragma unroll +#endif for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) { /* Load DCT coefficients. */ int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE); diff --git a/simd/i386/jsimd.c b/simd/i386/jsimd.c index 563949a0..b429b0a5 100644 --- a/simd/i386/jsimd.c +++ b/simd/i386/jsimd.c @@ -2,8 +2,8 @@ * jsimd_i386.c * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022-2023, D. R. Commander. + * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -21,7 +21,6 @@ #include "../../jdct.h" #include "../../jsimddct.h" #include "../jsimd.h" -#include "jconfigint.h" /* * In the PIC cases, we have no guarantee that constants will keep @@ -32,19 +31,17 @@ #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ #define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */ -static unsigned int simd_support = (unsigned int)(~0); -static unsigned int simd_huffman = 1; +static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0); +static THREAD_LOCAL unsigned int simd_huffman = 1; /* * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. */ LOCAL(void) init_simd(void) { #ifndef NO_GETENV - char *env = NULL; + char env[2] = { 0 }; #endif if (simd_support != ~0U) @@ -54,26 +51,19 @@ init_simd(void) #ifndef NO_GETENV /* Force different settings through environment variables */ - env = getenv("JSIMD_FORCEMMX"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCEMMX") && !strcmp(env, "1")) simd_support &= JSIMD_MMX; - env = getenv("JSIMD_FORCE3DNOW"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCE3DNOW") && !strcmp(env, "1")) simd_support &= JSIMD_3DNOW | JSIMD_MMX; - env = getenv("JSIMD_FORCESSE"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCESSE") && !strcmp(env, "1")) simd_support &= JSIMD_SSE | JSIMD_MMX; - env = getenv("JSIMD_FORCESSE2"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1")) simd_support &= JSIMD_SSE2; - env = getenv("JSIMD_FORCEAVX2"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1")) simd_support &= JSIMD_AVX2; - env = getenv("JSIMD_FORCENONE"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1")) simd_support = 0; - env = getenv("JSIMD_NOHUFFENC"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1")) simd_huffman = 0; #endif } @@ -168,6 +158,9 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->in_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_extrgb_ycc_convert_avx2; @@ -227,6 +220,9 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->in_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_extrgb_gray_convert_avx2; @@ -286,6 +282,9 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_ycc_extrgb_convert_avx2; @@ -389,6 +388,9 @@ GLOBAL(void) jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, compptr->v_samp_factor, @@ -409,6 +411,9 @@ GLOBAL(void) jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, compptr->v_samp_factor, @@ -471,6 +476,9 @@ GLOBAL(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, input_data, output_data_ptr); @@ -486,6 +494,9 @@ GLOBAL(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, input_data, output_data_ptr); @@ -547,6 +558,9 @@ GLOBAL(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor, compptr->downsampled_width, input_data, @@ -565,6 +579,9 @@ GLOBAL(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor, compptr->downsampled_width, input_data, @@ -633,6 +650,9 @@ jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2; @@ -691,6 +711,9 @@ jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2; @@ -795,6 +818,9 @@ GLOBAL(void) jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_convsamp_avx2(sample_data, start_col, workspace); else if (simd_support & JSIMD_SSE2) @@ -807,6 +833,9 @@ GLOBAL(void) jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_SSE2) jsimd_convsamp_float_sse2(sample_data, start_col, workspace); else if (simd_support & JSIMD_SSE) @@ -877,6 +906,9 @@ jsimd_can_fdct_float(void) GLOBAL(void) jsimd_fdct_islow(DCTELEM *data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_fdct_islow_avx2(data); else if (simd_support & JSIMD_SSE2) @@ -888,6 +920,9 @@ jsimd_fdct_islow(DCTELEM *data) GLOBAL(void) jsimd_fdct_ifast(DCTELEM *data) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) jsimd_fdct_ifast_sse2(data); else @@ -897,6 +932,9 @@ jsimd_fdct_ifast(DCTELEM *data) GLOBAL(void) jsimd_fdct_float(FAST_FLOAT *data) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) jsimd_fdct_float_sse(data); else if (simd_support & JSIMD_3DNOW) @@ -952,6 +990,9 @@ jsimd_can_quantize_float(void) GLOBAL(void) jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_quantize_avx2(coef_block, divisors, workspace); else if (simd_support & JSIMD_SSE2) @@ -964,6 +1005,9 @@ GLOBAL(void) jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_SSE2) jsimd_quantize_float_sse2(coef_block, divisors, workspace); else if (simd_support & JSIMD_SSE) @@ -1027,6 +1071,9 @@ jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1039,6 +1086,9 @@ jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1133,6 +1183,9 @@ jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1149,6 +1202,9 @@ jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1162,6 +1218,9 @@ jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1219,7 +1278,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void) GLOBAL(void) jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) + int Al, UJCOEF *values, size_t *zerobits) { jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start, Sl, Al, values, zerobits); @@ -1245,7 +1304,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void) GLOBAL(int) jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) + int Al, UJCOEF *absvalues, size_t *bits) { return jsimd_encode_mcu_AC_refine_prepare_sse2(block, jpeg_natural_order_start, diff --git a/simd/jsimd.h b/simd/jsimd.h index 64747c63..a28754ad 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -2,10 +2,10 @@ * simd/jsimd.h * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander. + * Copyright (C) 2011, 2014-2016, 2018, 2020, 2022, D. R. Commander. * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. * Copyright (C) 2014, Linaro Limited. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois. * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. * Copyright (C) 2020, Arm Limited. * @@ -1243,16 +1243,16 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl /* Progressive Huffman encoding */ EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *values, size_t *zerobits); + UJCOEF *values, size_t *zerobits); EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *values, size_t *zerobits); + UJCOEF *values, size_t *zerobits); EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2 (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *absvalues, size_t *bits); + UJCOEF *absvalues, size_t *bits); EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al, - JCOEF *absvalues, size_t *bits); + UJCOEF *absvalues, size_t *bits); diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm index 00720283..9ea6df94 100644 --- a/simd/x86_64/jchuff-sse2.asm +++ b/simd/x86_64/jchuff-sse2.asm @@ -1,7 +1,7 @@ ; ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) ; -; Copyright (C) 2009-2011, 2014-2016, 2019, D. R. Commander. +; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander. ; Copyright (C) 2015, Matthieu Darbois. ; Copyright (C) 2018, Matthias Räncker. ; @@ -83,6 +83,7 @@ times 1 << 11 db 12 times 1 << 12 db 13 times 1 << 13 db 14 times 1 << 14 db 15 +times 1 << 15 db 16 alignz 32 diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c index eb766799..3f5ee77e 100644 --- a/simd/x86_64/jsimd.c +++ b/simd/x86_64/jsimd.c @@ -2,8 +2,8 @@ * jsimd_x86_64.c * * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB - * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander. - * Copyright (C) 2015-2016, 2018, Matthieu Darbois. + * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022-2023, D. R. Commander. + * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois. * * Based on the x86 SIMD extension for IJG JPEG library, * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -21,7 +21,6 @@ #include "../../jdct.h" #include "../../jsimddct.h" #include "../jsimd.h" -#include "jconfigint.h" /* * In the PIC cases, we have no guarantee that constants will keep @@ -32,19 +31,17 @@ #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ #define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */ -static unsigned int simd_support = (unsigned int)(~0); -static unsigned int simd_huffman = 1; +static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0); +static THREAD_LOCAL unsigned int simd_huffman = 1; /* * Check what SIMD accelerations are supported. - * - * FIXME: This code is racy under a multi-threaded environment. */ LOCAL(void) init_simd(void) { #ifndef NO_GETENV - char *env = NULL; + char env[2] = { 0 }; #endif if (simd_support != ~0U) @@ -54,17 +51,13 @@ init_simd(void) #ifndef NO_GETENV /* Force different settings through environment variables */ - env = getenv("JSIMD_FORCESSE2"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1")) simd_support &= JSIMD_SSE2; - env = getenv("JSIMD_FORCEAVX2"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1")) simd_support &= JSIMD_AVX2; - env = getenv("JSIMD_FORCENONE"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1")) simd_support = 0; - env = getenv("JSIMD_NOHUFFENC"); - if ((env != NULL) && (strcmp(env, "1") == 0)) + if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1")) simd_huffman = 0; #endif } @@ -152,6 +145,9 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->in_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_extrgb_ycc_convert_avx2; @@ -201,6 +197,9 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->in_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_extrgb_gray_convert_avx2; @@ -250,6 +249,9 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_ycc_extrgb_convert_avx2; @@ -340,6 +342,9 @@ GLOBAL(void) jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, compptr->v_samp_factor, @@ -356,6 +361,9 @@ GLOBAL(void) jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY output_data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor, compptr->v_samp_factor, @@ -410,6 +418,9 @@ GLOBAL(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, input_data, output_data_ptr); @@ -422,6 +433,9 @@ GLOBAL(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width, input_data, output_data_ptr); @@ -476,6 +490,9 @@ GLOBAL(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor, compptr->downsampled_width, input_data, @@ -490,6 +507,9 @@ GLOBAL(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr, JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor, compptr->downsampled_width, input_data, @@ -549,6 +569,9 @@ jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2; @@ -597,6 +620,9 @@ jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf, void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); + if (simd_support == ~0U) + init_simd(); + switch (cinfo->out_color_space) { case JCS_EXT_RGB: avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2; @@ -686,6 +712,9 @@ GLOBAL(void) jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_convsamp_avx2(sample_data, start_col, workspace); else @@ -755,6 +784,9 @@ jsimd_can_fdct_float(void) GLOBAL(void) jsimd_fdct_islow(DCTELEM *data) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_fdct_islow_avx2(data); else @@ -816,6 +848,9 @@ jsimd_can_quantize_float(void) GLOBAL(void) jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_quantize_avx2(coef_block, divisors, workspace); else @@ -970,6 +1005,9 @@ jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { + if (simd_support == ~0U) + init_simd(); + if (simd_support & JSIMD_AVX2) jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf, output_col); @@ -1040,7 +1078,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void) GLOBAL(void) jsimd_encode_mcu_AC_first_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *values, size_t *zerobits) + int Al, UJCOEF *values, size_t *zerobits) { jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start, Sl, Al, values, zerobits); @@ -1064,7 +1102,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void) GLOBAL(int) jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, - int Al, JCOEF *absvalues, size_t *bits) + int Al, UJCOEF *absvalues, size_t *bits) { return jsimd_encode_mcu_AC_refine_prepare_sse2(block, jpeg_natural_order_start, |