aboutsummaryrefslogtreecommitdiff
path: root/simd
diff options
context:
space:
mode:
Diffstat (limited to 'simd')
-rw-r--r--simd/arm/aarch32/jsimd.c27
-rw-r--r--simd/arm/aarch64/jchuff-neon.c236
-rw-r--r--simd/arm/aarch64/jsimd.c42
-rw-r--r--simd/arm/jcphuff-neon.c187
-rw-r--r--simd/arm/jdcolor-neon.c1
-rw-r--r--simd/arm/jdmerge-neon.c1
-rw-r--r--simd/arm/jidctint-neon.c1
-rw-r--r--simd/arm/jquanti-neon.c5
-rw-r--r--simd/i386/jsimd.c107
-rw-r--r--simd/jsimd.h12
-rw-r--r--simd/x86_64/jchuff-sse2.asm3
-rw-r--r--simd/x86_64/jsimd.c74
12 files changed, 393 insertions, 303 deletions
diff --git a/simd/arm/aarch32/jsimd.c b/simd/arm/aarch32/jsimd.c
index fac55dfb..04d64526 100644
--- a/simd/arm/aarch32/jsimd.c
+++ b/simd/arm/aarch32/jsimd.c
@@ -3,8 +3,8 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
- * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
* Copyright (C) 2019, Google LLC.
* Copyright (C) 2020, Arm Limited.
*
@@ -25,12 +25,10 @@
#include "../../../jsimddct.h"
#include "../../jsimd.h"
-#include <stdio.h>
-#include <string.h>
#include <ctype.h>
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
@@ -98,14 +96,12 @@ parse_proc_cpuinfo(int bufsize)
/*
* Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
*/
LOCAL(void)
init_simd(void)
{
#ifndef NO_GETENV
- char *env = NULL;
+ char env[2] = { 0 };
#endif
#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
int bufsize = 1024; /* an initial guess for the line buffer size limit */
@@ -131,14 +127,11 @@ init_simd(void)
#ifndef NO_GETENV
/* Force different settings through environment variables */
- env = getenv("JSIMD_FORCENEON");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
simd_support = JSIMD_NEON;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
simd_support = 0;
- env = getenv("JSIMD_NOHUFFENC");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
simd_huffman = 0;
#endif
}
@@ -950,7 +943,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *values, size_t *zerobits)
+ int Al, UJCOEF *values, size_t *zerobits)
{
jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
Sl, Al, values, zerobits);
@@ -975,7 +968,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *absvalues, size_t *bits)
+ int Al, UJCOEF *absvalues, size_t *bits)
{
return jsimd_encode_mcu_AC_refine_prepare_neon(block,
jpeg_natural_order_start, Sl,
diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c
index f13fd1b5..607a1160 100644
--- a/simd/arm/aarch64/jchuff-neon.c
+++ b/simd/arm/aarch64/jchuff-neon.c
@@ -2,7 +2,7 @@
* jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
*
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
- * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -59,6 +59,17 @@ ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
14, 15, 30, 31, 44, 45, 46, 47
};
+/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
+ * address warning because the macro sometimes writes a 64-bit value to a
+ * non-64-bit-aligned address. That behavior is technically undefined per
+ * the C specification, but it is supported by the AArch64 architecture and
+ * compilers.
+ */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("alignment")))
+#endif
+#endif
JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl,
@@ -158,73 +169,43 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
7), row6, 5);
/* DCT block is now in zig-zag order; start Huffman encoding process. */
- int16x8_t abs_row0 = vabsq_s16(row0);
- int16x8_t abs_row1 = vabsq_s16(row1);
- int16x8_t abs_row2 = vabsq_s16(row2);
- int16x8_t abs_row3 = vabsq_s16(row3);
- int16x8_t abs_row4 = vabsq_s16(row4);
- int16x8_t abs_row5 = vabsq_s16(row5);
- int16x8_t abs_row6 = vabsq_s16(row6);
- int16x8_t abs_row7 = vabsq_s16(row7);
-
- /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
- uint16x8_t row0_diff =
- vreinterpretq_u16_s16(veorq_s16(abs_row0, vshrq_n_s16(row0, 15)));
- uint16x8_t row1_diff =
- vreinterpretq_u16_s16(veorq_s16(abs_row1, vshrq_n_s16(row1, 15)));
- uint16x8_t row2_diff =
- vreinterpretq_u16_s16(veorq_s16(abs_row2, vshrq_n_s16(row2, 15)));
- uint16x8_t row3_diff =
- vreinterpretq_u16_s16(veorq_s16(abs_row3, vshrq_n_s16(row3, 15)));
- uint16x8_t row4_diff =
- vreinterpretq_u16_s16(veorq_s16(abs_row4, vshrq_n_s16(row4, 15)));
- uint16x8_t row5_diff =
- vreinterpretq_u16_s16(veorq_s16(abs_row5, vshrq_n_s16(row5, 15)));
- uint16x8_t row6_diff =
- vreinterpretq_u16_s16(veorq_s16(abs_row6, vshrq_n_s16(row6, 15)));
- uint16x8_t row7_diff =
- vreinterpretq_u16_s16(veorq_s16(abs_row7, vshrq_n_s16(row7, 15)));
/* Construct bitmap to accelerate encoding of AC coefficients. A set bit
* means that the corresponding coefficient != 0.
*/
- uint8x8_t abs_row0_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row0),
- vdupq_n_u16(0)));
- uint8x8_t abs_row1_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row1),
- vdupq_n_u16(0)));
- uint8x8_t abs_row2_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row2),
- vdupq_n_u16(0)));
- uint8x8_t abs_row3_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row3),
- vdupq_n_u16(0)));
- uint8x8_t abs_row4_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row4),
- vdupq_n_u16(0)));
- uint8x8_t abs_row5_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row5),
- vdupq_n_u16(0)));
- uint8x8_t abs_row6_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row6),
- vdupq_n_u16(0)));
- uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7),
- vdupq_n_u16(0)));
+ uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
+ uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
+ uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
+ uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
+ uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
+ uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
+ uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
+ uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
+
+ uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
+ vreinterpretq_u8_u16(row0_ne_0));
+ uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
+ vreinterpretq_u8_u16(row2_ne_0));
+ uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
+ vreinterpretq_u8_u16(row4_ne_0));
+ uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
+ vreinterpretq_u8_u16(row6_ne_0));
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
- const uint8x8_t bitmap_mask =
- vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+ const uint8x16_t bitmap_mask =
+ vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
- abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask);
- abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask);
- abs_row2_gt0 = vand_u8(abs_row2_gt0, bitmap_mask);
- abs_row3_gt0 = vand_u8(abs_row3_gt0, bitmap_mask);
- abs_row4_gt0 = vand_u8(abs_row4_gt0, bitmap_mask);
- abs_row5_gt0 = vand_u8(abs_row5_gt0, bitmap_mask);
- abs_row6_gt0 = vand_u8(abs_row6_gt0, bitmap_mask);
- abs_row7_gt0 = vand_u8(abs_row7_gt0, bitmap_mask);
+ uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
+ uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
- uint8x8_t bitmap_rows_10 = vpadd_u8(abs_row1_gt0, abs_row0_gt0);
- uint8x8_t bitmap_rows_32 = vpadd_u8(abs_row3_gt0, abs_row2_gt0);
- uint8x8_t bitmap_rows_54 = vpadd_u8(abs_row5_gt0, abs_row4_gt0);
- uint8x8_t bitmap_rows_76 = vpadd_u8(abs_row7_gt0, abs_row6_gt0);
- uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
- uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
- uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+ uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
+ uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
+ uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
+ bitmap_rows_3210);
+ uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
+ vget_high_u8(bitmap_rows_76543210));
/* Shift left to remove DC bit. */
bitmap_all =
@@ -241,16 +222,16 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
/* Encode DC coefficient. */
+ /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+ int16x8_t abs_row0 = vabsq_s16(row0);
+ int16x8_t row0_lz = vclzq_s16(abs_row0);
+ uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
+ uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
/* Find nbits required to specify sign and amplitude of coefficient. */
-#if defined(_MSC_VER) && !defined(__clang__)
- unsigned int lz = BUILTIN_CLZ(vgetq_lane_s16(abs_row0, 0));
-#else
- unsigned int lz;
- __asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0)));
-#endif
- unsigned int nbits = 32 - lz;
+ unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
+ unsigned int nbits = 16 - lz;
/* Emit Huffman-coded symbol and additional diff bits. */
- unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz;
+ unsigned int diff = vgetq_lane_u16(row0_diff, 0);
PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
/* Encode AC coefficients. */
@@ -263,13 +244,20 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
/* The most efficient method of computing nbits and diff depends on the
* number of non-zero coefficients. If the bitmap is not too sparse (> 8
- * non-zero AC coefficients), it is beneficial to use Neon; else we compute
- * nbits and diff on demand using scalar code.
+ * non-zero AC coefficients), it is beneficial to do all of the work using
+ * Neon; else we do some of the work using Neon and the rest on demand using
+ * scalar code.
*/
if (non_zero_coefficients > 8) {
uint8_t block_nbits[DCTSIZE2];
- int16x8_t row0_lz = vclzq_s16(abs_row0);
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
int16x8_t row1_lz = vclzq_s16(abs_row1);
int16x8_t row2_lz = vclzq_s16(abs_row2);
int16x8_t row3_lz = vclzq_s16(abs_row3);
@@ -277,49 +265,48 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
int16x8_t row5_lz = vclzq_s16(abs_row5);
int16x8_t row6_lz = vclzq_s16(abs_row6);
int16x8_t row7_lz = vclzq_s16(abs_row7);
+ /* Narrow leading zero count to 8 bits. */
+ uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
+ vreinterpretq_u8_s16(row1_lz));
+ uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
+ vreinterpretq_u8_s16(row3_lz));
+ uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
+ vreinterpretq_u8_s16(row5_lz));
+ uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
+ vreinterpretq_u8_s16(row7_lz));
/* Compute nbits needed to specify magnitude of each coefficient. */
- uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
- vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
- uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
- vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
- uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
- vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
- uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
- vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
- uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
- vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
- uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
- vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
- uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
- vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
- uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
- vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+ uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
+ uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
+ uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
+ uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
/* Store nbits. */
- vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
- vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
- vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
- vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
- vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
- vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
- vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
- vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+ vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
+ vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
+ vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
+ vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
/* Mask bits not required to specify sign and amplitude of diff. */
- row0_diff = vshlq_u16(row0_diff, row0_lz);
- row1_diff = vshlq_u16(row1_diff, row1_lz);
- row2_diff = vshlq_u16(row2_diff, row2_lz);
- row3_diff = vshlq_u16(row3_diff, row3_lz);
- row4_diff = vshlq_u16(row4_diff, row4_lz);
- row5_diff = vshlq_u16(row5_diff, row5_lz);
- row6_diff = vshlq_u16(row6_diff, row6_lz);
- row7_diff = vshlq_u16(row7_diff, row7_lz);
- row0_diff = vshlq_u16(row0_diff, vnegq_s16(row0_lz));
- row1_diff = vshlq_u16(row1_diff, vnegq_s16(row1_lz));
- row2_diff = vshlq_u16(row2_diff, vnegq_s16(row2_lz));
- row3_diff = vshlq_u16(row3_diff, vnegq_s16(row3_lz));
- row4_diff = vshlq_u16(row4_diff, vnegq_s16(row4_lz));
- row5_diff = vshlq_u16(row5_diff, vnegq_s16(row5_lz));
- row6_diff = vshlq_u16(row6_diff, vnegq_s16(row6_lz));
- row7_diff = vshlq_u16(row7_diff, vnegq_s16(row7_lz));
+ uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
+ uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
+ uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
+ uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
+ uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
+ uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
+ uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
+ /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+ row1_mask);
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+ row2_mask);
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+ row3_mask);
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+ row4_mask);
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+ row5_mask);
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+ row6_mask);
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+ row7_mask);
/* Store diff bits. */
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
@@ -349,7 +336,14 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
}
} else if (bitmap != 0) {
uint16_t block_abs[DCTSIZE2];
- /* Store absolute value of coefficients. */
+ /* Compute and store absolute value of coefficients. */
+ int16x8_t abs_row1 = vabsq_s16(row1);
+ int16x8_t abs_row2 = vabsq_s16(row2);
+ int16x8_t abs_row3 = vabsq_s16(row3);
+ int16x8_t abs_row4 = vabsq_s16(row4);
+ int16x8_t abs_row5 = vabsq_s16(row5);
+ int16x8_t abs_row6 = vabsq_s16(row6);
+ int16x8_t abs_row7 = vabsq_s16(row7);
vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
@@ -358,7 +352,21 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
- /* Store diff bits. */
+ /* Compute diff bits (without nbits mask) and store. */
+ uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+ vcltzq_s16(row1));
+ uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+ vcltzq_s16(row2));
+ uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+ vcltzq_s16(row3));
+ uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+ vcltzq_s16(row4));
+ uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+ vcltzq_s16(row5));
+ uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+ vcltzq_s16(row6));
+ uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+ vcltzq_s16(row7));
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
@@ -375,7 +383,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
bitmap <<= r;
lz = BUILTIN_CLZ(block_abs[i]);
nbits = 32 - lz;
- diff = (unsigned int)(block_diff[i] << lz) >> lz;
+ diff = ((unsigned int)block_diff[i] << lz) >> lz;
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
diff --git a/simd/arm/aarch64/jsimd.c b/simd/arm/aarch64/jsimd.c
index 8570b82c..358e1597 100644
--- a/simd/arm/aarch64/jsimd.c
+++ b/simd/arm/aarch64/jsimd.c
@@ -3,8 +3,8 @@
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
- * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
* Copyright (C) 2020, Arm Limited.
*
* Based on the x86 SIMD extension for IJG JPEG library,
@@ -23,20 +23,17 @@
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
-#include "jconfigint.h"
-#include <stdio.h>
-#include <string.h>
#include <ctype.h>
#define JSIMD_FASTLD3 1
#define JSIMD_FASTST3 2
#define JSIMD_FASTTBL 4
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
- JSIMD_FASTTBL;
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_features = JSIMD_FASTLD3 |
+ JSIMD_FASTST3 | JSIMD_FASTTBL;
#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
@@ -111,8 +108,6 @@ parse_proc_cpuinfo(int bufsize)
/*
* Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
*/
/*
@@ -125,7 +120,7 @@ LOCAL(void)
init_simd(void)
{
#ifndef NO_GETENV
- char *env = NULL;
+ char env[2] = { 0 };
#endif
#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
int bufsize = 1024; /* an initial guess for the line buffer size limit */
@@ -147,24 +142,19 @@ init_simd(void)
#ifndef NO_GETENV
/* Force different settings through environment variables */
- env = getenv("JSIMD_FORCENEON");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
simd_support = JSIMD_NEON;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
simd_support = 0;
- env = getenv("JSIMD_NOHUFFENC");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
simd_huffman = 0;
- env = getenv("JSIMD_FASTLD3");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1"))
simd_features |= JSIMD_FASTLD3;
- if ((env != NULL) && (strcmp(env, "0") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0"))
simd_features &= ~JSIMD_FASTLD3;
- env = getenv("JSIMD_FASTST3");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1"))
simd_features |= JSIMD_FASTST3;
- if ((env != NULL) && (strcmp(env, "0") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0"))
simd_features &= ~JSIMD_FASTST3;
#endif
}
@@ -1028,7 +1018,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *values, size_t *zerobits)
+ int Al, UJCOEF *values, size_t *zerobits)
{
jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
Sl, Al, values, zerobits);
@@ -1055,7 +1045,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *absvalues, size_t *bits)
+ int Al, UJCOEF *absvalues, size_t *bits)
{
return jsimd_encode_mcu_AC_refine_prepare_neon(block,
jpeg_natural_order_start,
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
index b91c5db4..51db3c5f 100644
--- a/simd/arm/jcphuff-neon.c
+++ b/simd/arm/jcphuff-neon.c
@@ -2,6 +2,8 @@
* jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
*
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2022, Matthieu Darbois. All Rights Reserved.
+ * Copyright (C) 2022, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -21,7 +23,6 @@
*/
#define JPEG_INTERNALS
-#include "jconfigint.h"
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
@@ -41,10 +42,10 @@
void jsimd_encode_mcu_AC_first_prepare_neon
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
- JCOEF *values, size_t *zerobits)
+ UJCOEF *values, size_t *zerobits)
{
- JCOEF *values_ptr = values;
- JCOEF *diff_values_ptr = values + DCTSIZE2;
+ UJCOEF *values_ptr = values;
+ UJCOEF *diff_values_ptr = values + DCTSIZE2;
/* Rows of coefficients to zero (since they haven't been processed) */
int i, rows_to_zero = 8;
@@ -68,23 +69,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon
coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
/* Isolate sign of coefficients. */
- int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
- int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+ uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+ uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
/* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs1 = vabsq_s16(coefs1);
- int16x8_t abs_coefs2 = vabsq_s16(coefs2);
- coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
- coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+ uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+ uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+ abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+ abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
/* Compute diff values. */
- int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
- int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+ uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+ uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
/* Store transformed coefficients and diff values. */
- vst1q_s16(values_ptr, coefs1);
- vst1q_s16(values_ptr + DCTSIZE, coefs2);
- vst1q_s16(diff_values_ptr, diff1);
- vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+ vst1q_u16(values_ptr, abs_coefs1);
+ vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+ vst1q_u16(diff_values_ptr, diff1);
+ vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
values_ptr += 16;
diff_values_ptr += 16;
jpeg_natural_order_start += 16;
@@ -130,23 +131,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon
}
/* Isolate sign of coefficients. */
- int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
- int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+ uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+ uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
/* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs1 = vabsq_s16(coefs1);
- int16x8_t abs_coefs2 = vabsq_s16(coefs2);
- coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
- coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+ uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+ uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+ abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+ abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
/* Compute diff values. */
- int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
- int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+ uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+ uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
/* Store transformed coefficients and diff values. */
- vst1q_s16(values_ptr, coefs1);
- vst1q_s16(values_ptr + DCTSIZE, coefs2);
- vst1q_s16(diff_values_ptr, diff1);
- vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+ vst1q_u16(values_ptr, abs_coefs1);
+ vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+ vst1q_u16(diff_values_ptr, diff1);
+ vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
values_ptr += 16;
diff_values_ptr += 16;
rows_to_zero -= 2;
@@ -184,17 +185,17 @@ void jsimd_encode_mcu_AC_first_prepare_neon
}
/* Isolate sign of coefficients. */
- int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+ uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
/* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs = vabsq_s16(coefs);
- coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+ uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+ abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
/* Compute diff values. */
- int16x8_t diff = veorq_s16(coefs, sign_coefs);
+ uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
/* Store transformed coefficients and diff values. */
- vst1q_s16(values_ptr, coefs);
- vst1q_s16(diff_values_ptr, diff);
+ vst1q_u16(values_ptr, abs_coefs);
+ vst1q_u16(diff_values_ptr, diff);
values_ptr += 8;
diff_values_ptr += 8;
rows_to_zero--;
@@ -202,8 +203,8 @@ void jsimd_encode_mcu_AC_first_prepare_neon
/* Zero remaining memory in the values and diff_values blocks. */
for (i = 0; i < rows_to_zero; i++) {
- vst1q_s16(values_ptr, vdupq_n_s16(0));
- vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+ vst1q_u16(values_ptr, vdupq_n_u16(0));
+ vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
values_ptr += 8;
diff_values_ptr += 8;
}
@@ -211,23 +212,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon
/* Construct zerobits bitmap. A set bit means that the corresponding
* coefficient != 0.
*/
- int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
- int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
- int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
- int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
- int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
- int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
- int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
- int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
-
- uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
- uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
- uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
- uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
- uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
- uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
- uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
- uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+ uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
+ uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
+ uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
+ uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
+ uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
+ uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
+ uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
+ uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
+
+ uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
+ uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
+ uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
+ uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
+ uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
+ uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
+ uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
+ uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
/* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
const uint8x8_t bitmap_mask =
@@ -274,7 +275,7 @@ void jsimd_encode_mcu_AC_first_prepare_neon
int jsimd_encode_mcu_AC_refine_prepare_neon
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
- JCOEF *absvalues, size_t *bits)
+ UJCOEF *absvalues, size_t *bits)
{
/* Temporary storage buffers for data used to compute the signbits bitmap and
* the end-of-block (EOB) position
@@ -282,7 +283,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
uint8_t coef_sign_bits[64];
uint8_t coef_eq1_bits[64];
- JCOEF *absvalues_ptr = absvalues;
+ UJCOEF *absvalues_ptr = absvalues;
uint8_t *coef_sign_bits_ptr = coef_sign_bits;
uint8_t *eq1_bits_ptr = coef_eq1_bits;
@@ -316,18 +317,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
/* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs1 = vabsq_s16(coefs1);
- int16x8_t abs_coefs2 = vabsq_s16(coefs2);
- coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
- coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
- vst1q_s16(absvalues_ptr, coefs1);
- vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+ uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+ uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+ abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+ abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_u16(absvalues_ptr, abs_coefs1);
+ vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
/* Test whether transformed coefficient values == 1 (used to find EOB
* position.)
*/
- uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
- uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
vst1_u8(eq1_bits_ptr, coefs_eq11);
vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
@@ -385,18 +386,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
/* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs1 = vabsq_s16(coefs1);
- int16x8_t abs_coefs2 = vabsq_s16(coefs2);
- coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
- coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
- vst1q_s16(absvalues_ptr, coefs1);
- vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+ uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+ uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+ abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+ abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+ vst1q_u16(absvalues_ptr, abs_coefs1);
+ vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
/* Test whether transformed coefficient values == 1 (used to find EOB
* position.)
*/
- uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
- uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+ uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
vst1_u8(eq1_bits_ptr, coefs_eq11);
vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
@@ -444,14 +445,14 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
vst1_u8(coef_sign_bits_ptr, sign_coefs);
/* Compute absolute value of coefficients and apply point transform Al. */
- int16x8_t abs_coefs = vabsq_s16(coefs);
- coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
- vst1q_s16(absvalues_ptr, coefs);
+ uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+ abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
+ vst1q_u16(absvalues_ptr, abs_coefs);
/* Test whether transformed coefficient values == 1 (used to find EOB
* position.)
*/
- uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+ uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
vst1_u8(eq1_bits_ptr, coefs_eq1);
absvalues_ptr += 8;
@@ -462,7 +463,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
/* Zero remaining memory in blocks. */
for (i = 0; i < rows_to_zero; i++) {
- vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+ vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
absvalues_ptr += 8;
@@ -471,23 +472,23 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
}
/* Construct zerobits bitmap. */
- int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
- int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
- int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
- int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
- int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
- int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
- int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
- int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
-
- uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
- uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
- uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
- uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
- uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
- uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
- uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
- uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+ uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
+ uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
+ uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
+ uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
+ uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
+ uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
+ uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
+ uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
+
+ uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
+ uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
+ uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
+ uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
+ uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
+ uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
+ uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
+ uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
/* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
const uint8x8_t bitmap_mask =
diff --git a/simd/arm/jdcolor-neon.c b/simd/arm/jdcolor-neon.c
index ea4668f1..28dbc572 100644
--- a/simd/arm/jdcolor-neon.c
+++ b/simd/arm/jdcolor-neon.c
@@ -21,7 +21,6 @@
*/
#define JPEG_INTERNALS
-#include "jconfigint.h"
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
diff --git a/simd/arm/jdmerge-neon.c b/simd/arm/jdmerge-neon.c
index e4f91fdc..18fb9d8a 100644
--- a/simd/arm/jdmerge-neon.c
+++ b/simd/arm/jdmerge-neon.c
@@ -21,7 +21,6 @@
*/
#define JPEG_INTERNALS
-#include "jconfigint.h"
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
diff --git a/simd/arm/jidctint-neon.c b/simd/arm/jidctint-neon.c
index 043b652e..d25112ef 100644
--- a/simd/arm/jidctint-neon.c
+++ b/simd/arm/jidctint-neon.c
@@ -22,7 +22,6 @@
*/
#define JPEG_INTERNALS
-#include "jconfigint.h"
#include "../../jinclude.h"
#include "../../jpeglib.h"
#include "../../jsimd.h"
diff --git a/simd/arm/jquanti-neon.c b/simd/arm/jquanti-neon.c
index a7eb6f19..d5d95d89 100644
--- a/simd/arm/jquanti-neon.c
+++ b/simd/arm/jquanti-neon.c
@@ -1,7 +1,7 @@
/*
* jquanti-neon.c - sample data conversion and quantization (Arm Neon)
*
- * Copyright (C) 2020, Arm Limited. All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
@@ -100,6 +100,9 @@ void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
int i;
+#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
+#pragma unroll
+#endif
for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
/* Load DCT coefficients. */
int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
diff --git a/simd/i386/jsimd.c b/simd/i386/jsimd.c
index 563949a0..b429b0a5 100644
--- a/simd/i386/jsimd.c
+++ b/simd/i386/jsimd.c
@@ -2,8 +2,8 @@
* jsimd_i386.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022-2023, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -21,7 +21,6 @@
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
-#include "jconfigint.h"
/*
* In the PIC cases, we have no guarantee that constants will keep
@@ -32,19 +31,17 @@
#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
-static unsigned int simd_support = (unsigned int)(~0);
-static unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0);
+static THREAD_LOCAL unsigned int simd_huffman = 1;
/*
* Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
*/
LOCAL(void)
init_simd(void)
{
#ifndef NO_GETENV
- char *env = NULL;
+ char env[2] = { 0 };
#endif
if (simd_support != ~0U)
@@ -54,26 +51,19 @@ init_simd(void)
#ifndef NO_GETENV
/* Force different settings through environment variables */
- env = getenv("JSIMD_FORCEMMX");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCEMMX") && !strcmp(env, "1"))
simd_support &= JSIMD_MMX;
- env = getenv("JSIMD_FORCE3DNOW");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCE3DNOW") && !strcmp(env, "1"))
simd_support &= JSIMD_3DNOW | JSIMD_MMX;
- env = getenv("JSIMD_FORCESSE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCESSE") && !strcmp(env, "1"))
simd_support &= JSIMD_SSE | JSIMD_MMX;
- env = getenv("JSIMD_FORCESSE2");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
simd_support &= JSIMD_SSE2;
- env = getenv("JSIMD_FORCEAVX2");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
simd_support &= JSIMD_AVX2;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
simd_support = 0;
- env = getenv("JSIMD_NOHUFFENC");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
simd_huffman = 0;
#endif
}
@@ -168,6 +158,9 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ if (simd_support == ~0U)
+ init_simd();
+
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
avx2fct = jsimd_extrgb_ycc_convert_avx2;
@@ -227,6 +220,9 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ if (simd_support == ~0U)
+ init_simd();
+
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
avx2fct = jsimd_extrgb_gray_convert_avx2;
@@ -286,6 +282,9 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+ if (simd_support == ~0U)
+ init_simd();
+
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
avx2fct = jsimd_ycc_extrgb_convert_avx2;
@@ -389,6 +388,9 @@ GLOBAL(void)
jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor,
@@ -409,6 +411,9 @@ GLOBAL(void)
jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor,
@@ -471,6 +476,9 @@ GLOBAL(void)
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
@@ -486,6 +494,9 @@ GLOBAL(void)
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
@@ -547,6 +558,9 @@ GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
@@ -565,6 +579,9 @@ GLOBAL(void)
jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
@@ -633,6 +650,9 @@ jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ if (simd_support == ~0U)
+ init_simd();
+
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
@@ -691,6 +711,9 @@ jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ if (simd_support == ~0U)
+ init_simd();
+
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
@@ -795,6 +818,9 @@ GLOBAL(void)
jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM *workspace)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_convsamp_avx2(sample_data, start_col, workspace);
else if (simd_support & JSIMD_SSE2)
@@ -807,6 +833,9 @@ GLOBAL(void)
jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
FAST_FLOAT *workspace)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_SSE2)
jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
else if (simd_support & JSIMD_SSE)
@@ -877,6 +906,9 @@ jsimd_can_fdct_float(void)
GLOBAL(void)
jsimd_fdct_islow(DCTELEM *data)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_fdct_islow_avx2(data);
else if (simd_support & JSIMD_SSE2)
@@ -888,6 +920,9 @@ jsimd_fdct_islow(DCTELEM *data)
GLOBAL(void)
jsimd_fdct_ifast(DCTELEM *data)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
jsimd_fdct_ifast_sse2(data);
else
@@ -897,6 +932,9 @@ jsimd_fdct_ifast(DCTELEM *data)
GLOBAL(void)
jsimd_fdct_float(FAST_FLOAT *data)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
jsimd_fdct_float_sse(data);
else if (simd_support & JSIMD_3DNOW)
@@ -952,6 +990,9 @@ jsimd_can_quantize_float(void)
GLOBAL(void)
jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_quantize_avx2(coef_block, divisors, workspace);
else if (simd_support & JSIMD_SSE2)
@@ -964,6 +1005,9 @@ GLOBAL(void)
jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
FAST_FLOAT *workspace)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_SSE2)
jsimd_quantize_float_sse2(coef_block, divisors, workspace);
else if (simd_support & JSIMD_SSE)
@@ -1027,6 +1071,9 @@ jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
output_col);
@@ -1039,6 +1086,9 @@ jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
output_col);
@@ -1133,6 +1183,9 @@ jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
output_col);
@@ -1149,6 +1202,9 @@ jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
output_col);
@@ -1162,6 +1218,9 @@ jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
output_col);
@@ -1219,7 +1278,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *values, size_t *zerobits)
+ int Al, UJCOEF *values, size_t *zerobits)
{
jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
Sl, Al, values, zerobits);
@@ -1245,7 +1304,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *absvalues, size_t *bits)
+ int Al, UJCOEF *absvalues, size_t *bits)
{
return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
jpeg_natural_order_start,
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 64747c63..a28754ad 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -2,10 +2,10 @@
* simd/jsimd.h
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2011, 2014-2016, 2018, 2020, 2022, D. R. Commander.
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
* Copyright (C) 2014, Linaro Limited.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
* Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
* Copyright (C) 2020, Arm Limited.
*
@@ -1243,16 +1243,16 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
/* Progressive Huffman encoding */
EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
- JCOEF *values, size_t *zerobits);
+ UJCOEF *values, size_t *zerobits);
EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
- JCOEF *values, size_t *zerobits);
+ UJCOEF *values, size_t *zerobits);
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
- JCOEF *absvalues, size_t *bits);
+ UJCOEF *absvalues, size_t *bits);
EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
(const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
- JCOEF *absvalues, size_t *bits);
+ UJCOEF *absvalues, size_t *bits);
diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm
index 00720283..9ea6df94 100644
--- a/simd/x86_64/jchuff-sse2.asm
+++ b/simd/x86_64/jchuff-sse2.asm
@@ -1,7 +1,7 @@
;
; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
;
-; Copyright (C) 2009-2011, 2014-2016, 2019, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
; Copyright (C) 2015, Matthieu Darbois.
; Copyright (C) 2018, Matthias Räncker.
;
@@ -83,6 +83,7 @@ times 1 << 11 db 12
times 1 << 12 db 13
times 1 << 13 db 14
times 1 << 14 db 15
+times 1 << 15 db 16
alignz 32
diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c
index eb766799..3f5ee77e 100644
--- a/simd/x86_64/jsimd.c
+++ b/simd/x86_64/jsimd.c
@@ -2,8 +2,8 @@
* jsimd_x86_64.c
*
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022-2023, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
*
* Based on the x86 SIMD extension for IJG JPEG library,
* Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -21,7 +21,6 @@
#include "../../jdct.h"
#include "../../jsimddct.h"
#include "../jsimd.h"
-#include "jconfigint.h"
/*
* In the PIC cases, we have no guarantee that constants will keep
@@ -32,19 +31,17 @@
#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
#define IS_ALIGNED_AVX(ptr) (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
-static unsigned int simd_support = (unsigned int)(~0);
-static unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0);
+static THREAD_LOCAL unsigned int simd_huffman = 1;
/*
* Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
*/
LOCAL(void)
init_simd(void)
{
#ifndef NO_GETENV
- char *env = NULL;
+ char env[2] = { 0 };
#endif
if (simd_support != ~0U)
@@ -54,17 +51,13 @@ init_simd(void)
#ifndef NO_GETENV
/* Force different settings through environment variables */
- env = getenv("JSIMD_FORCESSE2");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
simd_support &= JSIMD_SSE2;
- env = getenv("JSIMD_FORCEAVX2");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
simd_support &= JSIMD_AVX2;
- env = getenv("JSIMD_FORCENONE");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
simd_support = 0;
- env = getenv("JSIMD_NOHUFFENC");
- if ((env != NULL) && (strcmp(env, "1") == 0))
+ if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
simd_huffman = 0;
#endif
}
@@ -152,6 +145,9 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ if (simd_support == ~0U)
+ init_simd();
+
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
avx2fct = jsimd_extrgb_ycc_convert_avx2;
@@ -201,6 +197,9 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+ if (simd_support == ~0U)
+ init_simd();
+
switch (cinfo->in_color_space) {
case JCS_EXT_RGB:
avx2fct = jsimd_extrgb_gray_convert_avx2;
@@ -250,6 +249,9 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+ if (simd_support == ~0U)
+ init_simd();
+
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
avx2fct = jsimd_ycc_extrgb_convert_avx2;
@@ -340,6 +342,9 @@ GLOBAL(void)
jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor,
@@ -356,6 +361,9 @@ GLOBAL(void)
jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
compptr->v_samp_factor,
@@ -410,6 +418,9 @@ GLOBAL(void)
jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
@@ -422,6 +433,9 @@ GLOBAL(void)
jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
input_data, output_data_ptr);
@@ -476,6 +490,9 @@ GLOBAL(void)
jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
@@ -490,6 +507,9 @@ GLOBAL(void)
jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
compptr->downsampled_width, input_data,
@@ -549,6 +569,9 @@ jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ if (simd_support == ~0U)
+ init_simd();
+
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
@@ -597,6 +620,9 @@ jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+ if (simd_support == ~0U)
+ init_simd();
+
switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
@@ -686,6 +712,9 @@ GLOBAL(void)
jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
DCTELEM *workspace)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_convsamp_avx2(sample_data, start_col, workspace);
else
@@ -755,6 +784,9 @@ jsimd_can_fdct_float(void)
GLOBAL(void)
jsimd_fdct_islow(DCTELEM *data)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_fdct_islow_avx2(data);
else
@@ -816,6 +848,9 @@ jsimd_can_quantize_float(void)
GLOBAL(void)
jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_quantize_avx2(coef_block, divisors, workspace);
else
@@ -970,6 +1005,9 @@ jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
JCOEFPTR coef_block, JSAMPARRAY output_buf,
JDIMENSION output_col)
{
+ if (simd_support == ~0U)
+ init_simd();
+
if (simd_support & JSIMD_AVX2)
jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
output_col);
@@ -1040,7 +1078,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
GLOBAL(void)
jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *values, size_t *zerobits)
+ int Al, UJCOEF *values, size_t *zerobits)
{
jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
Sl, Al, values, zerobits);
@@ -1064,7 +1102,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
GLOBAL(int)
jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
const int *jpeg_natural_order_start, int Sl,
- int Al, JCOEF *absvalues, size_t *bits)
+ int Al, UJCOEF *absvalues, size_t *bits)
{
return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
jpeg_natural_order_start,