12 files changed, 393 insertions, 303 deletions
diff --git a/simd/arm/aarch32/jsimd.c b/simd/arm/aarch32/jsimd.c
index fac55dfb..04d64526 100644
--- a/simd/arm/aarch32/jsimd.c
+++ b/simd/arm/aarch32/jsimd.c
@@ -3,8 +3,8 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
- * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  * Copyright (C) 2019, Google LLC.
  * Copyright (C) 2020, Arm Limited.
  *
@@ -25,12 +25,10 @@
 #include "../../../jsimddct.h"
 #include "../../jsimd.h"
 
-#include <stdio.h>
-#include <string.h>
 #include <ctype.h>
 
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
 
 #if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
 
@@ -98,14 +96,12 @@ parse_proc_cpuinfo(int bufsize)
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 LOCAL(void)
 init_simd(void)
 {
 #ifndef NO_GETENV
-  char *env = NULL;
+  char env[2] = { 0 };
 #endif
 #if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
   int bufsize = 1024; /* an initial guess for the line buffer size limit */
@@ -131,14 +127,11 @@ init_simd(void)
 
 #ifndef NO_GETENV
   /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENEON");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
     simd_support = JSIMD_NEON;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
     simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
     simd_huffman = 0;
 #endif
 }
@@ -950,7 +943,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
   jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
                                          Sl, Al, values, zerobits);
@@ -975,7 +968,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return jsimd_encode_mcu_AC_refine_prepare_neon(block,
                                                  jpeg_natural_order_start, Sl,
diff --git a/simd/arm/aarch64/jchuff-neon.c b/simd/arm/aarch64/jchuff-neon.c
index f13fd1b5..607a1160 100644
--- a/simd/arm/aarch64/jchuff-neon.c
+++ b/simd/arm/aarch64/jchuff-neon.c
@@ -2,7 +2,7 @@
  * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
  *
  * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
- * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2020, 2022, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -59,6 +59,17 @@ ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
    14,  15,  30,  31,  44,  45,  46,  47
 };
 
+/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
+ * address warning because the macro sometimes writes a 64-bit value to a
+ * non-64-bit-aligned address.  That behavior is technically undefined per
+ * the C specification, but it is supported by the AArch64 architecture and
+ * compilers.
+ */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("alignment")))
+#endif
+#endif
 JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
                                          JCOEFPTR block, int last_dc_val,
                                          c_derived_tbl *dctbl,
@@ -158,73 +169,43 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
                                   7), row6, 5);
 
   /* DCT block is now in zig-zag order; start Huffman encoding process. */
-  int16x8_t abs_row0 = vabsq_s16(row0);
-  int16x8_t abs_row1 = vabsq_s16(row1);
-  int16x8_t abs_row2 = vabsq_s16(row2);
-  int16x8_t abs_row3 = vabsq_s16(row3);
-  int16x8_t abs_row4 = vabsq_s16(row4);
-  int16x8_t abs_row5 = vabsq_s16(row5);
-  int16x8_t abs_row6 = vabsq_s16(row6);
-  int16x8_t abs_row7 = vabsq_s16(row7);
-
-  /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
-  uint16x8_t row0_diff =
-    vreinterpretq_u16_s16(veorq_s16(abs_row0, vshrq_n_s16(row0, 15)));
-  uint16x8_t row1_diff =
-    vreinterpretq_u16_s16(veorq_s16(abs_row1, vshrq_n_s16(row1, 15)));
-  uint16x8_t row2_diff =
-    vreinterpretq_u16_s16(veorq_s16(abs_row2, vshrq_n_s16(row2, 15)));
-  uint16x8_t row3_diff =
-    vreinterpretq_u16_s16(veorq_s16(abs_row3, vshrq_n_s16(row3, 15)));
-  uint16x8_t row4_diff =
-    vreinterpretq_u16_s16(veorq_s16(abs_row4, vshrq_n_s16(row4, 15)));
-  uint16x8_t row5_diff =
-    vreinterpretq_u16_s16(veorq_s16(abs_row5, vshrq_n_s16(row5, 15)));
-  uint16x8_t row6_diff =
-    vreinterpretq_u16_s16(veorq_s16(abs_row6, vshrq_n_s16(row6, 15)));
-  uint16x8_t row7_diff =
-    vreinterpretq_u16_s16(veorq_s16(abs_row7, vshrq_n_s16(row7, 15)));
 
   /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
    * means that the corresponding coefficient != 0.
    */
-  uint8x8_t abs_row0_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row0),
-                                               vdupq_n_u16(0)));
-  uint8x8_t abs_row1_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row1),
-                                               vdupq_n_u16(0)));
-  uint8x8_t abs_row2_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row2),
-                                               vdupq_n_u16(0)));
-  uint8x8_t abs_row3_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row3),
-                                               vdupq_n_u16(0)));
-  uint8x8_t abs_row4_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row4),
-                                               vdupq_n_u16(0)));
-  uint8x8_t abs_row5_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row5),
-                                               vdupq_n_u16(0)));
-  uint8x8_t abs_row6_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row6),
-                                               vdupq_n_u16(0)));
-  uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7),
-                                               vdupq_n_u16(0)));
+  uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
+  uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
+  uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
+  uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
+  uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
+  uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
+  uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
+  uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
+
+  uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
+                                    vreinterpretq_u8_u16(row0_ne_0));
+  uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
+                                    vreinterpretq_u8_u16(row2_ne_0));
+  uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
+                                    vreinterpretq_u8_u16(row4_ne_0));
+  uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
+                                    vreinterpretq_u8_u16(row6_ne_0));
 
   /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
-  const uint8x8_t bitmap_mask =
-    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+  const uint8x16_t bitmap_mask =
+    vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
 
-  abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask);
-  abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask);
-  abs_row2_gt0 = vand_u8(abs_row2_gt0, bitmap_mask);
-  abs_row3_gt0 = vand_u8(abs_row3_gt0, bitmap_mask);
-  abs_row4_gt0 = vand_u8(abs_row4_gt0, bitmap_mask);
-  abs_row5_gt0 = vand_u8(abs_row5_gt0, bitmap_mask);
-  abs_row6_gt0 = vand_u8(abs_row6_gt0, bitmap_mask);
-  abs_row7_gt0 = vand_u8(abs_row7_gt0, bitmap_mask);
+  uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
 
-  uint8x8_t bitmap_rows_10 = vpadd_u8(abs_row1_gt0, abs_row0_gt0);
-  uint8x8_t bitmap_rows_32 = vpadd_u8(abs_row3_gt0, abs_row2_gt0);
-  uint8x8_t bitmap_rows_54 = vpadd_u8(abs_row5_gt0, abs_row4_gt0);
-  uint8x8_t bitmap_rows_76 = vpadd_u8(abs_row7_gt0, abs_row6_gt0);
-  uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
-  uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
-  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+  uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
+                                              bitmap_rows_3210);
+  uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
+                                  vget_high_u8(bitmap_rows_76543210));
 
   /* Shift left to remove DC bit. */
   bitmap_all =
@@ -241,16 +222,16 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
 
   /* Encode DC coefficient. */
 
+  /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
   /* Find nbits required to specify sign and amplitude of coefficient. */
-#if defined(_MSC_VER) && !defined(__clang__)
-  unsigned int lz = BUILTIN_CLZ(vgetq_lane_s16(abs_row0, 0));
-#else
-  unsigned int lz;
-  __asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0)));
-#endif
-  unsigned int nbits = 32 - lz;
+  unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
+  unsigned int nbits = 16 - lz;
   /* Emit Huffman-coded symbol and additional diff bits. */
-  unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz;
+  unsigned int diff = vgetq_lane_u16(row0_diff, 0);
   PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
 
   /* Encode AC coefficients. */
@@ -263,13 +244,20 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
 
   /* The most efficient method of computing nbits and diff depends on the
    * number of non-zero coefficients.  If the bitmap is not too sparse (> 8
-   * non-zero AC coefficients), it is beneficial to use Neon; else we compute
-   * nbits and diff on demand using scalar code.
+   * non-zero AC coefficients), it is beneficial to do all of the work using
+   * Neon; else we do some of the work using Neon and the rest on demand using
+   * scalar code.
    */
   if (non_zero_coefficients > 8) {
     uint8_t block_nbits[DCTSIZE2];
 
-    int16x8_t row0_lz = vclzq_s16(abs_row0);
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
     int16x8_t row1_lz = vclzq_s16(abs_row1);
     int16x8_t row2_lz = vclzq_s16(abs_row2);
     int16x8_t row3_lz = vclzq_s16(abs_row3);
@@ -277,49 +265,48 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
     int16x8_t row5_lz = vclzq_s16(abs_row5);
     int16x8_t row6_lz = vclzq_s16(abs_row6);
     int16x8_t row7_lz = vclzq_s16(abs_row7);
+    /* Narrow leading zero count to 8 bits. */
+    uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
+                                    vreinterpretq_u8_s16(row1_lz));
+    uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
+                                    vreinterpretq_u8_s16(row3_lz));
+    uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
+                                    vreinterpretq_u8_s16(row5_lz));
+    uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
+                                    vreinterpretq_u8_s16(row7_lz));
     /* Compute nbits needed to specify magnitude of each coefficient. */
-    uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
-                                   vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
-    uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
-                                   vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
-    uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
-                                   vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
-    uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
-                                   vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
-    uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
-                                   vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
-    uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
-                                   vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
-    uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
-                                   vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
-    uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
-                                   vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+    uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
+    uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
+    uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
+    uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
     /* Store nbits. */
-    vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
-    vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
-    vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
-    vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
-    vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
-    vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
-    vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
-    vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+    vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
+    vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
+    vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
+    vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
     /* Mask bits not required to specify sign and amplitude of diff. */
-    row0_diff = vshlq_u16(row0_diff, row0_lz);
-    row1_diff = vshlq_u16(row1_diff, row1_lz);
-    row2_diff = vshlq_u16(row2_diff, row2_lz);
-    row3_diff = vshlq_u16(row3_diff, row3_lz);
-    row4_diff = vshlq_u16(row4_diff, row4_lz);
-    row5_diff = vshlq_u16(row5_diff, row5_lz);
-    row6_diff = vshlq_u16(row6_diff, row6_lz);
-    row7_diff = vshlq_u16(row7_diff, row7_lz);
-    row0_diff = vshlq_u16(row0_diff, vnegq_s16(row0_lz));
-    row1_diff = vshlq_u16(row1_diff, vnegq_s16(row1_lz));
-    row2_diff = vshlq_u16(row2_diff, vnegq_s16(row2_lz));
-    row3_diff = vshlq_u16(row3_diff, vnegq_s16(row3_lz));
-    row4_diff = vshlq_u16(row4_diff, vnegq_s16(row4_lz));
-    row5_diff = vshlq_u16(row5_diff, vnegq_s16(row5_lz));
-    row6_diff = vshlq_u16(row6_diff, vnegq_s16(row6_lz));
-    row7_diff = vshlq_u16(row7_diff, vnegq_s16(row7_lz));
+    uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
+    uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
+    uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
+    uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
+    uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
+    uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
+    uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
+    /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     row1_mask);
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     row2_mask);
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     row3_mask);
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     row4_mask);
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     row5_mask);
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     row6_mask);
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     row7_mask);
     /* Store diff bits. */
     vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
     vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
@@ -349,7 +336,14 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
     }
   } else if (bitmap != 0) {
     uint16_t block_abs[DCTSIZE2];
-    /* Store absolute value of coefficients. */
+    /* Compute and store absolute value of coefficients. */
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
     vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
     vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
     vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
@@ -358,7 +352,21 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
     vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
     vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
     vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
-    /* Store diff bits. */
+    /* Compute diff bits (without nbits mask) and store. */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     vcltzq_s16(row1));
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     vcltzq_s16(row2));
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     vcltzq_s16(row3));
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     vcltzq_s16(row4));
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     vcltzq_s16(row5));
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     vcltzq_s16(row6));
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     vcltzq_s16(row7));
     vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
     vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
     vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
@@ -375,7 +383,7 @@ JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
       bitmap <<= r;
       lz = BUILTIN_CLZ(block_abs[i]);
       nbits = 32 - lz;
-      diff = (unsigned int)(block_diff[i] << lz) >> lz;
+      diff = ((unsigned int)block_diff[i] << lz) >> lz;
       while (r > 15) {
         /* If run length > 15, emit special run-length-16 codes. */
         PUT_BITS(code_0xf0, size_0xf0)
diff --git a/simd/arm/aarch64/jsimd.c b/simd/arm/aarch64/jsimd.c
index 8570b82c..358e1597 100644
--- a/simd/arm/aarch64/jsimd.c
+++ b/simd/arm/aarch64/jsimd.c
@@ -3,8 +3,8 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
- * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
@@ -23,20 +23,17 @@
 #include "../../../jdct.h"
 #include "../../../jsimddct.h"
 #include "../../jsimd.h"
-#include "jconfigint.h"
 
-#include <stdio.h>
-#include <string.h>
 #include <ctype.h>
 
 #define JSIMD_FASTLD3  1
 #define JSIMD_FASTST3  2
 #define JSIMD_FASTTBL  4
 
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
-                                    JSIMD_FASTTBL;
+static THREAD_LOCAL unsigned int simd_support = ~0;
+static THREAD_LOCAL unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_features = JSIMD_FASTLD3 |
+                                                 JSIMD_FASTST3 | JSIMD_FASTTBL;
 
 #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
 
@@ -111,8 +108,6 @@ parse_proc_cpuinfo(int bufsize)
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 
 /*
@@ -125,7 +120,7 @@ LOCAL(void)
 init_simd(void)
 {
 #ifndef NO_GETENV
-  char *env = NULL;
+  char env[2] = { 0 };
 #endif
 #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
   int bufsize = 1024; /* an initial guess for the line buffer size limit */
@@ -147,24 +142,19 @@ init_simd(void)
 
 #ifndef NO_GETENV
   /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENEON");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
     simd_support = JSIMD_NEON;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
     simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
     simd_huffman = 0;
-  env = getenv("JSIMD_FASTLD3");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1"))
     simd_features |= JSIMD_FASTLD3;
-  if ((env != NULL) && (strcmp(env, "0") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0"))
     simd_features &= ~JSIMD_FASTLD3;
-  env = getenv("JSIMD_FASTST3");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1"))
     simd_features |= JSIMD_FASTST3;
-  if ((env != NULL) && (strcmp(env, "0") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0"))
     simd_features &= ~JSIMD_FASTST3;
 #endif
 }
@@ -1028,7 +1018,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
   jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
                                          Sl, Al, values, zerobits);
@@ -1055,7 +1045,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return jsimd_encode_mcu_AC_refine_prepare_neon(block,
                                                  jpeg_natural_order_start,
diff --git a/simd/arm/jcphuff-neon.c b/simd/arm/jcphuff-neon.c
index b91c5db4..51db3c5f 100644
--- a/simd/arm/jcphuff-neon.c
+++ b/simd/arm/jcphuff-neon.c
@@ -2,6 +2,8 @@
  * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
  *
  * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2022, Matthieu Darbois.  All Rights Reserved.
+ * Copyright (C) 2022, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -21,7 +23,6 @@
  */
 
 #define JPEG_INTERNALS
-#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
@@ -41,10 +42,10 @@
 
 void jsimd_encode_mcu_AC_first_prepare_neon
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *values, size_t *zerobits)
+   UJCOEF *values, size_t *zerobits)
 {
-  JCOEF *values_ptr = values;
-  JCOEF *diff_values_ptr = values + DCTSIZE2;
+  UJCOEF *values_ptr = values;
+  UJCOEF *diff_values_ptr = values + DCTSIZE2;
 
   /* Rows of coefficients to zero (since they haven't been processed) */
   int i, rows_to_zero = 8;
@@ -68,23 +69,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon
     coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
 
     /* Isolate sign of coefficients. */
-    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
-    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+    uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
-    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
-    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
-    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
 
     /* Compute diff values. */
-    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
-    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+    uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+    uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
 
     /* Store transformed coefficients and diff values. */
-    vst1q_s16(values_ptr, coefs1);
-    vst1q_s16(values_ptr + DCTSIZE, coefs2);
-    vst1q_s16(diff_values_ptr, diff1);
-    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    vst1q_u16(values_ptr, abs_coefs1);
+    vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+    vst1q_u16(diff_values_ptr, diff1);
+    vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
     values_ptr += 16;
     diff_values_ptr += 16;
     jpeg_natural_order_start += 16;
@@ -130,23 +131,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon
     }
 
     /* Isolate sign of coefficients. */
-    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
-    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
+    uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
-    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
-    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
-    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
 
     /* Compute diff values. */
-    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
-    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+    uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
+    uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
 
     /* Store transformed coefficients and diff values. */
-    vst1q_s16(values_ptr, coefs1);
-    vst1q_s16(values_ptr + DCTSIZE, coefs2);
-    vst1q_s16(diff_values_ptr, diff1);
-    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    vst1q_u16(values_ptr, abs_coefs1);
+    vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
+    vst1q_u16(diff_values_ptr, diff1);
+    vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
     values_ptr += 16;
     diff_values_ptr += 16;
     rows_to_zero -= 2;
@@ -184,17 +185,17 @@ void jsimd_encode_mcu_AC_first_prepare_neon
     }
 
     /* Isolate sign of coefficients. */
-    int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+    uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs = vabsq_s16(coefs);
-    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+    uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+    abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
 
     /* Compute diff values. */
-    int16x8_t diff = veorq_s16(coefs, sign_coefs);
+    uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
 
     /* Store transformed coefficients and diff values. */
-    vst1q_s16(values_ptr, coefs);
-    vst1q_s16(diff_values_ptr, diff);
+    vst1q_u16(values_ptr, abs_coefs);
+    vst1q_u16(diff_values_ptr, diff);
     values_ptr += 8;
     diff_values_ptr += 8;
     rows_to_zero--;
@@ -202,8 +203,8 @@ void jsimd_encode_mcu_AC_first_prepare_neon
 
   /* Zero remaining memory in the values and diff_values blocks. */
   for (i = 0; i < rows_to_zero; i++) {
-    vst1q_s16(values_ptr, vdupq_n_s16(0));
-    vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+    vst1q_u16(values_ptr, vdupq_n_u16(0));
+    vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
     values_ptr += 8;
     diff_values_ptr += 8;
   }
@@ -211,23 +212,23 @@ void jsimd_encode_mcu_AC_first_prepare_neon
   /* Construct zerobits bitmap.  A set bit means that the corresponding
    * coefficient != 0.
    */
-  int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
-  int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
-  int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
-  int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
-  int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
-  int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
-  int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
-  int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
-
-  uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
-  uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
-  uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
-  uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
-  uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
-  uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
-  uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
-  uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+  uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
+  uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
+  uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
+  uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
+  uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
+  uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
+  uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
+  uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
+
+  uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
+  uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
+  uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
+  uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
+  uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
+  uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
+  uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
+  uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
 
   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
   const uint8x8_t bitmap_mask =
@@ -274,7 +275,7 @@ void jsimd_encode_mcu_AC_first_prepare_neon
 
 int jsimd_encode_mcu_AC_refine_prepare_neon
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *absvalues, size_t *bits)
+   UJCOEF *absvalues, size_t *bits)
 {
   /* Temporary storage buffers for data used to compute the signbits bitmap and
    * the end-of-block (EOB) position
@@ -282,7 +283,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
   uint8_t coef_sign_bits[64];
   uint8_t coef_eq1_bits[64];
 
-  JCOEF *absvalues_ptr = absvalues;
+  UJCOEF *absvalues_ptr = absvalues;
   uint8_t *coef_sign_bits_ptr = coef_sign_bits;
   uint8_t *eq1_bits_ptr = coef_eq1_bits;
 
@@ -316,18 +317,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
 
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
-    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
-    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
-    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
-    vst1q_s16(absvalues_ptr, coefs1);
-    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs1);
+    vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
 
     /* Test whether transformed coefficient values == 1 (used to find EOB
      * position.)
      */
-    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
-    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
     vst1_u8(eq1_bits_ptr, coefs_eq11);
     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
 
@@ -385,18 +386,18 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
     vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
 
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
-    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
-    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
-    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
-    vst1q_s16(absvalues_ptr, coefs1);
-    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
+    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
+    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
+    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs1);
+    vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
 
     /* Test whether transformed coefficient values == 1 (used to find EOB
      * position.)
      */
-    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
-    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
     vst1_u8(eq1_bits_ptr, coefs_eq11);
     vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
 
@@ -444,14 +445,14 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
     vst1_u8(coef_sign_bits_ptr, sign_coefs);
 
     /* Compute absolute value of coefficients and apply point transform Al. */
-    int16x8_t abs_coefs = vabsq_s16(coefs);
-    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
-    vst1q_s16(absvalues_ptr, coefs);
+    uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
+    abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
+    vst1q_u16(absvalues_ptr, abs_coefs);
 
     /* Test whether transformed coefficient values == 1 (used to find EOB
      * position.)
      */
-    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
     vst1_u8(eq1_bits_ptr, coefs_eq1);
 
     absvalues_ptr += 8;
@@ -462,7 +463,7 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
 
   /* Zero remaining memory in blocks. */
   for (i = 0; i < rows_to_zero; i++) {
-    vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+    vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
     vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
     vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
     absvalues_ptr += 8;
@@ -471,23 +472,23 @@ int jsimd_encode_mcu_AC_refine_prepare_neon
   }
 
   /* Construct zerobits bitmap. */
-  int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
-  int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
-  int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
-  int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
-  int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
-  int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
-  int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
-  int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
-
-  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
-  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
-  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
-  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
-  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
-  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
-  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
-  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+  uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
+  uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
+  uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
+  uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
+  uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
+  uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
+  uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
+  uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
+
+  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
+  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
+  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
+  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
+  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
+  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
+  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
+  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
 
   /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
   const uint8x8_t bitmap_mask =
diff --git a/simd/arm/jdcolor-neon.c b/simd/arm/jdcolor-neon.c
index ea4668f1..28dbc572 100644
--- a/simd/arm/jdcolor-neon.c
+++ b/simd/arm/jdcolor-neon.c
@@ -21,7 +21,6 @@
  */
 
 #define JPEG_INTERNALS
-#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
diff --git a/simd/arm/jdmerge-neon.c b/simd/arm/jdmerge-neon.c
index e4f91fdc..18fb9d8a 100644
--- a/simd/arm/jdmerge-neon.c
+++ b/simd/arm/jdmerge-neon.c
@@ -21,7 +21,6 @@
  */
 
 #define JPEG_INTERNALS
-#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
diff --git a/simd/arm/jidctint-neon.c b/simd/arm/jidctint-neon.c
index 043b652e..d25112ef 100644
--- a/simd/arm/jidctint-neon.c
+++ b/simd/arm/jidctint-neon.c
@@ -22,7 +22,6 @@
  */
 
 #define JPEG_INTERNALS
-#include "jconfigint.h"
 #include "../../jinclude.h"
 #include "../../jpeglib.h"
 #include "../../jsimd.h"
diff --git a/simd/arm/jquanti-neon.c b/simd/arm/jquanti-neon.c
index a7eb6f19..d5d95d89 100644
--- a/simd/arm/jquanti-neon.c
+++ b/simd/arm/jquanti-neon.c
@@ -1,7 +1,7 @@
 /*
  * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
  *
- * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -100,6 +100,9 @@ void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
   DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
   int i;
 
+#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
+#pragma unroll
+#endif
   for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
     /* Load DCT coefficients. */
     int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
diff --git a/simd/i386/jsimd.c b/simd/i386/jsimd.c
index 563949a0..b429b0a5 100644
--- a/simd/i386/jsimd.c
+++ b/simd/i386/jsimd.c
@@ -2,8 +2,8 @@
  * jsimd_i386.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022-2023, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -21,7 +21,6 @@
 #include "../../jdct.h"
 #include "../../jsimddct.h"
 #include "../jsimd.h"
-#include "jconfigint.h"
 
 /*
  * In the PIC cases, we have no guarantee that constants will keep
@@ -32,19 +31,17 @@
 #define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
 #define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
 
-static unsigned int simd_support = (unsigned int)(~0);
-static unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0);
+static THREAD_LOCAL unsigned int simd_huffman = 1;
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 LOCAL(void)
 init_simd(void)
 {
 #ifndef NO_GETENV
-  char *env = NULL;
+  char env[2] = { 0 };
 #endif
 
   if (simd_support != ~0U)
@@ -54,26 +51,19 @@ init_simd(void)
 
 #ifndef NO_GETENV
   /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCEMMX");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCEMMX") && !strcmp(env, "1"))
     simd_support &= JSIMD_MMX;
-  env = getenv("JSIMD_FORCE3DNOW");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCE3DNOW") && !strcmp(env, "1"))
     simd_support &= JSIMD_3DNOW | JSIMD_MMX;
-  env = getenv("JSIMD_FORCESSE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE") && !strcmp(env, "1"))
     simd_support &= JSIMD_SSE | JSIMD_MMX;
-  env = getenv("JSIMD_FORCESSE2");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
     simd_support &= JSIMD_SSE2;
-  env = getenv("JSIMD_FORCEAVX2");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
     simd_support &= JSIMD_AVX2;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
     simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
     simd_huffman = 0;
 #endif
 }
@@ -168,6 +158,9 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
   void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_extrgb_ycc_convert_avx2;
@@ -227,6 +220,9 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
   void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_extrgb_gray_convert_avx2;
@@ -286,6 +282,9 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
   void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_ycc_extrgb_convert_avx2;
@@ -389,6 +388,9 @@ GLOBAL(void)
 jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
                                compptr->v_samp_factor,
@@ -409,6 +411,9 @@ GLOBAL(void)
 jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
                                compptr->v_samp_factor,
@@ -471,6 +476,9 @@ GLOBAL(void)
 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
                              input_data, output_data_ptr);
@@ -486,6 +494,9 @@ GLOBAL(void)
 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
                              input_data, output_data_ptr);
@@ -547,6 +558,9 @@ GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
                                    compptr->downsampled_width, input_data,
@@ -565,6 +579,9 @@ GLOBAL(void)
 jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
                                    compptr->downsampled_width, input_data,
@@ -633,6 +650,9 @@ jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
   void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
@@ -691,6 +711,9 @@ jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
   void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
@@ -795,6 +818,9 @@ GLOBAL(void)
 jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
                DCTELEM *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_convsamp_avx2(sample_data, start_col, workspace);
   else if (simd_support & JSIMD_SSE2)
@@ -807,6 +833,9 @@ GLOBAL(void)
 jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
                      FAST_FLOAT *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_SSE2)
     jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
   else if (simd_support & JSIMD_SSE)
@@ -877,6 +906,9 @@ jsimd_can_fdct_float(void)
 GLOBAL(void)
 jsimd_fdct_islow(DCTELEM *data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_fdct_islow_avx2(data);
   else if (simd_support & JSIMD_SSE2)
@@ -888,6 +920,9 @@ jsimd_fdct_islow(DCTELEM *data)
 GLOBAL(void)
 jsimd_fdct_ifast(DCTELEM *data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
     jsimd_fdct_ifast_sse2(data);
   else
@@ -897,6 +932,9 @@ jsimd_fdct_ifast(DCTELEM *data)
 GLOBAL(void)
 jsimd_fdct_float(FAST_FLOAT *data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
     jsimd_fdct_float_sse(data);
   else if (simd_support & JSIMD_3DNOW)
@@ -952,6 +990,9 @@ jsimd_can_quantize_float(void)
 GLOBAL(void)
 jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_quantize_avx2(coef_block, divisors, workspace);
   else if (simd_support & JSIMD_SSE2)
@@ -964,6 +1005,9 @@ GLOBAL(void)
 jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
                      FAST_FLOAT *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_SSE2)
     jsimd_quantize_float_sse2(coef_block, divisors, workspace);
   else if (simd_support & JSIMD_SSE)
@@ -1027,6 +1071,9 @@ jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block, JSAMPARRAY output_buf,
                JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
     jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
                         output_col);
@@ -1039,6 +1086,9 @@ jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                JCOEFPTR coef_block, JSAMPARRAY output_buf,
                JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
     jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
                         output_col);
@@ -1133,6 +1183,9 @@ jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
                           output_col);
@@ -1149,6 +1202,9 @@ jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
     jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
                           output_col);
@@ -1162,6 +1218,9 @@ jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
     jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
                           output_col);
@@ -1219,7 +1278,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
   jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
                                          Sl, Al, values, zerobits);
@@ -1245,7 +1304,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
                                                  jpeg_natural_order_start,
diff --git a/simd/jsimd.h b/simd/jsimd.h
index 64747c63..a28754ad 100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -2,10 +2,10 @@
  * simd/jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2011, 2014-2016, 2018, 2020, 2022, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2014, Linaro Limited.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  * Copyright (C) 2020, Arm Limited.
  *
@@ -1243,16 +1243,16 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
 /* Progressive Huffman encoding */
 EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *values, size_t *zerobits);
+   UJCOEF *values, size_t *zerobits);
 
 EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *values, size_t *zerobits);
+   UJCOEF *values, size_t *zerobits);
 
 EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *absvalues, size_t *bits);
+   UJCOEF *absvalues, size_t *bits);
 
 EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
-   JCOEF *absvalues, size_t *bits);
+   UJCOEF *absvalues, size_t *bits);
diff --git a/simd/x86_64/jchuff-sse2.asm b/simd/x86_64/jchuff-sse2.asm
index 00720283..9ea6df94 100644
--- a/simd/x86_64/jchuff-sse2.asm
+++ b/simd/x86_64/jchuff-sse2.asm
@@ -1,7 +1,7 @@
 ;
 ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
 ;
-; Copyright (C) 2009-2011, 2014-2016, 2019, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
 ; Copyright (C) 2015, Matthieu Darbois.
 ; Copyright (C) 2018, Matthias Räncker.
 ;
@@ -83,6 +83,7 @@ times 1 << 11 db 12
 times 1 << 12 db 13
 times 1 << 13 db 14
 times 1 << 14 db 15
+times 1 << 15 db 16
 
     alignz      32
 
diff --git a/simd/x86_64/jsimd.c b/simd/x86_64/jsimd.c
index eb766799..3f5ee77e 100644
--- a/simd/x86_64/jsimd.c
+++ b/simd/x86_64/jsimd.c
@@ -2,8 +2,8 @@
  * jsimd_x86_64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022-2023, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, 2022, Matthieu Darbois.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -21,7 +21,6 @@
 #include "../../jdct.h"
 #include "../../jsimddct.h"
 #include "../jsimd.h"
-#include "jconfigint.h"
 
 /*
  * In the PIC cases, we have no guarantee that constants will keep
@@ -32,19 +31,17 @@
 #define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
 #define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
 
-static unsigned int simd_support = (unsigned int)(~0);
-static unsigned int simd_huffman = 1;
+static THREAD_LOCAL unsigned int simd_support = (unsigned int)(~0);
+static THREAD_LOCAL unsigned int simd_huffman = 1;
 
 /*
  * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
  */
 LOCAL(void)
 init_simd(void)
 {
 #ifndef NO_GETENV
-  char *env = NULL;
+  char env[2] = { 0 };
 #endif
 
   if (simd_support != ~0U)
@@ -54,17 +51,13 @@ init_simd(void)
 
 #ifndef NO_GETENV
   /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCESSE2");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
     simd_support &= JSIMD_SSE2;
-  env = getenv("JSIMD_FORCEAVX2");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
     simd_support &= JSIMD_AVX2;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
     simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
     simd_huffman = 0;
 #endif
 }
@@ -152,6 +145,9 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
   void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_extrgb_ycc_convert_avx2;
@@ -201,6 +197,9 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
   void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
   void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_extrgb_gray_convert_avx2;
@@ -250,6 +249,9 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_ycc_extrgb_convert_avx2;
@@ -340,6 +342,9 @@ GLOBAL(void)
 jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
                                compptr->v_samp_factor,
@@ -356,6 +361,9 @@ GLOBAL(void)
 jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
                                compptr->v_samp_factor,
@@ -410,6 +418,9 @@ GLOBAL(void)
 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
                              input_data, output_data_ptr);
@@ -422,6 +433,9 @@ GLOBAL(void)
 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
                              input_data, output_data_ptr);
@@ -476,6 +490,9 @@ GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
                                    compptr->downsampled_width, input_data,
@@ -490,6 +507,9 @@ GLOBAL(void)
 jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
                                    compptr->downsampled_width, input_data,
@@ -549,6 +569,9 @@ jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
@@ -597,6 +620,9 @@ jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
   void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
 
+  if (simd_support == ~0U)
+    init_simd();
+
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
     avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
@@ -686,6 +712,9 @@ GLOBAL(void)
 jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
                DCTELEM *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_convsamp_avx2(sample_data, start_col, workspace);
   else
@@ -755,6 +784,9 @@ jsimd_can_fdct_float(void)
 GLOBAL(void)
 jsimd_fdct_islow(DCTELEM *data)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_fdct_islow_avx2(data);
   else
@@ -816,6 +848,9 @@ jsimd_can_quantize_float(void)
 GLOBAL(void)
 jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_quantize_avx2(coef_block, divisors, workspace);
   else
@@ -970,6 +1005,9 @@ jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  if (simd_support == ~0U)
+    init_simd();
+
   if (simd_support & JSIMD_AVX2)
     jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
                           output_col);
@@ -1040,7 +1078,7 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
 GLOBAL(void)
 jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
-                                  int Al, JCOEF *values, size_t *zerobits)
+                                  int Al, UJCOEF *values, size_t *zerobits)
 {
   jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
                                          Sl, Al, values, zerobits);
@@ -1064,7 +1102,7 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
 GLOBAL(int)
 jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
-                                   int Al, JCOEF *absvalues, size_t *bits)
+                                   int Al, UJCOEF *absvalues, size_t *bits)
 {
   return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
                                                  jpeg_natural_order_start,