59 files changed, 1441 insertions, 3642 deletions
diff --git a/Android.bp b/Android.bp
index 60cadc91263..b9e94aa2dc7 100644
--- a/Android.bp
+++ b/Android.bp
@@ -14,43 +14,10 @@
 // limitations under the License.
 //
 
-package {
-    default_applicable_licenses: ["external_scudo_license"],
-}
-
-// Added automatically by a large-scale-change that took the approach of
-// 'apply every license found to every target'. While this makes sure we respect
-// every license restriction, it may not be entirely correct.
-//
-// e.g. GPL in an MIT project might only apply to the contrib/ directory.
-//
-// Please consider splitting the single license below into multiple licenses,
-// taking care not to lose any license_kind information, and overriding the
-// default license using the 'licenses: [...]' property on targets as needed.
-//
-// For unused files, consider creating a 'filegroup' with "//visibility:private"
-// to attach the license to, and including a comment whether the files may be
-// used in the current project.
-// http://go/android-license-faq
-license {
-    name: "external_scudo_license",
-    visibility: [":__subpackages__"],
-    license_kinds: [
-        "SPDX-license-identifier-Apache-2.0",
-        "SPDX-license-identifier-BSD",
-        "SPDX-license-identifier-MIT",
-        "SPDX-license-identifier-NCSA",
-    ],
-    license_text: [
-        "LICENSE.TXT",
-    ],
-}
-
 cc_defaults {
     name: "libscudo_defaults",
     native_coverage: false,
     ramdisk_available: true,
-    vendor_ramdisk_available: true,
     recovery_available: true,
     host_supported: true,
     native_bridge_supported: true,
@@ -79,20 +46,12 @@ cc_defaults {
         // Android assumes that allocations of multiples of 16 bytes
         // will be aligned to at least 16 bytes.
         "-DSCUDO_MIN_ALIGNMENT_LOG=4",
-
-        // Allow scudo to use android_unsafe_frame_pointer_chase(), which is
-        // normally a private function.
-        "-DHAVE_ANDROID_UNSAFE_FRAME_POINTER_CHASE",
     ],
     cppflags: [
         "-nostdinc++",
         "-fno-exceptions",
     ],
 
-    include_dirs: [
-        "external/scudo/standalone/include",
-    ],
-
     system_shared_libs: [],
 
     srcs: [
@@ -129,32 +88,22 @@ cc_defaults {
         linux_glibc: {
             enabled: true,
         },
-        android: {
-            header_libs: ["bionic_libc_platform_headers"],
-        },
-        linux_bionic: {
-            header_libs: ["bionic_libc_platform_headers"],
-        },
-        native_bridge: {
-            cflags: ["-DSCUDO_DISABLE_TBI"],
-        },
     },
 
-    header_libs: ["libc_headers"],
+    header_libs: [
+        "bionic_libc_platform_headers",
+    ],
+    product_variables: {
+        experimental_mte: {
+            cflags: ["-DANDROID_EXPERIMENTAL_MTE"],
+        },
+    },
 }
 
 cc_library_static {
     name: "libscudo",
     defaults: ["libscudo_defaults"],
-    cflags: [
-      "-D_BIONIC=1",
-      "-DSCUDO_HAS_PLATFORM_TLS_SLOT",
-    ],
-    visibility: [
-      "//bionic:__subpackages__",
-      "//frameworks/libs/native_bridge_support/libc:__subpackages__",
-      "//system/core/debuggerd:__subpackages__",
-    ],
+    cflags: ["-D_BIONIC=1"],
 }
 
 cc_library_static {
@@ -164,9 +113,7 @@ cc_library_static {
 
 cc_test {
     name: "scudo_unit_tests",
-    // Temporarily disabled on host due to a 15-20s per-test timeout,
-    // which is currently exceeded by ScudoCombinedTest.BasicCombined.
-    host_supported: false,
+    host_supported: true,
     srcs: [
         "standalone/tests/atomic_test.cpp",
         "standalone/tests/bytemap_test.cpp",
@@ -191,39 +138,20 @@ cc_test {
     ],
     static_libs: ["libscudo_for_testing"],
     include_dirs: [
+        "external",
         "external/scudo/standalone",
-        "external/scudo/standalone/include",
     ],
     cflags: [
         "-Wno-unused-parameter",
         "-fno-emulated-tls",
     ],
-    target: {
-        android: {
-            header_libs: ["bionic_libc_platform_headers"],
-        },
-        linux_bionic: {
-            header_libs: ["bionic_libc_platform_headers"],
+    header_libs: [
+        "bionic_libc_platform_headers",
+    ],
+    product_variables: {
+        experimental_mte: {
+            cflags: ["-DANDROID_EXPERIMENTAL_MTE"],
         },
     },
     test_suites: ["general-tests"],
-    bootstrap: true,
-}
-
-cc_fuzz {
-    name: "scudo_get_error_info_fuzzer",
-    host_supported: true,
-    compile_multilib: "64",
-    static_libs: ["libscudo"],
-    include_dirs: [
-        "external/scudo/standalone",
-        "external/scudo/standalone/include",
-    ],
-    cflags: [
-        "-Wno-unneeded-internal-declaration",
-    ],
-    srcs: ["standalone/fuzz/get_error_info_fuzzer.cpp"],
-    fuzz_config: {
-        componentid: 87896
-    },
 }
diff --git a/METADATA b/METADATA
index bee7b61e47a..72458a38327 100644
--- a/METADATA
+++ b/METADATA
@@ -17,6 +17,5 @@ third_party {
     value: "https://github.com/llvm/llvm-project.git"
   }
   version: "161cca266a9d0b6deb5f1fd2de8ad543649a7fa1"
-  license_type: NOTICE
   last_upgrade_date { year: 2019 month: 9 day: 10 }
 }
diff --git a/NOTICE b/NOTICE
new file mode 120000
index 00000000000..7a694c9699a
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1 @@
+LICENSE
+\ No newline at end of file
diff --git a/copybara/copy.bara.sky b/copybara/copy.bara.sky
index 54ca37c0dcc..4d22c479513 100644
--- a/copybara/copy.bara.sky
+++ b/copybara/copy.bara.sky
@@ -2,7 +2,7 @@ core.workflow(
     name = "default",
     origin = git.origin(
         url = "https://github.com/llvm/llvm-project.git",
-        ref = "main",
+        ref = "master",
     ),
     origin_files = glob(
         [
@@ -26,12 +26,16 @@ core.workflow(
             "**/Android.bp"
         ],
     ),
+    mode = "SQUASH",
     authoring = authoring.pass_thru(
         "Dynamic Tools Team <dynamic-tools@google.com>"
     ),
-    mode = 'ITERATIVE',
     transformations = [
         core.move("compiler-rt/lib/scudo/standalone/", "standalone"),
         core.move("compiler-rt/LICENSE.TXT", "LICENSE.TXT"),
+        metadata.squash_notes(
+            prefix = "Imported Scudo Standalone changes:\n\n",
+            oldest_first = True,
+        ),
     ],
 )
diff --git a/standalone/allocator_config.h b/standalone/allocator_config.h
index 8e103f28b1a..ad2a17ef701 100644
--- a/standalone/allocator_config.h
+++ b/standalone/allocator_config.h
@@ -21,138 +21,59 @@
 
 namespace scudo {
 
-// The combined allocator uses a structure as a template argument that
-// specifies the configuration options for the various subcomponents of the
-// allocator.
-//
-// struct ExampleConfig {
-//   // SizeClasMmap to use with the Primary.
-//   using SizeClassMap = DefaultSizeClassMap;
-//   // Indicates possible support for Memory Tagging.
-//   static const bool MaySupportMemoryTagging = false;
-//   // Defines the Primary allocator to use.
-//   typedef SizeClassAllocator64<ExampleConfig> Primary;
-//   // Log2 of the size of a size class region, as used by the Primary.
-//   static const uptr PrimaryRegionSizeLog = 30U;
-//   // Defines the type and scale of a compact pointer. A compact pointer can
-//   // be understood as the offset of a pointer within the region it belongs
-//   // to, in increments of a power-of-2 scale.
-//   // eg: Ptr = Base + (CompactPtr << Scale).
-//   typedef u32 PrimaryCompactPtrT;
-//   static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
-//   // Defines the minimal & maximal release interval that can be set.
-//   static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-//   static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-//   // Defines the type of cache used by the Secondary. Some additional
-//   // configuration entries can be necessary depending on the Cache.
-//   typedef MapAllocatorNoCache SecondaryCache;
-//   // Thread-Specific Data Registry used, shared or exclusive.
-//   template <class A> using TSDRegistryT = TSDRegistrySharedT<A, 8U, 4U>;
-// };
-
 // Default configurations for various platforms.
 
 struct DefaultConfig {
   using SizeClassMap = DefaultSizeClassMap;
-  static const bool MaySupportMemoryTagging = false;
-
 #if SCUDO_CAN_USE_PRIMARY64
-  typedef SizeClassAllocator64<DefaultConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 32U;
-  typedef uptr PrimaryCompactPtrT;
-  static const uptr PrimaryCompactPtrScale = 0;
+  // 1GB Regions
+  typedef SizeClassAllocator64<SizeClassMap, 30U> Primary;
 #else
-  typedef SizeClassAllocator32<DefaultConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 19U;
-  typedef uptr PrimaryCompactPtrT;
+  // 512KB regions
+  typedef SizeClassAllocator32<SizeClassMap, 19U> Primary;
 #endif
-  static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-
-  typedef MapAllocatorCache<DefaultConfig> SecondaryCache;
-  static const u32 SecondaryCacheEntriesArraySize = 32U;
-  static const u32 SecondaryCacheQuarantineSize = 0U;
-  static const u32 SecondaryCacheDefaultMaxEntriesCount = 32U;
-  static const uptr SecondaryCacheDefaultMaxEntrySize = 1UL << 19;
-  static const s32 SecondaryCacheMinReleaseToOsIntervalMs = INT32_MIN;
-  static const s32 SecondaryCacheMaxReleaseToOsIntervalMs = INT32_MAX;
-
+  typedef MapAllocator<MapAllocatorCache<>> Secondary;
   template <class A> using TSDRegistryT = TSDRegistryExT<A>; // Exclusive
 };
 
 struct AndroidConfig {
   using SizeClassMap = AndroidSizeClassMap;
-  static const bool MaySupportMemoryTagging = true;
-
 #if SCUDO_CAN_USE_PRIMARY64
-  typedef SizeClassAllocator64<AndroidConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 28U;
-  typedef u32 PrimaryCompactPtrT;
-  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  // 256MB regions
+  typedef SizeClassAllocator64<SizeClassMap, 28U, 1000, 1000,
+                               /*MaySupportMemoryTagging=*/true>
+      Primary;
 #else
-  typedef SizeClassAllocator32<AndroidConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 18U;
-  typedef uptr PrimaryCompactPtrT;
+  // 256KB regions
+  typedef SizeClassAllocator32<SizeClassMap, 18U, 1000, 1000> Primary;
 #endif
-  static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
-  static const s32 PrimaryMaxReleaseToOsIntervalMs = 1000;
-
-  typedef MapAllocatorCache<AndroidConfig> SecondaryCache;
-  static const u32 SecondaryCacheEntriesArraySize = 256U;
-  static const u32 SecondaryCacheQuarantineSize = 32U;
-  static const u32 SecondaryCacheDefaultMaxEntriesCount = 32U;
-  static const uptr SecondaryCacheDefaultMaxEntrySize = 2UL << 20;
-  static const s32 SecondaryCacheMinReleaseToOsIntervalMs = 0;
-  static const s32 SecondaryCacheMaxReleaseToOsIntervalMs = 1000;
-
+  // Cache blocks up to 2MB
+  typedef MapAllocator<MapAllocatorCache<32U, 2UL << 20, 0, 1000>> Secondary;
   template <class A>
-  using TSDRegistryT = TSDRegistrySharedT<A, 8U, 2U>; // Shared, max 8 TSDs.
+  using TSDRegistryT = TSDRegistrySharedT<A, 2U>; // Shared, max 2 TSDs.
 };
 
 struct AndroidSvelteConfig {
   using SizeClassMap = SvelteSizeClassMap;
-  static const bool MaySupportMemoryTagging = false;
-
 #if SCUDO_CAN_USE_PRIMARY64
-  typedef SizeClassAllocator64<AndroidSvelteConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 27U;
-  typedef u32 PrimaryCompactPtrT;
-  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  // 128MB regions
+  typedef SizeClassAllocator64<SizeClassMap, 27U, 1000, 1000> Primary;
 #else
-  typedef SizeClassAllocator32<AndroidSvelteConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 16U;
-  typedef uptr PrimaryCompactPtrT;
+  // 64KB regions
+  typedef SizeClassAllocator32<SizeClassMap, 16U, 1000, 1000> Primary;
 #endif
-  static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
-  static const s32 PrimaryMaxReleaseToOsIntervalMs = 1000;
-
-  typedef MapAllocatorCache<AndroidSvelteConfig> SecondaryCache;
-  static const u32 SecondaryCacheEntriesArraySize = 16U;
-  static const u32 SecondaryCacheQuarantineSize = 32U;
-  static const u32 SecondaryCacheDefaultMaxEntriesCount = 4U;
-  static const uptr SecondaryCacheDefaultMaxEntrySize = 1UL << 18;
-  static const s32 SecondaryCacheMinReleaseToOsIntervalMs = 0;
-  static const s32 SecondaryCacheMaxReleaseToOsIntervalMs = 0;
-
+  typedef MapAllocator<MapAllocatorCache<4U, 1UL << 18, 0, 0>> Secondary;
   template <class A>
-  using TSDRegistryT = TSDRegistrySharedT<A, 2U, 1U>; // Shared, max 2 TSDs.
+  using TSDRegistryT = TSDRegistrySharedT<A, 1U>; // Shared, only 1 TSD.
 };
 
 #if SCUDO_CAN_USE_PRIMARY64
 struct FuchsiaConfig {
-  using SizeClassMap = DefaultSizeClassMap;
-  static const bool MaySupportMemoryTagging = false;
-
-  typedef SizeClassAllocator64<FuchsiaConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 30U;
-  typedef u32 PrimaryCompactPtrT;
-  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
-  static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-
-  typedef MapAllocatorNoCache SecondaryCache;
+  // 1GB Regions
+  typedef SizeClassAllocator64<DefaultSizeClassMap, 30U> Primary;
+  typedef MapAllocator<MapAllocatorNoCache> Secondary;
   template <class A>
-  using TSDRegistryT = TSDRegistrySharedT<A, 8U, 4U>; // Shared, max 8 TSDs.
+  using TSDRegistryT = TSDRegistrySharedT<A, 8U>; // Shared, max 8 TSDs.
 };
 #endif
 
diff --git a/standalone/atomic_helpers.h b/standalone/atomic_helpers.h
index d88f5d7be64..6c84ba86ed3 100644
--- a/standalone/atomic_helpers.h
+++ b/standalone/atomic_helpers.h
@@ -51,7 +51,7 @@ struct atomic_u32 {
 struct atomic_u64 {
   typedef u64 Type;
   // On 32-bit platforms u64 is not necessarily aligned on 8 bytes.
-  alignas(8) volatile Type ValDoNotUse;
+  ALIGNED(8) volatile Type ValDoNotUse;
 };
 
 struct atomic_uptr {
@@ -90,20 +90,6 @@ inline typename T::Type atomic_fetch_sub(volatile T *A, typename T::Type V,
 }
 
 template <typename T>
-inline typename T::Type atomic_fetch_and(volatile T *A, typename T::Type V,
-                                         memory_order MO) {
-  DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
-  return __atomic_fetch_and(&A->ValDoNotUse, V, MO);
-}
-
-template <typename T>
-inline typename T::Type atomic_fetch_or(volatile T *A, typename T::Type V,
-                                        memory_order MO) {
-  DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
-  return __atomic_fetch_or(&A->ValDoNotUse, V, MO);
-}
-
-template <typename T>
 inline typename T::Type atomic_exchange(volatile T *A, typename T::Type V,
                                         memory_order MO) {
   DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
@@ -120,6 +106,14 @@ inline bool atomic_compare_exchange_strong(volatile T *A, typename T::Type *Cmp,
                                    __ATOMIC_RELAXED);
 }
 
+template <typename T>
+inline bool atomic_compare_exchange_weak(volatile T *A, typename T::Type *Cmp,
+                                         typename T::Type Xchg,
+                                         memory_order MO) {
+  return __atomic_compare_exchange(&A->ValDoNotUse, Cmp, &Xchg, true, MO,
+                                   __ATOMIC_RELAXED);
+}
+
 // Clutter-reducing helpers.
 
 template <typename T>
diff --git a/standalone/benchmarks/malloc_benchmark.cpp b/standalone/benchmarks/malloc_benchmark.cpp
index 661fff45a8d..ce48dc02f7a 100644
--- a/standalone/benchmarks/malloc_benchmark.cpp
+++ b/standalone/benchmarks/malloc_benchmark.cpp
@@ -13,22 +13,15 @@
 #include "benchmark/benchmark.h"
 
 #include <memory>
-#include <vector>
-
-void *CurrentAllocator;
-template <typename Config> void PostInitCallback() {
-  reinterpret_cast<scudo::Allocator<Config> *>(CurrentAllocator)->initGwpAsan();
-}
 
 template <typename Config> static void BM_malloc_free(benchmark::State &State) {
-  using AllocatorT = scudo::Allocator<Config, PostInitCallback<Config>>;
+  using AllocatorT = scudo::Allocator<Config>;
   auto Deleter = [](AllocatorT *A) {
     A->unmapTestOnly();
     delete A;
   };
   std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
                                                            Deleter);
-  CurrentAllocator = Allocator.get();
   Allocator->reset();
 
   const size_t NBytes = State.range(0);
@@ -62,19 +55,18 @@ BENCHMARK_TEMPLATE(BM_malloc_free, scudo::FuchsiaConfig)
 
 template <typename Config>
 static void BM_malloc_free_loop(benchmark::State &State) {
-  using AllocatorT = scudo::Allocator<Config, PostInitCallback<Config>>;
+  using AllocatorT = scudo::Allocator<Config>;
   auto Deleter = [](AllocatorT *A) {
     A->unmapTestOnly();
     delete A;
   };
   std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
                                                            Deleter);
-  CurrentAllocator = Allocator.get();
   Allocator->reset();
 
   const size_t NumIters = State.range(0);
   size_t PageSize = scudo::getPageSizeCached();
-  std::vector<void *> Ptrs(NumIters);
+  void *Ptrs[NumIters];
 
   for (auto _ : State) {
     size_t SizeLog2 = 0;
diff --git a/standalone/checksum.cpp b/standalone/checksum.cpp
index 05d4ba54bfc..5de049a0931 100644
--- a/standalone/checksum.cpp
+++ b/standalone/checksum.cpp
@@ -31,13 +31,6 @@ Checksum HashAlgorithm = {Checksum::BSD};
 #define bit_SSE4_2 bit_SSE42 // clang and gcc have different defines.
 #endif
 
-#ifndef signature_HYGON_ebx // They are not defined in gcc.
-// HYGON: "HygonGenuine".
-#define signature_HYGON_ebx 0x6f677948
-#define signature_HYGON_edx 0x6e65476e
-#define signature_HYGON_ecx 0x656e6975
-#endif
-
 bool hasHardwareCRC32() {
   u32 Eax, Ebx = 0, Ecx = 0, Edx = 0;
   __get_cpuid(0, &Eax, &Ebx, &Ecx, &Edx);
@@ -46,10 +39,7 @@ bool hasHardwareCRC32() {
                        (Ecx == signature_INTEL_ecx);
   const bool IsAMD = (Ebx == signature_AMD_ebx) && (Edx == signature_AMD_edx) &&
                      (Ecx == signature_AMD_ecx);
-  const bool IsHygon = (Ebx == signature_HYGON_ebx) &&
-                       (Edx == signature_HYGON_edx) &&
-                       (Ecx == signature_HYGON_ecx);
-  if (!IsIntel && !IsAMD && !IsHygon)
+  if (!IsIntel && !IsAMD)
     return false;
   __get_cpuid(1, &Eax, &Ebx, &Ecx, &Edx);
   return !!(Ecx & bit_SSE4_2);
diff --git a/standalone/chunk.h b/standalone/chunk.h
index 69b8e1b12a9..f4d68b3ac6c 100644
--- a/standalone/chunk.h
+++ b/standalone/chunk.h
@@ -65,8 +65,7 @@ typedef u64 PackedHeader;
 struct UnpackedHeader {
   uptr ClassId : 8;
   u8 State : 2;
-  // Origin if State == Allocated, or WasZeroed otherwise.
-  u8 OriginOrWasZeroed : 2;
+  u8 Origin : 2;
   uptr SizeOrUnusedBytes : 20;
   uptr Offset : 16;
   uptr Checksum : 16;
diff --git a/standalone/combined.h b/standalone/combined.h
index 8080d677d7b..3ed34c21aa5 100644
--- a/standalone/combined.h
+++ b/standalone/combined.h
@@ -13,18 +13,15 @@
 #include "common.h"
 #include "flags.h"
 #include "flags_parser.h"
+#include "interface.h"
 #include "local_cache.h"
 #include "memtag.h"
-#include "options.h"
 #include "quarantine.h"
 #include "report.h"
 #include "secondary.h"
-#include "stack_depot.h"
 #include "string_utils.h"
 #include "tsd.h"
 
-#include "scudo/interface.h"
-
 #ifdef GWP_ASAN_HOOKS
 #include "gwp_asan/guarded_pool_allocator.h"
 #include "gwp_asan/optional/backtrace.h"
@@ -33,15 +30,10 @@
 
 extern "C" inline void EmptyCallback() {}
 
-#ifdef HAVE_ANDROID_UNSAFE_FRAME_POINTER_CHASE
-// This function is not part of the NDK so it does not appear in any public
-// header files. We only declare/use it when targeting the platform.
-extern "C" size_t android_unsafe_frame_pointer_chase(scudo::uptr *buf,
-                                                     size_t num_entries);
-#endif
-
 namespace scudo {
 
+enum class Option { ReleaseInterval };
+
 template <class Params, void (*PostInitCallback)(void) = EmptyCallback>
 class Allocator {
 public:
@@ -51,7 +43,8 @@ public:
   typedef typename Params::template TSDRegistryT<ThisT> TSDRegistryT;
 
   void callPostInitCallback() {
-    pthread_once(&PostInitNonce, PostInitCallback);
+    static pthread_once_t OnceControl = PTHREAD_ONCE_INIT;
+    pthread_once(&OnceControl, PostInitCallback);
   }
 
   struct QuarantineCallback {
@@ -70,10 +63,12 @@ public:
       NewHeader.State = Chunk::State::Available;
       Chunk::compareExchangeHeader(Allocator.Cookie, Ptr, &NewHeader, &Header);
 
-      if (allocatorSupportsMemoryTagging<Params>())
-        Ptr = untagPointer(Ptr);
       void *BlockBegin = Allocator::getBlockBegin(Ptr, &NewHeader);
-      Cache.deallocate(NewHeader.ClassId, BlockBegin);
+      const uptr ClassId = NewHeader.ClassId;
+      if (LIKELY(ClassId))
+        Cache.deallocate(ClassId, BlockBegin);
+      else
+        Allocator.Secondary.deallocate(BlockBegin);
     }
 
     // We take a shortcut when allocating a quarantine batch by working with the
@@ -95,12 +90,6 @@ public:
       Header.State = Chunk::State::Allocated;
       Chunk::storeHeader(Allocator.Cookie, Ptr, &Header);
 
-      // Reset tag to 0 as this chunk may have been previously used for a tagged
-      // user allocation.
-      if (UNLIKELY(useMemoryTagging<Params>(Allocator.Primary.Options.load())))
-        storeTags(reinterpret_cast<uptr>(Ptr),
-                  reinterpret_cast<uptr>(Ptr) + sizeof(QuarantineBatch));
-
       return Ptr;
     }
 
@@ -148,22 +137,11 @@ public:
     reportUnrecognizedFlags();
 
     // Store some flags locally.
-    if (getFlags()->may_return_null)
-      Primary.Options.set(OptionBit::MayReturnNull);
-    if (getFlags()->zero_contents)
-      Primary.Options.setFillContentsMode(ZeroFill);
-    else if (getFlags()->pattern_fill_contents)
-      Primary.Options.setFillContentsMode(PatternOrZeroFill);
-    if (getFlags()->dealloc_type_mismatch)
-      Primary.Options.set(OptionBit::DeallocTypeMismatch);
-    if (getFlags()->delete_size_mismatch)
-      Primary.Options.set(OptionBit::DeleteSizeMismatch);
-    if (allocatorSupportsMemoryTagging<Params>() &&
-        systemSupportsMemoryTagging())
-      Primary.Options.set(OptionBit::UseMemoryTagging);
-    Primary.Options.set(OptionBit::UseOddEvenTags);
-
-    QuarantineMaxChunkSize =
+    Options.MayReturnNull = getFlags()->may_return_null;
+    Options.ZeroContents = getFlags()->zero_contents;
+    Options.DeallocTypeMismatch = getFlags()->dealloc_type_mismatch;
+    Options.DeleteSizeMismatch = getFlags()->delete_size_mismatch;
+    Options.QuarantineMaxChunkSize =
         static_cast<u32>(getFlags()->quarantine_max_chunk_size);
 
     Stats.initLinkerInitialized();
@@ -182,6 +160,11 @@ public:
 #ifdef GWP_ASAN_HOOKS
     gwp_asan::options::Options Opt;
     Opt.Enabled = getFlags()->GWP_ASAN_Enabled;
+    // Bear in mind - Scudo has its own alignment guarantees that are strictly
+    // enforced. Scudo exposes the same allocation function for everything from
+    // malloc() to posix_memalign, so in general this flag goes unused, as Scudo
+    // will always ask GWP-ASan for an aligned amount of bytes.
+    Opt.PerfectlyRightAlign = getFlags()->GWP_ASAN_PerfectlyRightAlign;
     Opt.MaxSimultaneousAllocations =
         getFlags()->GWP_ASAN_MaxSimultaneousAllocations;
     Opt.SampleRate = getFlags()->GWP_ASAN_SampleRate;
@@ -190,26 +173,16 @@ public:
     // Allocator::disable calling GWPASan.disable). Disable GWP-ASan's atfork
     // handler.
     Opt.InstallForkHandlers = false;
-    Opt.Backtrace = gwp_asan::backtrace::getBacktraceFunction();
+    Opt.Backtrace = gwp_asan::options::getBacktraceFunction();
     GuardedAlloc.init(Opt);
 
     if (Opt.InstallSignalHandlers)
-      gwp_asan::segv_handler::installSignalHandlers(
-          &GuardedAlloc, Printf,
-          gwp_asan::backtrace::getPrintBacktraceFunction(),
-          gwp_asan::backtrace::getSegvBacktraceFunction());
-
-    GuardedAllocSlotSize =
-        GuardedAlloc.getAllocatorState()->maximumAllocationSize();
-    Stats.add(StatFree, static_cast<uptr>(Opt.MaxSimultaneousAllocations) *
-                            GuardedAllocSlotSize);
+      gwp_asan::crash_handler::installSignalHandlers(
+          &GuardedAlloc, Printf, gwp_asan::options::getPrintBacktraceFunction(),
+          Opt.Backtrace);
 #endif // GWP_ASAN_HOOKS
   }
 
-  ALWAYS_INLINE void initThreadMaybe(bool MinimalInit = false) {
-    TSDRegistry.initThreadMaybe(this, MinimalInit);
-  }
-
   void reset() { memset(this, 0, sizeof(*this)); }
 
   void unmapTestOnly() {
@@ -217,7 +190,7 @@ public:
     Primary.unmapTestOnly();
 #ifdef GWP_ASAN_HOOKS
     if (getFlags()->GWP_ASAN_InstallSignalHandlers)
-      gwp_asan::segv_handler::uninstallSignalHandlers();
+      gwp_asan::crash_handler::uninstallSignalHandlers();
     GuardedAlloc.uninitTestOnly();
 #endif // GWP_ASAN_HOOKS
   }
@@ -240,53 +213,11 @@ public:
     TSD->Cache.destroy(&Stats);
   }
 
-  ALWAYS_INLINE void *getHeaderTaggedPointer(void *Ptr) {
-    if (!allocatorSupportsMemoryTagging<Params>())
-      return Ptr;
-    auto UntaggedPtr = untagPointer(Ptr);
-    if (UntaggedPtr != Ptr)
-      return UntaggedPtr;
-    // Secondary, or pointer allocated while memory tagging is unsupported or
-    // disabled. The tag mismatch is okay in the latter case because tags will
-    // not be checked.
-    return addHeaderTag(Ptr);
-  }
-
-  ALWAYS_INLINE uptr addHeaderTag(uptr Ptr) {
-    if (!allocatorSupportsMemoryTagging<Params>())
-      return Ptr;
-    return addFixedTag(Ptr, 2);
-  }
-
-  ALWAYS_INLINE void *addHeaderTag(void *Ptr) {
-    return reinterpret_cast<void *>(addHeaderTag(reinterpret_cast<uptr>(Ptr)));
-  }
-
-  NOINLINE u32 collectStackTrace() {
-#ifdef HAVE_ANDROID_UNSAFE_FRAME_POINTER_CHASE
-    // Discard collectStackTrace() frame and allocator function frame.
-    constexpr uptr DiscardFrames = 2;
-    uptr Stack[MaxTraceSize + DiscardFrames];
-    uptr Size =
-        android_unsafe_frame_pointer_chase(Stack, MaxTraceSize + DiscardFrames);
-    Size = Min<uptr>(Size, MaxTraceSize + DiscardFrames);
-    return Depot.insert(Stack + Min<uptr>(DiscardFrames, Size), Stack + Size);
-#else
-    return 0;
-#endif
-  }
-
-  uptr computeOddEvenMaskForPointerMaybe(Options Options, uptr Ptr,
-                                         uptr ClassId) {
-    if (!Options.get(OptionBit::UseOddEvenTags))
-      return 0;
-
-    // If a chunk's tag is odd, we want the tags of the surrounding blocks to be
-    // even, and vice versa. Blocks are laid out Size bytes apart, and adding
-    // Size to Ptr will flip the least significant set bit of Size in Ptr, so
-    // that bit will have the pattern 010101... for consecutive blocks, which we
-    // can use to determine which tag mask to use.
-    return 0x5555U << ((Ptr >> SizeClassMap::getSizeLSBByClassId(ClassId)) & 1);
+  ALWAYS_INLINE void *untagPointerMaybe(void *Ptr) {
+    if (Primary.SupportsMemoryTagging)
+      return reinterpret_cast<void *>(
+          untagPointer(reinterpret_cast<uptr>(Ptr)));
+    return Ptr;
   }
 
   NOINLINE void *allocate(uptr Size, Chunk::Origin Origin,
@@ -294,33 +225,22 @@ public:
                           bool ZeroContents = false) {
     initThreadMaybe();
 
-    const Options Options = Primary.Options.load();
-    if (UNLIKELY(Alignment > MaxAlignment)) {
-      if (Options.get(OptionBit::MayReturnNull))
-        return nullptr;
-      reportAlignmentTooBig(Alignment, MaxAlignment);
-    }
-    if (Alignment < MinAlignment)
-      Alignment = MinAlignment;
-
 #ifdef GWP_ASAN_HOOKS
     if (UNLIKELY(GuardedAlloc.shouldSample())) {
-      if (void *Ptr = GuardedAlloc.allocate(Size, Alignment)) {
-        if (UNLIKELY(&__scudo_allocate_hook))
-          __scudo_allocate_hook(Ptr, Size);
-        Stats.lock();
-        Stats.add(StatAllocated, GuardedAllocSlotSize);
-        Stats.sub(StatFree, GuardedAllocSlotSize);
-        Stats.unlock();
+      if (void *Ptr = GuardedAlloc.allocate(roundUpTo(Size, Alignment)))
         return Ptr;
-      }
     }
 #endif // GWP_ASAN_HOOKS
 
-    const FillContentsMode FillContents = ZeroContents ? ZeroFill
-                                          : TSDRegistry.getDisableMemInit()
-                                              ? NoFill
-                                              : Options.getFillContentsMode();
+    ZeroContents |= static_cast<bool>(Options.ZeroContents);
+
+    if (UNLIKELY(Alignment > MaxAlignment)) {
+      if (Options.MayReturnNull)
+        return nullptr;
+      reportAlignmentTooBig(Alignment, MaxAlignment);
+    }
+    if (Alignment < MinAlignment)
+      Alignment = MinAlignment;
 
     // If the requested size happens to be 0 (more common than you might think),
     // allocate MinAlignment bytes on top of the header. Then add the extra
@@ -334,7 +254,7 @@ public:
     // Takes care of extravagantly large sizes as well as integer overflows.
     static_assert(MaxAllowedMallocSize < UINTPTR_MAX - MaxAlignment, "");
     if (UNLIKELY(Size >= MaxAllowedMallocSize)) {
-      if (Options.get(OptionBit::MayReturnNull))
+      if (Options.MayReturnNull)
         return nullptr;
       reportAllocationSizeTooBig(Size, NeededSize, MaxAllowedMallocSize);
     }
@@ -342,7 +262,7 @@ public:
 
     void *Block = nullptr;
     uptr ClassId = 0;
-    uptr SecondaryBlockEnd = 0;
+    uptr SecondaryBlockEnd;
     if (LIKELY(PrimaryT::canAllocate(NeededSize))) {
       ClassId = SizeClassMap::getClassIdBySize(NeededSize);
       DCHECK_NE(ClassId, 0U);
@@ -354,20 +274,25 @@ public:
       // larger class until it fits. If it fails to fit in the largest class,
       // fallback to the Secondary.
       if (UNLIKELY(!Block)) {
-        while (ClassId < SizeClassMap::LargestClassId && !Block)
+        while (ClassId < SizeClassMap::LargestClassId) {
           Block = TSD->Cache.allocate(++ClassId);
-        if (!Block)
+          if (LIKELY(Block)) {
+            break;
+          }
+        }
+        if (UNLIKELY(!Block)) {
           ClassId = 0;
+        }
       }
       if (UnlockRequired)
         TSD->unlock();
     }
     if (UNLIKELY(ClassId == 0))
-      Block = Secondary.allocate(Options, Size, Alignment, &SecondaryBlockEnd,
-                                 FillContents);
+      Block = Secondary.allocate(NeededSize, Alignment, &SecondaryBlockEnd,
+                                 ZeroContents);
 
     if (UNLIKELY(!Block)) {
-      if (Options.get(OptionBit::MayReturnNull))
+      if (Options.MayReturnNull)
         return nullptr;
       reportOutOfMemory(NeededSize);
     }
@@ -378,7 +303,7 @@ public:
 
     void *Ptr = reinterpret_cast<void *>(UserPtr);
     void *TaggedPtr = Ptr;
-    if (LIKELY(ClassId)) {
+    if (ClassId) {
       // We only need to zero or tag the contents for Primary backed
       // allocations. We only set tags for primary allocations in order to avoid
       // faulting potentially large numbers of pages for large secondary
@@ -390,11 +315,10 @@ public:
       //
       // When memory tagging is enabled, zeroing the contents is done as part of
       // setting the tag.
-      if (UNLIKELY(useMemoryTagging<Params>(Options))) {
+      if (UNLIKELY(useMemoryTagging())) {
         uptr PrevUserPtr;
         Chunk::UnpackedHeader Header;
-        const uptr BlockSize = PrimaryT::getSizeByClassId(ClassId);
-        const uptr BlockEnd = BlockUptr + BlockSize;
+        const uptr BlockEnd = BlockUptr + PrimaryT::getSizeByClassId(ClassId);
         // If possible, try to reuse the UAF tag that was set by deallocate().
         // For simplicity, only reuse tags if we have the same start address as
         // the previous allocation. This handles the majority of cases since
@@ -437,44 +361,14 @@ public:
           if (NextPage < PrevEnd && loadTag(NextPage) != NextPage)
             PrevEnd = NextPage;
           TaggedPtr = reinterpret_cast<void *>(TaggedUserPtr);
-          resizeTaggedChunk(PrevEnd, TaggedUserPtr + Size, Size, BlockEnd);
-          if (UNLIKELY(FillContents != NoFill && !Header.OriginOrWasZeroed)) {
-            // If an allocation needs to be zeroed (i.e. calloc) we can normally
-            // avoid zeroing the memory now since we can rely on memory having
-            // been zeroed on free, as this is normally done while setting the
-            // UAF tag. But if tagging was disabled per-thread when the memory
-            // was freed, it would not have been retagged and thus zeroed, and
-            // therefore it needs to be zeroed now.
-            memset(TaggedPtr, 0,
-                   Min(Size, roundUpTo(PrevEnd - TaggedUserPtr,
-                                       archMemoryTagGranuleSize())));
-          } else if (Size) {
-            // Clear any stack metadata that may have previously been stored in
-            // the chunk data.
-            memset(TaggedPtr, 0, archMemoryTagGranuleSize());
-          }
+          resizeTaggedChunk(PrevEnd, TaggedUserPtr + Size, BlockEnd);
         } else {
-          const uptr OddEvenMask =
-              computeOddEvenMaskForPointerMaybe(Options, BlockUptr, ClassId);
-          TaggedPtr = prepareTaggedChunk(Ptr, Size, OddEvenMask, BlockEnd);
+          TaggedPtr = prepareTaggedChunk(Ptr, Size, BlockEnd);
         }
-        storePrimaryAllocationStackMaybe(Options, Ptr);
-      } else {
-        Block = addHeaderTag(Block);
-        Ptr = addHeaderTag(Ptr);
-        if (UNLIKELY(FillContents != NoFill)) {
-          // This condition is not necessarily unlikely, but since memset is
-          // costly, we might as well mark it as such.
-          memset(Block, FillContents == ZeroFill ? 0 : PatternFillByte,
-                 PrimaryT::getSizeByClassId(ClassId));
-        }
-      }
-    } else {
-      Block = addHeaderTag(Block);
-      Ptr = addHeaderTag(Ptr);
-      if (UNLIKELY(useMemoryTagging<Params>(Options))) {
-        storeTags(reinterpret_cast<uptr>(Block), reinterpret_cast<uptr>(Ptr));
-        storeSecondaryAllocationStackMaybe(Options, Ptr, Size);
+      } else if (UNLIKELY(ZeroContents)) {
+        // This condition is not necessarily unlikely, but since memset is
+        // costly, we might as well mark it as such.
+        memset(Block, 0, PrimaryT::getSizeByClassId(ClassId));
       }
     }
 
@@ -492,13 +386,13 @@ public:
     }
     Header.ClassId = ClassId & Chunk::ClassIdMask;
     Header.State = Chunk::State::Allocated;
-    Header.OriginOrWasZeroed = Origin & Chunk::OriginMask;
+    Header.Origin = Origin & Chunk::OriginMask;
     Header.SizeOrUnusedBytes =
         (ClassId ? Size : SecondaryBlockEnd - (UserPtr + Size)) &
         Chunk::SizeOrUnusedBytesMask;
     Chunk::storeHeader(Cookie, Ptr, &Header);
 
-    if (UNLIKELY(&__scudo_allocate_hook))
+    if (&__scudo_allocate_hook)
       __scudo_allocate_hook(TaggedPtr, Size);
 
     return TaggedPtr;
@@ -514,67 +408,58 @@ public:
     // being destroyed properly. Any other heap operation will do a full init.
     initThreadMaybe(/*MinimalInit=*/true);
 
-    if (UNLIKELY(&__scudo_deallocate_hook))
-      __scudo_deallocate_hook(Ptr);
-
-    if (UNLIKELY(!Ptr))
-      return;
-
 #ifdef GWP_ASAN_HOOKS
     if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr))) {
       GuardedAlloc.deallocate(Ptr);
-      Stats.lock();
-      Stats.add(StatFree, GuardedAllocSlotSize);
-      Stats.sub(StatAllocated, GuardedAllocSlotSize);
-      Stats.unlock();
       return;
     }
 #endif // GWP_ASAN_HOOKS
 
+    if (&__scudo_deallocate_hook)
+      __scudo_deallocate_hook(Ptr);
+
+    if (UNLIKELY(!Ptr))
+      return;
     if (UNLIKELY(!isAligned(reinterpret_cast<uptr>(Ptr), MinAlignment)))
       reportMisalignedPointer(AllocatorAction::Deallocating, Ptr);
 
-    void *TaggedPtr = Ptr;
-    Ptr = getHeaderTaggedPointer(Ptr);
+    Ptr = untagPointerMaybe(Ptr);
 
     Chunk::UnpackedHeader Header;
     Chunk::loadHeader(Cookie, Ptr, &Header);
 
     if (UNLIKELY(Header.State != Chunk::State::Allocated))
       reportInvalidChunkState(AllocatorAction::Deallocating, Ptr);
-
-    const Options Options = Primary.Options.load();
-    if (Options.get(OptionBit::DeallocTypeMismatch)) {
-      if (UNLIKELY(Header.OriginOrWasZeroed != Origin)) {
+    if (Options.DeallocTypeMismatch) {
+      if (Header.Origin != Origin) {
         // With the exception of memalign'd chunks, that can be still be free'd.
-        if (Header.OriginOrWasZeroed != Chunk::Origin::Memalign ||
-            Origin != Chunk::Origin::Malloc)
+        if (UNLIKELY(Header.Origin != Chunk::Origin::Memalign ||
+                     Origin != Chunk::Origin::Malloc))
           reportDeallocTypeMismatch(AllocatorAction::Deallocating, Ptr,
-                                    Header.OriginOrWasZeroed, Origin);
+                                    Header.Origin, Origin);
       }
     }
 
     const uptr Size = getSize(Ptr, &Header);
-    if (DeleteSize && Options.get(OptionBit::DeleteSizeMismatch)) {
+    if (DeleteSize && Options.DeleteSizeMismatch) {
       if (UNLIKELY(DeleteSize != Size))
         reportDeleteSizeMismatch(Ptr, DeleteSize, Size);
     }
 
-    quarantineOrDeallocateChunk(Options, TaggedPtr, &Header, Size);
+    quarantineOrDeallocateChunk(Ptr, &Header, Size);
   }
 
   void *reallocate(void *OldPtr, uptr NewSize, uptr Alignment = MinAlignment) {
     initThreadMaybe();
 
-    const Options Options = Primary.Options.load();
     if (UNLIKELY(NewSize >= MaxAllowedMallocSize)) {
-      if (Options.get(OptionBit::MayReturnNull))
+      if (Options.MayReturnNull)
         return nullptr;
       reportAllocationSizeTooBig(NewSize, 0, MaxAllowedMallocSize);
     }
 
     void *OldTaggedPtr = OldPtr;
-    OldPtr = getHeaderTaggedPointer(OldPtr);
+    OldPtr = untagPointerMaybe(OldPtr);
 
     // The following cases are handled by the C wrappers.
     DCHECK_NE(OldPtr, nullptr);
@@ -587,10 +472,6 @@ public:
       if (NewPtr)
         memcpy(NewPtr, OldPtr, (NewSize < OldSize) ? NewSize : OldSize);
       GuardedAlloc.deallocate(OldPtr);
-      Stats.lock();
-      Stats.add(StatFree, GuardedAllocSlotSize);
-      Stats.sub(StatAllocated, GuardedAllocSlotSize);
-      Stats.unlock();
       return NewPtr;
     }
 #endif // GWP_ASAN_HOOKS
@@ -607,14 +488,13 @@ public:
     // Pointer has to be allocated with a malloc-type function. Some
     // applications think that it is OK to realloc a memalign'ed pointer, which
     // will trigger this check. It really isn't.
-    if (Options.get(OptionBit::DeallocTypeMismatch)) {
-      if (UNLIKELY(OldHeader.OriginOrWasZeroed != Chunk::Origin::Malloc))
+    if (Options.DeallocTypeMismatch) {
+      if (UNLIKELY(OldHeader.Origin != Chunk::Origin::Malloc))
         reportDeallocTypeMismatch(AllocatorAction::Reallocating, OldPtr,
-                                  OldHeader.OriginOrWasZeroed,
-                                  Chunk::Origin::Malloc);
+                                  OldHeader.Origin, Chunk::Origin::Malloc);
     }
 
-    void *BlockBegin = getBlockBegin(OldTaggedPtr, &OldHeader);
+    void *BlockBegin = getBlockBegin(OldPtr, &OldHeader);
     uptr BlockEnd;
     uptr OldSize;
     const uptr ClassId = OldHeader.ClassId;
@@ -624,31 +504,24 @@ public:
       OldSize = OldHeader.SizeOrUnusedBytes;
     } else {
       BlockEnd = SecondaryT::getBlockEnd(BlockBegin);
-      OldSize = BlockEnd - (reinterpret_cast<uptr>(OldTaggedPtr) +
-                            OldHeader.SizeOrUnusedBytes);
+      OldSize = BlockEnd -
+                (reinterpret_cast<uptr>(OldPtr) + OldHeader.SizeOrUnusedBytes);
     }
     // If the new chunk still fits in the previously allocated block (with a
     // reasonable delta), we just keep the old block, and update the chunk
     // header to reflect the size change.
-    if (reinterpret_cast<uptr>(OldTaggedPtr) + NewSize <= BlockEnd) {
+    if (reinterpret_cast<uptr>(OldPtr) + NewSize <= BlockEnd) {
       if (NewSize > OldSize || (OldSize - NewSize) < getPageSizeCached()) {
         Chunk::UnpackedHeader NewHeader = OldHeader;
         NewHeader.SizeOrUnusedBytes =
             (ClassId ? NewSize
-                     : BlockEnd -
-                           (reinterpret_cast<uptr>(OldTaggedPtr) + NewSize)) &
+                     : BlockEnd - (reinterpret_cast<uptr>(OldPtr) + NewSize)) &
             Chunk::SizeOrUnusedBytesMask;
         Chunk::compareExchangeHeader(Cookie, OldPtr, &NewHeader, &OldHeader);
-        if (UNLIKELY(useMemoryTagging<Params>(Options))) {
-          if (ClassId) {
-            resizeTaggedChunk(reinterpret_cast<uptr>(OldTaggedPtr) + OldSize,
-                              reinterpret_cast<uptr>(OldTaggedPtr) + NewSize,
-                              NewSize, BlockEnd);
-            storePrimaryAllocationStackMaybe(Options, OldPtr);
-          } else {
-            storeSecondaryAllocationStackMaybe(Options, OldPtr, NewSize);
-          }
-        }
+        if (UNLIKELY(ClassId && useMemoryTagging()))
+          resizeTaggedChunk(reinterpret_cast<uptr>(OldTaggedPtr) + OldSize,
+                            reinterpret_cast<uptr>(OldTaggedPtr) + NewSize,
+                            BlockEnd);
         return OldTaggedPtr;
       }
     }
@@ -658,9 +531,10 @@ public:
     // allow for potential further in-place realloc. The gains of such a trick
     // are currently unclear.
     void *NewPtr = allocate(NewSize, Chunk::Origin::Malloc, Alignment);
-    if (LIKELY(NewPtr)) {
+    if (NewPtr) {
+      const uptr OldSize = getSize(OldPtr, &OldHeader);
       memcpy(NewPtr, OldTaggedPtr, Min(NewSize, OldSize));
-      quarantineOrDeallocateChunk(Options, OldTaggedPtr, &OldHeader, OldSize);
+      quarantineOrDeallocateChunk(OldPtr, &OldHeader, OldSize);
     }
     return NewPtr;
   }
@@ -733,31 +607,15 @@ public:
     initThreadMaybe();
     const uptr From = Base;
     const uptr To = Base + Size;
-    bool MayHaveTaggedPrimary = allocatorSupportsMemoryTagging<Params>() &&
-                                systemSupportsMemoryTagging();
-    auto Lambda = [this, From, To, MayHaveTaggedPrimary, Callback,
-                   Arg](uptr Block) {
+    auto Lambda = [this, From, To, Callback, Arg](uptr Block) {
       if (Block < From || Block >= To)
         return;
       uptr Chunk;
       Chunk::UnpackedHeader Header;
-      if (MayHaveTaggedPrimary) {
-        // A chunk header can either have a zero tag (tagged primary) or the
-        // header tag (secondary, or untagged primary). We don't know which so
-        // try both.
-        ScopedDisableMemoryTagChecks x;
-        if (!getChunkFromBlock(Block, &Chunk, &Header) &&
-            !getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header))
-          return;
-      } else {
-        if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header))
-          return;
-      }
-      if (Header.State == Chunk::State::Allocated) {
+      if (getChunkFromBlock(Block, &Chunk, &Header) &&
+          Header.State == Chunk::State::Allocated) {
         uptr TaggedChunk = Chunk;
-        if (allocatorSupportsMemoryTagging<Params>())
-          TaggedChunk = untagPointer(TaggedChunk);
-        if (useMemoryTagging<Params>(Primary.Options.load()))
+        if (useMemoryTagging())
           TaggedChunk = loadTag(Chunk);
         Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header),
                  Arg);
@@ -772,32 +630,14 @@ public:
 
   bool canReturnNull() {
     initThreadMaybe();
-    return Primary.Options.load().get(OptionBit::MayReturnNull);
+    return Options.MayReturnNull;
   }
 
   bool setOption(Option O, sptr Value) {
-    initThreadMaybe();
-    if (O == Option::MemtagTuning) {
-      // Enabling odd/even tags involves a tradeoff between use-after-free
-      // detection and buffer overflow detection. Odd/even tags make it more
-      // likely for buffer overflows to be detected by increasing the size of
-      // the guaranteed "red zone" around the allocation, but on the other hand
-      // use-after-free is less likely to be detected because the tag space for
-      // any particular chunk is cut in half. Therefore we use this tuning
-      // setting to control whether odd/even tags are enabled.
-      if (Value == M_MEMTAG_TUNING_BUFFER_OVERFLOW)
-        Primary.Options.set(OptionBit::UseOddEvenTags);
-      else if (Value == M_MEMTAG_TUNING_UAF)
-        Primary.Options.clear(OptionBit::UseOddEvenTags);
+    if (O == Option::ReleaseInterval) {
+      Primary.setReleaseToOsIntervalMs(static_cast<s32>(Value));
+      Secondary.setReleaseToOsIntervalMs(static_cast<s32>(Value));
       return true;
-    } else {
-      // We leave it to the various sub-components to decide whether or not they
-      // want to handle the option, but we do not want to short-circuit
-      // execution if one of the setOption was to return false.
-      const bool PrimaryResult = Primary.setOption(O, Value);
-      const bool SecondaryResult = Secondary.setOption(O, Value);
-      const bool RegistryResult = TSDRegistry.setOption(O, Value);
-      return PrimaryResult && SecondaryResult && RegistryResult;
     }
     return false;
   }
@@ -817,7 +657,7 @@ public:
       return GuardedAlloc.getSize(Ptr);
 #endif // GWP_ASAN_HOOKS
 
-    Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr));
+    Ptr = untagPointerMaybe(const_cast<void *>(Ptr));
     Chunk::UnpackedHeader Header;
     Chunk::loadHeader(Cookie, Ptr, &Header);
     // Getting the usable size of a chunk only makes sense if it's allocated.
@@ -842,114 +682,18 @@ public:
 #endif // GWP_ASAN_HOOKS
     if (!Ptr || !isAligned(reinterpret_cast<uptr>(Ptr), MinAlignment))
       return false;
-    Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr));
+    Ptr = untagPointerMaybe(const_cast<void *>(Ptr));
     Chunk::UnpackedHeader Header;
     return Chunk::isValid(Cookie, Ptr, &Header) &&
            Header.State == Chunk::State::Allocated;
   }
 
-  bool useMemoryTaggingTestOnly() const {
-    return useMemoryTagging<Params>(Primary.Options.load());
-  }
-  void disableMemoryTagging() {
-    // If we haven't been initialized yet, we need to initialize now in order to
-    // prevent a future call to initThreadMaybe() from enabling memory tagging
-    // based on feature detection. But don't call initThreadMaybe() because it
-    // may end up calling the allocator (via pthread_atfork, via the post-init
-    // callback), which may cause mappings to be created with memory tagging
-    // enabled.
-    TSDRegistry.initOnceMaybe(this);
-    if (allocatorSupportsMemoryTagging<Params>()) {
-      Secondary.disableMemoryTagging();
-      Primary.Options.clear(OptionBit::UseMemoryTagging);
-    }
-  }
-
-  void setTrackAllocationStacks(bool Track) {
-    initThreadMaybe();
-    if (Track)
-      Primary.Options.set(OptionBit::TrackAllocationStacks);
-    else
-      Primary.Options.clear(OptionBit::TrackAllocationStacks);
-  }
-
-  void setFillContents(FillContentsMode FillContents) {
-    initThreadMaybe();
-    Primary.Options.setFillContentsMode(FillContents);
-  }
-
-  void setAddLargeAllocationSlack(bool AddSlack) {
-    initThreadMaybe();
-    if (AddSlack)
-      Primary.Options.set(OptionBit::AddLargeAllocationSlack);
-    else
-      Primary.Options.clear(OptionBit::AddLargeAllocationSlack);
-  }
-
-  const char *getStackDepotAddress() const {
-    return reinterpret_cast<const char *>(&Depot);
-  }
-
-  const char *getRegionInfoArrayAddress() const {
-    return Primary.getRegionInfoArrayAddress();
-  }
-
-  static uptr getRegionInfoArraySize() {
-    return PrimaryT::getRegionInfoArraySize();
-  }
-
-  const char *getRingBufferAddress() const {
-    return reinterpret_cast<const char *>(&RingBuffer);
-  }
-
-  static uptr getRingBufferSize() { return sizeof(RingBuffer); }
-
-  static const uptr MaxTraceSize = 64;
-
-  static void collectTraceMaybe(const StackDepot *Depot,
-                                uintptr_t (&Trace)[MaxTraceSize], u32 Hash) {
-    uptr RingPos, Size;
-    if (!Depot->find(Hash, &RingPos, &Size))
-      return;
-    for (unsigned I = 0; I != Size && I != MaxTraceSize; ++I)
-      Trace[I] = (*Depot)[RingPos + I];
-  }
-
-  static void getErrorInfo(struct scudo_error_info *ErrorInfo,
-                           uintptr_t FaultAddr, const char *DepotPtr,
-                           const char *RegionInfoPtr, const char *RingBufferPtr,
-                           const char *Memory, const char *MemoryTags,
-                           uintptr_t MemoryAddr, size_t MemorySize) {
-    *ErrorInfo = {};
-    if (!allocatorSupportsMemoryTagging<Params>() ||
-        MemoryAddr + MemorySize < MemoryAddr)
-      return;
+  bool useMemoryTagging() { return Primary.useMemoryTagging(); }
 
-    auto *Depot = reinterpret_cast<const StackDepot *>(DepotPtr);
-    size_t NextErrorReport = 0;
-
-    // Check for OOB in the current block and the two surrounding blocks. Beyond
-    // that, UAF is more likely.
-    if (extractTag(FaultAddr) != 0)
-      getInlineErrorInfo(ErrorInfo, NextErrorReport, FaultAddr, Depot,
-                         RegionInfoPtr, Memory, MemoryTags, MemoryAddr,
-                         MemorySize, 0, 2);
-
-    // Check the ring buffer. For primary allocations this will only find UAF;
-    // for secondary allocations we can find either UAF or OOB.
-    getRingBufferErrorInfo(ErrorInfo, NextErrorReport, FaultAddr, Depot,
-                           RingBufferPtr);
-
-    // Check for OOB in the 28 blocks surrounding the 3 we checked earlier.
-    // Beyond that we are likely to hit false positives.
-    if (extractTag(FaultAddr) != 0)
-      getInlineErrorInfo(ErrorInfo, NextErrorReport, FaultAddr, Depot,
-                         RegionInfoPtr, Memory, MemoryTags, MemoryAddr,
-                         MemorySize, 2, 16);
-  }
+  void disableMemoryTagging() { Primary.disableMemoryTagging(); }
 
 private:
-  using SecondaryT = MapAllocator<Params>;
+  using SecondaryT = typename Params::Secondary;
   typedef typename PrimaryT::SizeClassMap SizeClassMap;
 
   static const uptr MinAlignmentLog = SCUDO_MIN_ALIGNMENT_LOG;
@@ -961,59 +705,32 @@ private:
 
   static_assert(MinAlignment >= sizeof(Chunk::PackedHeader),
                 "Minimal alignment must at least cover a chunk header.");
-  static_assert(!allocatorSupportsMemoryTagging<Params>() ||
+  static_assert(!PrimaryT::SupportsMemoryTagging ||
                     MinAlignment >= archMemoryTagGranuleSize(),
                 "");
 
   static const u32 BlockMarker = 0x44554353U;
 
-  // These are indexes into an "array" of 32-bit values that store information
-  // inline with a chunk that is relevant to diagnosing memory tag faults, where
-  // 0 corresponds to the address of the user memory. This means that only
-  // negative indexes may be used. The smallest index that may be used is -2,
-  // which corresponds to 8 bytes before the user memory, because the chunk
-  // header size is 8 bytes and in allocators that support memory tagging the
-  // minimum alignment is at least the tag granule size (16 on aarch64).
-  static const sptr MemTagAllocationTraceIndex = -2;
-  static const sptr MemTagAllocationTidIndex = -1;
-
-  u32 Cookie = 0;
-  u32 QuarantineMaxChunkSize = 0;
-
   GlobalStats Stats;
+  TSDRegistryT TSDRegistry;
   PrimaryT Primary;
   SecondaryT Secondary;
   QuarantineT Quarantine;
-  TSDRegistryT TSDRegistry;
-  pthread_once_t PostInitNonce = PTHREAD_ONCE_INIT;
+
+  u32 Cookie;
+
+  struct {
+    u8 MayReturnNull : 1;       // may_return_null
+    u8 ZeroContents : 1;        // zero_contents
+    u8 DeallocTypeMismatch : 1; // dealloc_type_mismatch
+    u8 DeleteSizeMismatch : 1;  // delete_size_mismatch
+    u32 QuarantineMaxChunkSize; // quarantine_max_chunk_size
+  } Options;
 
 #ifdef GWP_ASAN_HOOKS
   gwp_asan::GuardedPoolAllocator GuardedAlloc;
-  uptr GuardedAllocSlotSize = 0;
 #endif // GWP_ASAN_HOOKS
 
-  StackDepot Depot;
-
-  struct AllocationRingBuffer {
-    struct Entry {
-      atomic_uptr Ptr;
-      atomic_uptr AllocationSize;
-      atomic_u32 AllocationTrace;
-      atomic_u32 AllocationTid;
-      atomic_u32 DeallocationTrace;
-      atomic_u32 DeallocationTid;
-    };
-
-    atomic_uptr Pos;
-#ifdef SCUDO_FUZZ
-    static const uptr NumEntries = 2;
-#else
-    static const uptr NumEntries = 32768;
-#endif
-    Entry Entries[NumEntries];
-  };
-  AllocationRingBuffer RingBuffer = {};
-
   // The following might get optimized out by the compiler.
   NOINLINE void performSanityChecks() {
     // Verify that the header offset field can hold the maximum offset. In the
@@ -1061,50 +778,30 @@ private:
     const uptr SizeOrUnusedBytes = Header->SizeOrUnusedBytes;
     if (LIKELY(Header->ClassId))
       return SizeOrUnusedBytes;
-    if (allocatorSupportsMemoryTagging<Params>())
-      Ptr = untagPointer(const_cast<void *>(Ptr));
     return SecondaryT::getBlockEnd(getBlockBegin(Ptr, Header)) -
            reinterpret_cast<uptr>(Ptr) - SizeOrUnusedBytes;
   }
 
-  void quarantineOrDeallocateChunk(Options Options, void *TaggedPtr,
-                                   Chunk::UnpackedHeader *Header, uptr Size) {
-    void *Ptr = getHeaderTaggedPointer(TaggedPtr);
+  ALWAYS_INLINE void initThreadMaybe(bool MinimalInit = false) {
+    TSDRegistry.initThreadMaybe(this, MinimalInit);
+  }
+
+  void quarantineOrDeallocateChunk(void *Ptr, Chunk::UnpackedHeader *Header,
+                                   uptr Size) {
     Chunk::UnpackedHeader NewHeader = *Header;
+    if (UNLIKELY(NewHeader.ClassId && useMemoryTagging())) {
+      uptr TaggedBegin, TaggedEnd;
+      setRandomTag(Ptr, Size, &TaggedBegin, &TaggedEnd);
+    }
     // If the quarantine is disabled, the actual size of a chunk is 0 or larger
     // than the maximum allowed, we return a chunk directly to the backend.
-    // This purposefully underflows for Size == 0.
-    const bool BypassQuarantine = !Quarantine.getCacheSize() ||
-                                  ((Size - 1) >= QuarantineMaxChunkSize) ||
-                                  !NewHeader.ClassId;
-    if (BypassQuarantine)
-      NewHeader.State = Chunk::State::Available;
-    else
-      NewHeader.State = Chunk::State::Quarantined;
-    NewHeader.OriginOrWasZeroed = useMemoryTagging<Params>(Options) &&
-                                  NewHeader.ClassId &&
-                                  !TSDRegistry.getDisableMemInit();
-    Chunk::compareExchangeHeader(Cookie, Ptr, &NewHeader, Header);
-
-    if (UNLIKELY(useMemoryTagging<Params>(Options))) {
-      u8 PrevTag = extractTag(reinterpret_cast<uptr>(TaggedPtr));
-      storeDeallocationStackMaybe(Options, Ptr, PrevTag, Size);
-      if (NewHeader.ClassId) {
-        if (!TSDRegistry.getDisableMemInit()) {
-          uptr TaggedBegin, TaggedEnd;
-          const uptr OddEvenMask = computeOddEvenMaskForPointerMaybe(
-              Options, reinterpret_cast<uptr>(getBlockBegin(Ptr, &NewHeader)),
-              NewHeader.ClassId);
-          // Exclude the previous tag so that immediate use after free is
-          // detected 100% of the time.
-          setRandomTag(Ptr, Size, OddEvenMask | (1UL << PrevTag), &TaggedBegin,
-                       &TaggedEnd);
-        }
-      }
-    }
+    // Logical Or can be short-circuited, which introduces unnecessary
+    // conditional jumps, so use bitwise Or and let the compiler be clever.
+    const bool BypassQuarantine = !Quarantine.getCacheSize() | !Size |
+                                  (Size > Options.QuarantineMaxChunkSize);
     if (BypassQuarantine) {
-      if (allocatorSupportsMemoryTagging<Params>())
-        Ptr = untagPointer(Ptr);
+      NewHeader.State = Chunk::State::Available;
+      Chunk::compareExchangeHeader(Cookie, Ptr, &NewHeader, Header);
       void *BlockBegin = getBlockBegin(Ptr, &NewHeader);
       const uptr ClassId = NewHeader.ClassId;
       if (LIKELY(ClassId)) {
@@ -1114,12 +811,11 @@ private:
         if (UnlockRequired)
           TSD->unlock();
       } else {
-        if (UNLIKELY(useMemoryTagging<Params>(Options)))
-          storeTags(reinterpret_cast<uptr>(BlockBegin),
-                    reinterpret_cast<uptr>(Ptr));
-        Secondary.deallocate(Options, BlockBegin);
+        Secondary.deallocate(BlockBegin);
       }
     } else {
+      NewHeader.State = Chunk::State::Quarantined;
+      Chunk::compareExchangeHeader(Cookie, Ptr, &NewHeader, Header);
       bool UnlockRequired;
       auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
       Quarantine.put(&TSD->QuarantineCache,
@@ -1131,291 +827,11 @@ private:
 
   bool getChunkFromBlock(uptr Block, uptr *Chunk,
                          Chunk::UnpackedHeader *Header) {
-    *Chunk =
-        Block + getChunkOffsetFromBlock(reinterpret_cast<const char *>(Block));
-    return Chunk::isValid(Cookie, reinterpret_cast<void *>(*Chunk), Header);
-  }
-
-  static uptr getChunkOffsetFromBlock(const char *Block) {
     u32 Offset = 0;
-    if (reinterpret_cast<const u32 *>(Block)[0] == BlockMarker)
-      Offset = reinterpret_cast<const u32 *>(Block)[1];
-    return Offset + Chunk::getHeaderSize();
-  }
-
-  // Set the tag of the granule past the end of the allocation to 0, to catch
-  // linear overflows even if a previous larger allocation used the same block
-  // and tag. Only do this if the granule past the end is in our block, because
-  // this would otherwise lead to a SEGV if the allocation covers the entire
-  // block and our block is at the end of a mapping. The tag of the next block's
-  // header granule will be set to 0, so it will serve the purpose of catching
-  // linear overflows in this case.
-  //
-  // For allocations of size 0 we do not end up storing the address tag to the
-  // memory tag space, which getInlineErrorInfo() normally relies on to match
-  // address tags against chunks. To allow matching in this case we store the
-  // address tag in the first byte of the chunk.
-  void storeEndMarker(uptr End, uptr Size, uptr BlockEnd) {
-    uptr UntaggedEnd = untagPointer(End);
-    if (UntaggedEnd != BlockEnd) {
-      storeTag(UntaggedEnd);
-      if (Size == 0)
-        *reinterpret_cast<u8 *>(UntaggedEnd) = extractTag(End);
-    }
-  }
-
-  void *prepareTaggedChunk(void *Ptr, uptr Size, uptr ExcludeMask,
-                           uptr BlockEnd) {
-    // Prepare the granule before the chunk to store the chunk header by setting
-    // its tag to 0. Normally its tag will already be 0, but in the case where a
-    // chunk holding a low alignment allocation is reused for a higher alignment
-    // allocation, the chunk may already have a non-zero tag from the previous
-    // allocation.
-    storeTag(reinterpret_cast<uptr>(Ptr) - archMemoryTagGranuleSize());
-
-    uptr TaggedBegin, TaggedEnd;
-    setRandomTag(Ptr, Size, ExcludeMask, &TaggedBegin, &TaggedEnd);
-
-    storeEndMarker(TaggedEnd, Size, BlockEnd);
-    return reinterpret_cast<void *>(TaggedBegin);
-  }
-
-  void resizeTaggedChunk(uptr OldPtr, uptr NewPtr, uptr NewSize,
-                         uptr BlockEnd) {
-    uptr RoundOldPtr = roundUpTo(OldPtr, archMemoryTagGranuleSize());
-    uptr RoundNewPtr;
-    if (RoundOldPtr >= NewPtr) {
-      // If the allocation is shrinking we just need to set the tag past the end
-      // of the allocation to 0. See explanation in storeEndMarker() above.
-      RoundNewPtr = roundUpTo(NewPtr, archMemoryTagGranuleSize());
-    } else {
-      // Set the memory tag of the region
-      // [RoundOldPtr, roundUpTo(NewPtr, archMemoryTagGranuleSize()))
-      // to the pointer tag stored in OldPtr.
-      RoundNewPtr = storeTags(RoundOldPtr, NewPtr);
-    }
-    storeEndMarker(RoundNewPtr, NewSize, BlockEnd);
-  }
-
-  void storePrimaryAllocationStackMaybe(Options Options, void *Ptr) {
-    if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
-      return;
-    auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
-    Ptr32[MemTagAllocationTraceIndex] = collectStackTrace();
-    Ptr32[MemTagAllocationTidIndex] = getThreadID();
-  }
-
-  void storeRingBufferEntry(void *Ptr, u32 AllocationTrace, u32 AllocationTid,
-                            uptr AllocationSize, u32 DeallocationTrace,
-                            u32 DeallocationTid) {
-    uptr Pos = atomic_fetch_add(&RingBuffer.Pos, 1, memory_order_relaxed);
-    typename AllocationRingBuffer::Entry *Entry =
-        &RingBuffer.Entries[Pos % AllocationRingBuffer::NumEntries];
-
-    // First invalidate our entry so that we don't attempt to interpret a
-    // partially written state in getSecondaryErrorInfo(). The fences below
-    // ensure that the compiler does not move the stores to Ptr in between the
-    // stores to the other fields.
-    atomic_store_relaxed(&Entry->Ptr, 0);
-
-    __atomic_signal_fence(__ATOMIC_SEQ_CST);
-    atomic_store_relaxed(&Entry->AllocationTrace, AllocationTrace);
-    atomic_store_relaxed(&Entry->AllocationTid, AllocationTid);
-    atomic_store_relaxed(&Entry->AllocationSize, AllocationSize);
-    atomic_store_relaxed(&Entry->DeallocationTrace, DeallocationTrace);
-    atomic_store_relaxed(&Entry->DeallocationTid, DeallocationTid);
-    __atomic_signal_fence(__ATOMIC_SEQ_CST);
-
-    atomic_store_relaxed(&Entry->Ptr, reinterpret_cast<uptr>(Ptr));
-  }
-
-  void storeSecondaryAllocationStackMaybe(Options Options, void *Ptr,
-                                          uptr Size) {
-    if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
-      return;
-
-    u32 Trace = collectStackTrace();
-    u32 Tid = getThreadID();
-
-    auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
-    Ptr32[MemTagAllocationTraceIndex] = Trace;
-    Ptr32[MemTagAllocationTidIndex] = Tid;
-
-    storeRingBufferEntry(untagPointer(Ptr), Trace, Tid, Size, 0, 0);
-  }
-
-  void storeDeallocationStackMaybe(Options Options, void *Ptr, u8 PrevTag,
-                                   uptr Size) {
-    if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
-      return;
-
-    auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
-    u32 AllocationTrace = Ptr32[MemTagAllocationTraceIndex];
-    u32 AllocationTid = Ptr32[MemTagAllocationTidIndex];
-
-    u32 DeallocationTrace = collectStackTrace();
-    u32 DeallocationTid = getThreadID();
-
-    storeRingBufferEntry(addFixedTag(untagPointer(Ptr), PrevTag),
-                         AllocationTrace, AllocationTid, Size,
-                         DeallocationTrace, DeallocationTid);
-  }
-
-  static const size_t NumErrorReports =
-      sizeof(((scudo_error_info *)0)->reports) /
-      sizeof(((scudo_error_info *)0)->reports[0]);
-
-  static void getInlineErrorInfo(struct scudo_error_info *ErrorInfo,
-                                 size_t &NextErrorReport, uintptr_t FaultAddr,
-                                 const StackDepot *Depot,
-                                 const char *RegionInfoPtr, const char *Memory,
-                                 const char *MemoryTags, uintptr_t MemoryAddr,
-                                 size_t MemorySize, size_t MinDistance,
-                                 size_t MaxDistance) {
-    uptr UntaggedFaultAddr = untagPointer(FaultAddr);
-    u8 FaultAddrTag = extractTag(FaultAddr);
-    BlockInfo Info =
-        PrimaryT::findNearestBlock(RegionInfoPtr, UntaggedFaultAddr);
-
-    auto GetGranule = [&](uptr Addr, const char **Data, uint8_t *Tag) -> bool {
-      if (Addr < MemoryAddr || Addr + archMemoryTagGranuleSize() < Addr ||
-          Addr + archMemoryTagGranuleSize() > MemoryAddr + MemorySize)
-        return false;
-      *Data = &Memory[Addr - MemoryAddr];
-      *Tag = static_cast<u8>(
-          MemoryTags[(Addr - MemoryAddr) / archMemoryTagGranuleSize()]);
-      return true;
-    };
-
-    auto ReadBlock = [&](uptr Addr, uptr *ChunkAddr,
-                         Chunk::UnpackedHeader *Header, const u32 **Data,
-                         u8 *Tag) {
-      const char *BlockBegin;
-      u8 BlockBeginTag;
-      if (!GetGranule(Addr, &BlockBegin, &BlockBeginTag))
-        return false;
-      uptr ChunkOffset = getChunkOffsetFromBlock(BlockBegin);
-      *ChunkAddr = Addr + ChunkOffset;
-
-      const char *ChunkBegin;
-      if (!GetGranule(*ChunkAddr, &ChunkBegin, Tag))
-        return false;
-      *Header = *reinterpret_cast<const Chunk::UnpackedHeader *>(
-          ChunkBegin - Chunk::getHeaderSize());
-      *Data = reinterpret_cast<const u32 *>(ChunkBegin);
-
-      // Allocations of size 0 will have stashed the tag in the first byte of
-      // the chunk, see storeEndMarker().
-      if (Header->SizeOrUnusedBytes == 0)
-        *Tag = static_cast<u8>(*ChunkBegin);
-
-      return true;
-    };
-
-    if (NextErrorReport == NumErrorReports)
-      return;
-
-    auto CheckOOB = [&](uptr BlockAddr) {
-      if (BlockAddr < Info.RegionBegin || BlockAddr >= Info.RegionEnd)
-        return false;
-
-      uptr ChunkAddr;
-      Chunk::UnpackedHeader Header;
-      const u32 *Data;
-      uint8_t Tag;
-      if (!ReadBlock(BlockAddr, &ChunkAddr, &Header, &Data, &Tag) ||
-          Header.State != Chunk::State::Allocated || Tag != FaultAddrTag)
-        return false;
-
-      auto *R = &ErrorInfo->reports[NextErrorReport++];
-      R->error_type =
-          UntaggedFaultAddr < ChunkAddr ? BUFFER_UNDERFLOW : BUFFER_OVERFLOW;
-      R->allocation_address = ChunkAddr;
-      R->allocation_size = Header.SizeOrUnusedBytes;
-      collectTraceMaybe(Depot, R->allocation_trace,
-                        Data[MemTagAllocationTraceIndex]);
-      R->allocation_tid = Data[MemTagAllocationTidIndex];
-      return NextErrorReport == NumErrorReports;
-    };
-
-    if (MinDistance == 0 && CheckOOB(Info.BlockBegin))
-      return;
-
-    for (size_t I = Max<size_t>(MinDistance, 1); I != MaxDistance; ++I)
-      if (CheckOOB(Info.BlockBegin + I * Info.BlockSize) ||
-          CheckOOB(Info.BlockBegin - I * Info.BlockSize))
-        return;
-  }
-
-  static void getRingBufferErrorInfo(struct scudo_error_info *ErrorInfo,
-                                     size_t &NextErrorReport,
-                                     uintptr_t FaultAddr,
-                                     const StackDepot *Depot,
-                                     const char *RingBufferPtr) {
-    auto *RingBuffer =
-        reinterpret_cast<const AllocationRingBuffer *>(RingBufferPtr);
-    uptr Pos = atomic_load_relaxed(&RingBuffer->Pos);
-
-    for (uptr I = Pos - 1; I != Pos - 1 - AllocationRingBuffer::NumEntries &&
-                           NextErrorReport != NumErrorReports;
-         --I) {
-      auto *Entry = &RingBuffer->Entries[I % AllocationRingBuffer::NumEntries];
-      uptr EntryPtr = atomic_load_relaxed(&Entry->Ptr);
-      if (!EntryPtr)
-        continue;
-
-      uptr UntaggedEntryPtr = untagPointer(EntryPtr);
-      uptr EntrySize = atomic_load_relaxed(&Entry->AllocationSize);
-      u32 AllocationTrace = atomic_load_relaxed(&Entry->AllocationTrace);
-      u32 AllocationTid = atomic_load_relaxed(&Entry->AllocationTid);
-      u32 DeallocationTrace = atomic_load_relaxed(&Entry->DeallocationTrace);
-      u32 DeallocationTid = atomic_load_relaxed(&Entry->DeallocationTid);
-
-      if (DeallocationTid) {
-        // For UAF we only consider in-bounds fault addresses because
-        // out-of-bounds UAF is rare and attempting to detect it is very likely
-        // to result in false positives.
-        if (FaultAddr < EntryPtr || FaultAddr >= EntryPtr + EntrySize)
-          continue;
-      } else {
-        // Ring buffer OOB is only possible with secondary allocations. In this
-        // case we are guaranteed a guard region of at least a page on either
-        // side of the allocation (guard page on the right, guard page + tagged
-        // region on the left), so ignore any faults outside of that range.
-        if (FaultAddr < EntryPtr - getPageSizeCached() ||
-            FaultAddr >= EntryPtr + EntrySize + getPageSizeCached())
-          continue;
-
-        // For UAF the ring buffer will contain two entries, one for the
-        // allocation and another for the deallocation. Don't report buffer
-        // overflow/underflow using the allocation entry if we have already
-        // collected a report from the deallocation entry.
-        bool Found = false;
-        for (uptr J = 0; J != NextErrorReport; ++J) {
-          if (ErrorInfo->reports[J].allocation_address == UntaggedEntryPtr) {
-            Found = true;
-            break;
-          }
-        }
-        if (Found)
-          continue;
-      }
-
-      auto *R = &ErrorInfo->reports[NextErrorReport++];
-      if (DeallocationTid)
-        R->error_type = USE_AFTER_FREE;
-      else if (FaultAddr < EntryPtr)
-        R->error_type = BUFFER_UNDERFLOW;
-      else
-        R->error_type = BUFFER_OVERFLOW;
-
-      R->allocation_address = UntaggedEntryPtr;
-      R->allocation_size = EntrySize;
-      collectTraceMaybe(Depot, R->allocation_trace, AllocationTrace);
-      R->allocation_tid = AllocationTid;
-      collectTraceMaybe(Depot, R->deallocation_trace, DeallocationTrace);
-      R->deallocation_tid = DeallocationTid;
-    }
+    if (reinterpret_cast<u32 *>(Block)[0] == BlockMarker)
+      Offset = reinterpret_cast<u32 *>(Block)[1];
+    *Chunk = Block + Offset + Chunk::getHeaderSize();
+    return Chunk::isValid(Cookie, reinterpret_cast<void *>(*Chunk), Header);
   }
 
   uptr getStats(ScopedString *Str) {
diff --git a/standalone/common.cpp b/standalone/common.cpp
index 666f95400c7..d93bfc59b3c 100644
--- a/standalone/common.cpp
+++ b/standalone/common.cpp
@@ -8,7 +8,6 @@
 
 #include "common.h"
 #include "atomic_helpers.h"
-#include "string_utils.h"
 
 namespace scudo {
 
@@ -22,16 +21,11 @@ uptr getPageSizeSlow() {
 }
 
 // Fatal internal map() or unmap() error (potentially OOM related).
-void NORETURN dieOnMapUnmapError(uptr SizeIfOOM) {
-  char Error[128] = "Scudo ERROR: internal map or unmap failure\n";
-  if (SizeIfOOM) {
-    formatString(
-        Error, sizeof(Error),
-        "Scudo ERROR: internal map failure (NO MEMORY) requesting %zuKB\n",
-        SizeIfOOM >> 10);
-  }
-  outputRaw(Error);
-  setAbortMessage(Error);
+void NORETURN dieOnMapUnmapError(bool OutOfMemory) {
+  outputRaw("Scudo ERROR: internal map or unmap failure");
+  if (OutOfMemory)
+    outputRaw(" (OOM)");
+  outputRaw("\n");
   die();
 }
 
diff --git a/standalone/common.h b/standalone/common.h
index 3f27a3d3e1b..e026e34c004 100644
--- a/standalone/common.h
+++ b/standalone/common.h
@@ -133,8 +133,6 @@ const char *getEnv(const char *Name);
 
 u64 getMonotonicTime();
 
-u32 getThreadID();
-
 // Our randomness gathering function is limited to 256 bytes to ensure we get
 // as many bytes as requested, and avoid interruptions (on Linux).
 constexpr uptr MaxRandomLength = 256U;
@@ -165,46 +163,16 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags = 0,
 void unmap(void *Addr, uptr Size, uptr Flags = 0,
            MapPlatformData *Data = nullptr);
 
-void setMemoryPermission(uptr Addr, uptr Size, uptr Flags,
-                         MapPlatformData *Data = nullptr);
-
 void releasePagesToOS(uptr BaseAddress, uptr Offset, uptr Size,
                       MapPlatformData *Data = nullptr);
 
-// Internal map & unmap fatal error. This must not call map(). SizeIfOOM shall
-// hold the requested size on an out-of-memory error, 0 otherwise.
-void NORETURN dieOnMapUnmapError(uptr SizeIfOOM = 0);
+// Internal map & unmap fatal error. This must not call map().
+void NORETURN dieOnMapUnmapError(bool OutOfMemory = false);
 
 // Logging related functions.
 
 void setAbortMessage(const char *Message);
 
-struct BlockInfo {
-  uptr BlockBegin;
-  uptr BlockSize;
-  uptr RegionBegin;
-  uptr RegionEnd;
-};
-
-enum class Option : u8 {
-  ReleaseInterval,      // Release to OS interval in milliseconds.
-  MemtagTuning,         // Whether to tune tagging for UAF or overflow.
-  ThreadDisableMemInit, // Whether to disable automatic heap initialization and,
-                        // where possible, memory tagging, on this thread.
-  MaxCacheEntriesCount, // Maximum number of blocks that can be cached.
-  MaxCacheEntrySize,    // Maximum size of a block that can be cached.
-  MaxTSDsCount,         // Number of usable TSDs for the shared registry.
-};
-
-constexpr unsigned char PatternFillByte = 0xAB;
-
-enum FillContentsMode {
-  NoFill = 0,
-  ZeroFill = 1,
-  PatternOrZeroFill = 2 // Pattern fill unless the memory is known to be
-                        // zero-initialized already.
-};
-
 } // namespace scudo
 
 #endif // SCUDO_COMMON_H_
diff --git a/standalone/flags.cpp b/standalone/flags.cpp
index de5153b288b..dd9f050a2d2 100644
--- a/standalone/flags.cpp
+++ b/standalone/flags.cpp
@@ -9,8 +9,7 @@
 #include "flags.h"
 #include "common.h"
 #include "flags_parser.h"
-
-#include "scudo/interface.h"
+#include "interface.h"
 
 namespace scudo {
 
diff --git a/standalone/flags.inc b/standalone/flags.inc
index b5cab473416..342af1c79ad 100644
--- a/standalone/flags.inc
+++ b/standalone/flags.inc
@@ -34,9 +34,6 @@ SCUDO_FLAG(bool, delete_size_mismatch, true,
 
 SCUDO_FLAG(bool, zero_contents, false, "Zero chunk contents on allocation.")
 
-SCUDO_FLAG(bool, pattern_fill_contents, false,
-           "Pattern fill chunk contents on allocation.")
-
 SCUDO_FLAG(int, rss_limit_mb, -1,
            "Enforce an upper limit (in megabytes) to the process RSS. The "
            "allocator will terminate or return NULL when allocations are "
diff --git a/standalone/flags_parser.h b/standalone/flags_parser.h
index ba832adbd90..32511f768c6 100644
--- a/standalone/flags_parser.h
+++ b/standalone/flags_parser.h
@@ -29,7 +29,7 @@ public:
   void printFlagDescriptions();
 
 private:
-  static const u32 MaxFlags = 20;
+  static const u32 MaxFlags = 16;
   struct Flag {
     const char *Name;
     const char *Desc;
diff --git a/standalone/fuchsia.cpp b/standalone/fuchsia.cpp
index 3b473bc9e22..b3d72de158c 100644
--- a/standalone/fuchsia.cpp
+++ b/standalone/fuchsia.cpp
@@ -15,6 +15,7 @@
 #include "string_utils.h"
 
 #include <lib/sync/mutex.h> // for sync_mutex_t
+#include <limits.h>         // for PAGE_SIZE
 #include <stdlib.h>         // for getenv()
 #include <zircon/compiler.h>
 #include <zircon/sanitizer.h>
@@ -22,7 +23,7 @@
 
 namespace scudo {
 
-uptr getPageSize() { return _zx_system_get_page_size(); }
+uptr getPageSize() { return PAGE_SIZE; }
 
 void NORETURN die() { __builtin_trap(); }
 
@@ -41,7 +42,7 @@ static void *allocateVmar(uptr Size, MapPlatformData *Data, bool AllowNoMem) {
       Size, &Data->Vmar, &Data->VmarBase);
   if (UNLIKELY(Status != ZX_OK)) {
     if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
     return nullptr;
   }
   return reinterpret_cast<void *>(Data->VmarBase);
@@ -49,7 +50,7 @@ static void *allocateVmar(uptr Size, MapPlatformData *Data, bool AllowNoMem) {
 
 void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
           MapPlatformData *Data) {
-  DCHECK_EQ(Size % getPageSizeCached(), 0);
+  DCHECK_EQ(Size % PAGE_SIZE, 0);
   const bool AllowNoMem = !!(Flags & MAP_ALLOWNOMEM);
 
   // For MAP_NOACCESS, just allocate a Vmar and return.
@@ -71,7 +72,7 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
     Status = _zx_vmo_set_size(Vmo, VmoSize + Size);
     if (Status != ZX_OK) {
       if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
       return nullptr;
     }
   } else {
@@ -79,7 +80,7 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
     Status = _zx_vmo_create(Size, ZX_VMO_RESIZABLE, &Vmo);
     if (UNLIKELY(Status != ZX_OK)) {
       if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+        dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
       return nullptr;
     }
     _zx_object_set_property(Vmo, ZX_PROP_NAME, Name, strlen(Name));
@@ -96,16 +97,14 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
   // No need to track the Vmo if we don't intend on resizing it. Close it.
   if (Flags & MAP_RESIZABLE) {
     DCHECK(Data);
-    if (Data->Vmo == ZX_HANDLE_INVALID)
-      Data->Vmo = Vmo;
-    else
-      DCHECK_EQ(Data->Vmo, Vmo);
+    DCHECK_EQ(Data->Vmo, ZX_HANDLE_INVALID);
+    Data->Vmo = Vmo;
   } else {
     CHECK_EQ(_zx_handle_close(Vmo), ZX_OK);
   }
   if (UNLIKELY(Status != ZX_OK)) {
     if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
-      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY ? Size : 0);
+      dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
     return nullptr;
   }
   if (Data)
@@ -136,16 +135,6 @@ void unmap(void *Addr, uptr Size, uptr Flags, MapPlatformData *Data) {
   }
 }
 
-void setMemoryPermission(UNUSED uptr Addr, UNUSED uptr Size, UNUSED uptr Flags,
-                         UNUSED MapPlatformData *Data) {
-  const zx_vm_option_t Prot =
-      (Flags & MAP_NOACCESS) ? 0 : (ZX_VM_PERM_READ | ZX_VM_PERM_WRITE);
-  DCHECK(Data);
-  DCHECK_NE(Data->Vmar, ZX_HANDLE_INVALID);
-  if (_zx_vmar_protect(Data->Vmar, Prot, Addr, Size) != ZX_OK)
-    dieOnMapUnmapError();
-}
-
 void releasePagesToOS(UNUSED uptr BaseAddress, uptr Offset, uptr Size,
                       MapPlatformData *Data) {
   DCHECK(Data);
@@ -181,8 +170,6 @@ u64 getMonotonicTime() { return _zx_clock_get_monotonic(); }
 
 u32 getNumberOfCPUs() { return _zx_system_get_num_cpus(); }
 
-u32 getThreadID() { return 0; }
-
 bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
   static_assert(MaxRandomLength <= ZX_CPRNG_DRAW_MAX_LEN, "");
   if (UNLIKELY(!Buffer || !Length || Length > MaxRandomLength))
diff --git a/standalone/fuzz/get_error_info_fuzzer.cpp b/standalone/fuzz/get_error_info_fuzzer.cpp
deleted file mode 100644
index 078e44b0dfc..00000000000
--- a/standalone/fuzz/get_error_info_fuzzer.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-//===-- get_error_info_fuzzer.cpp -----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#define SCUDO_FUZZ
-#include "allocator_config.h"
-#include "combined.h"
-
-#include <fuzzer/FuzzedDataProvider.h>
-
-#include <string>
-#include <vector>
-
-extern "C" int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) {
-  using AllocatorT = scudo::Allocator<scudo::AndroidConfig>;
-  FuzzedDataProvider FDP(Data, Size);
-
-  uintptr_t FaultAddr = FDP.ConsumeIntegral<uintptr_t>();
-  uintptr_t MemoryAddr = FDP.ConsumeIntegral<uintptr_t>();
-
-  std::string MemoryAndTags =
-      FDP.ConsumeRandomLengthString(FDP.remaining_bytes());
-  const char *Memory = MemoryAndTags.c_str();
-  // Assume 16-byte alignment.
-  size_t MemorySize = (MemoryAndTags.length() / 17) * 16;
-  const char *MemoryTags = Memory + MemorySize;
-
-  std::string StackDepotBytes =
-      FDP.ConsumeRandomLengthString(FDP.remaining_bytes());
-  std::vector<char> StackDepot(sizeof(scudo::StackDepot), 0);
-  for (size_t i = 0; i < StackDepotBytes.length() && i < StackDepot.size();
-       ++i) {
-    StackDepot[i] = StackDepotBytes[i];
-  }
-
-  std::string RegionInfoBytes =
-      FDP.ConsumeRandomLengthString(FDP.remaining_bytes());
-  std::vector<char> RegionInfo(AllocatorT::getRegionInfoArraySize(), 0);
-  for (size_t i = 0; i < RegionInfoBytes.length() && i < RegionInfo.size();
-       ++i) {
-    RegionInfo[i] = RegionInfoBytes[i];
-  }
-
-  std::string RingBufferBytes = FDP.ConsumeRemainingBytesAsString();
-  std::vector<char> RingBuffer(AllocatorT::getRingBufferSize(), 0);
-  for (size_t i = 0; i < RingBufferBytes.length() && i < RingBuffer.size();
-       ++i) {
-    RingBuffer[i] = RingBufferBytes[i];
-  }
-
-  scudo_error_info ErrorInfo;
-  AllocatorT::getErrorInfo(&ErrorInfo, FaultAddr, StackDepot.data(),
-                           RegionInfo.data(), RingBuffer.data(), Memory,
-                           MemoryTags, MemoryAddr, MemorySize);
-  return 0;
-}
diff --git a/standalone/include/scudo/interface.h b/standalone/include/scudo/interface.h
deleted file mode 100644
index 9b9a84623c5..00000000000
--- a/standalone/include/scudo/interface.h
+++ /dev/null
@@ -1,160 +0,0 @@
-//===-- scudo/interface.h ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SCUDO_INTERFACE_H_
-#define SCUDO_INTERFACE_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-extern "C" {
-
-__attribute__((weak)) const char *__scudo_default_options();
-
-// Post-allocation & pre-deallocation hooks.
-// They must be thread-safe and not use heap related functions.
-__attribute__((weak)) void __scudo_allocate_hook(void *ptr, size_t size);
-__attribute__((weak)) void __scudo_deallocate_hook(void *ptr);
-
-void __scudo_print_stats(void);
-
-typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg);
-
-// Determine the likely cause of a tag check fault or other memory protection
-// error on a system with memory tagging support. The results are returned via
-// the error_info data structure. Up to three possible causes are returned in
-// the reports array, in decreasing order of probability. The remaining elements
-// of reports are zero-initialized.
-//
-// This function may be called from a different process from the one that
-// crashed. In this case, various data structures must be copied from the
-// crashing process to the process that analyzes the crash.
-//
-// This interface is not guaranteed to be stable and may change at any time.
-// Furthermore, the version of scudo in the crashing process must be the same as
-// the version in the process that analyzes the crash.
-//
-// fault_addr is the fault address. On aarch64 this is available in the system
-// register FAR_ELx, or siginfo.si_addr in Linux 5.11 or above. This address
-// must include the pointer tag; this is available if SA_EXPOSE_TAGBITS was set
-// in sigaction.sa_flags when the signal handler was registered. Note that the
-// kernel strips the tag from the field sigcontext.fault_address, so this
-// address is not suitable to be passed as fault_addr.
-//
-// stack_depot is a pointer to the stack depot data structure, which may be
-// obtained by calling the function __scudo_get_stack_depot_addr() in the
-// crashing process. The size of the stack depot is available by calling the
-// function __scudo_get_stack_depot_size().
-//
-// region_info is a pointer to the region info data structure, which may be
-// obtained by calling the function __scudo_get_region_info_addr() in the
-// crashing process. The size of the region info is available by calling the
-// function __scudo_get_region_info_size().
-//
-// memory is a pointer to a region of memory surrounding the fault address.
-// The more memory available via this pointer, the more likely it is that the
-// function will be able to analyze a crash correctly. It is recommended to
-// provide an amount of memory equal to 16 * the primary allocator's largest
-// size class either side of the fault address.
-//
-// memory_tags is a pointer to an array of memory tags for the memory accessed
-// via memory. Each byte of this array corresponds to a region of memory of size
-// equal to the architecturally defined memory tag granule size (16 on aarch64).
-//
-// memory_addr is the start address of memory in the crashing process's address
-// space.
-//
-// memory_size is the size of the memory region referred to by the memory
-// pointer.
-void __scudo_get_error_info(struct scudo_error_info *error_info,
-                            uintptr_t fault_addr, const char *stack_depot,
-                            const char *region_info, const char *ring_buffer,
-                            const char *memory, const char *memory_tags,
-                            uintptr_t memory_addr, size_t memory_size);
-
-enum scudo_error_type {
-  UNKNOWN,
-  USE_AFTER_FREE,
-  BUFFER_OVERFLOW,
-  BUFFER_UNDERFLOW,
-};
-
-struct scudo_error_report {
-  enum scudo_error_type error_type;
-
-  uintptr_t allocation_address;
-  uintptr_t allocation_size;
-
-  uint32_t allocation_tid;
-  uintptr_t allocation_trace[64];
-
-  uint32_t deallocation_tid;
-  uintptr_t deallocation_trace[64];
-};
-
-struct scudo_error_info {
-  struct scudo_error_report reports[3];
-};
-
-const char *__scudo_get_stack_depot_addr();
-size_t __scudo_get_stack_depot_size();
-
-const char *__scudo_get_region_info_addr();
-size_t __scudo_get_region_info_size();
-
-const char *__scudo_get_ring_buffer_addr();
-size_t __scudo_get_ring_buffer_size();
-
-#ifndef M_DECAY_TIME
-#define M_DECAY_TIME -100
-#endif
-
-#ifndef M_PURGE
-#define M_PURGE -101
-#endif
-
-// Tune the allocator's choice of memory tags to make it more likely that
-// a certain class of memory errors will be detected. The value argument should
-// be one of the M_MEMTAG_TUNING_* constants below.
-#ifndef M_MEMTAG_TUNING
-#define M_MEMTAG_TUNING -102
-#endif
-
-// Per-thread memory initialization tuning. The value argument should be one of:
-// 1: Disable automatic heap initialization and, where possible, memory tagging,
-//    on this thread.
-// 0: Normal behavior.
-#ifndef M_THREAD_DISABLE_MEM_INIT
-#define M_THREAD_DISABLE_MEM_INIT -103
-#endif
-
-#ifndef M_CACHE_COUNT_MAX
-#define M_CACHE_COUNT_MAX -200
-#endif
-
-#ifndef M_CACHE_SIZE_MAX
-#define M_CACHE_SIZE_MAX -201
-#endif
-
-#ifndef M_TSDS_COUNT_MAX
-#define M_TSDS_COUNT_MAX -202
-#endif
-
-// Tune for buffer overflows.
-#ifndef M_MEMTAG_TUNING_BUFFER_OVERFLOW
-#define M_MEMTAG_TUNING_BUFFER_OVERFLOW 0
-#endif
-
-// Tune for use-after-free.
-#ifndef M_MEMTAG_TUNING_UAF
-#define M_MEMTAG_TUNING_UAF 1
-#endif
-
-} // extern "C"
-
-#endif // SCUDO_INTERFACE_H_
diff --git a/standalone/interface.h b/standalone/interface.h
new file mode 100644
index 00000000000..e2639823f42
--- /dev/null
+++ b/standalone/interface.h
@@ -0,0 +1,29 @@
+//===-- interface.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_INTERFACE_H_
+#define SCUDO_INTERFACE_H_
+
+#include "internal_defs.h"
+
+extern "C" {
+
+WEAK INTERFACE const char *__scudo_default_options();
+
+// Post-allocation & pre-deallocation hooks.
+// They must be thread-safe and not use heap related functions.
+WEAK INTERFACE void __scudo_allocate_hook(void *ptr, size_t size);
+WEAK INTERFACE void __scudo_deallocate_hook(void *ptr);
+
+WEAK INTERFACE void __scudo_print_stats(void);
+
+typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg);
+
+} // extern "C"
+
+#endif // SCUDO_INTERFACE_H_
diff --git a/standalone/internal_defs.h b/standalone/internal_defs.h
index bbf7631be18..c61f8e6c71b 100644
--- a/standalone/internal_defs.h
+++ b/standalone/internal_defs.h
@@ -33,9 +33,13 @@
 #define WEAK __attribute__((weak))
 #define ALWAYS_INLINE inline __attribute__((always_inline))
 #define ALIAS(X) __attribute__((alias(X)))
+// Please only use the ALIGNED macro before the type. Using ALIGNED after the
+// variable declaration is not portable.
+#define ALIGNED(X) __attribute__((aligned(X)))
 #define FORMAT(F, A) __attribute__((format(printf, F, A)))
 #define NOINLINE __attribute__((noinline))
 #define NORETURN __attribute__((noreturn))
+#define THREADLOCAL __thread
 #define LIKELY(X) __builtin_expect(!!(X), 1)
 #define UNLIKELY(X) __builtin_expect(!!(X), 0)
 #if defined(__i386__) || defined(__x86_64__)
@@ -48,34 +52,6 @@
 #define USED __attribute__((used))
 #define NOEXCEPT noexcept
 
-// This check is only available on Clang. This is essentially an alias of
-// C++20's 'constinit' specifier which will take care of this when (if?) we can
-// ask all libc's that use Scudo to compile us with C++20. Dynamic
-// initialization is bad; Scudo is designed to be lazy-initializated on the
-// first call to malloc/free (and friends), and this generally happens in the
-// loader somewhere in libdl's init. After the loader is done, control is
-// transferred to libc's initialization, and the dynamic initializers are run.
-// If there's a dynamic initializer for Scudo, then it will clobber the
-// already-initialized Scudo, and re-initialize all its members back to default
-// values, causing various explosions. Unfortunately, marking
-// scudo::Allocator<>'s constructor as 'constexpr' isn't sufficient to prevent
-// dynamic initialization, as default initialization is fine under 'constexpr'
-// (but not 'constinit'). Clang at -O0, and gcc at all opt levels will emit a
-// dynamic initializer for any constant-initialized variables if there is a mix
-// of default-initialized and constant-initialized variables.
-//
-// If you're looking at this because your build failed, you probably introduced
-// a new member to scudo::Allocator<> (possibly transiently) that didn't have an
-// initializer. The fix is easy - just add one.
-#if defined(__has_attribute)
-#if __has_attribute(require_constant_initialization)
-#define SCUDO_REQUIRE_CONSTANT_INITIALIZATION                                  \
-  __attribute__((__require_constant_initialization__))
-#else
-#define SCUDO_REQUIRE_CONSTANT_INITIALIZATION
-#endif
-#endif
-
 namespace scudo {
 
 typedef unsigned long uptr;
@@ -134,27 +110,13 @@ void NORETURN reportCheckFailed(const char *File, int Line,
 #define DCHECK_GT(A, B) CHECK_GT(A, B)
 #define DCHECK_GE(A, B) CHECK_GE(A, B)
 #else
-#define DCHECK(A)                                                              \
-  do {                                                                         \
-  } while (false)
-#define DCHECK_EQ(A, B)                                                        \
-  do {                                                                         \
-  } while (false)
-#define DCHECK_NE(A, B)                                                        \
-  do {                                                                         \
-  } while (false)
-#define DCHECK_LT(A, B)                                                        \
-  do {                                                                         \
-  } while (false)
-#define DCHECK_LE(A, B)                                                        \
-  do {                                                                         \
-  } while (false)
-#define DCHECK_GT(A, B)                                                        \
-  do {                                                                         \
-  } while (false)
-#define DCHECK_GE(A, B)                                                        \
-  do {                                                                         \
-  } while (false)
+#define DCHECK(A)
+#define DCHECK_EQ(A, B)
+#define DCHECK_NE(A, B)
+#define DCHECK_LT(A, B)
+#define DCHECK_LE(A, B)
+#define DCHECK_GT(A, B)
+#define DCHECK_GE(A, B)
 #endif
 
 // The superfluous die() call effectively makes this macro NORETURN.
diff --git a/standalone/linux.cpp b/standalone/linux.cpp
index 301bdcd34da..0ab96836fc4 100644
--- a/standalone/linux.cpp
+++ b/standalone/linux.cpp
@@ -10,7 +10,6 @@
 
 #if SCUDO_LINUX
 
-#include "atomic_helpers.h"
 #include "common.h"
 #include "linux.h"
 #include "mutex.h"
@@ -36,6 +35,10 @@
 #define ANDROID_PR_SET_VMA_ANON_NAME 0
 #endif
 
+#ifdef ANDROID_EXPERIMENTAL_MTE
+#include <bionic/mte_kernel.h>
+#endif
+
 namespace scudo {
 
 uptr getPageSize() { return static_cast<uptr>(sysconf(_SC_PAGESIZE)); }
@@ -51,14 +54,11 @@ void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
     MmapProt = PROT_NONE;
   } else {
     MmapProt = PROT_READ | PROT_WRITE;
-  }
-#if defined(__aarch64__)
-#ifndef PROT_MTE
-#define PROT_MTE 0x20
-#endif
-  if (Flags & MAP_MEMTAG)
-    MmapProt |= PROT_MTE;
+#if defined(__aarch64__) && defined(ANDROID_EXPERIMENTAL_MTE)
+    if (Flags & MAP_MEMTAG)
+      MmapProt |= PROT_MTE;
 #endif
+  }
   if (Addr) {
     // Currently no scenario for a noaccess mapping with a fixed address.
     DCHECK_EQ(Flags & MAP_NOACCESS, 0);
@@ -67,11 +67,11 @@ void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
   void *P = mmap(Addr, Size, MmapProt, MmapFlags, -1, 0);
   if (P == MAP_FAILED) {
     if (!(Flags & MAP_ALLOWNOMEM) || errno != ENOMEM)
-      dieOnMapUnmapError(errno == ENOMEM ? Size : 0);
+      dieOnMapUnmapError(errno == ENOMEM);
     return nullptr;
   }
 #if SCUDO_ANDROID
-  if (Name)
+  if (!(Flags & MAP_NOACCESS))
     prctl(ANDROID_PR_SET_VMA, ANDROID_PR_SET_VMA_ANON_NAME, P, Size, Name);
 #endif
   return P;
@@ -83,48 +83,9 @@ void unmap(void *Addr, uptr Size, UNUSED uptr Flags,
     dieOnMapUnmapError();
 }
 
-void setMemoryPermission(uptr Addr, uptr Size, uptr Flags,
-                         UNUSED MapPlatformData *Data) {
-  int Prot = (Flags & MAP_NOACCESS) ? PROT_NONE : (PROT_READ | PROT_WRITE);
-  if (mprotect(reinterpret_cast<void *>(Addr), Size, Prot) != 0)
-    dieOnMapUnmapError();
-}
-
-static bool madviseNeedsMemset() {
-  const uptr Size = getPageSizeCached();
-  char *P = reinterpret_cast<char *>(mmap(0, Size, PROT_READ | PROT_WRITE,
-                                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
-  if (!P)
-    dieOnMapUnmapError(errno == ENOMEM ? Size : 0);
-  *P = 1;
-  while (madvise(P, Size, MADV_DONTNEED) == -1 && errno == EAGAIN) {
-  }
-  const bool R = (*P != 0);
-  if (munmap(P, Size) != 0)
-    dieOnMapUnmapError();
-  return R;
-}
-
-static bool madviseNeedsMemsetCached() {
-  static atomic_u8 Cache;
-  enum State : u8 { Unknown = 0, Yes = 1, No = 2 };
-  State NeedsMemset = static_cast<State>(atomic_load_relaxed(&Cache));
-  if (NeedsMemset == Unknown) {
-    NeedsMemset = madviseNeedsMemset() ? Yes : No;
-    atomic_store_relaxed(&Cache, NeedsMemset);
-  }
-  return NeedsMemset == Yes;
-}
-
 void releasePagesToOS(uptr BaseAddress, uptr Offset, uptr Size,
                       UNUSED MapPlatformData *Data) {
   void *Addr = reinterpret_cast<void *>(BaseAddress + Offset);
-  if (madviseNeedsMemsetCached()) {
-    // Workaround for QEMU-user ignoring MADV_DONTNEED.
-    // https://github.com/qemu/qemu/blob/b1cffefa1b163bce9aebc3416f562c1d3886eeaa/linux-user/syscall.c#L11941
-    // https://bugs.launchpad.net/qemu/+bug/1926521
-    memset(Addr, 0, Size);
-  }
   while (madvise(Addr, Size, MADV_DONTNEED) == -1 && errno == EAGAIN) {
   }
 }
@@ -178,14 +139,6 @@ u32 getNumberOfCPUs() {
   return static_cast<u32>(CPU_COUNT(&CPUs));
 }
 
-u32 getThreadID() {
-#if SCUDO_ANDROID
-  return static_cast<u32>(gettid());
-#else
-  return static_cast<u32>(syscall(SYS_gettid));
-#endif
-}
-
 // Blocking is possibly unused if the getrandom block is not compiled in.
 bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
   if (!Buffer || !Length || Length > MaxRandomLength)
@@ -237,7 +190,7 @@ void outputRaw(const char *Buffer) {
     }
     async_safe_write_log(AndroidLogInfo, "scudo", Buffer);
   } else {
-    (void)write(2, Buffer, strlen(Buffer));
+    write(2, Buffer, strlen(Buffer));
   }
 }
 
diff --git a/standalone/linux.h b/standalone/linux.h
index 72acb6da83a..c8e41484c85 100644
--- a/standalone/linux.h
+++ b/standalone/linux.h
@@ -18,6 +18,51 @@ namespace scudo {
 // MapPlatformData is unused on Linux, define it as a minimally sized structure.
 struct MapPlatformData {};
 
+#if SCUDO_ANDROID
+
+#if defined(__aarch64__)
+#define __get_tls()                                                            \
+  ({                                                                           \
+    void **__v;                                                                \
+    __asm__("mrs %0, tpidr_el0" : "=r"(__v));                                  \
+    __v;                                                                       \
+  })
+#elif defined(__arm__)
+#define __get_tls()                                                            \
+  ({                                                                           \
+    void **__v;                                                                \
+    __asm__("mrc p15, 0, %0, c13, c0, 3" : "=r"(__v));                         \
+    __v;                                                                       \
+  })
+#elif defined(__i386__)
+#define __get_tls()                                                            \
+  ({                                                                           \
+    void **__v;                                                                \
+    __asm__("movl %%gs:0, %0" : "=r"(__v));                                    \
+    __v;                                                                       \
+  })
+#elif defined(__x86_64__)
+#define __get_tls()                                                            \
+  ({                                                                           \
+    void **__v;                                                                \
+    __asm__("mov %%fs:0, %0" : "=r"(__v));                                     \
+    __v;                                                                       \
+  })
+#else
+#error "Unsupported architecture."
+#endif
+
+// The Android Bionic team has allocated a TLS slot for sanitizers starting
+// with Q, given that Android currently doesn't support ELF TLS. It is used to
+// store sanitizer thread specific data.
+static const int TLS_SLOT_SANITIZER = 6;
+
+ALWAYS_INLINE uptr *getAndroidTlsPtr() {
+  return reinterpret_cast<uptr *>(&__get_tls()[TLS_SLOT_SANITIZER]);
+}
+
+#endif // SCUDO_ANDROID
+
 } // namespace scudo
 
 #endif // SCUDO_LINUX
diff --git a/standalone/list.h b/standalone/list.h
index 1ac93c2f65d..c3b898a328c 100644
--- a/standalone/list.h
+++ b/standalone/list.h
@@ -57,9 +57,9 @@ template <class T> struct IntrusiveList {
   void checkConsistency() const;
 
 protected:
-  uptr Size = 0;
-  T *First = nullptr;
-  T *Last = nullptr;
+  uptr Size;
+  T *First;
+  T *Last;
 };
 
 template <class T> void IntrusiveList<T>::checkConsistency() const {
diff --git a/standalone/local_cache.h b/standalone/local_cache.h
index 50039379fa0..089aeb93962 100644
--- a/standalone/local_cache.h
+++ b/standalone/local_cache.h
@@ -17,25 +17,24 @@ namespace scudo {
 
 template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
-  typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
 
   struct TransferBatch {
     static const u32 MaxNumCached = SizeClassMap::MaxNumCachedHint;
-    void setFromArray(CompactPtrT *Array, u32 N) {
+    void setFromArray(void **Array, u32 N) {
       DCHECK_LE(N, MaxNumCached);
       Count = N;
-      memcpy(Batch, Array, sizeof(Batch[0]) * Count);
+      memcpy(Batch, Array, sizeof(void *) * Count);
     }
     void clear() { Count = 0; }
-    void add(CompactPtrT P) {
+    void add(void *P) {
       DCHECK_LT(Count, MaxNumCached);
       Batch[Count++] = P;
     }
-    void copyToArray(CompactPtrT *Array) const {
-      memcpy(Array, Batch, sizeof(Batch[0]) * Count);
+    void copyToArray(void **Array) const {
+      memcpy(Array, Batch, sizeof(void *) * Count);
     }
     u32 getCount() const { return Count; }
-    CompactPtrT get(u32 I) const {
+    void *get(u32 I) const {
       DCHECK_LE(I, Count);
       return Batch[I];
     }
@@ -46,7 +45,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
 
   private:
     u32 Count;
-    CompactPtrT Batch[MaxNumCached];
+    void *Batch[MaxNumCached];
   };
 
   void initLinkerInitialized(GlobalStats *S, SizeClassAllocator *A) {
@@ -79,10 +78,13 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     // Count, while Chunks might be further off (depending on Count). That keeps
     // the memory accesses in close quarters.
     const uptr ClassSize = C->ClassSize;
-    CompactPtrT CompactP = C->Chunks[--C->Count];
+    void *P = C->Chunks[--C->Count];
+    // The jury is still out as to whether any kind of PREFETCH here increases
+    // performance. It definitely decreases performance on Android though.
+    // if (!SCUDO_ANDROID) PREFETCH(P);
     Stats.add(StatAllocated, ClassSize);
     Stats.sub(StatFree, ClassSize);
-    return Allocator->decompactPtr(ClassId, CompactP);
+    return P;
   }
 
   void deallocate(uptr ClassId, void *P) {
@@ -95,35 +97,22 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
       drain(C, ClassId);
     // See comment in allocate() about memory accesses.
     const uptr ClassSize = C->ClassSize;
-    C->Chunks[C->Count++] =
-        Allocator->compactPtr(ClassId, reinterpret_cast<uptr>(P));
+    C->Chunks[C->Count++] = P;
     Stats.sub(StatAllocated, ClassSize);
     Stats.add(StatFree, ClassSize);
   }
 
-  bool isEmpty() const {
-    for (uptr I = 0; I < NumClasses; ++I)
-      if (PerClassArray[I].Count)
-        return false;
-    return true;
-  }
-
   void drain() {
-    // Drain BatchClassId last as createBatch can refill it.
-    for (uptr I = 0; I < NumClasses; ++I) {
-      if (I == BatchClassId)
-        continue;
-      while (PerClassArray[I].Count > 0)
-        drain(&PerClassArray[I], I);
+    for (uptr I = 0; I < NumClasses; I++) {
+      PerClass *C = &PerClassArray[I];
+      while (C->Count > 0)
+        drain(C, I);
     }
-    while (PerClassArray[BatchClassId].Count > 0)
-      drain(&PerClassArray[BatchClassId], BatchClassId);
-    DCHECK(isEmpty());
   }
 
   TransferBatch *createBatch(uptr ClassId, void *B) {
-    if (ClassId != BatchClassId)
-      B = allocate(BatchClassId);
+    if (ClassId != SizeClassMap::BatchClassId)
+      B = allocate(SizeClassMap::BatchClassId);
     return reinterpret_cast<TransferBatch *>(B);
   }
 
@@ -131,17 +120,15 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
 
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
-  static const uptr BatchClassId = SizeClassMap::BatchClassId;
   struct PerClass {
     u32 Count;
     u32 MaxCount;
-    // Note: ClassSize is zero for the transfer batch.
     uptr ClassSize;
-    CompactPtrT Chunks[2 * TransferBatch::MaxNumCached];
+    void *Chunks[2 * TransferBatch::MaxNumCached];
   };
-  PerClass PerClassArray[NumClasses] = {};
+  PerClass PerClassArray[NumClasses];
   LocalStats Stats;
-  SizeClassAllocator *Allocator = nullptr;
+  SizeClassAllocator *Allocator;
 
   ALWAYS_INLINE void initCacheMaybe(PerClass *C) {
     if (LIKELY(C->MaxCount))
@@ -155,19 +142,13 @@ private:
       PerClass *P = &PerClassArray[I];
       const uptr Size = SizeClassAllocator::getSizeByClassId(I);
       P->MaxCount = 2 * TransferBatch::getMaxCached(Size);
-      if (I != BatchClassId) {
-        P->ClassSize = Size;
-      } else {
-        // ClassSize in this struct is only used for malloc/free stats, which
-        // should only track user allocations, not internal movements.
-        P->ClassSize = 0;
-      }
+      P->ClassSize = Size;
     }
   }
 
   void destroyBatch(uptr ClassId, void *B) {
-    if (ClassId != BatchClassId)
-      deallocate(BatchClassId, B);
+    if (ClassId != SizeClassMap::BatchClassId)
+      deallocate(SizeClassMap::BatchClassId, B);
   }
 
   NOINLINE bool refill(PerClass *C, uptr ClassId) {
@@ -185,10 +166,10 @@ private:
 
   NOINLINE void drain(PerClass *C, uptr ClassId) {
     const u32 Count = Min(C->MaxCount / 2, C->Count);
-    TransferBatch *B =
-        createBatch(ClassId, Allocator->decompactPtr(ClassId, C->Chunks[0]));
+    TransferBatch *B = createBatch(ClassId, C->Chunks[0]);
     if (UNLIKELY(!B))
-      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
+      reportOutOfMemory(
+          SizeClassAllocator::getSizeByClassId(SizeClassMap::BatchClassId));
     B->setFromArray(&C->Chunks[0], Count);
     C->Count -= Count;
     for (uptr I = 0; I < C->Count; I++)
diff --git a/standalone/memtag.h b/standalone/memtag.h
index 4bdce16faea..76271333754 100644
--- a/standalone/memtag.h
+++ b/standalone/memtag.h
@@ -14,228 +14,163 @@
 #if SCUDO_LINUX
 #include <sys/auxv.h>
 #include <sys/prctl.h>
+#if defined(ANDROID_EXPERIMENTAL_MTE)
+#include <bionic/mte_kernel.h>
+#endif
 #endif
 
 namespace scudo {
 
-#if defined(__aarch64__) || defined(SCUDO_FUZZ)
+#if defined(__aarch64__)
 
-// We assume that Top-Byte Ignore is enabled if the architecture supports memory
-// tagging. Not all operating systems enable TBI, so we only claim architectural
-// support for memory tagging if the operating system enables TBI.
-#if SCUDO_LINUX && !defined(SCUDO_DISABLE_TBI)
 inline constexpr bool archSupportsMemoryTagging() { return true; }
-#else
-inline constexpr bool archSupportsMemoryTagging() { return false; }
-#endif
-
 inline constexpr uptr archMemoryTagGranuleSize() { return 16; }
 
-inline uptr untagPointer(uptr Ptr) { return Ptr & ((1ULL << 56) - 1); }
-
-inline uint8_t extractTag(uptr Ptr) { return (Ptr >> 56) & 0xf; }
-
-#else
-
-inline constexpr bool archSupportsMemoryTagging() { return false; }
-
-inline uptr archMemoryTagGranuleSize() {
-  UNREACHABLE("memory tagging not supported");
-}
-
-inline uptr untagPointer(uptr Ptr) {
-  (void)Ptr;
-  UNREACHABLE("memory tagging not supported");
-}
-
-inline uint8_t extractTag(uptr Ptr) {
-  (void)Ptr;
-  UNREACHABLE("memory tagging not supported");
-}
-
-#endif
-
-#if defined(__aarch64__)
-
-#if SCUDO_LINUX
-
 inline bool systemSupportsMemoryTagging() {
-#ifndef HWCAP2_MTE
-#define HWCAP2_MTE (1 << 18)
-#endif
+#if defined(ANDROID_EXPERIMENTAL_MTE)
   return getauxval(AT_HWCAP2) & HWCAP2_MTE;
+#else
+  return false;
+#endif
 }
 
 inline bool systemDetectsMemoryTagFaultsTestOnly() {
-#ifndef PR_GET_TAGGED_ADDR_CTRL
-#define PR_GET_TAGGED_ADDR_CTRL 56
-#endif
-#ifndef PR_MTE_TCF_SHIFT
-#define PR_MTE_TCF_SHIFT 1
-#endif
-#ifndef PR_MTE_TCF_NONE
-#define PR_MTE_TCF_NONE (0UL << PR_MTE_TCF_SHIFT)
-#endif
-#ifndef PR_MTE_TCF_MASK
-#define PR_MTE_TCF_MASK (3UL << PR_MTE_TCF_SHIFT)
+#if defined(ANDROID_EXPERIMENTAL_MTE)
+  return (prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0) & PR_MTE_TCF_MASK) !=
+         PR_MTE_TCF_NONE;
+#else
+  return false;
 #endif
-  return (static_cast<unsigned long>(
-              prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0)) &
-          PR_MTE_TCF_MASK) != PR_MTE_TCF_NONE;
 }
 
-#else // !SCUDO_LINUX
-
-inline bool systemSupportsMemoryTagging() { return false; }
-
-inline bool systemDetectsMemoryTagFaultsTestOnly() { return false; }
-
-#endif // SCUDO_LINUX
-
 inline void disableMemoryTagChecksTestOnly() {
-  __asm__ __volatile__(
-      R"(
-      .arch_extension memtag
-      msr tco, #1
-      )");
+  __asm__ __volatile__(".arch_extension mte; msr tco, #1");
 }
 
 inline void enableMemoryTagChecksTestOnly() {
-  __asm__ __volatile__(
-      R"(
-      .arch_extension memtag
-      msr tco, #0
-      )");
+  __asm__ __volatile__(".arch_extension mte; msr tco, #0");
 }
 
-class ScopedDisableMemoryTagChecks {
-  size_t PrevTCO;
-
-public:
-  ScopedDisableMemoryTagChecks() {
-    __asm__ __volatile__(
-        R"(
-        .arch_extension memtag
-        mrs %0, tco
-        msr tco, #1
-        )"
-        : "=r"(PrevTCO));
-  }
-
-  ~ScopedDisableMemoryTagChecks() {
-    __asm__ __volatile__(
-        R"(
-        .arch_extension memtag
-        msr tco, %0
-        )"
-        :
-        : "r"(PrevTCO));
-  }
-};
+inline uptr untagPointer(uptr Ptr) { return Ptr & ((1ULL << 56) - 1); }
 
-inline uptr selectRandomTag(uptr Ptr, uptr ExcludeMask) {
-  uptr TaggedPtr;
+inline void setRandomTag(void *Ptr, uptr Size, uptr *TaggedBegin,
+                         uptr *TaggedEnd) {
+  void *End;
   __asm__ __volatile__(
       R"(
-      .arch_extension memtag
-      irg %[TaggedPtr], %[Ptr], %[ExcludeMask]
-      )"
-      : [TaggedPtr] "=r"(TaggedPtr)
-      : [Ptr] "r"(Ptr), [ExcludeMask] "r"(ExcludeMask));
-  return TaggedPtr;
-}
+    .arch_extension mte
 
-inline uptr addFixedTag(uptr Ptr, uptr Tag) { return Ptr | (Tag << 56); }
+    // Set a random tag for Ptr in TaggedPtr. This needs to happen even if
+    // Size = 0 so that TaggedPtr ends up pointing at a valid address.
+    irg %[TaggedPtr], %[Ptr]
+    mov %[Cur], %[TaggedPtr]
+
+    // Skip the loop if Size = 0. We don't want to do any tagging in this case.
+    cbz %[Size], 2f
+
+    // Set the memory tag of the region
+    // [TaggedPtr, TaggedPtr + roundUpTo(Size, 16))
+    // to the pointer tag stored in TaggedPtr.
+    add %[End], %[TaggedPtr], %[Size]
 
-inline uptr storeTags(uptr Begin, uptr End) {
-  DCHECK(Begin % 16 == 0);
-  uptr LineSize, Next, Tmp;
-  __asm__ __volatile__(
-      R"(
-    .arch_extension memtag
-
-    // Compute the cache line size in bytes (DCZID_EL0 stores it as the log2
-    // of the number of 4-byte words) and bail out to the slow path if DCZID_EL0
-    // indicates that the DC instructions are unavailable.
-    DCZID .req %[Tmp]
-    mrs DCZID, dczid_el0
-    tbnz DCZID, #4, 3f
-    and DCZID, DCZID, #15
-    mov %[LineSize], #4
-    lsl %[LineSize], %[LineSize], DCZID
-    .unreq DCZID
-
-    // Our main loop doesn't handle the case where we don't need to perform any
-    // DC GZVA operations. If the size of our tagged region is less than
-    // twice the cache line size, bail out to the slow path since it's not
-    // guaranteed that we'll be able to do a DC GZVA.
-    Size .req %[Tmp]
-    sub Size, %[End], %[Cur]
-    cmp Size, %[LineSize], lsl #1
-    b.lt 3f
-    .unreq Size
-
-    LineMask .req %[Tmp]
-    sub LineMask, %[LineSize], #1
-
-    // STZG until the start of the next cache line.
-    orr %[Next], %[Cur], LineMask
   1:
     stzg %[Cur], [%[Cur]], #16
-    cmp %[Cur], %[Next]
+    cmp %[Cur], %[End]
     b.lt 1b
 
-    // DC GZVA cache lines until we have no more full cache lines.
-    bic %[Next], %[End], LineMask
-    .unreq LineMask
   2:
-    dc gzva, %[Cur]
-    add %[Cur], %[Cur], %[LineSize]
-    cmp %[Cur], %[Next]
-    b.lt 2b
-
-    // STZG until the end of the tagged region. This loop is also used to handle
-    // slow path cases.
-  3:
-    cmp %[Cur], %[End]
-    b.ge 4f
-    stzg %[Cur], [%[Cur]], #16
-    b 3b
-
-  4:
   )"
-      : [Cur] "+&r"(Begin), [LineSize] "=&r"(LineSize), [Next] "=&r"(Next),
-        [Tmp] "=&r"(Tmp)
-      : [End] "r"(End)
+      : [ TaggedPtr ] "=&r"(*TaggedBegin), [ Cur ] "=&r"(*TaggedEnd),
+        [ End ] "=&r"(End)
+      : [ Ptr ] "r"(Ptr), [ Size ] "r"(Size)
       : "memory");
-  return Begin;
 }
 
-inline void storeTag(uptr Ptr) {
-  __asm__ __volatile__(R"(
-    .arch_extension memtag
-    stg %0, [%0]
-  )"
+inline void *prepareTaggedChunk(void *Ptr, uptr Size, uptr BlockEnd) {
+  // Prepare the granule before the chunk to store the chunk header by setting
+  // its tag to 0. Normally its tag will already be 0, but in the case where a
+  // chunk holding a low alignment allocation is reused for a higher alignment
+  // allocation, the chunk may already have a non-zero tag from the previous
+  // allocation.
+  __asm__ __volatile__(".arch_extension mte; stg %0, [%0, #-16]"
                        :
                        : "r"(Ptr)
                        : "memory");
+
+  uptr TaggedBegin, TaggedEnd;
+  setRandomTag(Ptr, Size, &TaggedBegin, &TaggedEnd);
+
+  // Finally, set the tag of the granule past the end of the allocation to 0,
+  // to catch linear overflows even if a previous larger allocation used the
+  // same block and tag. Only do this if the granule past the end is in our
+  // block, because this would otherwise lead to a SEGV if the allocation
+  // covers the entire block and our block is at the end of a mapping. The tag
+  // of the next block's header granule will be set to 0, so it will serve the
+  // purpose of catching linear overflows in this case.
+  uptr UntaggedEnd = untagPointer(TaggedEnd);
+  if (UntaggedEnd != BlockEnd)
+    __asm__ __volatile__(".arch_extension mte; stg %0, [%0]"
+                         :
+                         : "r"(UntaggedEnd)
+                         : "memory");
+  return reinterpret_cast<void *>(TaggedBegin);
+}
+
+inline void resizeTaggedChunk(uptr OldPtr, uptr NewPtr, uptr BlockEnd) {
+  uptr RoundOldPtr = roundUpTo(OldPtr, 16);
+  if (RoundOldPtr >= NewPtr) {
+    // If the allocation is shrinking we just need to set the tag past the end
+    // of the allocation to 0. See explanation in prepareTaggedChunk above.
+    uptr RoundNewPtr = untagPointer(roundUpTo(NewPtr, 16));
+    if (RoundNewPtr != BlockEnd)
+      __asm__ __volatile__(".arch_extension mte; stg %0, [%0]"
+                           :
+                           : "r"(RoundNewPtr)
+                           : "memory");
+    return;
+  }
+
+  __asm__ __volatile__(R"(
+    .arch_extension mte
+
+    // Set the memory tag of the region
+    // [roundUpTo(OldPtr, 16), roundUpTo(NewPtr, 16))
+    // to the pointer tag stored in OldPtr.
+  1:
+    stzg %[Cur], [%[Cur]], #16
+    cmp %[Cur], %[End]
+    b.lt 1b
+
+    // Finally, set the tag of the granule past the end of the allocation to 0.
+    and %[Cur], %[Cur], #(1 << 56) - 1
+    cmp %[Cur], %[BlockEnd]
+    b.eq 2f
+    stg %[Cur], [%[Cur]]
+
+  2:
+  )"
+                       : [ Cur ] "+&r"(RoundOldPtr), [ End ] "+&r"(NewPtr)
+                       : [ BlockEnd ] "r"(BlockEnd)
+                       : "memory");
+}
+
+inline uptr tagPointer(uptr UntaggedPtr, uptr Tag) {
+  return UntaggedPtr | (Tag & (0xfUL << 56));
 }
 
 inline uptr loadTag(uptr Ptr) {
   uptr TaggedPtr = Ptr;
-  __asm__ __volatile__(
-      R"(
-      .arch_extension memtag
-      ldg %0, [%0]
-      )"
-      : "+r"(TaggedPtr)
-      :
-      : "memory");
+  __asm__ __volatile__(".arch_extension mte; ldg %0, [%0]"
+                       : "+r"(TaggedPtr)
+                       :
+                       : "memory");
   return TaggedPtr;
 }
 
 #else
 
+inline constexpr bool archSupportsMemoryTagging() { return false; }
+
 inline bool systemSupportsMemoryTagging() {
   UNREACHABLE("memory tagging not supported");
 }
@@ -244,6 +179,10 @@ inline bool systemDetectsMemoryTagFaultsTestOnly() {
   UNREACHABLE("memory tagging not supported");
 }
 
+inline uptr archMemoryTagGranuleSize() {
+  UNREACHABLE("memory tagging not supported");
+}
+
 inline void disableMemoryTagChecksTestOnly() {
   UNREACHABLE("memory tagging not supported");
 }
@@ -252,30 +191,31 @@ inline void enableMemoryTagChecksTestOnly() {
   UNREACHABLE("memory tagging not supported");
 }
 
-struct ScopedDisableMemoryTagChecks {
-  ScopedDisableMemoryTagChecks() {}
-};
-
-inline uptr selectRandomTag(uptr Ptr, uptr ExcludeMask) {
+inline uptr untagPointer(uptr Ptr) {
   (void)Ptr;
-  (void)ExcludeMask;
   UNREACHABLE("memory tagging not supported");
 }
 
-inline uptr addFixedTag(uptr Ptr, uptr Tag) {
+inline void setRandomTag(void *Ptr, uptr Size, uptr *TaggedBegin,
+                         uptr *TaggedEnd) {
   (void)Ptr;
-  (void)Tag;
+  (void)Size;
+  (void)TaggedBegin;
+  (void)TaggedEnd;
   UNREACHABLE("memory tagging not supported");
 }
 
-inline uptr storeTags(uptr Begin, uptr End) {
-  (void)Begin;
-  (void)End;
+inline void *prepareTaggedChunk(void *Ptr, uptr Size, uptr BlockEnd) {
+  (void)Ptr;
+  (void)Size;
+  (void)BlockEnd;
   UNREACHABLE("memory tagging not supported");
 }
 
-inline void storeTag(uptr Ptr) {
-  (void)Ptr;
+inline void resizeTaggedChunk(uptr OldPtr, uptr NewPtr, uptr BlockEnd) {
+  (void)OldPtr;
+  (void)NewPtr;
+  (void)BlockEnd;
   UNREACHABLE("memory tagging not supported");
 }
 
@@ -286,30 +226,6 @@ inline uptr loadTag(uptr Ptr) {
 
 #endif
 
-inline void setRandomTag(void *Ptr, uptr Size, uptr ExcludeMask,
-                         uptr *TaggedBegin, uptr *TaggedEnd) {
-  *TaggedBegin = selectRandomTag(reinterpret_cast<uptr>(Ptr), ExcludeMask);
-  *TaggedEnd = storeTags(*TaggedBegin, *TaggedBegin + Size);
-}
-
-inline void *untagPointer(void *Ptr) {
-  return reinterpret_cast<void *>(untagPointer(reinterpret_cast<uptr>(Ptr)));
-}
-
-inline void *loadTag(void *Ptr) {
-  return reinterpret_cast<void *>(loadTag(reinterpret_cast<uptr>(Ptr)));
-}
-
-inline void *addFixedTag(void *Ptr, uptr Tag) {
-  return reinterpret_cast<void *>(
-      addFixedTag(reinterpret_cast<uptr>(Ptr), Tag));
-}
-
-template <typename Config>
-inline constexpr bool allocatorSupportsMemoryTagging() {
-  return archSupportsMemoryTagging() && Config::MaySupportMemoryTagging;
-}
-
 } // namespace scudo
 
 #endif
diff --git a/standalone/mutex.h b/standalone/mutex.h
index a654d35c5a7..b26b2df0662 100644
--- a/standalone/mutex.h
+++ b/standalone/mutex.h
@@ -22,7 +22,7 @@ namespace scudo {
 
 class HybridMutex {
 public:
-  void init() { M = {}; }
+  void init() { memset(this, 0, sizeof(*this)); }
   bool tryLock();
   NOINLINE void lock() {
     if (LIKELY(tryLock()))
@@ -48,9 +48,9 @@ private:
   static constexpr u8 NumberOfYields = 8U;
 
 #if SCUDO_LINUX
-  atomic_u32 M = {};
+  atomic_u32 M;
 #elif SCUDO_FUCHSIA
-  sync_mutex_t M = {};
+  sync_mutex_t M;
 #endif
 
   void lockSlow();
diff --git a/standalone/options.h b/standalone/options.h
deleted file mode 100644
index 4e678651333..00000000000
--- a/standalone/options.h
+++ /dev/null
@@ -1,74 +0,0 @@
-//===-- options.h -----------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SCUDO_OPTIONS_H_
-#define SCUDO_OPTIONS_H_
-
-#include "atomic_helpers.h"
-#include "common.h"
-#include "memtag.h"
-
-namespace scudo {
-
-enum class OptionBit {
-  MayReturnNull,
-  FillContents0of2,
-  FillContents1of2,
-  DeallocTypeMismatch,
-  DeleteSizeMismatch,
-  TrackAllocationStacks,
-  UseOddEvenTags,
-  UseMemoryTagging,
-  AddLargeAllocationSlack,
-};
-
-struct Options {
-  u32 Val;
-
-  bool get(OptionBit Opt) const { return Val & (1U << static_cast<u32>(Opt)); }
-
-  FillContentsMode getFillContentsMode() const {
-    return static_cast<FillContentsMode>(
-        (Val >> static_cast<u32>(OptionBit::FillContents0of2)) & 3);
-  }
-};
-
-template <typename Config> bool useMemoryTagging(Options Options) {
-  return allocatorSupportsMemoryTagging<Config>() &&
-         Options.get(OptionBit::UseMemoryTagging);
-}
-
-struct AtomicOptions {
-  atomic_u32 Val = {};
-
-  Options load() const { return Options{atomic_load_relaxed(&Val)}; }
-
-  void clear(OptionBit Opt) {
-    atomic_fetch_and(&Val, ~(1U << static_cast<u32>(Opt)),
-                     memory_order_relaxed);
-  }
-
-  void set(OptionBit Opt) {
-    atomic_fetch_or(&Val, 1U << static_cast<u32>(Opt), memory_order_relaxed);
-  }
-
-  void setFillContentsMode(FillContentsMode FillContents) {
-    u32 Opts = atomic_load_relaxed(&Val), NewOpts;
-    do {
-      NewOpts = Opts;
-      NewOpts &= ~(3U << static_cast<u32>(OptionBit::FillContents0of2));
-      NewOpts |= static_cast<u32>(FillContents)
-                 << static_cast<u32>(OptionBit::FillContents0of2);
-    } while (!atomic_compare_exchange_strong(&Val, &Opts, NewOpts,
-                                             memory_order_relaxed));
-  }
-};
-
-} // namespace scudo
-
-#endif // SCUDO_OPTIONS_H_
diff --git a/standalone/primary32.h b/standalone/primary32.h
index 33d81754fb5..7d061e2cbcc 100644
--- a/standalone/primary32.h
+++ b/standalone/primary32.h
@@ -13,7 +13,6 @@
 #include "common.h"
 #include "list.h"
 #include "local_cache.h"
-#include "options.h"
 #include "release.h"
 #include "report.h"
 #include "stats.h"
@@ -39,18 +38,23 @@ namespace scudo {
 // Memory used by this allocator is never unmapped but can be partially
 // reclaimed if the platform allows for it.
 
-template <typename Config> class SizeClassAllocator32 {
+template <class SizeClassMapT, uptr RegionSizeLog,
+          s32 MinReleaseToOsIntervalMs = INT32_MIN,
+          s32 MaxReleaseToOsIntervalMs = INT32_MAX>
+class SizeClassAllocator32 {
 public:
-  typedef typename Config::PrimaryCompactPtrT CompactPtrT;
-  typedef typename Config::SizeClassMap SizeClassMap;
+  typedef SizeClassMapT SizeClassMap;
   // The bytemap can only track UINT8_MAX - 1 classes.
   static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
   // Regions should be large enough to hold the largest Block.
-  static_assert((1UL << Config::PrimaryRegionSizeLog) >= SizeClassMap::MaxSize,
-                "");
-  typedef SizeClassAllocator32<Config> ThisT;
+  static_assert((1UL << RegionSizeLog) >= SizeClassMap::MaxSize, "");
+  typedef SizeClassAllocator32<SizeClassMapT, RegionSizeLog,
+                               MinReleaseToOsIntervalMs,
+                               MaxReleaseToOsIntervalMs>
+      ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
+  static const bool SupportsMemoryTagging = false;
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
@@ -65,20 +69,24 @@ public:
       reportError("SizeClassAllocator32 is not supported on Fuchsia");
 
     PossibleRegions.initLinkerInitialized();
+    MinRegionIndex = NumRegions; // MaxRegionIndex is already initialized to 0.
 
     u32 Seed;
     const u64 Time = getMonotonicTime();
-    if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
+    if (UNLIKELY(!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed))))
       Seed = static_cast<u32>(
           Time ^ (reinterpret_cast<uptr>(SizeClassInfoArray) >> 6));
+    const uptr PageSize = getPageSizeCached();
     for (uptr I = 0; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
       Sci->RandState = getRandomU32(&Seed);
-      // Sci->MaxRegionIndex is already initialized to 0.
-      Sci->MinRegionIndex = NumRegions;
-      Sci->ReleaseInfo.LastReleaseAtNs = Time;
+      // See comment in the 64-bit primary about releasing smaller size classes.
+      Sci->CanRelease = (I != SizeClassMap::BatchClassId) &&
+                        (getSizeByClassId(I) >= (PageSize / 32));
+      if (Sci->CanRelease)
+        Sci->ReleaseInfo.LastReleaseAtNs = Time;
     }
-    setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
+    setReleaseToOsIntervalMs(ReleaseToOsInterval);
   }
   void init(s32 ReleaseToOsInterval) {
     memset(this, 0, sizeof(*this));
@@ -89,28 +97,12 @@ public:
     while (NumberOfStashedRegions > 0)
       unmap(reinterpret_cast<void *>(RegionsStash[--NumberOfStashedRegions]),
             RegionSize);
-    uptr MinRegionIndex = NumRegions, MaxRegionIndex = 0;
-    for (uptr I = 0; I < NumClasses; I++) {
-      SizeClassInfo *Sci = getSizeClassInfo(I);
-      if (Sci->MinRegionIndex < MinRegionIndex)
-        MinRegionIndex = Sci->MinRegionIndex;
-      if (Sci->MaxRegionIndex > MaxRegionIndex)
-        MaxRegionIndex = Sci->MaxRegionIndex;
-    }
-    for (uptr I = MinRegionIndex; I < MaxRegionIndex; I++)
+    for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++)
       if (PossibleRegions[I])
         unmap(reinterpret_cast<void *>(I * RegionSize), RegionSize);
     PossibleRegions.unmapTestOnly();
   }
 
-  CompactPtrT compactPtr(UNUSED uptr ClassId, uptr Ptr) const {
-    return static_cast<CompactPtrT>(Ptr);
-  }
-
-  void *decompactPtr(UNUSED uptr ClassId, CompactPtrT CompactPtr) const {
-    return reinterpret_cast<void *>(static_cast<uptr>(CompactPtr));
-  }
-
   TransferBatch *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
@@ -135,7 +127,7 @@ public:
     ScopedLock L(Sci->Mutex);
     Sci->FreeList.push_front(B);
     Sci->Stats.PushedBlocks += B->getCount();
-    if (ClassId != SizeClassMap::BatchClassId)
+    if (Sci->CanRelease)
       releaseToOSMaybe(Sci, ClassId);
   }
 
@@ -163,14 +155,6 @@ public:
   }
 
   template <typename F> void iterateOverBlocks(F Callback) {
-    uptr MinRegionIndex = NumRegions, MaxRegionIndex = 0;
-    for (uptr I = 0; I < NumClasses; I++) {
-      SizeClassInfo *Sci = getSizeClassInfo(I);
-      if (Sci->MinRegionIndex < MinRegionIndex)
-        MinRegionIndex = Sci->MinRegionIndex;
-      if (Sci->MaxRegionIndex > MaxRegionIndex)
-        MaxRegionIndex = Sci->MaxRegionIndex;
-    }
     for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++)
       if (PossibleRegions[I] &&
           (PossibleRegions[I] - 1U) != SizeClassMap::BatchClassId) {
@@ -200,23 +184,18 @@ public:
       getStats(Str, I, 0);
   }
 
-  bool setOption(Option O, sptr Value) {
-    if (O == Option::ReleaseInterval) {
-      const s32 Interval = Max(
-          Min(static_cast<s32>(Value), Config::PrimaryMaxReleaseToOsIntervalMs),
-          Config::PrimaryMinReleaseToOsIntervalMs);
-      atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
-      return true;
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    if (Interval >= MaxReleaseToOsIntervalMs) {
+      Interval = MaxReleaseToOsIntervalMs;
+    } else if (Interval <= MinReleaseToOsIntervalMs) {
+      Interval = MinReleaseToOsIntervalMs;
     }
-    // Not supported by the Primary, but not an error either.
-    return true;
+    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
   }
 
   uptr releaseToOS() {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
-      if (I == SizeClassMap::BatchClassId)
-        continue;
       SizeClassInfo *Sci = getSizeClassInfo(I);
       ScopedLock L(Sci->Mutex);
       TotalReleasedBytes += releaseToOSMaybe(Sci, I, /*Force=*/true);
@@ -224,21 +203,13 @@ public:
     return TotalReleasedBytes;
   }
 
-  const char *getRegionInfoArrayAddress() const { return nullptr; }
-  static uptr getRegionInfoArraySize() { return 0; }
-
-  static BlockInfo findNearestBlock(UNUSED const char *RegionInfoData,
-                                    UNUSED uptr Ptr) {
-    return {};
-  }
-
-  AtomicOptions Options;
+  bool useMemoryTagging() { return false; }
+  void disableMemoryTagging() {}
 
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
-  static const uptr RegionSize = 1UL << Config::PrimaryRegionSizeLog;
-  static const uptr NumRegions =
-      SCUDO_MMAP_RANGE_SIZE >> Config::PrimaryRegionSizeLog;
+  static const uptr RegionSize = 1UL << RegionSizeLog;
+  static const uptr NumRegions = SCUDO_MMAP_RANGE_SIZE >> RegionSizeLog;
   static const u32 MaxNumBatches = SCUDO_ANDROID ? 4U : 8U;
   typedef FlatByteMap<NumRegions> ByteMap;
 
@@ -254,24 +225,21 @@ private:
     u64 LastReleaseAtNs;
   };
 
-  struct alignas(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
+  struct ALIGNED(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
     HybridMutex Mutex;
     SinglyLinkedList<TransferBatch> FreeList;
     uptr CurrentRegion;
     uptr CurrentRegionAllocated;
     SizeClassStats Stats;
+    bool CanRelease;
     u32 RandState;
     uptr AllocatedUser;
-    // Lowest & highest region index allocated for this size class, to avoid
-    // looping through the whole NumRegions.
-    uptr MinRegionIndex;
-    uptr MaxRegionIndex;
     ReleaseToOsInfo ReleaseInfo;
   };
   static_assert(sizeof(SizeClassInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
   uptr computeRegionId(uptr Mem) {
-    const uptr Id = Mem >> Config::PrimaryRegionSizeLog;
+    const uptr Id = Mem >> RegionSizeLog;
     CHECK_LT(Id, NumRegions);
     return Id;
   }
@@ -280,7 +248,7 @@ private:
     uptr MapSize = 2 * RegionSize;
     const uptr MapBase = reinterpret_cast<uptr>(
         map(nullptr, MapSize, "scudo:primary", MAP_ALLOWNOMEM));
-    if (!MapBase)
+    if (UNLIKELY(!MapBase))
       return 0;
     const uptr MapEnd = MapBase + MapSize;
     uptr Region = MapBase;
@@ -301,7 +269,7 @@ private:
     return Region;
   }
 
-  uptr allocateRegion(SizeClassInfo *Sci, uptr ClassId) {
+  uptr allocateRegion(uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     uptr Region = 0;
     {
@@ -312,12 +280,11 @@ private:
     if (!Region)
       Region = allocateRegionSlow();
     if (LIKELY(Region)) {
-      // Sci->Mutex is held by the caller, updating the Min/Max is safe.
       const uptr RegionIndex = computeRegionId(Region);
-      if (RegionIndex < Sci->MinRegionIndex)
-        Sci->MinRegionIndex = RegionIndex;
-      if (RegionIndex > Sci->MaxRegionIndex)
-        Sci->MaxRegionIndex = RegionIndex;
+      if (RegionIndex < MinRegionIndex)
+        MinRegionIndex = RegionIndex;
+      if (RegionIndex > MaxRegionIndex)
+        MaxRegionIndex = RegionIndex;
       PossibleRegions.set(RegionIndex, static_cast<u8>(ClassId + 1U));
     }
     return Region;
@@ -328,6 +295,29 @@ private:
     return &SizeClassInfoArray[ClassId];
   }
 
+  bool populateBatches(CacheT *C, SizeClassInfo *Sci, uptr ClassId,
+                       TransferBatch **CurrentBatch, u32 MaxCount,
+                       void **PointersArray, u32 Count) {
+    if (ClassId != SizeClassMap::BatchClassId)
+      shuffle(PointersArray, Count, &Sci->RandState);
+    TransferBatch *B = *CurrentBatch;
+    for (uptr I = 0; I < Count; I++) {
+      if (B && B->getCount() == MaxCount) {
+        Sci->FreeList.push_back(B);
+        B = nullptr;
+      }
+      if (!B) {
+        B = C->createBatch(ClassId, PointersArray[I]);
+        if (UNLIKELY(!B))
+          return false;
+        B->clear();
+      }
+      B->add(PointersArray[I]);
+    }
+    *CurrentBatch = B;
+    return true;
+  }
+
   NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
                                            SizeClassInfo *Sci) {
     uptr Region;
@@ -342,7 +332,7 @@ private:
       Offset = Sci->CurrentRegionAllocated;
     } else {
       DCHECK_EQ(Sci->CurrentRegionAllocated, 0U);
-      Region = allocateRegion(Sci, ClassId);
+      Region = allocateRegion(ClassId);
       if (UNLIKELY(!Region))
         return nullptr;
       C->getStats().add(StatMapped, RegionSize);
@@ -363,36 +353,38 @@ private:
             static_cast<u32>((RegionSize - Offset) / Size));
     DCHECK_GT(NumberOfBlocks, 0U);
 
+    TransferBatch *B = nullptr;
     constexpr u32 ShuffleArraySize =
         MaxNumBatches * TransferBatch::MaxNumCached;
     // Fill the transfer batches and put them in the size-class freelist. We
     // need to randomize the blocks for security purposes, so we first fill a
     // local array that we then shuffle before populating the batches.
-    CompactPtrT ShuffleArray[ShuffleArraySize];
-    DCHECK_LE(NumberOfBlocks, ShuffleArraySize);
-
-    uptr P = Region + Offset;
-    for (u32 I = 0; I < NumberOfBlocks; I++, P += Size)
-      ShuffleArray[I] = reinterpret_cast<CompactPtrT>(P);
-    // No need to shuffle the batches size class.
-    if (ClassId != SizeClassMap::BatchClassId)
-      shuffle(ShuffleArray, NumberOfBlocks, &Sci->RandState);
-    for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B =
-          C->createBatch(ClassId, reinterpret_cast<void *>(ShuffleArray[I]));
-      if (UNLIKELY(!B))
+    void *ShuffleArray[ShuffleArraySize];
+    u32 Count = 0;
+    const uptr AllocatedUser = Size * NumberOfBlocks;
+    for (uptr I = Region + Offset; I < Region + Offset + AllocatedUser;
+         I += Size) {
+      ShuffleArray[Count++] = reinterpret_cast<void *>(I);
+      if (Count == ShuffleArraySize) {
+        if (UNLIKELY(!populateBatches(C, Sci, ClassId, &B, MaxCount,
+                                      ShuffleArray, Count)))
+          return nullptr;
+        Count = 0;
+      }
+    }
+    if (Count) {
+      if (UNLIKELY(!populateBatches(C, Sci, ClassId, &B, MaxCount, ShuffleArray,
+                                    Count)))
         return nullptr;
-      const u32 N = Min(MaxCount, NumberOfBlocks - I);
-      B->setFromArray(&ShuffleArray[I], N);
-      Sci->FreeList.push_back(B);
-      I += N;
     }
-    TransferBatch *B = Sci->FreeList.front();
-    Sci->FreeList.pop_front();
     DCHECK(B);
+    if (!Sci->FreeList.empty()) {
+      Sci->FreeList.push_back(B);
+      B = Sci->FreeList.front();
+      Sci->FreeList.pop_front();
+    }
     DCHECK_GT(B->getCount(), 0);
 
-    const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
     DCHECK_LE(Sci->CurrentRegionAllocated + AllocatedUser, RegionSize);
     // If there is not enough room in the region currently associated to fit
@@ -422,12 +414,16 @@ private:
                 AvailableChunks, Rss >> 10, Sci->ReleaseInfo.RangesReleased);
   }
 
+  s32 getReleaseToOsIntervalMs() {
+    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
+  }
+
   NOINLINE uptr releaseToOSMaybe(SizeClassInfo *Sci, uptr ClassId,
                                  bool Force = false) {
     const uptr BlockSize = getSizeByClassId(ClassId);
     const uptr PageSize = getPageSizeCached();
 
-    DCHECK_GE(Sci->Stats.PoppedBlocks, Sci->Stats.PushedBlocks);
+    CHECK_GE(Sci->Stats.PoppedBlocks, Sci->Stats.PushedBlocks);
     const uptr BytesInFreeList =
         Sci->AllocatedUser -
         (Sci->Stats.PoppedBlocks - Sci->Stats.PushedBlocks) * BlockSize;
@@ -445,14 +441,14 @@ private:
     if (BlockSize < PageSize / 16U) {
       if (!Force && BytesPushed < Sci->AllocatedUser / 16U)
         return 0;
-      // We want 8x% to 9x% free bytes (the larger the block, the lower the %).
+      // We want 8x% to 9x% free bytes (the larger the bock, the lower the %).
       if ((BytesInFreeList * 100U) / Sci->AllocatedUser <
           (100U - 1U - BlockSize / 16U))
         return 0;
     }
 
     if (!Force) {
-      const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
+      const s32 IntervalMs = getReleaseToOsIntervalMs();
       if (IntervalMs < 0)
         return 0;
       if (Sci->ReleaseInfo.LastReleaseAtNs +
@@ -462,44 +458,54 @@ private:
       }
     }
 
-    const uptr First = Sci->MinRegionIndex;
-    const uptr Last = Sci->MaxRegionIndex;
-    DCHECK_NE(Last, 0U);
-    DCHECK_LE(First, Last);
+    DCHECK_GT(MinRegionIndex, 0U);
+    uptr First = 0;
+    for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++) {
+      if (PossibleRegions[I] - 1U == ClassId) {
+        First = I;
+        break;
+      }
+    }
+    uptr Last = 0;
+    for (uptr I = MaxRegionIndex; I >= MinRegionIndex; I--) {
+      if (PossibleRegions[I] - 1U == ClassId) {
+        Last = I;
+        break;
+      }
+    }
     uptr TotalReleasedBytes = 0;
-    const uptr Base = First * RegionSize;
-    const uptr NumberOfRegions = Last - First + 1U;
-    ReleaseRecorder Recorder(Base);
-    auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
-      return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
-    };
-    auto DecompactPtr = [](CompactPtrT CompactPtr) {
-      return reinterpret_cast<uptr>(CompactPtr);
-    };
-    releaseFreeMemoryToOS(Sci->FreeList, RegionSize, NumberOfRegions, BlockSize,
-                          &Recorder, DecompactPtr, SkipRegion);
-    if (Recorder.getReleasedRangesCount() > 0) {
-      Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
-      Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
-      Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
-      TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
+    if (First != 0U && Last != 0U) {
+      const uptr Base = First * RegionSize;
+      const uptr NumberOfRegions = Last - First + 1U;
+      ReleaseRecorder Recorder(Base);
+      releaseFreeMemoryToOS(Sci->FreeList, Base, RegionSize, NumberOfRegions,
+                            BlockSize, &Recorder);
+      if (Recorder.getReleasedRangesCount() > 0) {
+        Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
+        Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
+        Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+        TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
+      }
     }
     Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();
-
     return TotalReleasedBytes;
   }
 
-  SizeClassInfo SizeClassInfoArray[NumClasses] = {};
+  SizeClassInfo SizeClassInfoArray[NumClasses];
 
   // Track the regions in use, 0 is unused, otherwise store ClassId + 1.
-  ByteMap PossibleRegions = {};
-  atomic_s32 ReleaseToOsIntervalMs = {};
+  ByteMap PossibleRegions;
+  // Keep track of the lowest & highest regions allocated to avoid looping
+  // through the whole NumRegions.
+  uptr MinRegionIndex;
+  uptr MaxRegionIndex;
+  atomic_s32 ReleaseToOsIntervalMs;
   // Unless several threads request regions simultaneously from different size
   // classes, the stash rarely contains more than 1 entry.
   static constexpr uptr MaxStashedRegions = 4;
   HybridMutex RegionsStashMutex;
-  uptr NumberOfStashedRegions = 0;
-  uptr RegionsStash[MaxStashedRegions] = {};
+  uptr NumberOfStashedRegions;
+  uptr RegionsStash[MaxStashedRegions];
 };
 
 } // namespace scudo
diff --git a/standalone/primary64.h b/standalone/primary64.h
index 94375fceee1..7bdb7ae6e49 100644
--- a/standalone/primary64.h
+++ b/standalone/primary64.h
@@ -14,7 +14,6 @@
 #include "list.h"
 #include "local_cache.h"
 #include "memtag.h"
-#include "options.h"
 #include "release.h"
 #include "stats.h"
 #include "string_utils.h"
@@ -40,18 +39,25 @@ namespace scudo {
 // The memory used by this allocator is never unmapped, but can be partially
 // released if the platform allows for it.
 
-template <typename Config> class SizeClassAllocator64 {
+template <class SizeClassMapT, uptr RegionSizeLog,
+          s32 MinReleaseToOsIntervalMs = INT32_MIN,
+          s32 MaxReleaseToOsIntervalMs = INT32_MAX,
+          bool MaySupportMemoryTagging = false>
+class SizeClassAllocator64 {
 public:
-  typedef typename Config::PrimaryCompactPtrT CompactPtrT;
-  static const uptr CompactPtrScale = Config::PrimaryCompactPtrScale;
-  typedef typename Config::SizeClassMap SizeClassMap;
-  typedef SizeClassAllocator64<Config> ThisT;
+  typedef SizeClassMapT SizeClassMap;
+  typedef SizeClassAllocator64<
+      SizeClassMap, RegionSizeLog, MinReleaseToOsIntervalMs,
+      MaxReleaseToOsIntervalMs, MaySupportMemoryTagging>
+      ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
+  static const bool SupportsMemoryTagging =
+      MaySupportMemoryTagging && archSupportsMemoryTagging();
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
-               ? roundUpTo(sizeof(TransferBatch), 1U << CompactPtrScale)
+               ? sizeof(TransferBatch)
                : SizeClassMap::getSizeByClassId(ClassId);
   }
 
@@ -60,11 +66,11 @@ public:
   void initLinkerInitialized(s32 ReleaseToOsInterval) {
     // Reserve the space required for the Primary.
     PrimaryBase = reinterpret_cast<uptr>(
-        map(nullptr, PrimarySize, nullptr, MAP_NOACCESS, &Data));
+        map(nullptr, PrimarySize, "scudo:primary", MAP_NOACCESS, &Data));
 
     u32 Seed;
     const u64 Time = getMonotonicTime();
-    if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
+    if (UNLIKELY(!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed))))
       Seed = static_cast<u32>(Time ^ (PrimaryBase >> 12));
     const uptr PageSize = getPageSizeCached();
     for (uptr I = 0; I < NumClasses; I++) {
@@ -73,9 +79,22 @@ public:
       Region->RegionBeg =
           getRegionBaseByClassId(I) + (getRandomModN(&Seed, 16) + 1) * PageSize;
       Region->RandState = getRandomU32(&Seed);
-      Region->ReleaseInfo.LastReleaseAtNs = Time;
+      // Releasing smaller size classes doesn't necessarily yield to a
+      // meaningful RSS impact: there are more blocks per page, they are
+      // randomized around, and thus pages are less likely to be entirely empty.
+      // On top of this, attempting to release those require more iterations and
+      // memory accesses which ends up being fairly costly. The current lower
+      // limit is mostly arbitrary and based on empirical observations.
+      // TODO(kostyak): make the lower limit a runtime option
+      Region->CanRelease = (I != SizeClassMap::BatchClassId) &&
+                           (getSizeByClassId(I) >= (PageSize / 32));
+      if (Region->CanRelease)
+        Region->ReleaseInfo.LastReleaseAtNs = Time;
     }
-    setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
+    setReleaseToOsIntervalMs(ReleaseToOsInterval);
+
+    if (SupportsMemoryTagging)
+      UseMemoryTagging = systemSupportsMemoryTagging();
   }
   void init(s32 ReleaseToOsInterval) {
     memset(this, 0, sizeof(*this));
@@ -109,7 +128,7 @@ public:
     ScopedLock L(Region->Mutex);
     Region->FreeList.push_front(B);
     Region->Stats.PushedBlocks += B->getCount();
-    if (ClassId != SizeClassMap::BatchClassId)
+    if (Region->CanRelease)
       releaseToOSMaybe(Region, ClassId);
   }
 
@@ -166,23 +185,18 @@ public:
       getStats(Str, I, 0);
   }
 
-  bool setOption(Option O, sptr Value) {
-    if (O == Option::ReleaseInterval) {
-      const s32 Interval = Max(
-          Min(static_cast<s32>(Value), Config::PrimaryMaxReleaseToOsIntervalMs),
-          Config::PrimaryMinReleaseToOsIntervalMs);
-      atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
-      return true;
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    if (Interval >= MaxReleaseToOsIntervalMs) {
+      Interval = MaxReleaseToOsIntervalMs;
+    } else if (Interval <= MinReleaseToOsIntervalMs) {
+      Interval = MinReleaseToOsIntervalMs;
     }
-    // Not supported by the Primary, but not an error either.
-    return true;
+    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
   }
 
   uptr releaseToOS() {
     uptr TotalReleasedBytes = 0;
     for (uptr I = 0; I < NumClasses; I++) {
-      if (I == SizeClassMap::BatchClassId)
-        continue;
       RegionInfo *Region = getRegionInfo(I);
       ScopedLock L(Region->Mutex);
       TotalReleasedBytes += releaseToOSMaybe(Region, I, /*Force=*/true);
@@ -190,78 +204,13 @@ public:
     return TotalReleasedBytes;
   }
 
-  const char *getRegionInfoArrayAddress() const {
-    return reinterpret_cast<const char *>(RegionInfoArray);
+  bool useMemoryTagging() const {
+    return SupportsMemoryTagging && UseMemoryTagging;
   }
-
-  static uptr getRegionInfoArraySize() { return sizeof(RegionInfoArray); }
-
-  uptr getCompactPtrBaseByClassId(uptr ClassId) {
-    // If we are not compacting pointers, base everything off of 0.
-    if (sizeof(CompactPtrT) == sizeof(uptr) && CompactPtrScale == 0)
-      return 0;
-    return getRegionInfo(ClassId)->RegionBeg;
-  }
-
-  CompactPtrT compactPtr(uptr ClassId, uptr Ptr) {
-    DCHECK_LE(ClassId, SizeClassMap::LargestClassId);
-    return compactPtrInternal(getCompactPtrBaseByClassId(ClassId), Ptr);
-  }
-
-  void *decompactPtr(uptr ClassId, CompactPtrT CompactPtr) {
-    DCHECK_LE(ClassId, SizeClassMap::LargestClassId);
-    return reinterpret_cast<void *>(
-        decompactPtrInternal(getCompactPtrBaseByClassId(ClassId), CompactPtr));
-  }
-
-  static BlockInfo findNearestBlock(const char *RegionInfoData, uptr Ptr) {
-    const RegionInfo *RegionInfoArray =
-        reinterpret_cast<const RegionInfo *>(RegionInfoData);
-    uptr ClassId;
-    uptr MinDistance = -1UL;
-    for (uptr I = 0; I != NumClasses; ++I) {
-      if (I == SizeClassMap::BatchClassId)
-        continue;
-      uptr Begin = RegionInfoArray[I].RegionBeg;
-      uptr End = Begin + RegionInfoArray[I].AllocatedUser;
-      if (Begin > End || End - Begin < SizeClassMap::getSizeByClassId(I))
-        continue;
-      uptr RegionDistance;
-      if (Begin <= Ptr) {
-        if (Ptr < End)
-          RegionDistance = 0;
-        else
-          RegionDistance = Ptr - End;
-      } else {
-        RegionDistance = Begin - Ptr;
-      }
-
-      if (RegionDistance < MinDistance) {
-        MinDistance = RegionDistance;
-        ClassId = I;
-      }
-    }
-
-    BlockInfo B = {};
-    if (MinDistance <= 8192) {
-      B.RegionBegin = RegionInfoArray[ClassId].RegionBeg;
-      B.RegionEnd = B.RegionBegin + RegionInfoArray[ClassId].AllocatedUser;
-      B.BlockSize = SizeClassMap::getSizeByClassId(ClassId);
-      B.BlockBegin =
-          B.RegionBegin + uptr(sptr(Ptr - B.RegionBegin) / sptr(B.BlockSize) *
-                               sptr(B.BlockSize));
-      while (B.BlockBegin < B.RegionBegin)
-        B.BlockBegin += B.BlockSize;
-      while (B.RegionEnd < B.BlockBegin + B.BlockSize)
-        B.BlockBegin -= B.BlockSize;
-    }
-    return B;
-  }
-
-  AtomicOptions Options;
+  void disableMemoryTagging() { UseMemoryTagging = false; }
 
 private:
-  static const uptr RegionSize = 1UL << Config::PrimaryRegionSizeLog;
+  static const uptr RegionSize = 1UL << RegionSizeLog;
   static const uptr NumClasses = SizeClassMap::NumClasses;
   static const uptr PrimarySize = RegionSize * NumClasses;
 
@@ -282,28 +231,26 @@ private:
     u64 LastReleaseAtNs;
   };
 
-  struct UnpaddedRegionInfo {
+  struct ALIGNED(SCUDO_CACHE_LINE_SIZE) RegionInfo {
     HybridMutex Mutex;
     SinglyLinkedList<TransferBatch> FreeList;
-    uptr RegionBeg = 0;
-    RegionStats Stats = {};
-    u32 RandState = 0;
-    uptr MappedUser = 0;    // Bytes mapped for user memory.
-    uptr AllocatedUser = 0; // Bytes allocated for user memory.
-    MapPlatformData Data = {};
-    ReleaseToOsInfo ReleaseInfo = {};
-    bool Exhausted = false;
-  };
-  struct RegionInfo : UnpaddedRegionInfo {
-    char Padding[SCUDO_CACHE_LINE_SIZE -
-                 (sizeof(UnpaddedRegionInfo) % SCUDO_CACHE_LINE_SIZE)] = {};
+    RegionStats Stats;
+    bool CanRelease;
+    bool Exhausted;
+    u32 RandState;
+    uptr RegionBeg;
+    uptr MappedUser;    // Bytes mapped for user memory.
+    uptr AllocatedUser; // Bytes allocated for user memory.
+    MapPlatformData Data;
+    ReleaseToOsInfo ReleaseInfo;
   };
   static_assert(sizeof(RegionInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
-  uptr PrimaryBase = 0;
-  MapPlatformData Data = {};
-  atomic_s32 ReleaseToOsIntervalMs = {};
-  alignas(SCUDO_CACHE_LINE_SIZE) RegionInfo RegionInfoArray[NumClasses];
+  uptr PrimaryBase;
+  MapPlatformData Data;
+  atomic_s32 ReleaseToOsIntervalMs;
+  bool UseMemoryTagging;
+  RegionInfo RegionInfoArray[NumClasses];
 
   RegionInfo *getRegionInfo(uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
@@ -311,15 +258,31 @@ private:
   }
 
   uptr getRegionBaseByClassId(uptr ClassId) const {
-    return PrimaryBase + (ClassId << Config::PrimaryRegionSizeLog);
-  }
-
-  static CompactPtrT compactPtrInternal(uptr Base, uptr Ptr) {
-    return static_cast<CompactPtrT>((Ptr - Base) >> CompactPtrScale);
+    return PrimaryBase + (ClassId << RegionSizeLog);
   }
 
-  static uptr decompactPtrInternal(uptr Base, CompactPtrT CompactPtr) {
-    return Base + (static_cast<uptr>(CompactPtr) << CompactPtrScale);
+  bool populateBatches(CacheT *C, RegionInfo *Region, uptr ClassId,
+                       TransferBatch **CurrentBatch, u32 MaxCount,
+                       void **PointersArray, u32 Count) {
+    // No need to shuffle the batches size class.
+    if (ClassId != SizeClassMap::BatchClassId)
+      shuffle(PointersArray, Count, &Region->RandState);
+    TransferBatch *B = *CurrentBatch;
+    for (uptr I = 0; I < Count; I++) {
+      if (B && B->getCount() == MaxCount) {
+        Region->FreeList.push_back(B);
+        B = nullptr;
+      }
+      if (!B) {
+        B = C->createBatch(ClassId, PointersArray[I]);
+        if (UNLIKELY(!B))
+          return false;
+        B->clear();
+      }
+      B->add(PointersArray[I]);
+    }
+    *CurrentBatch = B;
+    return true;
   }
 
   NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
@@ -333,32 +296,31 @@ private:
     // Map more space for blocks, if necessary.
     if (TotalUserBytes > MappedUser) {
       // Do the mmap for the user memory.
-      const uptr MapSize =
+      const uptr UserMapSize =
           roundUpTo(TotalUserBytes - MappedUser, MapSizeIncrement);
       const uptr RegionBase = RegionBeg - getRegionBaseByClassId(ClassId);
-      if (UNLIKELY(RegionBase + MappedUser + MapSize > RegionSize)) {
+      if (UNLIKELY(RegionBase + MappedUser + UserMapSize > RegionSize)) {
         if (!Region->Exhausted) {
           Region->Exhausted = true;
           ScopedString Str(1024);
           getStats(&Str);
           Str.append(
-              "Scudo OOM: The process has exhausted %zuM for size class %zu.\n",
+              "Scudo OOM: The process has Exhausted %zuM for size class %zu.\n",
               RegionSize >> 20, Size);
           Str.output();
         }
         return nullptr;
       }
-      if (MappedUser == 0)
+      if (UNLIKELY(MappedUser == 0))
         Region->Data = Data;
-      if (UNLIKELY(!map(
-              reinterpret_cast<void *>(RegionBeg + MappedUser), MapSize,
-              "scudo:primary",
-              MAP_ALLOWNOMEM | MAP_RESIZABLE |
-                  (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG : 0),
-              &Region->Data)))
+      if (UNLIKELY(!map(reinterpret_cast<void *>(RegionBeg + MappedUser),
+                        UserMapSize, "scudo:primary",
+                        MAP_ALLOWNOMEM | MAP_RESIZABLE |
+                            (useMemoryTagging() ? MAP_MEMTAG : 0),
+                        &Region->Data)))
         return nullptr;
-      Region->MappedUser += MapSize;
-      C->getStats().add(StatMapped, MapSize);
+      Region->MappedUser += UserMapSize;
+      C->getStats().add(StatMapped, UserMapSize);
     }
 
     const u32 NumberOfBlocks = Min(
@@ -366,37 +328,38 @@ private:
         static_cast<u32>((Region->MappedUser - Region->AllocatedUser) / Size));
     DCHECK_GT(NumberOfBlocks, 0);
 
+    TransferBatch *B = nullptr;
     constexpr u32 ShuffleArraySize =
         MaxNumBatches * TransferBatch::MaxNumCached;
-    CompactPtrT ShuffleArray[ShuffleArraySize];
-    DCHECK_LE(NumberOfBlocks, ShuffleArraySize);
-
-    const uptr CompactPtrBase = getCompactPtrBaseByClassId(ClassId);
-    uptr P = RegionBeg + Region->AllocatedUser;
-    for (u32 I = 0; I < NumberOfBlocks; I++, P += Size)
-      ShuffleArray[I] = compactPtrInternal(CompactPtrBase, P);
-    // No need to shuffle the batches size class.
-    if (ClassId != SizeClassMap::BatchClassId)
-      shuffle(ShuffleArray, NumberOfBlocks, &Region->RandState);
-    for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B =
-          C->createBatch(ClassId, reinterpret_cast<void *>(decompactPtrInternal(
-                                      CompactPtrBase, ShuffleArray[I])));
-      if (UNLIKELY(!B))
+    void *ShuffleArray[ShuffleArraySize];
+    u32 Count = 0;
+    const uptr P = RegionBeg + Region->AllocatedUser;
+    const uptr AllocatedUser = Size * NumberOfBlocks;
+    for (uptr I = P; I < P + AllocatedUser; I += Size) {
+      ShuffleArray[Count++] = reinterpret_cast<void *>(I);
+      if (Count == ShuffleArraySize) {
+        if (UNLIKELY(!populateBatches(C, Region, ClassId, &B, MaxCount,
+                                      ShuffleArray, Count)))
+          return nullptr;
+        Count = 0;
+      }
+    }
+    if (Count) {
+      if (UNLIKELY(!populateBatches(C, Region, ClassId, &B, MaxCount,
+                                    ShuffleArray, Count)))
         return nullptr;
-      const u32 N = Min(MaxCount, NumberOfBlocks - I);
-      B->setFromArray(&ShuffleArray[I], N);
-      Region->FreeList.push_back(B);
-      I += N;
     }
-    TransferBatch *B = Region->FreeList.front();
-    Region->FreeList.pop_front();
     DCHECK(B);
+    if (!Region->FreeList.empty()) {
+      Region->FreeList.push_back(B);
+      B = Region->FreeList.front();
+      Region->FreeList.pop_front();
+    }
     DCHECK_GT(B->getCount(), 0);
 
-    const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
     Region->AllocatedUser += AllocatedUser;
+    Region->Exhausted = false;
 
     return B;
   }
@@ -418,12 +381,16 @@ private:
                 getRegionBaseByClassId(ClassId));
   }
 
+  s32 getReleaseToOsIntervalMs() {
+    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
+  }
+
   NOINLINE uptr releaseToOSMaybe(RegionInfo *Region, uptr ClassId,
                                  bool Force = false) {
     const uptr BlockSize = getSizeByClassId(ClassId);
     const uptr PageSize = getPageSizeCached();
 
-    DCHECK_GE(Region->Stats.PoppedBlocks, Region->Stats.PushedBlocks);
+    CHECK_GE(Region->Stats.PoppedBlocks, Region->Stats.PushedBlocks);
     const uptr BytesInFreeList =
         Region->AllocatedUser -
         (Region->Stats.PoppedBlocks - Region->Stats.PushedBlocks) * BlockSize;
@@ -441,14 +408,14 @@ private:
     if (BlockSize < PageSize / 16U) {
       if (!Force && BytesPushed < Region->AllocatedUser / 16U)
         return 0;
-      // We want 8x% to 9x% free bytes (the larger the block, the lower the %).
+      // We want 8x% to 9x% free bytes (the larger the bock, the lower the %).
       if ((BytesInFreeList * 100U) / Region->AllocatedUser <
           (100U - 1U - BlockSize / 16U))
         return 0;
     }
 
     if (!Force) {
-      const s32 IntervalMs = atomic_load_relaxed(&ReleaseToOsIntervalMs);
+      const s32 IntervalMs = getReleaseToOsIntervalMs();
       if (IntervalMs < 0)
         return 0;
       if (Region->ReleaseInfo.LastReleaseAtNs +
@@ -459,13 +426,8 @@ private:
     }
 
     ReleaseRecorder Recorder(Region->RegionBeg, &Region->Data);
-    const uptr CompactPtrBase = getCompactPtrBaseByClassId(ClassId);
-    auto DecompactPtr = [CompactPtrBase](CompactPtrT CompactPtr) {
-      return decompactPtrInternal(CompactPtrBase, CompactPtr);
-    };
-    auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
-    releaseFreeMemoryToOS(Region->FreeList, Region->AllocatedUser, 1U,
-                          BlockSize, &Recorder, DecompactPtr, SkipRegion);
+    releaseFreeMemoryToOS(Region->FreeList, Region->RegionBeg,
+                          Region->AllocatedUser, 1U, BlockSize, &Recorder);
 
     if (Recorder.getReleasedRangesCount() > 0) {
       Region->ReleaseInfo.PushedBlocksAtLastRelease =
diff --git a/standalone/quarantine.h b/standalone/quarantine.h
index 8d4b38e21fc..406a0e23804 100644
--- a/standalone/quarantine.h
+++ b/standalone/quarantine.h
@@ -161,7 +161,7 @@ public:
 
 private:
   SinglyLinkedList<QuarantineBatch> List;
-  atomic_uptr Size = {};
+  atomic_uptr Size;
 
   void addToSize(uptr add) { atomic_store_relaxed(&Size, getSize() + add); }
   void subFromSize(uptr sub) { atomic_store_relaxed(&Size, getSize() - sub); }
@@ -187,12 +187,7 @@ public:
     Cache.initLinkerInitialized();
   }
   void init(uptr Size, uptr CacheSize) {
-    CacheMutex.init();
-    Cache.init();
-    RecycleMutex.init();
-    MinSize = {};
-    MaxSize = {};
-    MaxCacheSize = {};
+    memset(this, 0, sizeof(*this));
     initLinkerInitialized(Size, CacheSize);
   }
 
@@ -246,9 +241,9 @@ private:
   alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex CacheMutex;
   CacheT Cache;
   alignas(SCUDO_CACHE_LINE_SIZE) HybridMutex RecycleMutex;
-  atomic_uptr MinSize = {};
-  atomic_uptr MaxSize = {};
-  alignas(SCUDO_CACHE_LINE_SIZE) atomic_uptr MaxCacheSize = {};
+  atomic_uptr MinSize;
+  atomic_uptr MaxSize;
+  alignas(SCUDO_CACHE_LINE_SIZE) atomic_uptr MaxCacheSize;
 
   void NOINLINE recycle(uptr MinSize, Callback Cb) {
     CacheT Tmp;
diff --git a/standalone/release.h b/standalone/release.h
index 293a8bc27ba..b50f36fa0c0 100644
--- a/standalone/release.h
+++ b/standalone/release.h
@@ -17,19 +17,17 @@ namespace scudo {
 
 class ReleaseRecorder {
 public:
-  ReleaseRecorder(uptr Base, MapPlatformData *Data = nullptr)
-      : Base(Base), Data(Data) {}
+  ReleaseRecorder(uptr BaseAddress, MapPlatformData *Data = nullptr)
+      : BaseAddress(BaseAddress), Data(Data) {}
 
   uptr getReleasedRangesCount() const { return ReleasedRangesCount; }
 
   uptr getReleasedBytes() const { return ReleasedBytes; }
 
-  uptr getBase() const { return Base; }
-
   // Releases [From, To) range of pages back to OS.
   void releasePageRangeToOS(uptr From, uptr To) {
     const uptr Size = To - From;
-    releasePagesToOS(Base, From, Size, Data);
+    releasePagesToOS(BaseAddress, From, Size, Data);
     ReleasedRangesCount++;
     ReleasedBytes += Size;
   }
@@ -37,7 +35,7 @@ public:
 private:
   uptr ReleasedRangesCount = 0;
   uptr ReleasedBytes = 0;
-  uptr Base = 0;
+  uptr BaseAddress = 0;
   MapPlatformData *Data = nullptr;
 };
 
@@ -54,20 +52,20 @@ public:
   PackedCounterArray(uptr NumberOfRegions, uptr CountersPerRegion,
                      uptr MaxValue)
       : Regions(NumberOfRegions), NumCounters(CountersPerRegion) {
-    DCHECK_GT(Regions, 0);
-    DCHECK_GT(NumCounters, 0);
-    DCHECK_GT(MaxValue, 0);
+    CHECK_GT(Regions, 0);
+    CHECK_GT(NumCounters, 0);
+    CHECK_GT(MaxValue, 0);
     constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL;
     // Rounding counter storage size up to the power of two allows for using
     // bit shifts calculating particular counter's Index and offset.
     const uptr CounterSizeBits =
         roundUpToPowerOfTwo(getMostSignificantSetBitIndex(MaxValue) + 1);
-    DCHECK_LE(CounterSizeBits, MaxCounterBits);
+    CHECK_LE(CounterSizeBits, MaxCounterBits);
     CounterSizeBitsLog = getLog2(CounterSizeBits);
     CounterMask = ~(static_cast<uptr>(0)) >> (MaxCounterBits - CounterSizeBits);
 
     const uptr PackingRatio = MaxCounterBits >> CounterSizeBitsLog;
-    DCHECK_GT(PackingRatio, 0);
+    CHECK_GT(PackingRatio, 0);
     PackingRatioLog = getLog2(PackingRatio);
     BitOffsetMask = PackingRatio - 1;
 
@@ -81,8 +79,7 @@ public:
       memset(Buffer, 0, BufferSize);
     } else {
       Buffer = reinterpret_cast<uptr *>(
-          map(nullptr, roundUpTo(BufferSize, getPageSizeCached()),
-              "scudo:counters", MAP_ALLOWNOMEM));
+          map(nullptr, BufferSize, "scudo:counters", MAP_ALLOWNOMEM));
     }
   }
   ~PackedCounterArray() {
@@ -91,12 +88,12 @@ public:
     if (Buffer == &StaticBuffer[0])
       Mutex.unlock();
     else
-      unmap(reinterpret_cast<void *>(Buffer),
-            roundUpTo(BufferSize, getPageSizeCached()));
+      unmap(reinterpret_cast<void *>(Buffer), BufferSize);
   }
 
   bool isAllocated() const { return !!Buffer; }
 
+
   uptr getCount() const { return NumCounters; }
 
   uptr get(uptr Region, uptr I) const {
@@ -160,11 +157,6 @@ public:
     CurrentPage++;
   }
 
-  void skipPages(uptr N) {
-    closeOpenedRange();
-    CurrentPage += N;
-  }
-
   void finish() { closeOpenedRange(); }
 
 private:
@@ -183,13 +175,11 @@ private:
   uptr CurrentRangeStatePage = 0;
 };
 
-template <class TransferBatchT, class ReleaseRecorderT, typename DecompactPtrT,
-          typename SkipRegionT>
+template <class TransferBatchT, class ReleaseRecorderT>
 NOINLINE void
-releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList,
+releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
                       uptr RegionSize, uptr NumberOfRegions, uptr BlockSize,
-                      ReleaseRecorderT *Recorder, DecompactPtrT DecompactPtr,
-                      SkipRegionT SkipRegion) {
+                      ReleaseRecorderT *Recorder) {
   const uptr PageSize = getPageSizeCached();
 
   // Figure out the number of chunks per page and whether we can take a fast
@@ -233,45 +223,44 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList,
     return;
 
   const uptr PageSizeLog = getLog2(PageSize);
-  const uptr RoundedRegionSize = PagesCount << PageSizeLog;
-  const uptr RoundedSize = NumberOfRegions * RoundedRegionSize;
+  const uptr RoundedSize = NumberOfRegions * (PagesCount << PageSizeLog);
 
   // Iterate over free chunks and count how many free chunks affect each
   // allocated page.
   if (BlockSize <= PageSize && PageSize % BlockSize == 0) {
     // Each chunk affects one page only.
     for (const auto &It : FreeList) {
-      for (u32 I = 0; I < It.getCount(); I++) {
-        const uptr P = DecompactPtr(It.get(I)) - Recorder->getBase();
-        if (P >= RoundedSize)
-          continue;
-        const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
-        const uptr PInRegion = P - RegionIndex * RegionSize;
-        Counters.inc(RegionIndex, PInRegion >> PageSizeLog);
+      // If dealing with a TransferBatch, the first pointer of the batch will
+      // point to the batch itself, we do not want to mark this for release as
+      // the batch is in use, so skip the first entry.
+      const bool IsTransferBatch =
+          (It.getCount() != 0) &&
+          (reinterpret_cast<uptr>(It.get(0)) == reinterpret_cast<uptr>(&It));
+      for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
+        const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
+        // This takes care of P < Base and P >= Base + RoundedSize.
+        if (P < RoundedSize) {
+          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
+          const uptr PInRegion = P - RegionIndex * RegionSize;
+          Counters.inc(RegionIndex, PInRegion >> PageSizeLog);
+        }
       }
     }
   } else {
     // In all other cases chunks might affect more than one page.
-    DCHECK_GE(RegionSize, BlockSize);
-    const uptr LastBlockInRegion = ((RegionSize / BlockSize) - 1U) * BlockSize;
     for (const auto &It : FreeList) {
-      for (u32 I = 0; I < It.getCount(); I++) {
-        const uptr P = DecompactPtr(It.get(I)) - Recorder->getBase();
-        if (P >= RoundedSize)
-          continue;
-        const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
-        uptr PInRegion = P - RegionIndex * RegionSize;
-        Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
-                          (PInRegion + BlockSize - 1) >> PageSizeLog);
-        // The last block in a region might straddle a page, so if it's
-        // free, we mark the following "pretend" memory block(s) as free.
-        if (PInRegion == LastBlockInRegion) {
-          PInRegion += BlockSize;
-          while (PInRegion < RoundedRegionSize) {
-            Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
-                              (PInRegion + BlockSize - 1) >> PageSizeLog);
-            PInRegion += BlockSize;
-          }
+      // See TransferBatch comment above.
+      const bool IsTransferBatch =
+          (It.getCount() != 0) &&
+          (reinterpret_cast<uptr>(It.get(0)) == reinterpret_cast<uptr>(&It));
+      for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
+        const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
+        // This takes care of P < Base and P >= Base + RoundedSize.
+        if (P < RoundedSize) {
+          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
+          const uptr PInRegion = P - RegionIndex * RegionSize;
+          Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
+                            (PInRegion + BlockSize - 1) >> PageSizeLog);
         }
       }
     }
@@ -282,15 +271,10 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList,
   FreePagesRangeTracker<ReleaseRecorderT> RangeTracker(Recorder);
   if (SameBlockCountPerPage) {
     // Fast path, every page has the same number of chunks affecting it.
-    for (uptr I = 0; I < NumberOfRegions; I++) {
-      if (SkipRegion(I)) {
-        RangeTracker.skipPages(PagesCount);
-        continue;
-      }
+    for (uptr I = 0; I < NumberOfRegions; I++)
       for (uptr J = 0; J < PagesCount; J++)
         RangeTracker.processNextPage(Counters.get(I, J) ==
                                      FullPagesBlockCountMax);
-    }
   } else {
     // Slow path, go through the pages keeping count how many chunks affect
     // each page.
@@ -302,10 +286,6 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList,
     // up the number of chunks on the current page and checking on every step
     // whether the page boundary was crossed.
     for (uptr I = 0; I < NumberOfRegions; I++) {
-      if (SkipRegion(I)) {
-        RangeTracker.skipPages(PagesCount);
-        continue;
-      }
       uptr PrevPageBoundary = 0;
       uptr CurrentBoundary = 0;
       for (uptr J = 0; J < PagesCount; J++) {
@@ -321,6 +301,7 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList,
           }
         }
         PrevPageBoundary = PageBoundary;
+
         RangeTracker.processNextPage(Counters.get(I, J) == BlocksPerPage);
       }
     }
diff --git a/standalone/secondary.h b/standalone/secondary.h
index ea5d6808aec..9d5f130f2d4 100644
--- a/standalone/secondary.h
+++ b/standalone/secondary.h
@@ -9,12 +9,9 @@
 #ifndef SCUDO_SECONDARY_H_
 #define SCUDO_SECONDARY_H_
 
-#include "chunk.h"
 #include "common.h"
 #include "list.h"
-#include "memtag.h"
 #include "mutex.h"
-#include "options.h"
 #include "stats.h"
 #include "string_utils.h"
 
@@ -31,292 +28,134 @@ namespace LargeBlock {
 struct Header {
   LargeBlock::Header *Prev;
   LargeBlock::Header *Next;
-  uptr CommitBase;
-  uptr CommitSize;
+  uptr BlockEnd;
   uptr MapBase;
   uptr MapSize;
-  [[no_unique_address]] MapPlatformData Data;
+  MapPlatformData Data;
 };
 
 constexpr uptr getHeaderSize() {
   return roundUpTo(sizeof(Header), 1U << SCUDO_MIN_ALIGNMENT_LOG);
 }
 
-template <typename Config> static uptr addHeaderTag(uptr Ptr) {
-  if (allocatorSupportsMemoryTagging<Config>())
-    return addFixedTag(Ptr, 1);
-  return Ptr;
+static Header *getHeader(uptr Ptr) {
+  return reinterpret_cast<Header *>(Ptr - getHeaderSize());
 }
 
-template <typename Config> static Header *getHeader(uptr Ptr) {
-  return reinterpret_cast<Header *>(addHeaderTag<Config>(Ptr) -
-                                    getHeaderSize());
-}
-
-template <typename Config> static Header *getHeader(const void *Ptr) {
-  return getHeader<Config>(reinterpret_cast<uptr>(Ptr));
+static Header *getHeader(const void *Ptr) {
+  return getHeader(reinterpret_cast<uptr>(Ptr));
 }
 
 } // namespace LargeBlock
 
-static void unmap(LargeBlock::Header *H) {
-  MapPlatformData Data = H->Data;
-  unmap(reinterpret_cast<void *>(H->MapBase), H->MapSize, UNMAP_ALL, &Data);
-}
-
 class MapAllocatorNoCache {
 public:
   void initLinkerInitialized(UNUSED s32 ReleaseToOsInterval) {}
   void init(UNUSED s32 ReleaseToOsInterval) {}
-  bool retrieve(UNUSED Options Options, UNUSED uptr Size, UNUSED uptr Alignment,
-                UNUSED LargeBlock::Header **H, UNUSED bool *Zeroed) {
+  bool retrieve(UNUSED uptr Size, UNUSED LargeBlock::Header **H) {
     return false;
   }
-  void store(UNUSED Options Options, LargeBlock::Header *H) { unmap(H); }
-  bool canCache(UNUSED uptr Size) { return false; }
+  bool store(UNUSED LargeBlock::Header *H) { return false; }
+  static bool canCache(UNUSED uptr Size) { return false; }
   void disable() {}
   void enable() {}
   void releaseToOS() {}
-  void disableMemoryTagging() {}
-  bool setOption(Option O, UNUSED sptr Value) {
-    if (O == Option::ReleaseInterval || O == Option::MaxCacheEntriesCount ||
-        O == Option::MaxCacheEntrySize)
-      return false;
-    // Not supported by the Secondary Cache, but not an error either.
-    return true;
-  }
+  void setReleaseToOsIntervalMs(UNUSED s32 Interval) {}
 };
 
-static const uptr MaxUnusedCachePages = 4U;
-
-template <typename Config>
-void mapSecondary(Options Options, uptr CommitBase, uptr CommitSize,
-                  uptr AllocPos, uptr Flags, MapPlatformData *Data) {
-  const uptr MaxUnusedCacheBytes = MaxUnusedCachePages * getPageSizeCached();
-  if (useMemoryTagging<Config>(Options) && CommitSize > MaxUnusedCacheBytes) {
-    const uptr UntaggedPos = Max(AllocPos, CommitBase + MaxUnusedCacheBytes);
-    map(reinterpret_cast<void *>(CommitBase), UntaggedPos - CommitBase,
-        "scudo:secondary", MAP_RESIZABLE | MAP_MEMTAG | Flags, Data);
-    map(reinterpret_cast<void *>(UntaggedPos),
-        CommitBase + CommitSize - UntaggedPos, "scudo:secondary",
-        MAP_RESIZABLE | Flags, Data);
-  } else {
-    map(reinterpret_cast<void *>(CommitBase), CommitSize, "scudo:secondary",
-        MAP_RESIZABLE | (useMemoryTagging<Config>(Options) ? MAP_MEMTAG : 0) |
-            Flags,
-        Data);
-  }
-}
-
-template <typename Config> class MapAllocatorCache {
+template <uptr MaxEntriesCount = 32U, uptr MaxEntrySize = 1UL << 19,
+          s32 MinReleaseToOsIntervalMs = INT32_MIN,
+          s32 MaxReleaseToOsIntervalMs = INT32_MAX>
+class MapAllocatorCache {
 public:
-  // Ensure the default maximum specified fits the array.
-  static_assert(Config::SecondaryCacheDefaultMaxEntriesCount <=
-                    Config::SecondaryCacheEntriesArraySize,
-                "");
+  // Fuchsia doesn't allow releasing Secondary blocks yet. Note that 0 length
+  // arrays are an extension for some compilers.
+  // FIXME(kostyak): support (partially) the cache on Fuchsia.
+  static_assert(!SCUDO_FUCHSIA || MaxEntriesCount == 0U, "");
 
   void initLinkerInitialized(s32 ReleaseToOsInterval) {
-    setOption(Option::MaxCacheEntriesCount,
-              static_cast<sptr>(Config::SecondaryCacheDefaultMaxEntriesCount));
-    setOption(Option::MaxCacheEntrySize,
-              static_cast<sptr>(Config::SecondaryCacheDefaultMaxEntrySize));
-    setOption(Option::ReleaseInterval, static_cast<sptr>(ReleaseToOsInterval));
+    setReleaseToOsIntervalMs(ReleaseToOsInterval);
   }
   void init(s32 ReleaseToOsInterval) {
     memset(this, 0, sizeof(*this));
     initLinkerInitialized(ReleaseToOsInterval);
   }
 
-  void store(Options Options, LargeBlock::Header *H) {
-    if (!canCache(H->CommitSize))
-      return unmap(H);
-
+  bool store(LargeBlock::Header *H) {
     bool EntryCached = false;
     bool EmptyCache = false;
-    const s32 Interval = atomic_load_relaxed(&ReleaseToOsIntervalMs);
     const u64 Time = getMonotonicTime();
-    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
-    CachedBlock Entry;
-    Entry.CommitBase = H->CommitBase;
-    Entry.CommitSize = H->CommitSize;
-    Entry.MapBase = H->MapBase;
-    Entry.MapSize = H->MapSize;
-    Entry.BlockBegin = reinterpret_cast<uptr>(H + 1);
-    Entry.Data = H->Data;
-    Entry.Time = Time;
-    if (useMemoryTagging<Config>(Options)) {
-      if (Interval == 0 && !SCUDO_FUCHSIA) {
-        // Release the memory and make it inaccessible at the same time by
-        // creating a new MAP_NOACCESS mapping on top of the existing mapping.
-        // Fuchsia does not support replacing mappings by creating a new mapping
-        // on top so we just do the two syscalls there.
-        Entry.Time = 0;
-        mapSecondary<Config>(Options, Entry.CommitBase, Entry.CommitSize,
-                             Entry.CommitBase, MAP_NOACCESS, &Entry.Data);
-      } else {
-        setMemoryPermission(Entry.CommitBase, Entry.CommitSize, MAP_NOACCESS,
-                            &Entry.Data);
-      }
-    } else if (Interval == 0) {
-      releasePagesToOS(Entry.CommitBase, 0, Entry.CommitSize, &Entry.Data);
-      Entry.Time = 0;
-    }
-    do {
+    {
       ScopedLock L(Mutex);
-      if (useMemoryTagging<Config>(Options) && QuarantinePos == -1U) {
-        // If we get here then memory tagging was disabled in between when we
-        // read Options and when we locked Mutex. We can't insert our entry into
-        // the quarantine or the cache because the permissions would be wrong so
-        // just unmap it.
-        break;
-      }
-      if (Config::SecondaryCacheQuarantineSize &&
-          useMemoryTagging<Config>(Options)) {
-        QuarantinePos =
-            (QuarantinePos + 1) % Max(Config::SecondaryCacheQuarantineSize, 1u);
-        if (!Quarantine[QuarantinePos].CommitBase) {
-          Quarantine[QuarantinePos] = Entry;
-          return;
-        }
-        CachedBlock PrevEntry = Quarantine[QuarantinePos];
-        Quarantine[QuarantinePos] = Entry;
-        if (OldestTime == 0)
-          OldestTime = Entry.Time;
-        Entry = PrevEntry;
-      }
-      if (EntriesCount >= MaxCount) {
+      if (EntriesCount == MaxEntriesCount) {
         if (IsFullEvents++ == 4U)
           EmptyCache = true;
       } else {
-        for (u32 I = 0; I < MaxCount; I++) {
-          if (Entries[I].CommitBase)
+        for (uptr I = 0; I < MaxEntriesCount; I++) {
+          if (Entries[I].Block)
             continue;
           if (I != 0)
             Entries[I] = Entries[0];
-          Entries[0] = Entry;
+          Entries[0].Block = reinterpret_cast<uptr>(H);
+          Entries[0].BlockEnd = H->BlockEnd;
+          Entries[0].MapBase = H->MapBase;
+          Entries[0].MapSize = H->MapSize;
+          Entries[0].Data = H->Data;
+          Entries[0].Time = Time;
           EntriesCount++;
-          if (OldestTime == 0)
-            OldestTime = Entry.Time;
           EntryCached = true;
           break;
         }
       }
-    } while (0);
+    }
+    s32 Interval;
     if (EmptyCache)
       empty();
-    else if (Interval >= 0)
+    else if ((Interval = getReleaseToOsIntervalMs()) >= 0)
       releaseOlderThan(Time - static_cast<u64>(Interval) * 1000000);
-    if (!EntryCached)
-      unmap(reinterpret_cast<void *>(Entry.MapBase), Entry.MapSize, UNMAP_ALL,
-            &Entry.Data);
+    return EntryCached;
   }
 
-  bool retrieve(Options Options, uptr Size, uptr Alignment,
-                LargeBlock::Header **H, bool *Zeroed) {
+  bool retrieve(uptr Size, LargeBlock::Header **H) {
     const uptr PageSize = getPageSizeCached();
-    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
-    bool Found = false;
-    CachedBlock Entry;
-    uptr HeaderPos;
-    {
-      ScopedLock L(Mutex);
-      if (EntriesCount == 0)
-        return false;
-      for (u32 I = 0; I < MaxCount; I++) {
-        const uptr CommitBase = Entries[I].CommitBase;
-        if (!CommitBase)
-          continue;
-        const uptr CommitSize = Entries[I].CommitSize;
-        const uptr AllocPos =
-            roundDownTo(CommitBase + CommitSize - Size, Alignment);
-        HeaderPos =
-            AllocPos - Chunk::getHeaderSize() - LargeBlock::getHeaderSize();
-        if (HeaderPos > CommitBase + CommitSize)
-          continue;
-        if (HeaderPos < CommitBase ||
-            AllocPos > CommitBase + PageSize * MaxUnusedCachePages)
-          continue;
-        Found = true;
-        Entry = Entries[I];
-        Entries[I].CommitBase = 0;
-        break;
-      }
-    }
-    if (Found) {
-      *H = reinterpret_cast<LargeBlock::Header *>(
-          LargeBlock::addHeaderTag<Config>(HeaderPos));
-      *Zeroed = Entry.Time == 0;
-      if (useMemoryTagging<Config>(Options))
-        setMemoryPermission(Entry.CommitBase, Entry.CommitSize, 0, &Entry.Data);
-      uptr NewBlockBegin = reinterpret_cast<uptr>(*H + 1);
-      if (useMemoryTagging<Config>(Options)) {
-        if (*Zeroed)
-          storeTags(LargeBlock::addHeaderTag<Config>(Entry.CommitBase),
-                    NewBlockBegin);
-        else if (Entry.BlockBegin < NewBlockBegin)
-          storeTags(Entry.BlockBegin, NewBlockBegin);
-        else
-          storeTags(untagPointer(NewBlockBegin),
-                    untagPointer(Entry.BlockBegin));
-      }
-      (*H)->CommitBase = Entry.CommitBase;
-      (*H)->CommitSize = Entry.CommitSize;
-      (*H)->MapBase = Entry.MapBase;
-      (*H)->MapSize = Entry.MapSize;
-      (*H)->Data = Entry.Data;
+    ScopedLock L(Mutex);
+    if (EntriesCount == 0)
+      return false;
+    for (uptr I = 0; I < MaxEntriesCount; I++) {
+      if (!Entries[I].Block)
+        continue;
+      const uptr BlockSize = Entries[I].BlockEnd - Entries[I].Block;
+      if (Size > BlockSize)
+        continue;
+      if (Size < BlockSize - PageSize * 4U)
+        continue;
+      *H = reinterpret_cast<LargeBlock::Header *>(Entries[I].Block);
+      Entries[I].Block = 0;
+      (*H)->BlockEnd = Entries[I].BlockEnd;
+      (*H)->MapBase = Entries[I].MapBase;
+      (*H)->MapSize = Entries[I].MapSize;
+      (*H)->Data = Entries[I].Data;
       EntriesCount--;
+      return true;
     }
-    return Found;
+    return false;
   }
 
-  bool canCache(uptr Size) {
-    return atomic_load_relaxed(&MaxEntriesCount) != 0U &&
-           Size <= atomic_load_relaxed(&MaxEntrySize);
+  static bool canCache(uptr Size) {
+    return MaxEntriesCount != 0U && Size <= MaxEntrySize;
   }
 
-  bool setOption(Option O, sptr Value) {
-    if (O == Option::ReleaseInterval) {
-      const s32 Interval =
-          Max(Min(static_cast<s32>(Value),
-                  Config::SecondaryCacheMaxReleaseToOsIntervalMs),
-              Config::SecondaryCacheMinReleaseToOsIntervalMs);
-      atomic_store_relaxed(&ReleaseToOsIntervalMs, Interval);
-      return true;
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    if (Interval >= MaxReleaseToOsIntervalMs) {
+      Interval = MaxReleaseToOsIntervalMs;
+    } else if (Interval <= MinReleaseToOsIntervalMs) {
+      Interval = MinReleaseToOsIntervalMs;
     }
-    if (O == Option::MaxCacheEntriesCount) {
-      const u32 MaxCount = static_cast<u32>(Value);
-      if (MaxCount > Config::SecondaryCacheEntriesArraySize)
-        return false;
-      atomic_store_relaxed(&MaxEntriesCount, MaxCount);
-      return true;
-    }
-    if (O == Option::MaxCacheEntrySize) {
-      atomic_store_relaxed(&MaxEntrySize, static_cast<uptr>(Value));
-      return true;
-    }
-    // Not supported by the Secondary Cache, but not an error either.
-    return true;
+    atomic_store(&ReleaseToOsIntervalMs, Interval, memory_order_relaxed);
   }
 
   void releaseToOS() { releaseOlderThan(UINT64_MAX); }
 
-  void disableMemoryTagging() {
-    ScopedLock L(Mutex);
-    for (u32 I = 0; I != Config::SecondaryCacheQuarantineSize; ++I) {
-      if (Quarantine[I].CommitBase) {
-        unmap(reinterpret_cast<void *>(Quarantine[I].MapBase),
-              Quarantine[I].MapSize, UNMAP_ALL, &Quarantine[I].Data);
-        Quarantine[I].CommitBase = 0;
-      }
-    }
-    const u32 MaxCount = atomic_load_relaxed(&MaxEntriesCount);
-    for (u32 I = 0; I < MaxCount; I++)
-      if (Entries[I].CommitBase)
-        setMemoryPermission(Entries[I].CommitBase, Entries[I].CommitSize, 0,
-                            &Entries[I].Data);
-    QuarantinePos = -1U;
-  }
-
   void disable() { Mutex.lock(); }
 
   void enable() { Mutex.unlock(); }
@@ -327,17 +166,17 @@ private:
       void *MapBase;
       uptr MapSize;
       MapPlatformData Data;
-    } MapInfo[Config::SecondaryCacheEntriesArraySize];
+    } MapInfo[MaxEntriesCount];
     uptr N = 0;
     {
       ScopedLock L(Mutex);
-      for (uptr I = 0; I < Config::SecondaryCacheEntriesArraySize; I++) {
-        if (!Entries[I].CommitBase)
+      for (uptr I = 0; I < MaxEntriesCount; I++) {
+        if (!Entries[I].Block)
           continue;
         MapInfo[N].MapBase = reinterpret_cast<void *>(Entries[I].MapBase);
         MapInfo[N].MapSize = Entries[I].MapSize;
         MapInfo[N].Data = Entries[I].Data;
-        Entries[I].CommitBase = 0;
+        Entries[I].Block = 0;
         N++;
       }
       EntriesCount = 0;
@@ -348,53 +187,42 @@ private:
             &MapInfo[I].Data);
   }
 
-  struct CachedBlock {
-    uptr CommitBase;
-    uptr CommitSize;
-    uptr MapBase;
-    uptr MapSize;
-    uptr BlockBegin;
-    [[no_unique_address]] MapPlatformData Data;
-    u64 Time;
-  };
-
-  void releaseIfOlderThan(CachedBlock &Entry, u64 Time) {
-    if (!Entry.CommitBase || !Entry.Time)
-      return;
-    if (Entry.Time > Time) {
-      if (OldestTime == 0 || Entry.Time < OldestTime)
-        OldestTime = Entry.Time;
+  void releaseOlderThan(u64 Time) {
+    ScopedLock L(Mutex);
+    if (!EntriesCount)
       return;
+    for (uptr I = 0; I < MaxEntriesCount; I++) {
+      if (!Entries[I].Block || !Entries[I].Time || Entries[I].Time > Time)
+        continue;
+      releasePagesToOS(Entries[I].Block, 0,
+                       Entries[I].BlockEnd - Entries[I].Block,
+                       &Entries[I].Data);
+      Entries[I].Time = 0;
     }
-    releasePagesToOS(Entry.CommitBase, 0, Entry.CommitSize, &Entry.Data);
-    Entry.Time = 0;
   }
 
-  void releaseOlderThan(u64 Time) {
-    ScopedLock L(Mutex);
-    if (!EntriesCount || OldestTime == 0 || OldestTime > Time)
-      return;
-    OldestTime = 0;
-    for (uptr I = 0; I < Config::SecondaryCacheQuarantineSize; I++)
-      releaseIfOlderThan(Quarantine[I], Time);
-    for (uptr I = 0; I < Config::SecondaryCacheEntriesArraySize; I++)
-      releaseIfOlderThan(Entries[I], Time);
+  s32 getReleaseToOsIntervalMs() {
+    return atomic_load(&ReleaseToOsIntervalMs, memory_order_relaxed);
   }
 
+  struct CachedBlock {
+    uptr Block;
+    uptr BlockEnd;
+    uptr MapBase;
+    uptr MapSize;
+    MapPlatformData Data;
+    u64 Time;
+  };
+
   HybridMutex Mutex;
-  u32 EntriesCount = 0;
-  u32 QuarantinePos = 0;
-  atomic_u32 MaxEntriesCount = {};
-  atomic_uptr MaxEntrySize = {};
-  u64 OldestTime = 0;
-  u32 IsFullEvents = 0;
-  atomic_s32 ReleaseToOsIntervalMs = {};
-
-  CachedBlock Entries[Config::SecondaryCacheEntriesArraySize] = {};
-  CachedBlock Quarantine[Config::SecondaryCacheQuarantineSize] = {};
+  CachedBlock Entries[MaxEntriesCount];
+  u32 EntriesCount;
+  uptr LargestSize;
+  u32 IsFullEvents;
+  atomic_s32 ReleaseToOsIntervalMs;
 };
 
-template <typename Config> class MapAllocator {
+template <class CacheT> class MapAllocator {
 public:
   void initLinkerInitialized(GlobalStats *S, s32 ReleaseToOsInterval = -1) {
     Cache.initLinkerInitialized(ReleaseToOsInterval);
@@ -407,15 +235,13 @@ public:
     initLinkerInitialized(S, ReleaseToOsInterval);
   }
 
-  void *allocate(Options Options, uptr Size, uptr AlignmentHint = 0,
-                 uptr *BlockEnd = nullptr,
-                 FillContentsMode FillContents = NoFill);
+  void *allocate(uptr Size, uptr AlignmentHint = 0, uptr *BlockEnd = nullptr,
+                 bool ZeroContents = false);
 
-  void deallocate(Options Options, void *Ptr);
+  void deallocate(void *Ptr);
 
   static uptr getBlockEnd(void *Ptr) {
-    auto *B = LargeBlock::getHeader<Config>(Ptr);
-    return B->CommitBase + B->CommitSize;
+    return LargeBlock::getHeader(Ptr)->BlockEnd;
   }
 
   static uptr getBlockSize(void *Ptr) {
@@ -435,32 +261,28 @@ public:
   }
 
   template <typename F> void iterateOverBlocks(F Callback) const {
-    for (const auto &H : InUseBlocks) {
-      uptr Ptr = reinterpret_cast<uptr>(&H) + LargeBlock::getHeaderSize();
-      if (allocatorSupportsMemoryTagging<Config>())
-        Ptr = untagPointer(Ptr);
-      Callback(Ptr);
-    }
+    for (const auto &H : InUseBlocks)
+      Callback(reinterpret_cast<uptr>(&H) + LargeBlock::getHeaderSize());
   }
 
-  uptr canCache(uptr Size) { return Cache.canCache(Size); }
+  static uptr canCache(uptr Size) { return CacheT::canCache(Size); }
 
-  bool setOption(Option O, sptr Value) { return Cache.setOption(O, Value); }
+  void setReleaseToOsIntervalMs(s32 Interval) {
+    Cache.setReleaseToOsIntervalMs(Interval);
+  }
 
   void releaseToOS() { Cache.releaseToOS(); }
 
-  void disableMemoryTagging() { Cache.disableMemoryTagging(); }
-
 private:
-  typename Config::SecondaryCache Cache;
+  CacheT Cache;
 
   HybridMutex Mutex;
   DoublyLinkedList<LargeBlock::Header> InUseBlocks;
-  uptr AllocatedBytes = 0;
-  uptr FreedBytes = 0;
-  uptr LargestSize = 0;
-  u32 NumberOfAllocs = 0;
-  u32 NumberOfFrees = 0;
+  uptr AllocatedBytes;
+  uptr FreedBytes;
+  uptr LargestSize;
+  u32 NumberOfAllocs;
+  u32 NumberOfFrees;
   LocalStats Stats;
 };
 
@@ -475,37 +297,24 @@ private:
 // For allocations requested with an alignment greater than or equal to a page,
 // the committed memory will amount to something close to Size - AlignmentHint
 // (pending rounding and headers).
-template <typename Config>
-void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
-                                     uptr *BlockEndPtr,
-                                     FillContentsMode FillContents) {
-  if (Options.get(OptionBit::AddLargeAllocationSlack))
-    Size += 1UL << SCUDO_MIN_ALIGNMENT_LOG;
-  Alignment = Max(Alignment, 1UL << SCUDO_MIN_ALIGNMENT_LOG);
+template <class CacheT>
+void *MapAllocator<CacheT>::allocate(uptr Size, uptr AlignmentHint,
+                                     uptr *BlockEnd, bool ZeroContents) {
+  DCHECK_GE(Size, AlignmentHint);
   const uptr PageSize = getPageSizeCached();
-  uptr RoundedSize =
-      roundUpTo(roundUpTo(Size, Alignment) + LargeBlock::getHeaderSize() +
-                    Chunk::getHeaderSize(),
-                PageSize);
-  if (Alignment > PageSize)
-    RoundedSize += Alignment - PageSize;
-
-  if (Alignment < PageSize && Cache.canCache(RoundedSize)) {
+  const uptr RoundedSize =
+      roundUpTo(Size + LargeBlock::getHeaderSize(), PageSize);
+
+  if (AlignmentHint < PageSize && CacheT::canCache(RoundedSize)) {
     LargeBlock::Header *H;
-    bool Zeroed;
-    if (Cache.retrieve(Options, Size, Alignment, &H, &Zeroed)) {
-      const uptr BlockEnd = H->CommitBase + H->CommitSize;
-      if (BlockEndPtr)
-        *BlockEndPtr = BlockEnd;
-      uptr HInt = reinterpret_cast<uptr>(H);
-      if (allocatorSupportsMemoryTagging<Config>())
-        HInt = untagPointer(HInt);
-      const uptr PtrInt = HInt + LargeBlock::getHeaderSize();
-      void *Ptr = reinterpret_cast<void *>(PtrInt);
-      if (FillContents && !Zeroed)
-        memset(Ptr, FillContents == ZeroFill ? 0 : PatternFillByte,
-               BlockEnd - PtrInt);
-      const uptr BlockSize = BlockEnd - HInt;
+    if (Cache.retrieve(RoundedSize, &H)) {
+      if (BlockEnd)
+        *BlockEnd = H->BlockEnd;
+      void *Ptr = reinterpret_cast<void *>(reinterpret_cast<uptr>(H) +
+                                           LargeBlock::getHeaderSize());
+      if (ZeroContents)
+        memset(Ptr, 0, H->BlockEnd - reinterpret_cast<uptr>(Ptr));
+      const uptr BlockSize = H->BlockEnd - reinterpret_cast<uptr>(H);
       {
         ScopedLock L(Mutex);
         InUseBlocks.push_back(H);
@@ -520,8 +329,9 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
 
   MapPlatformData Data = {};
   const uptr MapSize = RoundedSize + 2 * PageSize;
-  uptr MapBase = reinterpret_cast<uptr>(
-      map(nullptr, MapSize, nullptr, MAP_NOACCESS | MAP_ALLOWNOMEM, &Data));
+  uptr MapBase =
+      reinterpret_cast<uptr>(map(nullptr, MapSize, "scudo:secondary",
+                                 MAP_NOACCESS | MAP_ALLOWNOMEM, &Data));
   if (UNLIKELY(!MapBase))
     return nullptr;
   uptr CommitBase = MapBase + PageSize;
@@ -529,11 +339,11 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
 
   // In the unlikely event of alignments larger than a page, adjust the amount
   // of memory we want to commit, and trim the extra memory.
-  if (UNLIKELY(Alignment >= PageSize)) {
+  if (UNLIKELY(AlignmentHint >= PageSize)) {
     // For alignments greater than or equal to a page, the user pointer (eg: the
     // pointer that is returned by the C or C++ allocation APIs) ends up on a
     // page boundary , and our headers will live in the preceding page.
-    CommitBase = roundUpTo(MapBase + PageSize + 1, Alignment) - PageSize;
+    CommitBase = roundUpTo(MapBase + PageSize + 1, AlignmentHint) - PageSize;
     const uptr NewMapBase = CommitBase - PageSize;
     DCHECK_GE(NewMapBase, MapBase);
     // We only trim the extra memory on 32-bit platforms: 64-bit platforms
@@ -542,8 +352,9 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
       unmap(reinterpret_cast<void *>(MapBase), NewMapBase - MapBase, 0, &Data);
       MapBase = NewMapBase;
     }
-    const uptr NewMapEnd =
-        CommitBase + PageSize + roundUpTo(Size, PageSize) + PageSize;
+    const uptr NewMapEnd = CommitBase + PageSize +
+                           roundUpTo((Size - AlignmentHint), PageSize) +
+                           PageSize;
     DCHECK_LE(NewMapEnd, MapEnd);
     if (SCUDO_WORDSIZE == 32U && NewMapEnd != MapEnd) {
       unmap(reinterpret_cast<void *>(NewMapEnd), MapEnd - NewMapEnd, 0, &Data);
@@ -552,22 +363,16 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
   }
 
   const uptr CommitSize = MapEnd - PageSize - CommitBase;
-  const uptr AllocPos = roundDownTo(CommitBase + CommitSize - Size, Alignment);
-  mapSecondary<Config>(Options, CommitBase, CommitSize, AllocPos, 0, &Data);
-  const uptr HeaderPos =
-      AllocPos - Chunk::getHeaderSize() - LargeBlock::getHeaderSize();
-  LargeBlock::Header *H = reinterpret_cast<LargeBlock::Header *>(
-      LargeBlock::addHeaderTag<Config>(HeaderPos));
-  if (useMemoryTagging<Config>(Options))
-    storeTags(LargeBlock::addHeaderTag<Config>(CommitBase),
-              reinterpret_cast<uptr>(H + 1));
+  const uptr Ptr =
+      reinterpret_cast<uptr>(map(reinterpret_cast<void *>(CommitBase),
+                                 CommitSize, "scudo:secondary", 0, &Data));
+  LargeBlock::Header *H = reinterpret_cast<LargeBlock::Header *>(Ptr);
   H->MapBase = MapBase;
   H->MapSize = MapEnd - MapBase;
-  H->CommitBase = CommitBase;
-  H->CommitSize = CommitSize;
+  H->BlockEnd = CommitBase + CommitSize;
   H->Data = Data;
-  if (BlockEndPtr)
-    *BlockEndPtr = CommitBase + CommitSize;
+  if (BlockEnd)
+    *BlockEnd = CommitBase + CommitSize;
   {
     ScopedLock L(Mutex);
     InUseBlocks.push_back(H);
@@ -578,13 +383,13 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
     Stats.add(StatAllocated, CommitSize);
     Stats.add(StatMapped, H->MapSize);
   }
-  return reinterpret_cast<void *>(HeaderPos + LargeBlock::getHeaderSize());
+  return reinterpret_cast<void *>(Ptr + LargeBlock::getHeaderSize());
 }
 
-template <typename Config>
-void MapAllocator<Config>::deallocate(Options Options, void *Ptr) {
-  LargeBlock::Header *H = LargeBlock::getHeader<Config>(Ptr);
-  const uptr CommitSize = H->CommitSize;
+template <class CacheT> void MapAllocator<CacheT>::deallocate(void *Ptr) {
+  LargeBlock::Header *H = LargeBlock::getHeader(Ptr);
+  const uptr Block = reinterpret_cast<uptr>(H);
+  const uptr CommitSize = H->BlockEnd - Block;
   {
     ScopedLock L(Mutex);
     InUseBlocks.remove(H);
@@ -593,11 +398,16 @@ void MapAllocator<Config>::deallocate(Options Options, void *Ptr) {
     Stats.sub(StatAllocated, CommitSize);
     Stats.sub(StatMapped, H->MapSize);
   }
-  Cache.store(Options, H);
+  if (CacheT::canCache(CommitSize) && Cache.store(H))
+    return;
+  void *Addr = reinterpret_cast<void *>(H->MapBase);
+  const uptr Size = H->MapSize;
+  MapPlatformData Data = H->Data;
+  unmap(Addr, Size, UNMAP_ALL, &Data);
 }
 
-template <typename Config>
-void MapAllocator<Config>::getStats(ScopedString *Str) const {
+template <class CacheT>
+void MapAllocator<CacheT>::getStats(ScopedString *Str) const {
   Str->append(
       "Stats: MapAllocator: allocated %zu times (%zuK), freed %zu times "
       "(%zuK), remains %zu (%zuK) max %zuM\n",
diff --git a/standalone/size_class_map.h b/standalone/size_class_map.h
index 1948802df0b..5ed8e2845b3 100644
--- a/standalone/size_class_map.h
+++ b/standalone/size_class_map.h
@@ -85,14 +85,6 @@ public:
     return T + (T >> S) * (ClassId & M) + SizeDelta;
   }
 
-  static u8 getSizeLSBByClassId(uptr ClassId) {
-    return u8(getLeastSignificantSetBitIndex(getSizeByClassId(ClassId)));
-  }
-
-  static constexpr bool usesCompressedLSBFormat() {
-    return false;
-  }
-
   static uptr getClassIdBySize(uptr Size) {
     if (Size <= SizeDelta + (1 << Config::MinSizeLog))
       return 1;
@@ -145,41 +137,7 @@ class TableSizeClassMap : public SizeClassMapBase<Config> {
     u8 Tab[getTableSize()] = {};
   };
 
-  static constexpr SizeTable SzTable = {};
-
-  struct LSBTable {
-    constexpr LSBTable() {
-      u8 Min = 255, Max = 0;
-      for (uptr I = 0; I != ClassesSize; ++I) {
-        for (u8 Bit = 0; Bit != 64; ++Bit) {
-          if (Config::Classes[I] & (1 << Bit)) {
-            Tab[I] = Bit;
-            if (Bit < Min)
-              Min = Bit;
-            if (Bit > Max)
-              Max = Bit;
-            break;
-          }
-        }
-      }
-
-      if (Max - Min > 3 || ClassesSize > 32)
-        return;
-
-      UseCompressedFormat = true;
-      CompressedMin = Min;
-      for (uptr I = 0; I != ClassesSize; ++I)
-        CompressedValue |= u64(Tab[I] - Min) << (I * 2);
-    }
-
-    u8 Tab[ClassesSize] = {};
-
-    bool UseCompressedFormat = false;
-    u8 CompressedMin = 0;
-    u64 CompressedValue = 0;
-  };
-
-  static constexpr LSBTable LTable = {};
+  static constexpr SizeTable Table = {};
 
 public:
   static const u32 MaxNumCachedHint = Config::MaxNumCachedHint;
@@ -194,18 +152,6 @@ public:
     return Config::Classes[ClassId - 1];
   }
 
-  static u8 getSizeLSBByClassId(uptr ClassId) {
-    if (LTable.UseCompressedFormat)
-      return ((LTable.CompressedValue >> ((ClassId - 1) * 2)) & 3) +
-             LTable.CompressedMin;
-    else
-      return LTable.Tab[ClassId - 1];
-  }
-
-  static constexpr bool usesCompressedLSBFormat() {
-    return LTable.UseCompressedFormat;
-  }
-
   static uptr getClassIdBySize(uptr Size) {
     if (Size <= Config::Classes[0])
       return 1;
@@ -213,7 +159,7 @@ public:
     DCHECK_LE(Size, MaxSize);
     if (Size <= (1 << Config::MidSizeLog))
       return ((Size - 1) >> Config::MinSizeLog) + 1;
-    return SzTable.Tab[scaledLog2(Size - 1, Config::MidSizeLog, S)];
+    return Table.Tab[scaledLog2(Size - 1, Config::MidSizeLog, S)];
   }
 
   static u32 getMaxCachedHint(uptr Size) {
@@ -222,24 +168,13 @@ public:
   }
 };
 
-struct DefaultSizeClassConfig {
-  static const uptr NumBits = 3;
-  static const uptr MinSizeLog = 5;
-  static const uptr MidSizeLog = 8;
-  static const uptr MaxSizeLog = 17;
-  static const u32 MaxNumCachedHint = 10;
-  static const uptr MaxBytesCachedLog = 10;
-};
-
-typedef FixedSizeClassMap<DefaultSizeClassConfig> DefaultSizeClassMap;
-
 struct AndroidSizeClassConfig {
 #if SCUDO_WORDSIZE == 64U
   static const uptr NumBits = 7;
   static const uptr MinSizeLog = 4;
   static const uptr MidSizeLog = 6;
   static const uptr MaxSizeLog = 16;
-  static const u32 MaxNumCachedHint = 13;
+  static const u32 MaxNumCachedHint = 14;
   static const uptr MaxBytesCachedLog = 13;
 
   static constexpr u32 Classes[] = {
@@ -273,9 +208,16 @@ struct AndroidSizeClassConfig {
 
 typedef TableSizeClassMap<AndroidSizeClassConfig> AndroidSizeClassMap;
 
-#if SCUDO_WORDSIZE == 64U && defined(__clang__)
-static_assert(AndroidSizeClassMap::usesCompressedLSBFormat(), "");
-#endif
+struct DefaultSizeClassConfig {
+  static const uptr NumBits = 3;
+  static const uptr MinSizeLog = 5;
+  static const uptr MidSizeLog = 8;
+  static const uptr MaxSizeLog = 17;
+  static const u32 MaxNumCachedHint = 8;
+  static const uptr MaxBytesCachedLog = 10;
+};
+
+typedef FixedSizeClassMap<DefaultSizeClassConfig> DefaultSizeClassMap;
 
 struct SvelteSizeClassConfig {
 #if SCUDO_WORDSIZE == 64U
@@ -283,14 +225,14 @@ struct SvelteSizeClassConfig {
   static const uptr MinSizeLog = 4;
   static const uptr MidSizeLog = 8;
   static const uptr MaxSizeLog = 14;
-  static const u32 MaxNumCachedHint = 13;
+  static const u32 MaxNumCachedHint = 4;
   static const uptr MaxBytesCachedLog = 10;
 #else
   static const uptr NumBits = 4;
   static const uptr MinSizeLog = 3;
   static const uptr MidSizeLog = 7;
   static const uptr MaxSizeLog = 14;
-  static const u32 MaxNumCachedHint = 14;
+  static const u32 MaxNumCachedHint = 5;
   static const uptr MaxBytesCachedLog = 10;
 #endif
 };
diff --git a/standalone/stack_depot.h b/standalone/stack_depot.h
deleted file mode 100644
index 458198fcb7a..00000000000
--- a/standalone/stack_depot.h
+++ /dev/null
@@ -1,144 +0,0 @@
-//===-- stack_depot.h -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SCUDO_STACK_DEPOT_H_
-#define SCUDO_STACK_DEPOT_H_
-
-#include "atomic_helpers.h"
-#include "mutex.h"
-
-namespace scudo {
-
-class MurMur2HashBuilder {
-  static const u32 M = 0x5bd1e995;
-  static const u32 Seed = 0x9747b28c;
-  static const u32 R = 24;
-  u32 H;
-
-public:
-  explicit MurMur2HashBuilder(u32 Init = 0) { H = Seed ^ Init; }
-  void add(u32 K) {
-    K *= M;
-    K ^= K >> R;
-    K *= M;
-    H *= M;
-    H ^= K;
-  }
-  u32 get() {
-    u32 X = H;
-    X ^= X >> 13;
-    X *= M;
-    X ^= X >> 15;
-    return X;
-  }
-};
-
-class StackDepot {
-  HybridMutex RingEndMu;
-  u32 RingEnd = 0;
-
-  // This data structure stores a stack trace for each allocation and
-  // deallocation when stack trace recording is enabled, that may be looked up
-  // using a hash of the stack trace. The lower bits of the hash are an index
-  // into the Tab array, which stores an index into the Ring array where the
-  // stack traces are stored. As the name implies, Ring is a ring buffer, so a
-  // stack trace may wrap around to the start of the array.
-  //
-  // Each stack trace in Ring is prefixed by a stack trace marker consisting of
-  // a fixed 1 bit in bit 0 (this allows disambiguation between stack frames
-  // and stack trace markers in the case where instruction pointers are 4-byte
-  // aligned, as they are on arm64), the stack trace hash in bits 1-32, and the
-  // size of the stack trace in bits 33-63.
-  //
-  // The insert() function is potentially racy in its accesses to the Tab and
-  // Ring arrays, but find() is resilient to races in the sense that, barring
-  // hash collisions, it will either return the correct stack trace or no stack
-  // trace at all, even if two instances of insert() raced with one another.
-  // This is achieved by re-checking the hash of the stack trace before
-  // returning the trace.
-
-#ifdef SCUDO_FUZZ
-  // Use smaller table sizes for fuzzing in order to reduce input size.
-  static const uptr TabBits = 4;
-#else
-  static const uptr TabBits = 16;
-#endif
-  static const uptr TabSize = 1 << TabBits;
-  static const uptr TabMask = TabSize - 1;
-  atomic_u32 Tab[TabSize] = {};
-
-#ifdef SCUDO_FUZZ
-  static const uptr RingBits = 4;
-#else
-  static const uptr RingBits = 19;
-#endif
-  static const uptr RingSize = 1 << RingBits;
-  static const uptr RingMask = RingSize - 1;
-  atomic_u64 Ring[RingSize] = {};
-
-public:
-  // Insert hash of the stack trace [Begin, End) into the stack depot, and
-  // return the hash.
-  u32 insert(uptr *Begin, uptr *End) {
-    MurMur2HashBuilder B;
-    for (uptr *I = Begin; I != End; ++I)
-      B.add(u32(*I) >> 2);
-    u32 Hash = B.get();
-
-    u32 Pos = Hash & TabMask;
-    u32 RingPos = atomic_load_relaxed(&Tab[Pos]);
-    u64 Entry = atomic_load_relaxed(&Ring[RingPos]);
-    u64 Id = (u64(End - Begin) << 33) | (u64(Hash) << 1) | 1;
-    if (Entry == Id)
-      return Hash;
-
-    ScopedLock Lock(RingEndMu);
-    RingPos = RingEnd;
-    atomic_store_relaxed(&Tab[Pos], RingPos);
-    atomic_store_relaxed(&Ring[RingPos], Id);
-    for (uptr *I = Begin; I != End; ++I) {
-      RingPos = (RingPos + 1) & RingMask;
-      atomic_store_relaxed(&Ring[RingPos], *I);
-    }
-    RingEnd = (RingPos + 1) & RingMask;
-    return Hash;
-  }
-
-  // Look up a stack trace by hash. Returns true if successful. The trace may be
-  // accessed via operator[] passing indexes between *RingPosPtr and
-  // *RingPosPtr + *SizePtr.
-  bool find(u32 Hash, uptr *RingPosPtr, uptr *SizePtr) const {
-    u32 Pos = Hash & TabMask;
-    u32 RingPos = atomic_load_relaxed(&Tab[Pos]);
-    if (RingPos >= RingSize)
-      return false;
-    u64 Entry = atomic_load_relaxed(&Ring[RingPos]);
-    u64 HashWithTagBit = (u64(Hash) << 1) | 1;
-    if ((Entry & 0x1ffffffff) != HashWithTagBit)
-      return false;
-    u32 Size = u32(Entry >> 33);
-    if (Size >= RingSize)
-      return false;
-    *RingPosPtr = (RingPos + 1) & RingMask;
-    *SizePtr = Size;
-    MurMur2HashBuilder B;
-    for (uptr I = 0; I != Size; ++I) {
-      RingPos = (RingPos + 1) & RingMask;
-      B.add(u32(atomic_load_relaxed(&Ring[RingPos])) >> 2);
-    }
-    return B.get() == Hash;
-  }
-
-  u64 operator[](uptr RingPos) const {
-    return atomic_load_relaxed(&Ring[RingPos & RingMask]);
-  }
-};
-
-} // namespace scudo
-
-#endif // SCUDO_STACK_DEPOT_H_
diff --git a/standalone/stats.h b/standalone/stats.h
index e15c0569497..38481e98e48 100644
--- a/standalone/stats.h
+++ b/standalone/stats.h
@@ -46,11 +46,11 @@ public:
 
   uptr get(StatType I) const { return atomic_load_relaxed(&StatsArray[I]); }
 
-  LocalStats *Next = nullptr;
-  LocalStats *Prev = nullptr;
+  LocalStats *Next;
+  LocalStats *Prev;
 
 private:
-  atomic_uptr StatsArray[StatCount] = {};
+  atomic_uptr StatsArray[StatCount];
 };
 
 // Global stats, used for aggregation and querying.
@@ -58,9 +58,7 @@ class GlobalStats : public LocalStats {
 public:
   void initLinkerInitialized() {}
   void init() {
-    LocalStats::init();
-    Mutex.init();
-    StatsList = {};
+    memset(this, 0, sizeof(*this));
     initLinkerInitialized();
   }
 
@@ -89,11 +87,8 @@ public:
       S[I] = static_cast<sptr>(S[I]) >= 0 ? S[I] : 0;
   }
 
-  void lock() { Mutex.lock(); }
-  void unlock() { Mutex.unlock(); }
-
-  void disable() { lock(); }
-  void enable() { unlock(); }
+  void disable() { Mutex.lock(); }
+  void enable() { Mutex.unlock(); }
 
 private:
   mutable HybridMutex Mutex;
diff --git a/standalone/string_utils.cpp b/standalone/string_utils.cpp
index 25bddbce34d..5de8b57bfcd 100644
--- a/standalone/string_utils.cpp
+++ b/standalone/string_utils.cpp
@@ -78,11 +78,10 @@ static int appendUnsigned(char **Buffer, const char *BufferEnd, u64 Num,
 static int appendSignedDecimal(char **Buffer, const char *BufferEnd, s64 Num,
                                u8 MinNumberLength, bool PadWithZero) {
   const bool Negative = (Num < 0);
-  const u64 UnsignedNum = (Num == INT64_MIN)
-                              ? static_cast<u64>(INT64_MAX) + 1
-                              : static_cast<u64>(Negative ? -Num : Num);
-  return appendNumber(Buffer, BufferEnd, UnsignedNum, 10, MinNumberLength,
-                      PadWithZero, Negative, /*Upper=*/false);
+  return appendNumber(Buffer, BufferEnd,
+                      static_cast<u64>(Negative ? -Num : Num), 10,
+                      MinNumberLength, PadWithZero, Negative,
+                      /*Upper=*/false);
 }
 
 // Use the fact that explicitly requesting 0 Width (%0s) results in UB and
@@ -115,8 +114,8 @@ static int appendPointer(char **Buffer, const char *BufferEnd, u64 ptr_value) {
   return Res;
 }
 
-static int formatString(char *Buffer, uptr BufferLength, const char *Format,
-                        va_list Args) {
+int formatString(char *Buffer, uptr BufferLength, const char *Format,
+                 va_list Args) {
   static const char *PrintfFormatsHelp =
       "Supported formatString formats: %([0-9]*)?(z|ll)?{d,u,x,X}; %p; "
       "%[-]([0-9]*)?(\\.\\*)?s; %c\n";
@@ -159,18 +158,16 @@ static int formatString(char *Buffer, uptr BufferLength, const char *Format,
     CHECK(!((Precision >= 0 || LeftJustified) && *Cur != 's'));
     switch (*Cur) {
     case 'd': {
-      DVal = HaveLL  ? va_arg(Args, s64)
-             : HaveZ ? va_arg(Args, sptr)
-                     : va_arg(Args, int);
+      DVal = HaveLL ? va_arg(Args, s64)
+                    : HaveZ ? va_arg(Args, sptr) : va_arg(Args, int);
       Res += appendSignedDecimal(&Buffer, BufferEnd, DVal, Width, PadWithZero);
       break;
     }
     case 'u':
     case 'x':
     case 'X': {
-      UVal = HaveLL  ? va_arg(Args, u64)
-             : HaveZ ? va_arg(Args, uptr)
-                     : va_arg(Args, unsigned);
+      UVal = HaveLL ? va_arg(Args, u64)
+                    : HaveZ ? va_arg(Args, uptr) : va_arg(Args, unsigned);
       const bool Upper = (*Cur == 'X');
       Res += appendUnsigned(&Buffer, BufferEnd, UVal, (*Cur == 'u') ? 10 : 16,
                             Width, PadWithZero, Upper);
@@ -210,14 +207,6 @@ static int formatString(char *Buffer, uptr BufferLength, const char *Format,
   return Res;
 }
 
-int formatString(char *Buffer, uptr BufferLength, const char *Format, ...) {
-  va_list Args;
-  va_start(Args, Format);
-  int Res = formatString(Buffer, BufferLength, Format, Args);
-  va_end(Args);
-  return Res;
-}
-
 void ScopedString::append(const char *Format, va_list Args) {
   DCHECK_LT(Length, String.size());
   va_list ArgsCopy;
@@ -230,7 +219,6 @@ void ScopedString::append(const char *Format, va_list Args) {
       static_cast<uptr>(formatString(C, sizeof(C), Format, Args)) + 1;
   String.resize(Length + AdditionalLength);
   formatString(String.data() + Length, AdditionalLength, Format, ArgsCopy);
-  va_end(ArgsCopy);
   Length = strlen(String.data());
   CHECK_LT(Length, String.size());
 }
diff --git a/standalone/string_utils.h b/standalone/string_utils.h
index 4880fa1e7cf..acd60bda9d8 100644
--- a/standalone/string_utils.h
+++ b/standalone/string_utils.h
@@ -36,7 +36,6 @@ private:
   uptr Length;
 };
 
-int formatString(char *Buffer, uptr BufferLength, const char *Format, ...);
 void Printf(const char *Format, ...);
 
 } // namespace scudo
diff --git a/standalone/tests/atomic_test.cpp b/standalone/tests/atomic_test.cpp
index e90a642fd35..103cd24624b 100644
--- a/standalone/tests/atomic_test.cpp
+++ b/standalone/tests/atomic_test.cpp
@@ -80,14 +80,26 @@ TEST(ScudoAtomicTest, AtomicStoreLoad) {
 
 template <typename T> void checkAtomicCompareExchange() {
   typedef typename T::Type Type;
-  Type OldVal = 42;
-  Type NewVal = 24;
-  Type V = OldVal;
-  EXPECT_TRUE(atomic_compare_exchange_strong(reinterpret_cast<T *>(&V), &OldVal,
+  {
+    Type OldVal = 42;
+    Type NewVal = 24;
+    Type V = OldVal;
+    EXPECT_TRUE(atomic_compare_exchange_strong(
+        reinterpret_cast<T *>(&V), &OldVal, NewVal, memory_order_relaxed));
+    EXPECT_FALSE(atomic_compare_exchange_strong(
+        reinterpret_cast<T *>(&V), &OldVal, NewVal, memory_order_relaxed));
+    EXPECT_EQ(NewVal, OldVal);
+  }
+  {
+    Type OldVal = 42;
+    Type NewVal = 24;
+    Type V = OldVal;
+    EXPECT_TRUE(atomic_compare_exchange_weak(reinterpret_cast<T *>(&V), &OldVal,
                                              NewVal, memory_order_relaxed));
-  EXPECT_FALSE(atomic_compare_exchange_strong(
-      reinterpret_cast<T *>(&V), &OldVal, NewVal, memory_order_relaxed));
-  EXPECT_EQ(NewVal, OldVal);
+    EXPECT_FALSE(atomic_compare_exchange_weak(
+        reinterpret_cast<T *>(&V), &OldVal, NewVal, memory_order_relaxed));
+    EXPECT_EQ(NewVal, OldVal);
+  }
 }
 
 TEST(ScudoAtomicTest, AtomicCompareExchangeTest) {
diff --git a/standalone/tests/checksum_test.cpp b/standalone/tests/checksum_test.cpp
index 781f990ecb7..361d33c7e46 100644
--- a/standalone/tests/checksum_test.cpp
+++ b/standalone/tests/checksum_test.cpp
@@ -41,10 +41,10 @@ template <ComputeChecksum F> void verifyChecksumFunctionBitFlip() {
   scudo::u8 IdenticalChecksums = 0;
   for (scudo::uptr I = 0; I < ArraySize; I++) {
     for (scudo::uptr J = 0; J < SCUDO_WORDSIZE; J++) {
-      Array[I] ^= scudo::uptr{1} << J;
+      Array[I] ^= 1U << J;
       if (F(Seed, Array, ArraySize) == Reference)
         IdenticalChecksums++;
-      Array[I] ^= scudo::uptr{1} << J;
+      Array[I] ^= 1U << J;
     }
   }
   // Allow for a couple of identical checksums over the whole set of flips.
diff --git a/standalone/tests/chunk_test.cpp b/standalone/tests/chunk_test.cpp
index 6458e23e142..13da70eff85 100644
--- a/standalone/tests/chunk_test.cpp
+++ b/standalone/tests/chunk_test.cpp
@@ -41,7 +41,7 @@ TEST(ScudoChunkTest, ChunkCmpXchg) {
   initChecksum();
   const scudo::uptr Size = 0x100U;
   scudo::Chunk::UnpackedHeader OldHeader = {};
-  OldHeader.OriginOrWasZeroed = scudo::Chunk::Origin::Malloc;
+  OldHeader.Origin = scudo::Chunk::Origin::Malloc;
   OldHeader.ClassId = 0x42U;
   OldHeader.SizeOrUnusedBytes = Size;
   OldHeader.State = scudo::Chunk::State::Allocated;
diff --git a/standalone/tests/combined_test.cpp b/standalone/tests/combined_test.cpp
index 5db249d0a85..a2c06182a68 100644
--- a/standalone/tests/combined_test.cpp
+++ b/standalone/tests/combined_test.cpp
@@ -12,18 +12,17 @@
 #include "combined.h"
 
 #include <condition_variable>
-#include <memory>
 #include <mutex>
-#include <set>
-#include <stdlib.h>
 #include <thread>
 #include <vector>
 
+static std::mutex Mutex;
+static std::condition_variable Cv;
+static bool Ready = false;
+
 static constexpr scudo::Chunk::Origin Origin = scudo::Chunk::Origin::Malloc;
-static constexpr scudo::uptr MinAlignLog = FIRST_32_SECOND_64(3U, 4U);
 
-// Fuchsia complains that the function is not used.
-UNUSED static void disableDebuggerdMaybe() {
+static void disableDebuggerdMaybe() {
 #if SCUDO_ANDROID
   // Disable the debuggerd signal handler on Android, without this we can end
   // up spending a significant amount of time creating tombstones.
@@ -32,7 +31,12 @@ UNUSED static void disableDebuggerdMaybe() {
 }
 
 template <class AllocatorT>
-bool isPrimaryAllocation(scudo::uptr Size, scudo::uptr Alignment) {
+bool isTaggedAllocation(AllocatorT *Allocator, scudo::uptr Size,
+                        scudo::uptr Alignment) {
+  if (!Allocator->useMemoryTagging() ||
+      !scudo::systemDetectsMemoryTagFaultsTestOnly())
+    return false;
+
   const scudo::uptr MinAlignment = 1UL << SCUDO_MIN_ALIGNMENT_LOG;
   if (Alignment < MinAlignment)
     Alignment = MinAlignment;
@@ -45,110 +49,46 @@ bool isPrimaryAllocation(scudo::uptr Size, scudo::uptr Alignment) {
 template <class AllocatorT>
 void checkMemoryTaggingMaybe(AllocatorT *Allocator, void *P, scudo::uptr Size,
                              scudo::uptr Alignment) {
-  const scudo::uptr MinAlignment = 1UL << SCUDO_MIN_ALIGNMENT_LOG;
-  Size = scudo::roundUpTo(Size, MinAlignment);
-  if (Allocator->useMemoryTaggingTestOnly())
-    EXPECT_DEATH(
-        {
-          disableDebuggerdMaybe();
-          reinterpret_cast<char *>(P)[-1] = 0xaa;
-        },
-        "");
-  if (isPrimaryAllocation<AllocatorT>(Size, Alignment)
-          ? Allocator->useMemoryTaggingTestOnly()
-          : Alignment == MinAlignment) {
-    EXPECT_DEATH(
-        {
-          disableDebuggerdMaybe();
-          reinterpret_cast<char *>(P)[Size] = 0xaa;
-        },
-        "");
-  }
-}
-
-template <typename Config> struct TestAllocator : scudo::Allocator<Config> {
-  TestAllocator() {
-    this->reset();
-    this->initThreadMaybe();
-    if (scudo::archSupportsMemoryTagging() &&
-        !scudo::systemDetectsMemoryTagFaultsTestOnly())
-      this->disableMemoryTagging();
-  }
-  ~TestAllocator() { this->unmapTestOnly(); }
-
-  void *operator new(size_t size) {
-    void *p = nullptr;
-    EXPECT_EQ(0, posix_memalign(&p, alignof(TestAllocator), size));
-    return p;
-  }
-
-  void operator delete(void *ptr) { free(ptr); }
-};
-
-template <class TypeParam> struct ScudoCombinedTest : public Test {
-  ScudoCombinedTest() {
-    UseQuarantine = std::is_same<TypeParam, scudo::AndroidConfig>::value;
-    Allocator = std::make_unique<AllocatorT>();
-  }
-  ~ScudoCombinedTest() {
-    Allocator->releaseToOS();
-    UseQuarantine = true;
-  }
-
-  void RunTest();
-
-  void BasicTest(scudo::uptr SizeLogMin, scudo::uptr SizeLogMax);
+  if (!isTaggedAllocation(Allocator, Size, Alignment))
+    return;
 
-  using AllocatorT = TestAllocator<TypeParam>;
-  std::unique_ptr<AllocatorT> Allocator;
-};
+  Size = scudo::roundUpTo(Size, scudo::archMemoryTagGranuleSize());
+  EXPECT_DEATH(
+      {
+        disableDebuggerdMaybe();
+        reinterpret_cast<char *>(P)[-1] = 0xaa;
+      },
+      "");
+  EXPECT_DEATH(
+      {
+        disableDebuggerdMaybe();
+        reinterpret_cast<char *>(P)[Size] = 0xaa;
+      },
+      "");
+}
 
-#if SCUDO_FUCHSIA
-#define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, AndroidSvelteConfig)                    \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, FuchsiaConfig)
-#else
-#define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, AndroidSvelteConfig)                    \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, DefaultConfig)                          \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, AndroidConfig)
-#endif
+template <class Config> static void testAllocator() {
+  using AllocatorT = scudo::Allocator<Config>;
+  auto Deleter = [](AllocatorT *A) {
+    A->unmapTestOnly();
+    delete A;
+  };
+  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
+                                                           Deleter);
+  Allocator->reset();
 
-#define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE)                             \
-  using FIXTURE##NAME##_##TYPE = FIXTURE##NAME<scudo::TYPE>;                   \
-  TEST_F(FIXTURE##NAME##_##TYPE, NAME) { Run(); }
-
-#define SCUDO_TYPED_TEST(FIXTURE, NAME)                                        \
-  template <class TypeParam>                                                   \
-  struct FIXTURE##NAME : public FIXTURE<TypeParam> {                           \
-    void Run();                                                                \
-  };                                                                           \
-  SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                                    \
-  template <class TypeParam> void FIXTURE##NAME<TypeParam>::Run()
-
-SCUDO_TYPED_TEST(ScudoCombinedTest, IsOwned) {
-  auto *Allocator = this->Allocator.get();
-  static scudo::u8 StaticBuffer[scudo::Chunk::getHeaderSize() + 1];
-  EXPECT_FALSE(
-      Allocator->isOwned(&StaticBuffer[scudo::Chunk::getHeaderSize()]));
-
-  scudo::u8 StackBuffer[scudo::Chunk::getHeaderSize() + 1];
-  for (scudo::uptr I = 0; I < sizeof(StackBuffer); I++)
-    StackBuffer[I] = 0x42U;
-  EXPECT_FALSE(Allocator->isOwned(&StackBuffer[scudo::Chunk::getHeaderSize()]));
-  for (scudo::uptr I = 0; I < sizeof(StackBuffer); I++)
-    EXPECT_EQ(StackBuffer[I], 0x42U);
-}
+  EXPECT_FALSE(Allocator->isOwned(&Mutex));
+  EXPECT_FALSE(Allocator->isOwned(&Allocator));
+  scudo::u64 StackVariable = 0x42424242U;
+  EXPECT_FALSE(Allocator->isOwned(&StackVariable));
+  EXPECT_EQ(StackVariable, 0x42424242U);
 
-template <class Config>
-void ScudoCombinedTest<Config>::BasicTest(scudo::uptr SizeLogMin,
-                                          scudo::uptr SizeLogMax) {
-  auto *Allocator = this->Allocator.get();
+  constexpr scudo::uptr MinAlignLog = FIRST_32_SECOND_64(3U, 4U);
 
   // This allocates and deallocates a bunch of chunks, with a wide range of
   // sizes and alignments, with a focus on sizes that could trigger weird
   // behaviors (plus or minus a small delta of a power of two for example).
-  for (scudo::uptr SizeLog = SizeLogMin; SizeLog <= SizeLogMax; SizeLog++) {
+  for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
     for (scudo::uptr AlignLog = MinAlignLog; AlignLog <= 16U; AlignLog++) {
       const scudo::uptr Align = 1U << AlignLog;
       for (scudo::sptr Delta = -32; Delta <= 32; Delta++) {
@@ -161,20 +101,12 @@ void ScudoCombinedTest<Config>::BasicTest(scudo::uptr SizeLogMin,
         EXPECT_TRUE(scudo::isAligned(reinterpret_cast<scudo::uptr>(P), Align));
         EXPECT_LE(Size, Allocator->getUsableSize(P));
         memset(P, 0xaa, Size);
-        checkMemoryTaggingMaybe(Allocator, P, Size, Align);
+        checkMemoryTaggingMaybe(Allocator.get(), P, Size, Align);
         Allocator->deallocate(P, Origin, Size);
       }
     }
   }
-}
-
-SCUDO_TYPED_TEST(ScudoCombinedTest, BasicCombined0) { this->BasicTest(0, 16); }
-SCUDO_TYPED_TEST(ScudoCombinedTest, BasicCombined1) { this->BasicTest(17, 18); }
-SCUDO_TYPED_TEST(ScudoCombinedTest, BasicCombined2) { this->BasicTest(19, 19); }
-SCUDO_TYPED_TEST(ScudoCombinedTest, BasicCombined3) { this->BasicTest(20, 20); }
-
-SCUDO_TYPED_TEST(ScudoCombinedTest, ZeroContents) {
-  auto *Allocator = this->Allocator.get();
+  Allocator->releaseToOS();
 
   // Ensure that specifying ZeroContents returns a zero'd out block.
   for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
@@ -183,60 +115,12 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ZeroContents) {
       void *P = Allocator->allocate(Size, Origin, 1U << MinAlignLog, true);
       EXPECT_NE(P, nullptr);
       for (scudo::uptr I = 0; I < Size; I++)
-        ASSERT_EQ((reinterpret_cast<char *>(P))[I], 0);
+        EXPECT_EQ((reinterpret_cast<char *>(P))[I], 0);
       memset(P, 0xaa, Size);
       Allocator->deallocate(P, Origin, Size);
     }
   }
-}
-
-SCUDO_TYPED_TEST(ScudoCombinedTest, ZeroFill) {
-  auto *Allocator = this->Allocator.get();
-
-  // Ensure that specifying ZeroContents returns a zero'd out block.
-  Allocator->setFillContents(scudo::ZeroFill);
-  for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
-    for (scudo::uptr Delta = 0U; Delta <= 4U; Delta++) {
-      const scudo::uptr Size = (1U << SizeLog) + Delta * 128U;
-      void *P = Allocator->allocate(Size, Origin, 1U << MinAlignLog, false);
-      EXPECT_NE(P, nullptr);
-      for (scudo::uptr I = 0; I < Size; I++)
-        ASSERT_EQ((reinterpret_cast<char *>(P))[I], 0);
-      memset(P, 0xaa, Size);
-      Allocator->deallocate(P, Origin, Size);
-    }
-  }
-}
-
-SCUDO_TYPED_TEST(ScudoCombinedTest, PatternOrZeroFill) {
-  auto *Allocator = this->Allocator.get();
-
-  // Ensure that specifying PatternOrZeroFill returns a pattern or zero filled
-  // block. The primary allocator only produces pattern filled blocks if MTE
-  // is disabled, so we only require pattern filled blocks in that case.
-  Allocator->setFillContents(scudo::PatternOrZeroFill);
-  for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
-    for (scudo::uptr Delta = 0U; Delta <= 4U; Delta++) {
-      const scudo::uptr Size = (1U << SizeLog) + Delta * 128U;
-      void *P = Allocator->allocate(Size, Origin, 1U << MinAlignLog, false);
-      EXPECT_NE(P, nullptr);
-      for (scudo::uptr I = 0; I < Size; I++) {
-        unsigned char V = (reinterpret_cast<unsigned char *>(P))[I];
-        if (isPrimaryAllocation<TestAllocator<TypeParam>>(Size,
-                                                          1U << MinAlignLog) &&
-            !Allocator->useMemoryTaggingTestOnly())
-          ASSERT_EQ(V, scudo::PatternFillByte);
-        else
-          ASSERT_TRUE(V == scudo::PatternFillByte || V == 0);
-      }
-      memset(P, 0xaa, Size);
-      Allocator->deallocate(P, Origin, Size);
-    }
-  }
-}
-
-SCUDO_TYPED_TEST(ScudoCombinedTest, BlockReuse) {
-  auto *Allocator = this->Allocator.get();
+  Allocator->releaseToOS();
 
   // Verify that a chunk will end up being reused, at some point.
   const scudo::uptr NeedleSize = 1024U;
@@ -245,20 +129,18 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, BlockReuse) {
   bool Found = false;
   for (scudo::uptr I = 0; I < 1024U && !Found; I++) {
     void *P = Allocator->allocate(NeedleSize, Origin);
-    if (Allocator->getHeaderTaggedPointer(P) ==
-        Allocator->getHeaderTaggedPointer(NeedleP))
+    if (Allocator->untagPointerMaybe(P) ==
+        Allocator->untagPointerMaybe(NeedleP))
       Found = true;
     Allocator->deallocate(P, Origin);
   }
   EXPECT_TRUE(Found);
-}
 
-SCUDO_TYPED_TEST(ScudoCombinedTest, ReallocateLarge) {
-  auto *Allocator = this->Allocator.get();
+  constexpr scudo::uptr MaxSize = Config::Primary::SizeClassMap::MaxSize;
 
   // Reallocate a large chunk all the way down to a byte, verifying that we
   // preserve the data in the process.
-  scudo::uptr Size = TypeParam::Primary::SizeClassMap::MaxSize * 2;
+  scudo::uptr Size = MaxSize * 2;
   const scudo::uptr DataSize = 2048U;
   void *P = Allocator->allocate(Size, Origin);
   const char Marker = 0xab;
@@ -272,19 +154,13 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ReallocateLarge) {
     P = NewP;
   }
   Allocator->deallocate(P, Origin);
-}
-
-SCUDO_TYPED_TEST(ScudoCombinedTest, ReallocateSame) {
-  auto *Allocator = this->Allocator.get();
 
   // Check that reallocating a chunk to a slightly smaller or larger size
   // returns the same chunk. This requires that all the sizes we iterate on use
   // the same block size, but that should be the case for MaxSize - 64 with our
   // default class size maps.
-  constexpr scudo::uptr ReallocSize =
-      TypeParam::Primary::SizeClassMap::MaxSize - 64;
-  void *P = Allocator->allocate(ReallocSize, Origin);
-  const char Marker = 0xab;
+  constexpr scudo::uptr ReallocSize = MaxSize - 64;
+  P = Allocator->allocate(ReallocSize, Origin);
   memset(P, Marker, ReallocSize);
   for (scudo::sptr Delta = -32; Delta < 32; Delta += 8) {
     const scudo::uptr NewSize = ReallocSize + Delta;
@@ -292,24 +168,17 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ReallocateSame) {
     EXPECT_EQ(NewP, P);
     for (scudo::uptr I = 0; I < ReallocSize - 32; I++)
       EXPECT_EQ((reinterpret_cast<char *>(NewP))[I], Marker);
-    checkMemoryTaggingMaybe(Allocator, NewP, NewSize, 0);
+    checkMemoryTaggingMaybe(Allocator.get(), NewP, NewSize, 0);
   }
   Allocator->deallocate(P, Origin);
-}
 
-SCUDO_TYPED_TEST(ScudoCombinedTest, IterateOverChunks) {
-  auto *Allocator = this->Allocator.get();
   // Allocates a bunch of chunks, then iterate over all the chunks, ensuring
   // they are the ones we allocated. This requires the allocator to not have any
   // other allocated chunk at this point (eg: won't work with the Quarantine).
-  // FIXME: Make it work with UseQuarantine and tagging enabled. Internals of
-  // iterateOverChunks reads header by tagged and non-tagger pointers so one of
-  // them will fail.
   if (!UseQuarantine) {
     std::vector<void *> V;
     for (scudo::uptr I = 0; I < 64U; I++)
-      V.push_back(Allocator->allocate(
-          rand() % (TypeParam::Primary::SizeClassMap::MaxSize / 2U), Origin));
+      V.push_back(Allocator->allocate(rand() % (MaxSize / 2U), Origin));
     Allocator->disable();
     Allocator->iterateOverChunks(
         0U, static_cast<scudo::uptr>(SCUDO_MMAP_RANGE_SIZE - 1),
@@ -320,42 +189,46 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, IterateOverChunks) {
         },
         reinterpret_cast<void *>(&V));
     Allocator->enable();
-    for (auto P : V)
-      Allocator->deallocate(P, Origin);
+    while (!V.empty()) {
+      Allocator->deallocate(V.back(), Origin);
+      V.pop_back();
+    }
   }
-}
 
-SCUDO_TYPED_TEST(ScudoCombinedTest, UseAfterFree) {
-  auto *Allocator = this->Allocator.get();
-
-  // Check that use-after-free is detected.
-  for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
-    const scudo::uptr Size = 1U << SizeLog;
-    if (!Allocator->useMemoryTaggingTestOnly())
-      continue;
-    EXPECT_DEATH(
-        {
-          disableDebuggerdMaybe();
-          void *P = Allocator->allocate(Size, Origin);
-          Allocator->deallocate(P, Origin);
-          reinterpret_cast<char *>(P)[0] = 0xaa;
-        },
-        "");
-    EXPECT_DEATH(
-        {
-          disableDebuggerdMaybe();
-          void *P = Allocator->allocate(Size, Origin);
-          Allocator->deallocate(P, Origin);
-          reinterpret_cast<char *>(P)[Size - 1] = 0xaa;
-        },
-        "");
-  }
-}
+  Allocator->releaseToOS();
 
-SCUDO_TYPED_TEST(ScudoCombinedTest, DisableMemoryTagging) {
-  auto *Allocator = this->Allocator.get();
+  if (Allocator->useMemoryTagging() &&
+      scudo::systemDetectsMemoryTagFaultsTestOnly()) {
+    // Check that use-after-free is detected.
+    for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
+      const scudo::uptr Size = 1U << SizeLog;
+      if (!isTaggedAllocation(Allocator.get(), Size, 1))
+        continue;
+      // UAF detection is probabilistic, so we repeat the test up to 256 times
+      // if necessary. With 15 possible tags this means a 1 in 15^256 chance of
+      // a false positive.
+      EXPECT_DEATH(
+          {
+            disableDebuggerdMaybe();
+            for (unsigned I = 0; I != 256; ++I) {
+              void *P = Allocator->allocate(Size, Origin);
+              Allocator->deallocate(P, Origin);
+              reinterpret_cast<char *>(P)[0] = 0xaa;
+            }
+          },
+          "");
+      EXPECT_DEATH(
+          {
+            disableDebuggerdMaybe();
+            for (unsigned I = 0; I != 256; ++I) {
+              void *P = Allocator->allocate(Size, Origin);
+              Allocator->deallocate(P, Origin);
+              reinterpret_cast<char *>(P)[Size - 1] = 0xaa;
+            }
+          },
+          "");
+    }
 
-  if (Allocator->useMemoryTaggingTestOnly()) {
     // Check that disabling memory tagging works correctly.
     void *P = Allocator->allocate(2048, Origin);
     EXPECT_DEATH(reinterpret_cast<char *>(P)[2048] = 0xaa, "");
@@ -365,7 +238,7 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, DisableMemoryTagging) {
     Allocator->deallocate(P, Origin);
 
     P = Allocator->allocate(2048, Origin);
-    EXPECT_EQ(scudo::untagPointer(P), P);
+    EXPECT_EQ(Allocator->untagPointerMaybe(P), P);
     reinterpret_cast<char *>(P)[2048] = 0xaa;
     Allocator->deallocate(P, Origin);
 
@@ -375,10 +248,6 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, DisableMemoryTagging) {
     // Re-enable them now.
     scudo::enableMemoryTagChecksTestOnly();
   }
-}
-
-SCUDO_TYPED_TEST(ScudoCombinedTest, Stats) {
-  auto *Allocator = this->Allocator.get();
 
   scudo::uptr BufferSize = 8192;
   std::vector<char> Buffer(BufferSize);
@@ -396,52 +265,63 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, Stats) {
   EXPECT_NE(Stats.find("Stats: Quarantine"), std::string::npos);
 }
 
-SCUDO_TYPED_TEST(ScudoCombinedTest, CacheDrain) {
-  auto *Allocator = this->Allocator.get();
+// Test that multiple instantiations of the allocator have not messed up the
+// process's signal handlers (GWP-ASan used to do this).
+void testSEGV() {
+  const scudo::uptr Size = 4 * scudo::getPageSizeCached();
+  scudo::MapPlatformData Data = {};
+  void *P = scudo::map(nullptr, Size, "testSEGV", MAP_NOACCESS, &Data);
+  EXPECT_NE(P, nullptr);
+  EXPECT_DEATH(memset(P, 0xaa, Size), "");
+  scudo::unmap(P, Size, UNMAP_ALL, &Data);
+}
 
-  std::vector<void *> V;
-  for (scudo::uptr I = 0; I < 64U; I++)
-    V.push_back(Allocator->allocate(
-        rand() % (TypeParam::Primary::SizeClassMap::MaxSize / 2U), Origin));
-  for (auto P : V)
-    Allocator->deallocate(P, Origin);
+TEST(ScudoCombinedTest, BasicCombined) {
+  UseQuarantine = false;
+  testAllocator<scudo::AndroidSvelteConfig>();
+#if SCUDO_FUCHSIA
+  testAllocator<scudo::FuchsiaConfig>();
+#else
+  testAllocator<scudo::DefaultConfig>();
+  UseQuarantine = true;
+  testAllocator<scudo::AndroidConfig>();
+  testSEGV();
+#endif
+}
 
-  bool UnlockRequired;
-  auto *TSD = Allocator->getTSDRegistry()->getTSDAndLock(&UnlockRequired);
-  EXPECT_TRUE(!TSD->Cache.isEmpty());
-  TSD->Cache.drain();
-  EXPECT_TRUE(TSD->Cache.isEmpty());
-  if (UnlockRequired)
-    TSD->unlock();
+template <typename AllocatorT> static void stressAllocator(AllocatorT *A) {
+  {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    while (!Ready)
+      Cv.wait(Lock);
+  }
+  std::vector<std::pair<void *, scudo::uptr>> V;
+  for (scudo::uptr I = 0; I < 256U; I++) {
+    const scudo::uptr Size = std::rand() % 4096U;
+    void *P = A->allocate(Size, Origin);
+    // A region could have ran out of memory, resulting in a null P.
+    if (P)
+      V.push_back(std::make_pair(P, Size));
+  }
+  while (!V.empty()) {
+    auto Pair = V.back();
+    A->deallocate(Pair.first, Origin, Pair.second);
+    V.pop_back();
+  }
 }
 
-SCUDO_TYPED_TEST(ScudoCombinedTest, ThreadedCombined) {
-  std::mutex Mutex;
-  std::condition_variable Cv;
-  bool Ready = false;
-  auto *Allocator = this->Allocator.get();
+template <class Config> static void testAllocatorThreaded() {
+  using AllocatorT = scudo::Allocator<Config>;
+  auto Deleter = [](AllocatorT *A) {
+    A->unmapTestOnly();
+    delete A;
+  };
+  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
+                                                           Deleter);
+  Allocator->reset();
   std::thread Threads[32];
   for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
-    Threads[I] = std::thread([&]() {
-      {
-        std::unique_lock<std::mutex> Lock(Mutex);
-        while (!Ready)
-          Cv.wait(Lock);
-      }
-      std::vector<std::pair<void *, scudo::uptr>> V;
-      for (scudo::uptr I = 0; I < 256U; I++) {
-        const scudo::uptr Size = std::rand() % 4096U;
-        void *P = Allocator->allocate(Size, Origin);
-        // A region could have ran out of memory, resulting in a null P.
-        if (P)
-          V.push_back(std::make_pair(P, Size));
-      }
-      while (!V.empty()) {
-        auto Pair = V.back();
-        Allocator->deallocate(Pair.first, Origin, Pair.second);
-        V.pop_back();
-      }
-    });
+    Threads[I] = std::thread(stressAllocator<AllocatorT>, Allocator.get());
   {
     std::unique_lock<std::mutex> Lock(Mutex);
     Ready = true;
@@ -452,21 +332,16 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ThreadedCombined) {
   Allocator->releaseToOS();
 }
 
+TEST(ScudoCombinedTest, ThreadedCombined) {
+  UseQuarantine = false;
+  testAllocatorThreaded<scudo::AndroidSvelteConfig>();
 #if SCUDO_FUCHSIA
-#define SKIP_ON_FUCHSIA(T) DISABLED_##T
+  testAllocatorThreaded<scudo::FuchsiaConfig>();
 #else
-#define SKIP_ON_FUCHSIA(T) T
+  testAllocatorThreaded<scudo::DefaultConfig>();
+  UseQuarantine = true;
+  testAllocatorThreaded<scudo::AndroidConfig>();
 #endif
-
-// Test that multiple instantiations of the allocator have not messed up the
-// process's signal handlers (GWP-ASan used to do this).
-TEST(ScudoCombinedTest, SKIP_ON_FUCHSIA(testSEGV)) {
-  const scudo::uptr Size = 4 * scudo::getPageSizeCached();
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size, "testSEGV", MAP_NOACCESS, &Data);
-  EXPECT_NE(P, nullptr);
-  EXPECT_DEATH(memset(P, 0xaa, Size), "");
-  scudo::unmap(P, Size, UNMAP_ALL, &Data);
 }
 
 struct DeathSizeClassConfig {
@@ -480,24 +355,23 @@ struct DeathSizeClassConfig {
 
 static const scudo::uptr DeathRegionSizeLog = 20U;
 struct DeathConfig {
-  static const bool MaySupportMemoryTagging = false;
-
   // Tiny allocator, its Primary only serves chunks of four sizes.
-  using SizeClassMap = scudo::FixedSizeClassMap<DeathSizeClassConfig>;
-  typedef scudo::SizeClassAllocator64<DeathConfig> Primary;
-  static const scudo::uptr PrimaryRegionSizeLog = DeathRegionSizeLog;
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-  typedef scudo::uptr PrimaryCompactPtrT;
-  static const scudo::uptr PrimaryCompactPtrScale = 0;
-
-  typedef scudo::MapAllocatorNoCache SecondaryCache;
-  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>;
+  using DeathSizeClassMap = scudo::FixedSizeClassMap<DeathSizeClassConfig>;
+  typedef scudo::SizeClassAllocator64<DeathSizeClassMap, DeathRegionSizeLog>
+      Primary;
+  typedef scudo::MapAllocator<scudo::MapAllocatorNoCache> Secondary;
+  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U>;
 };
 
 TEST(ScudoCombinedTest, DeathCombined) {
-  using AllocatorT = TestAllocator<DeathConfig>;
-  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+  using AllocatorT = scudo::Allocator<DeathConfig>;
+  auto Deleter = [](AllocatorT *A) {
+    A->unmapTestOnly();
+    delete A;
+  };
+  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
+                                                           Deleter);
+  Allocator->reset();
 
   const scudo::uptr Size = 1000U;
   void *P = Allocator->allocate(Size, Origin);
@@ -531,8 +405,14 @@ TEST(ScudoCombinedTest, DeathCombined) {
 // Ensure that releaseToOS can be called prior to any other allocator
 // operation without issue.
 TEST(ScudoCombinedTest, ReleaseToOS) {
-  using AllocatorT = TestAllocator<DeathConfig>;
-  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+  using AllocatorT = scudo::Allocator<DeathConfig>;
+  auto Deleter = [](AllocatorT *A) {
+    A->unmapTestOnly();
+    delete A;
+  };
+  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
+                                                           Deleter);
+  Allocator->reset();
 
   Allocator->releaseToOS();
 }
@@ -540,19 +420,25 @@ TEST(ScudoCombinedTest, ReleaseToOS) {
 // Verify that when a region gets full, the allocator will still manage to
 // fulfill the allocation through a larger size class.
 TEST(ScudoCombinedTest, FullRegion) {
-  using AllocatorT = TestAllocator<DeathConfig>;
-  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+  using AllocatorT = scudo::Allocator<DeathConfig>;
+  auto Deleter = [](AllocatorT *A) {
+    A->unmapTestOnly();
+    delete A;
+  };
+  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
+                                                           Deleter);
+  Allocator->reset();
 
   std::vector<void *> V;
   scudo::uptr FailedAllocationsCount = 0;
   for (scudo::uptr ClassId = 1U;
-       ClassId <= DeathConfig::SizeClassMap::LargestClassId; ClassId++) {
+       ClassId <= DeathConfig::DeathSizeClassMap::LargestClassId; ClassId++) {
     const scudo::uptr Size =
-        DeathConfig::SizeClassMap::getSizeByClassId(ClassId);
+        DeathConfig::DeathSizeClassMap::getSizeByClassId(ClassId);
     // Allocate enough to fill all of the regions above this one.
     const scudo::uptr MaxNumberOfChunks =
         ((1U << DeathRegionSizeLog) / Size) *
-        (DeathConfig::SizeClassMap::LargestClassId - ClassId + 1);
+        (DeathConfig::DeathSizeClassMap::LargestClassId - ClassId + 1);
     void *P;
     for (scudo::uptr I = 0; I <= MaxNumberOfChunks; I++) {
       P = Allocator->allocate(Size - 64U, Origin);
@@ -568,83 +454,3 @@ TEST(ScudoCombinedTest, FullRegion) {
   }
   EXPECT_EQ(FailedAllocationsCount, 0U);
 }
-
-TEST(ScudoCombinedTest, OddEven) {
-  using AllocatorT = TestAllocator<scudo::AndroidConfig>;
-  using SizeClassMap = AllocatorT::PrimaryT::SizeClassMap;
-  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
-
-  if (!Allocator->useMemoryTaggingTestOnly())
-    return;
-
-  auto CheckOddEven = [](scudo::uptr P1, scudo::uptr P2) {
-    scudo::uptr Tag1 = scudo::extractTag(scudo::loadTag(P1));
-    scudo::uptr Tag2 = scudo::extractTag(scudo::loadTag(P2));
-    EXPECT_NE(Tag1 % 2, Tag2 % 2);
-  };
-
-  for (scudo::uptr ClassId = 1U; ClassId <= SizeClassMap::LargestClassId;
-       ClassId++) {
-    const scudo::uptr Size = SizeClassMap::getSizeByClassId(ClassId);
-
-    std::set<scudo::uptr> Ptrs;
-    bool Found = false;
-    for (unsigned I = 0; I != 65536; ++I) {
-      scudo::uptr P = scudo::untagPointer(reinterpret_cast<scudo::uptr>(
-          Allocator->allocate(Size - scudo::Chunk::getHeaderSize(), Origin)));
-      if (Ptrs.count(P - Size)) {
-        Found = true;
-        CheckOddEven(P, P - Size);
-        break;
-      }
-      if (Ptrs.count(P + Size)) {
-        Found = true;
-        CheckOddEven(P, P + Size);
-        break;
-      }
-      Ptrs.insert(P);
-    }
-    EXPECT_TRUE(Found);
-  }
-}
-
-TEST(ScudoCombinedTest, DisableMemInit) {
-  using AllocatorT = TestAllocator<scudo::AndroidConfig>;
-  using SizeClassMap = AllocatorT::PrimaryT::SizeClassMap;
-  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
-
-  std::vector<void *> Ptrs(65536, nullptr);
-
-  Allocator->setOption(scudo::Option::ThreadDisableMemInit, 1);
-
-  constexpr scudo::uptr MinAlignLog = FIRST_32_SECOND_64(3U, 4U);
-
-  // Test that if mem-init is disabled on a thread, calloc should still work as
-  // expected. This is tricky to ensure when MTE is enabled, so this test tries
-  // to exercise the relevant code on our MTE path.
-  for (scudo::uptr ClassId = 1U; ClassId <= 8; ClassId++) {
-    const scudo::uptr Size =
-        SizeClassMap::getSizeByClassId(ClassId) - scudo::Chunk::getHeaderSize();
-    if (Size < 8)
-      continue;
-    for (unsigned I = 0; I != Ptrs.size(); ++I) {
-      Ptrs[I] = Allocator->allocate(Size, Origin);
-      memset(Ptrs[I], 0xaa, Size);
-    }
-    for (unsigned I = 0; I != Ptrs.size(); ++I)
-      Allocator->deallocate(Ptrs[I], Origin, Size);
-    for (unsigned I = 0; I != Ptrs.size(); ++I) {
-      Ptrs[I] = Allocator->allocate(Size - 8, Origin);
-      memset(Ptrs[I], 0xbb, Size - 8);
-    }
-    for (unsigned I = 0; I != Ptrs.size(); ++I)
-      Allocator->deallocate(Ptrs[I], Origin, Size - 8);
-    for (unsigned I = 0; I != Ptrs.size(); ++I) {
-      Ptrs[I] = Allocator->allocate(Size, Origin, 1U << MinAlignLog, true);
-      for (scudo::uptr J = 0; J < Size; ++J)
-        ASSERT_EQ((reinterpret_cast<char *>(Ptrs[I]))[J], 0);
-    }
-  }
-
-  Allocator->setOption(scudo::Option::ThreadDisableMemInit, 0);
-}
diff --git a/standalone/tests/mutex_test.cpp b/standalone/tests/mutex_test.cpp
index ed56cb5219e..ce715a19332 100644
--- a/standalone/tests/mutex_test.cpp
+++ b/standalone/tests/mutex_test.cpp
@@ -52,7 +52,7 @@ private:
   static const scudo::u32 Size = 64U;
   typedef scudo::u64 T;
   scudo::HybridMutex &Mutex;
-  alignas(SCUDO_CACHE_LINE_SIZE) T Data[Size];
+  ALIGNED(SCUDO_CACHE_LINE_SIZE) T Data[Size];
 };
 
 const scudo::u32 NumberOfThreads = 8;
diff --git a/standalone/tests/primary_test.cpp b/standalone/tests/primary_test.cpp
index e7aa6f795b6..010bf84490e 100644
--- a/standalone/tests/primary_test.cpp
+++ b/standalone/tests/primary_test.cpp
@@ -14,7 +14,6 @@
 
 #include <condition_variable>
 #include <mutex>
-#include <stdlib.h>
 #include <thread>
 #include <vector>
 
@@ -22,90 +21,16 @@
 // 32-bit architectures. It's not something we want to encourage, but we still
 // should ensure the tests pass.
 
-struct TestConfig1 {
-  static const scudo::uptr PrimaryRegionSizeLog = 18U;
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-  static const bool MaySupportMemoryTagging = false;
-  typedef scudo::uptr PrimaryCompactPtrT;
-  static const scudo::uptr PrimaryCompactPtrScale = 0;
-};
-
-struct TestConfig2 {
-  static const scudo::uptr PrimaryRegionSizeLog = 24U;
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-  static const bool MaySupportMemoryTagging = false;
-  typedef scudo::uptr PrimaryCompactPtrT;
-  static const scudo::uptr PrimaryCompactPtrScale = 0;
-};
-
-struct TestConfig3 {
-  static const scudo::uptr PrimaryRegionSizeLog = 24U;
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-  static const bool MaySupportMemoryTagging = true;
-  typedef scudo::uptr PrimaryCompactPtrT;
-  static const scudo::uptr PrimaryCompactPtrScale = 0;
-};
-
-template <typename BaseConfig, typename SizeClassMapT>
-struct Config : public BaseConfig {
-  using SizeClassMap = SizeClassMapT;
-};
-
-template <typename BaseConfig, typename SizeClassMapT>
-struct SizeClassAllocator
-    : public scudo::SizeClassAllocator64<Config<BaseConfig, SizeClassMapT>> {};
-template <typename SizeClassMapT>
-struct SizeClassAllocator<TestConfig1, SizeClassMapT>
-    : public scudo::SizeClassAllocator32<Config<TestConfig1, SizeClassMapT>> {};
-
-template <typename BaseConfig, typename SizeClassMapT>
-struct TestAllocator : public SizeClassAllocator<BaseConfig, SizeClassMapT> {
-  ~TestAllocator() { this->unmapTestOnly(); }
-
-  void *operator new(size_t size) {
-    void *p = nullptr;
-    EXPECT_EQ(0, posix_memalign(&p, alignof(TestAllocator), size));
-    return p;
-  }
-
-  void operator delete(void *ptr) { free(ptr); }
-};
-
-template <class BaseConfig> struct ScudoPrimaryTest : public Test {};
-
-#if SCUDO_FUCHSIA
-#define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig2)                            \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)
-#else
-#define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig1)                            \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig2)                            \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)
-#endif
-
-#define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE)                             \
-  using FIXTURE##NAME##_##TYPE = FIXTURE##NAME<TYPE>;                          \
-  TEST_F(FIXTURE##NAME##_##TYPE, NAME) { Run(); }
-
-#define SCUDO_TYPED_TEST(FIXTURE, NAME)                                        \
-  template <class TypeParam>                                                   \
-  struct FIXTURE##NAME : public FIXTURE<TypeParam> {                           \
-    void Run();                                                                \
-  };                                                                           \
-  SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                                    \
-  template <class TypeParam> void FIXTURE##NAME<TypeParam>::Run()
-
-SCUDO_TYPED_TEST(ScudoPrimaryTest, BasicPrimary) {
-  using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
-  std::unique_ptr<Primary> Allocator(new Primary);
+template <typename Primary> static void testPrimary() {
+  const scudo::uptr NumberOfAllocations = 32U;
+  auto Deleter = [](Primary *P) {
+    P->unmapTestOnly();
+    delete P;
+  };
+  std::unique_ptr<Primary, decltype(Deleter)> Allocator(new Primary, Deleter);
   Allocator->init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
   Cache.init(nullptr, Allocator.get());
-  const scudo::uptr NumberOfAllocations = 32U;
   for (scudo::uptr I = 0; I <= 16U; I++) {
     const scudo::uptr Size = 1UL << I;
     if (!Primary::canAllocate(Size))
@@ -127,20 +52,19 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, BasicPrimary) {
   Str.output();
 }
 
-struct SmallRegionsConfig {
+TEST(ScudoPrimaryTest, BasicPrimary) {
   using SizeClassMap = scudo::DefaultSizeClassMap;
-  static const scudo::uptr PrimaryRegionSizeLog = 20U;
-  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
-  static const bool MaySupportMemoryTagging = false;
-  typedef scudo::uptr PrimaryCompactPtrT;
-  static const scudo::uptr PrimaryCompactPtrScale = 0;
-};
+#if !SCUDO_FUCHSIA
+  testPrimary<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
+#endif
+  testPrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
+  testPrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U, true>>();
+}
 
 // The 64-bit SizeClassAllocator can be easily OOM'd with small region sizes.
 // For the 32-bit one, it requires actually exhausting memory, so we skip it.
 TEST(ScudoPrimaryTest, Primary64OOM) {
-  using Primary = scudo::SizeClassAllocator64<SmallRegionsConfig>;
+  using Primary = scudo::SizeClassAllocator64<scudo::DefaultSizeClassMap, 20U>;
   using TransferBatch = Primary::CacheT::TransferBatch;
   Primary Allocator;
   Allocator.init(/*ReleaseToOsInterval=*/-1);
@@ -159,7 +83,7 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
       break;
     }
     for (scudo::u32 J = 0; J < B->getCount(); J++)
-      memset(Allocator.decompactPtr(ClassId, B->get(J)), 'B', Size);
+      memset(B->get(J), 'B', Size);
     Batches.push_back(B);
   }
   while (!Batches.empty()) {
@@ -175,9 +99,12 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
   Allocator.unmapTestOnly();
 }
 
-SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryIterate) {
-  using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
-  std::unique_ptr<Primary> Allocator(new Primary);
+template <typename Primary> static void testIteratePrimary() {
+  auto Deleter = [](Primary *P) {
+    P->unmapTestOnly();
+    delete P;
+  };
+  std::unique_ptr<Primary, decltype(Deleter)> Allocator(new Primary, Deleter);
   Allocator->init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
   Cache.init(nullptr, Allocator.get());
@@ -211,40 +138,53 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryIterate) {
   Str.output();
 }
 
-SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryThreaded) {
-  using Primary = TestAllocator<TypeParam, scudo::SvelteSizeClassMap>;
-  std::unique_ptr<Primary> Allocator(new Primary);
+TEST(ScudoPrimaryTest, PrimaryIterate) {
+  using SizeClassMap = scudo::DefaultSizeClassMap;
+#if !SCUDO_FUCHSIA
+  testIteratePrimary<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
+#endif
+  testIteratePrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
+  testIteratePrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U, true>>();
+}
+
+static std::mutex Mutex;
+static std::condition_variable Cv;
+static bool Ready = false;
+
+template <typename Primary> static void performAllocations(Primary *Allocator) {
+  static THREADLOCAL typename Primary::CacheT Cache;
+  Cache.init(nullptr, Allocator);
+  std::vector<std::pair<scudo::uptr, void *>> V;
+  {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    while (!Ready)
+      Cv.wait(Lock);
+  }
+  for (scudo::uptr I = 0; I < 256U; I++) {
+    const scudo::uptr Size = std::rand() % Primary::SizeClassMap::MaxSize / 4;
+    const scudo::uptr ClassId = Primary::SizeClassMap::getClassIdBySize(Size);
+    void *P = Cache.allocate(ClassId);
+    if (P)
+      V.push_back(std::make_pair(ClassId, P));
+  }
+  while (!V.empty()) {
+    auto Pair = V.back();
+    Cache.deallocate(Pair.first, Pair.second);
+    V.pop_back();
+  }
+  Cache.destroy(nullptr);
+}
+
+template <typename Primary> static void testPrimaryThreaded() {
+  auto Deleter = [](Primary *P) {
+    P->unmapTestOnly();
+    delete P;
+  };
+  std::unique_ptr<Primary, decltype(Deleter)> Allocator(new Primary, Deleter);
   Allocator->init(/*ReleaseToOsInterval=*/-1);
-  std::mutex Mutex;
-  std::condition_variable Cv;
-  bool Ready = false;
   std::thread Threads[32];
   for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
-    Threads[I] = std::thread([&]() {
-      static thread_local typename Primary::CacheT Cache;
-      Cache.init(nullptr, Allocator.get());
-      std::vector<std::pair<scudo::uptr, void *>> V;
-      {
-        std::unique_lock<std::mutex> Lock(Mutex);
-        while (!Ready)
-          Cv.wait(Lock);
-      }
-      for (scudo::uptr I = 0; I < 256U; I++) {
-        const scudo::uptr Size =
-            std::rand() % Primary::SizeClassMap::MaxSize / 4;
-        const scudo::uptr ClassId =
-            Primary::SizeClassMap::getClassIdBySize(Size);
-        void *P = Cache.allocate(ClassId);
-        if (P)
-          V.push_back(std::make_pair(ClassId, P));
-      }
-      while (!V.empty()) {
-        auto Pair = V.back();
-        Cache.deallocate(Pair.first, Pair.second);
-        V.pop_back();
-      }
-      Cache.destroy(nullptr);
-    });
+    Threads[I] = std::thread(performAllocations<Primary>, Allocator.get());
   {
     std::unique_lock<std::mutex> Lock(Mutex);
     Ready = true;
@@ -258,12 +198,24 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryThreaded) {
   Str.output();
 }
 
+TEST(ScudoPrimaryTest, PrimaryThreaded) {
+  using SizeClassMap = scudo::SvelteSizeClassMap;
+#if !SCUDO_FUCHSIA
+  testPrimaryThreaded<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
+#endif
+  testPrimaryThreaded<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
+  testPrimaryThreaded<scudo::SizeClassAllocator64<SizeClassMap, 24U, true>>();
+}
+
 // Through a simple allocation that spans two pages, verify that releaseToOS
 // actually releases some bytes (at least one page worth). This is a regression
 // test for an error in how the release criteria were computed.
-SCUDO_TYPED_TEST(ScudoPrimaryTest, ReleaseToOS) {
-  using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
-  std::unique_ptr<Primary> Allocator(new Primary);
+template <typename Primary> static void testReleaseToOS() {
+  auto Deleter = [](Primary *P) {
+    P->unmapTestOnly();
+    delete P;
+  };
+  std::unique_ptr<Primary, decltype(Deleter)> Allocator(new Primary, Deleter);
   Allocator->init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
   Cache.init(nullptr, Allocator.get());
@@ -276,3 +228,12 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, ReleaseToOS) {
   Cache.destroy(nullptr);
   EXPECT_GT(Allocator->releaseToOS(), 0U);
 }
+
+TEST(ScudoPrimaryTest, ReleaseToOS) {
+  using SizeClassMap = scudo::DefaultSizeClassMap;
+#if !SCUDO_FUCHSIA
+  testReleaseToOS<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
+#endif
+  testReleaseToOS<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
+  testReleaseToOS<scudo::SizeClassAllocator64<SizeClassMap, 24U, true>>();
+}
diff --git a/standalone/tests/quarantine_test.cpp b/standalone/tests/quarantine_test.cpp
index 91de56a78c9..0422c2ff373 100644
--- a/standalone/tests/quarantine_test.cpp
+++ b/standalone/tests/quarantine_test.cpp
@@ -219,17 +219,12 @@ TEST(ScudoQuarantineTest, GlobalQuarantine) {
   Str.output();
 }
 
-struct PopulateQuarantineThread {
-  pthread_t Thread;
-  QuarantineT *Quarantine;
-  CacheT Cache;
-};
-
 void *populateQuarantine(void *Param) {
-  PopulateQuarantineThread *P = static_cast<PopulateQuarantineThread *>(Param);
-  P->Cache.init();
+  CacheT Cache;
+  Cache.init();
+  QuarantineT *Quarantine = reinterpret_cast<QuarantineT *>(Param);
   for (scudo::uptr I = 0; I < 128UL; I++)
-    P->Quarantine->put(&P->Cache, Cb, FakePtr, LargeBlockSize);
+    Quarantine->put(&Cache, Cb, FakePtr, LargeBlockSize);
   return 0;
 }
 
@@ -238,18 +233,13 @@ TEST(ScudoQuarantineTest, ThreadedGlobalQuarantine) {
   Quarantine.init(MaxQuarantineSize, MaxCacheSize);
 
   const scudo::uptr NumberOfThreads = 32U;
-  PopulateQuarantineThread T[NumberOfThreads];
-  for (scudo::uptr I = 0; I < NumberOfThreads; I++) {
-    T[I].Quarantine = &Quarantine;
-    pthread_create(&T[I].Thread, 0, populateQuarantine, &T[I]);
-  }
+  pthread_t T[NumberOfThreads];
+  for (scudo::uptr I = 0; I < NumberOfThreads; I++)
+    pthread_create(&T[I], 0, populateQuarantine, &Quarantine);
   for (scudo::uptr I = 0; I < NumberOfThreads; I++)
-    pthread_join(T[I].Thread, 0);
+    pthread_join(T[I], 0);
 
   scudo::ScopedString Str(1024);
   Quarantine.getStats(&Str);
   Str.output();
-
-  for (scudo::uptr I = 0; I < NumberOfThreads; I++)
-    Quarantine.drainAndRecycle(&T[I].Cache, Cb);
 }
diff --git a/standalone/tests/release_test.cpp b/standalone/tests/release_test.cpp
index 04c02891e91..8907520d30c 100644
--- a/standalone/tests/release_test.cpp
+++ b/standalone/tests/release_test.cpp
@@ -38,8 +38,7 @@ TEST(ScudoReleaseTest, PackedCounterArray) {
     // Make sure counters request one memory page for the buffer.
     const scudo::uptr NumCounters =
         (scudo::getPageSizeCached() / 8) * (SCUDO_WORDSIZE >> I);
-    scudo::PackedCounterArray Counters(1U, NumCounters,
-                                       1UL << ((1UL << I) - 1));
+    scudo::PackedCounterArray Counters(1U, NumCounters, 1UL << ((1UL << I) - 1));
     Counters.inc(0U, 0U);
     for (scudo::uptr C = 1; C < NumCounters - 1; C++) {
       EXPECT_EQ(0UL, Counters.get(0U, C));
@@ -49,7 +48,7 @@ TEST(ScudoReleaseTest, PackedCounterArray) {
     EXPECT_EQ(0UL, Counters.get(0U, NumCounters - 1));
     Counters.inc(0U, NumCounters - 1);
     if (I > 0) {
-      Counters.incRange(0u, 0U, NumCounters - 1);
+      Counters.incRange(0U, 0U, NumCounters - 1);
       for (scudo::uptr C = 0; C < NumCounters; C++)
         EXPECT_EQ(2UL, Counters.get(0U, C));
     }
@@ -124,8 +123,6 @@ public:
     for (scudo::uptr I = From; I < To; I += PageSize)
       ReportedPages.insert(I);
   }
-
-  scudo::uptr getBase() const { return 0; }
 };
 
 // Simplified version of a TransferBatch.
@@ -192,11 +189,9 @@ template <class SizeClassMap> void testReleaseFreeMemoryToOS() {
     }
 
     // Release the memory.
-    auto SkipRegion = [](UNUSED scudo::uptr RegionIndex) { return false; };
-    auto DecompactPtr = [](scudo::uptr P) { return P; };
     ReleasedPagesRecorder Recorder;
-    releaseFreeMemoryToOS(FreeList, MaxBlocks * BlockSize, 1U, BlockSize,
-                          &Recorder, DecompactPtr, SkipRegion);
+    releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, 1U, BlockSize,
+                          &Recorder);
 
     // Verify that there are no released pages touched by used chunks and all
     // ranges of free chunks big enough to contain the entire memory pages had
@@ -245,9 +240,7 @@ template <class SizeClassMap> void testReleaseFreeMemoryToOS() {
 
     if (InFreeRange) {
       scudo::uptr P = scudo::roundUpTo(CurrentFreeRangeStart, PageSize);
-      const scudo::uptr EndPage =
-          scudo::roundUpTo(MaxBlocks * BlockSize, PageSize);
-      while (P + PageSize <= EndPage) {
+      while (P + PageSize <= MaxBlocks * BlockSize) {
         const bool PageReleased =
             Recorder.ReportedPages.find(P) != Recorder.ReportedPages.end();
         EXPECT_EQ(true, PageReleased);
diff --git a/standalone/tests/scudo_unit_test.h b/standalone/tests/scudo_unit_test.h
index 555a935254c..55d039ef77c 100644
--- a/standalone/tests/scudo_unit_test.h
+++ b/standalone/tests/scudo_unit_test.h
@@ -10,23 +10,16 @@
 
 #if SCUDO_FUCHSIA
 #include <zxtest/zxtest.h>
-using Test = ::zxtest::Test;
 #else
 #include "gtest/gtest.h"
-using Test = ::testing::Test;
 #endif
 
 // If EXPECT_DEATH isn't defined, make it a no-op.
 #ifndef EXPECT_DEATH
-// If ASSERT_DEATH is defined, make EXPECT_DEATH a wrapper to it.
-#ifdef ASSERT_DEATH
-#define EXPECT_DEATH(X, Y) ASSERT_DEATH(([&] { X; }), "")
-#else
 #define EXPECT_DEATH(X, Y)                                                     \
   do {                                                                         \
   } while (0)
-#endif // ASSERT_DEATH
-#endif // EXPECT_DEATH
+#endif
 
 // If EXPECT_STREQ isn't defined, define our own simple one.
 #ifndef EXPECT_STREQ
diff --git a/standalone/tests/scudo_unit_test_main.cpp b/standalone/tests/scudo_unit_test_main.cpp
index 9bbf6e75a5c..20deca998d9 100644
--- a/standalone/tests/scudo_unit_test_main.cpp
+++ b/standalone/tests/scudo_unit_test_main.cpp
@@ -29,11 +29,11 @@ __scudo_default_options() {
          "dealloc_type_mismatch=" DEALLOC_TYPE_MISMATCH;
 }
 
-// The zxtest library provides a default main function that does the same thing
-// for Fuchsia builds.
-#if !SCUDO_FUCHSIA
 int main(int argc, char **argv) {
+#if !SCUDO_FUCHSIA
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
-}
+#else
+  return RUN_ALL_TESTS(argc, argv);
 #endif
+}
diff --git a/standalone/tests/secondary_test.cpp b/standalone/tests/secondary_test.cpp
index a55704297de..d2260b9c15b 100644
--- a/standalone/tests/secondary_test.cpp
+++ b/standalone/tests/secondary_test.cpp
@@ -8,7 +8,6 @@
 
 #include "tests/scudo_unit_test.h"
 
-#include "allocator_config.h"
 #include "secondary.h"
 
 #include <stdio.h>
@@ -19,37 +18,35 @@
 #include <thread>
 #include <vector>
 
-template <typename Config> static void testSecondaryBasic(void) {
-  using SecondaryT = scudo::MapAllocator<Config>;
-
+template <class SecondaryT> static void testSecondaryBasic(void) {
   scudo::GlobalStats S;
   S.init();
-  std::unique_ptr<SecondaryT> L(new SecondaryT);
+  SecondaryT *L = new SecondaryT;
   L->init(&S);
   const scudo::uptr Size = 1U << 16;
-  void *P = L->allocate(scudo::Options{}, Size);
+  void *P = L->allocate(Size);
   EXPECT_NE(P, nullptr);
   memset(P, 'A', Size);
   EXPECT_GE(SecondaryT::getBlockSize(P), Size);
-  L->deallocate(scudo::Options{}, P);
+  L->deallocate(P);
   // If the Secondary can't cache that pointer, it will be unmapped.
-  if (!L->canCache(Size))
+  if (!SecondaryT::canCache(Size))
     EXPECT_DEATH(memset(P, 'A', Size), "");
 
   const scudo::uptr Align = 1U << 16;
-  P = L->allocate(scudo::Options{}, Size + Align, Align);
+  P = L->allocate(Size + Align, Align);
   EXPECT_NE(P, nullptr);
   void *AlignedP = reinterpret_cast<void *>(
       scudo::roundUpTo(reinterpret_cast<scudo::uptr>(P), Align));
   memset(AlignedP, 'A', Size);
-  L->deallocate(scudo::Options{}, P);
+  L->deallocate(P);
 
   std::vector<void *> V;
   for (scudo::uptr I = 0; I < 32U; I++)
-    V.push_back(L->allocate(scudo::Options{}, Size));
+    V.push_back(L->allocate(Size));
   std::shuffle(V.begin(), V.end(), std::mt19937(std::random_device()()));
   while (!V.empty()) {
-    L->deallocate(scudo::Options{}, V.back());
+    L->deallocate(V.back());
     V.pop_back();
   }
   scudo::ScopedString Str(1024);
@@ -57,29 +54,20 @@ template <typename Config> static void testSecondaryBasic(void) {
   Str.output();
 }
 
-struct NoCacheConfig {
-  typedef scudo::MapAllocatorNoCache SecondaryCache;
-  static const bool MaySupportMemoryTagging = false;
-};
-
-struct TestConfig {
-  typedef scudo::MapAllocatorCache<TestConfig> SecondaryCache;
-  static const bool MaySupportMemoryTagging = false;
-  static const scudo::u32 SecondaryCacheEntriesArraySize = 128U;
-  static const scudo::u32 SecondaryCacheQuarantineSize = 0U;
-  static const scudo::u32 SecondaryCacheDefaultMaxEntriesCount = 64U;
-  static const scudo::uptr SecondaryCacheDefaultMaxEntrySize = 1UL << 20;
-  static const scudo::s32 SecondaryCacheMinReleaseToOsIntervalMs = INT32_MIN;
-  static const scudo::s32 SecondaryCacheMaxReleaseToOsIntervalMs = INT32_MAX;
-};
-
 TEST(ScudoSecondaryTest, SecondaryBasic) {
-  testSecondaryBasic<NoCacheConfig>();
-  testSecondaryBasic<scudo::DefaultConfig>();
-  testSecondaryBasic<TestConfig>();
+  testSecondaryBasic<scudo::MapAllocator<scudo::MapAllocatorNoCache>>();
+#if !SCUDO_FUCHSIA
+  testSecondaryBasic<scudo::MapAllocator<scudo::MapAllocatorCache<>>>();
+  testSecondaryBasic<
+      scudo::MapAllocator<scudo::MapAllocatorCache<64U, 1UL << 20>>>();
+#endif
 }
 
-using LargeAllocator = scudo::MapAllocator<scudo::DefaultConfig>;
+#if SCUDO_FUCHSIA
+using LargeAllocator = scudo::MapAllocator<scudo::MapAllocatorNoCache>;
+#else
+using LargeAllocator = scudo::MapAllocator<scudo::MapAllocatorCache<>>;
+#endif
 
 // This exercises a variety of combinations of size and alignment for the
 // MapAllocator. The size computation done here mimic the ones done by the
@@ -87,7 +75,7 @@ using LargeAllocator = scudo::MapAllocator<scudo::DefaultConfig>;
 TEST(ScudoSecondaryTest, SecondaryCombinations) {
   constexpr scudo::uptr MinAlign = FIRST_32_SECOND_64(8, 16);
   constexpr scudo::uptr HeaderSize = scudo::roundUpTo(8, MinAlign);
-  std::unique_ptr<LargeAllocator> L(new LargeAllocator);
+  LargeAllocator *L = new LargeAllocator;
   L->init(nullptr);
   for (scudo::uptr SizeLog = 0; SizeLog <= 20; SizeLog++) {
     for (scudo::uptr AlignLog = FIRST_32_SECOND_64(3, 4); AlignLog <= 16;
@@ -100,12 +88,12 @@ TEST(ScudoSecondaryTest, SecondaryCombinations) {
             scudo::roundUpTo((1U << SizeLog) + Delta, MinAlign);
         const scudo::uptr Size =
             HeaderSize + UserSize + (Align > MinAlign ? Align - HeaderSize : 0);
-        void *P = L->allocate(scudo::Options{}, Size, Align);
+        void *P = L->allocate(Size, Align);
         EXPECT_NE(P, nullptr);
         void *AlignedP = reinterpret_cast<void *>(
             scudo::roundUpTo(reinterpret_cast<scudo::uptr>(P), Align));
         memset(AlignedP, 0xff, UserSize);
-        L->deallocate(scudo::Options{}, P);
+        L->deallocate(P);
       }
     }
   }
@@ -115,12 +103,12 @@ TEST(ScudoSecondaryTest, SecondaryCombinations) {
 }
 
 TEST(ScudoSecondaryTest, SecondaryIterate) {
-  std::unique_ptr<LargeAllocator> L(new LargeAllocator);
+  LargeAllocator *L = new LargeAllocator;
   L->init(nullptr);
   std::vector<void *> V;
   const scudo::uptr PageSize = scudo::getPageSizeCached();
   for (scudo::uptr I = 0; I < 32U; I++)
-    V.push_back(L->allocate(scudo::Options{}, (std::rand() % 16) * PageSize));
+    V.push_back(L->allocate((std::rand() % 16) * PageSize));
   auto Lambda = [V](scudo::uptr Block) {
     EXPECT_NE(std::find(V.begin(), V.end(), reinterpret_cast<void *>(Block)),
               V.end());
@@ -129,7 +117,7 @@ TEST(ScudoSecondaryTest, SecondaryIterate) {
   L->iterateOverBlocks(Lambda);
   L->enable();
   while (!V.empty()) {
-    L->deallocate(scudo::Options{}, V.back());
+    L->deallocate(V.back());
     V.pop_back();
   }
   scudo::ScopedString Str(1024);
@@ -137,32 +125,9 @@ TEST(ScudoSecondaryTest, SecondaryIterate) {
   Str.output();
 }
 
-TEST(ScudoSecondaryTest, SecondaryOptions) {
-  std::unique_ptr<LargeAllocator> L(new LargeAllocator);
-  L->init(nullptr);
-  // Attempt to set a maximum number of entries higher than the array size.
-  EXPECT_FALSE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4096U));
-  // A negative number will be cast to a scudo::u32, and fail.
-  EXPECT_FALSE(L->setOption(scudo::Option::MaxCacheEntriesCount, -1));
-  if (L->canCache(0U)) {
-    // Various valid combinations.
-    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
-    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
-    EXPECT_TRUE(L->canCache(1UL << 18));
-    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 17));
-    EXPECT_FALSE(L->canCache(1UL << 18));
-    EXPECT_TRUE(L->canCache(1UL << 16));
-    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 0U));
-    EXPECT_FALSE(L->canCache(1UL << 16));
-    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntriesCount, 4U));
-    EXPECT_TRUE(L->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
-    EXPECT_TRUE(L->canCache(1UL << 16));
-  }
-}
-
 static std::mutex Mutex;
 static std::condition_variable Cv;
-static bool Ready;
+static bool Ready = false;
 
 static void performAllocations(LargeAllocator *L) {
   std::vector<void *> V;
@@ -175,25 +140,24 @@ static void performAllocations(LargeAllocator *L) {
   for (scudo::uptr I = 0; I < 128U; I++) {
     // Deallocate 75% of the blocks.
     const bool Deallocate = (rand() & 3) != 0;
-    void *P = L->allocate(scudo::Options{}, (std::rand() % 16) * PageSize);
+    void *P = L->allocate((std::rand() % 16) * PageSize);
     if (Deallocate)
-      L->deallocate(scudo::Options{}, P);
+      L->deallocate(P);
     else
       V.push_back(P);
   }
   while (!V.empty()) {
-    L->deallocate(scudo::Options{}, V.back());
+    L->deallocate(V.back());
     V.pop_back();
   }
 }
 
 TEST(ScudoSecondaryTest, SecondaryThreadsRace) {
-  Ready = false;
-  std::unique_ptr<LargeAllocator> L(new LargeAllocator);
+  LargeAllocator *L = new LargeAllocator;
   L->init(nullptr, /*ReleaseToOsInterval=*/0);
   std::thread Threads[16];
   for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
-    Threads[I] = std::thread(performAllocations, L.get());
+    Threads[I] = std::thread(performAllocations, L);
   {
     std::unique_lock<std::mutex> Lock(Mutex);
     Ready = true;
diff --git a/standalone/tests/tsd_test.cpp b/standalone/tests/tsd_test.cpp
index 58ac9e74b98..4a3cf1cd0fc 100644
--- a/standalone/tests/tsd_test.cpp
+++ b/standalone/tests/tsd_test.cpp
@@ -13,7 +13,6 @@
 
 #include <condition_variable>
 #include <mutex>
-#include <set>
 #include <thread>
 
 // We mock out an allocator with a TSD registry, mostly using empty stubs. The
@@ -48,12 +47,12 @@ private:
 
 struct OneCache {
   template <class Allocator>
-  using TSDRegistryT = scudo::TSDRegistrySharedT<Allocator, 1U, 1U>;
+  using TSDRegistryT = scudo::TSDRegistrySharedT<Allocator, 1U>;
 };
 
 struct SharedCaches {
   template <class Allocator>
-  using TSDRegistryT = scudo::TSDRegistrySharedT<Allocator, 16U, 8U>;
+  using TSDRegistryT = scudo::TSDRegistrySharedT<Allocator, 16U>;
 };
 
 struct ExclusiveCaches {
@@ -117,7 +116,7 @@ TEST(ScudoTSDTest, TSDRegistryBasic) {
 
 static std::mutex Mutex;
 static std::condition_variable Cv;
-static bool Ready;
+static bool Ready = false;
 
 template <typename AllocatorT> static void stressCache(AllocatorT *Allocator) {
   auto Registry = Allocator->getTSDRegistry();
@@ -146,7 +145,6 @@ template <typename AllocatorT> static void stressCache(AllocatorT *Allocator) {
 }
 
 template <class AllocatorT> static void testRegistryThreaded() {
-  Ready = false;
   auto Deleter = [](AllocatorT *A) {
     A->unmapTestOnly();
     delete A;
@@ -173,74 +171,3 @@ TEST(ScudoTSDTest, TSDRegistryThreaded) {
   testRegistryThreaded<MockAllocator<ExclusiveCaches>>();
 #endif
 }
-
-static std::set<void *> Pointers;
-
-static void stressSharedRegistry(MockAllocator<SharedCaches> *Allocator) {
-  std::set<void *> Set;
-  auto Registry = Allocator->getTSDRegistry();
-  {
-    std::unique_lock<std::mutex> Lock(Mutex);
-    while (!Ready)
-      Cv.wait(Lock);
-  }
-  Registry->initThreadMaybe(Allocator, /*MinimalInit=*/false);
-  bool UnlockRequired;
-  for (scudo::uptr I = 0; I < 4096U; I++) {
-    auto TSD = Registry->getTSDAndLock(&UnlockRequired);
-    EXPECT_NE(TSD, nullptr);
-    Set.insert(reinterpret_cast<void *>(TSD));
-    if (UnlockRequired)
-      TSD->unlock();
-  }
-  {
-    std::unique_lock<std::mutex> Lock(Mutex);
-    Pointers.insert(Set.begin(), Set.end());
-  }
-}
-
-TEST(ScudoTSDTest, TSDRegistryTSDsCount) {
-  Ready = false;
-  Pointers.clear();
-  using AllocatorT = MockAllocator<SharedCaches>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
-  // We attempt to use as many TSDs as the shared cache offers by creating a
-  // decent amount of threads that will be run concurrently and attempt to get
-  // and lock TSDs. We put them all in a set and count the number of entries
-  // after we are done.
-  std::thread Threads[32];
-  for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
-    Threads[I] = std::thread(stressSharedRegistry, Allocator.get());
-  {
-    std::unique_lock<std::mutex> Lock(Mutex);
-    Ready = true;
-    Cv.notify_all();
-  }
-  for (auto &T : Threads)
-    T.join();
-  // The initial number of TSDs we get will be the minimum of the default count
-  // and the number of CPUs.
-  EXPECT_LE(Pointers.size(), 8U);
-  Pointers.clear();
-  auto Registry = Allocator->getTSDRegistry();
-  // Increase the number of TSDs to 16.
-  Registry->setOption(scudo::Option::MaxTSDsCount, 16);
-  Ready = false;
-  for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
-    Threads[I] = std::thread(stressSharedRegistry, Allocator.get());
-  {
-    std::unique_lock<std::mutex> Lock(Mutex);
-    Ready = true;
-    Cv.notify_all();
-  }
-  for (auto &T : Threads)
-    T.join();
-  // We should get 16 distinct TSDs back.
-  EXPECT_EQ(Pointers.size(), 16U);
-}
diff --git a/standalone/tests/wrappers_c_test.cpp b/standalone/tests/wrappers_c_test.cpp
index eed8f031933..8b2bc6ecbd5 100644
--- a/standalone/tests/wrappers_c_test.cpp
+++ b/standalone/tests/wrappers_c_test.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "scudo/interface.h"
 #include "tests/scudo_unit_test.h"
 
 #include <errno.h>
@@ -42,19 +41,8 @@ TEST(ScudoWrappersCTest, Malloc) {
   EXPECT_NE(P, nullptr);
   EXPECT_LE(Size, malloc_usable_size(P));
   EXPECT_EQ(reinterpret_cast<uintptr_t>(P) % FIRST_32_SECOND_64(8U, 16U), 0U);
-
-  // An update to this warning in Clang now triggers in this line, but it's ok
-  // because the check is expecting a bad pointer and should fail.
-#if defined(__has_warning) && __has_warning("-Wfree-nonheap-object")
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
-#endif
   EXPECT_DEATH(
       free(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(P) | 1U)), "");
-#if defined(__has_warning) && __has_warning("-Wfree-nonheap-object")
-#pragma GCC diagnostic pop
-#endif
-
   free(P);
   EXPECT_DEATH(free(P), "");
 
@@ -94,18 +82,6 @@ TEST(ScudoWrappersCTest, Calloc) {
   EXPECT_EQ(errno, ENOMEM);
 }
 
-TEST(ScudoWrappersCTest, SmallAlign) {
-  void *P;
-  for (size_t Size = 1; Size <= 0x10000; Size <<= 1) {
-    for (size_t Align = 1; Align <= 0x10000; Align <<= 1) {
-      for (size_t Count = 0; Count < 3; ++Count) {
-        P = memalign(Align, Size);
-        EXPECT_TRUE(reinterpret_cast<uintptr_t>(P) % Align == 0);
-      }
-    }
-  }
-}
-
 TEST(ScudoWrappersCTest, Memalign) {
   void *P;
   for (size_t I = FIRST_32_SECOND_64(2U, 3U); I <= 18U; I++) {
@@ -212,6 +188,14 @@ TEST(ScudoWrappersCTest, Realloc) {
   }
 }
 
+#ifndef M_DECAY_TIME
+#define M_DECAY_TIME -100
+#endif
+
+#ifndef M_PURGE
+#define M_PURGE -101
+#endif
+
 #if !SCUDO_FUCHSIA
 TEST(ScudoWrappersCTest, MallOpt) {
   errno = 0;
@@ -225,12 +209,6 @@ TEST(ScudoWrappersCTest, MallOpt) {
   EXPECT_EQ(mallopt(M_DECAY_TIME, 0), 1);
   EXPECT_EQ(mallopt(M_DECAY_TIME, 1), 1);
   EXPECT_EQ(mallopt(M_DECAY_TIME, 0), 1);
-
-  if (SCUDO_ANDROID) {
-    EXPECT_EQ(mallopt(M_CACHE_COUNT_MAX, 100), 1);
-    EXPECT_EQ(mallopt(M_CACHE_SIZE_MAX, 1024 * 1024 * 2), 1);
-    EXPECT_EQ(mallopt(M_TSDS_COUNT_MAX, 10), 1);
-  }
 }
 #endif
 
@@ -326,10 +304,8 @@ TEST(ScudoWrappersCTest, MallocIterateBoundary) {
   }
 }
 
-// Fuchsia doesn't have alarm, fork or malloc_info.
-#if !SCUDO_FUCHSIA
+// We expect heap operations within a disable/enable scope to deadlock.
 TEST(ScudoWrappersCTest, MallocDisableDeadlock) {
-  // We expect heap operations within a disable/enable scope to deadlock.
   EXPECT_DEATH(
       {
         void *P = malloc(Size);
@@ -343,6 +319,9 @@ TEST(ScudoWrappersCTest, MallocDisableDeadlock) {
       "");
 }
 
+// Fuchsia doesn't have fork or malloc_info.
+#if !SCUDO_FUCHSIA
+
 TEST(ScudoWrappersCTest, MallocInfo) {
   // Use volatile so that the allocations don't get optimized away.
   void *volatile P1 = malloc(1234);
@@ -393,7 +372,6 @@ TEST(ScudoWrappersCTest, Fork) {
 
 static pthread_mutex_t Mutex;
 static pthread_cond_t Conditional = PTHREAD_COND_INITIALIZER;
-static bool Ready;
 
 static void *enableMalloc(void *Unused) {
   // Initialize the allocator for this thread.
@@ -404,7 +382,6 @@ static void *enableMalloc(void *Unused) {
 
   // Signal the main thread we are ready.
   pthread_mutex_lock(&Mutex);
-  Ready = true;
   pthread_cond_signal(&Conditional);
   pthread_mutex_unlock(&Mutex);
 
@@ -417,13 +394,11 @@ static void *enableMalloc(void *Unused) {
 
 TEST(ScudoWrappersCTest, DisableForkEnable) {
   pthread_t ThreadId;
-  Ready = false;
   EXPECT_EQ(pthread_create(&ThreadId, nullptr, &enableMalloc, nullptr), 0);
 
   // Wait for the thread to be warmed up.
   pthread_mutex_lock(&Mutex);
-  while (!Ready)
-    pthread_cond_wait(&Conditional, &Mutex);
+  pthread_cond_wait(&Conditional, &Mutex);
   pthread_mutex_unlock(&Mutex);
 
   // Disable the allocator and fork. fork should succeed after malloc_enable.
diff --git a/standalone/tests/wrappers_cpp_test.cpp b/standalone/tests/wrappers_cpp_test.cpp
index 9df06dcdf14..4ccef5bb0de 100644
--- a/standalone/tests/wrappers_cpp_test.cpp
+++ b/standalone/tests/wrappers_cpp_test.cpp
@@ -66,10 +66,6 @@ public:
 };
 
 TEST(ScudoWrappersCppTest, New) {
-  if (getenv("SKIP_TYPE_MISMATCH")) {
-    printf("Skipped type mismatch tests.\n");
-    return;
-  }
   testCxxNew<bool>();
   testCxxNew<uint8_t>();
   testCxxNew<uint16_t>();
@@ -83,7 +79,7 @@ TEST(ScudoWrappersCppTest, New) {
 
 static std::mutex Mutex;
 static std::condition_variable Cv;
-static bool Ready;
+static bool Ready = false;
 
 static void stressNew() {
   std::vector<uintptr_t *> V;
@@ -107,7 +103,6 @@ static void stressNew() {
 }
 
 TEST(ScudoWrappersCppTest, ThreadedNew) {
-  Ready = false;
   std::thread Threads[32];
   for (size_t I = 0U; I < sizeof(Threads) / sizeof(Threads[0]); I++)
     Threads[I] = std::thread(stressNew);
diff --git a/standalone/tools/compute_size_class_config.cpp b/standalone/tools/compute_size_class_config.cpp
index 8b17be0e965..82f37b6647e 100644
--- a/standalone/tools/compute_size_class_config.cpp
+++ b/standalone/tools/compute_size_class_config.cpp
@@ -19,8 +19,9 @@ struct Alloc {
 };
 
 size_t measureWastage(const std::vector<Alloc> &allocs,
-                      const std::vector<size_t> &classes, size_t pageSize,
-                      size_t headerSize) {
+                       const std::vector<size_t> &classes,
+                       size_t pageSize,
+                       size_t headerSize) {
   size_t totalWastage = 0;
   for (auto &a : allocs) {
     size_t sizePlusHeader = a.size + headerSize;
@@ -54,8 +55,7 @@ void readAllocs(std::vector<Alloc> &allocs, const char *path) {
   }
 
   Alloc a;
-  while (fscanf(f, "<alloc size=\"%zu\" count=\"%zu\"/>\n", &a.size,
-                &a.count) == 2)
+  while (fscanf(f, "<alloc size=\"%zu\" count=\"%zu\"/>\n", &a.size, &a.count) == 2)
     allocs.push_back(a);
   fclose(f);
 }
@@ -157,6 +157,5 @@ struct MySizeClassConfig {
   };
   static const uptr SizeDelta = %zu;
 };
-)",
-         headerSize);
+)", headerSize);
 }
diff --git a/standalone/tsd.h b/standalone/tsd.h
index a6e669b66e6..20f0d69cabf 100644
--- a/standalone/tsd.h
+++ b/standalone/tsd.h
@@ -23,10 +23,10 @@
 
 namespace scudo {
 
-template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
+template <class Allocator> struct ALIGNED(SCUDO_CACHE_LINE_SIZE) TSD {
   typename Allocator::CacheT Cache;
   typename Allocator::QuarantineCacheT QuarantineCache;
-  u8 DestructorIterations = 0;
+  u8 DestructorIterations;
 
   void initLinkerInitialized(Allocator *Instance) {
     Instance->initCache(&Cache);
@@ -59,7 +59,7 @@ template <class Allocator> struct alignas(SCUDO_CACHE_LINE_SIZE) TSD {
 
 private:
   HybridMutex Mutex;
-  atomic_uptr Precedence = {};
+  atomic_uptr Precedence;
 };
 
 } // namespace scudo
diff --git a/standalone/tsd_exclusive.h b/standalone/tsd_exclusive.h
index a907ed4684a..3492509b5a8 100644
--- a/standalone/tsd_exclusive.h
+++ b/standalone/tsd_exclusive.h
@@ -13,13 +13,10 @@
 
 namespace scudo {
 
-struct ThreadState {
-  bool DisableMemInit : 1;
-  enum {
-    NotInitialized = 0,
-    Initialized,
-    TornDown,
-  } InitState : 2;
+enum class ThreadState : u8 {
+  NotInitialized = 0,
+  Initialized,
+  TornDown,
 };
 
 template <class Allocator> void teardownThread(void *Ptr);
@@ -36,30 +33,16 @@ template <class Allocator> struct TSDRegistryExT {
     initLinkerInitialized(Instance);
   }
 
-  void initOnceMaybe(Allocator *Instance) {
-    ScopedLock L(Mutex);
-    if (LIKELY(Initialized))
-      return;
-    initLinkerInitialized(Instance); // Sets Initialized.
-  }
-
-  void unmapTestOnly() {
-    Allocator *Instance =
-        reinterpret_cast<Allocator *>(pthread_getspecific(PThreadKey));
-    if (!Instance)
-      return;
-    ThreadTSD.commitBack(Instance);
-    State = {};
-  }
+  void unmapTestOnly() {}
 
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance, bool MinimalInit) {
-    if (LIKELY(State.InitState != ThreadState::NotInitialized))
+    if (LIKELY(State != ThreadState::NotInitialized))
       return;
     initThread(Instance, MinimalInit);
   }
 
   ALWAYS_INLINE TSD<Allocator> *getTSDAndLock(bool *UnlockRequired) {
-    if (LIKELY(State.InitState == ThreadState::Initialized &&
+    if (LIKELY(State == ThreadState::Initialized &&
                !atomic_load(&Disabled, memory_order_acquire))) {
       *UnlockRequired = false;
       return &ThreadTSD;
@@ -83,17 +66,14 @@ template <class Allocator> struct TSDRegistryExT {
     Mutex.unlock();
   }
 
-  bool setOption(Option O, UNUSED sptr Value) {
-    if (O == Option::ThreadDisableMemInit)
-      State.DisableMemInit = Value;
-    if (O == Option::MaxTSDsCount)
-      return false;
-    return true;
+private:
+  void initOnceMaybe(Allocator *Instance) {
+    ScopedLock L(Mutex);
+    if (LIKELY(Initialized))
+      return;
+    initLinkerInitialized(Instance); // Sets Initialized.
   }
 
-  bool getDisableMemInit() { return State.DisableMemInit; }
-
-private:
   // Using minimal initialization allows for global initialization while keeping
   // the thread specific structure untouched. The fallback structure will be
   // used instead.
@@ -104,25 +84,25 @@ private:
     CHECK_EQ(
         pthread_setspecific(PThreadKey, reinterpret_cast<void *>(Instance)), 0);
     ThreadTSD.initLinkerInitialized(Instance);
-    State.InitState = ThreadState::Initialized;
+    State = ThreadState::Initialized;
     Instance->callPostInitCallback();
   }
 
-  pthread_key_t PThreadKey = {};
-  bool Initialized = false;
-  atomic_u8 Disabled = {};
+  pthread_key_t PThreadKey;
+  bool Initialized;
+  atomic_u8 Disabled;
   TSD<Allocator> FallbackTSD;
   HybridMutex Mutex;
-  static thread_local ThreadState State;
-  static thread_local TSD<Allocator> ThreadTSD;
+  static THREADLOCAL ThreadState State;
+  static THREADLOCAL TSD<Allocator> ThreadTSD;
 
   friend void teardownThread<Allocator>(void *Ptr);
 };
 
 template <class Allocator>
-thread_local TSD<Allocator> TSDRegistryExT<Allocator>::ThreadTSD;
+THREADLOCAL TSD<Allocator> TSDRegistryExT<Allocator>::ThreadTSD;
 template <class Allocator>
-thread_local ThreadState TSDRegistryExT<Allocator>::State;
+THREADLOCAL ThreadState TSDRegistryExT<Allocator>::State;
 
 template <class Allocator> void teardownThread(void *Ptr) {
   typedef TSDRegistryExT<Allocator> TSDRegistryT;
@@ -140,7 +120,7 @@ template <class Allocator> void teardownThread(void *Ptr) {
       return;
   }
   TSDRegistryT::ThreadTSD.commitBack(Instance);
-  TSDRegistryT::State.InitState = ThreadState::TornDown;
+  TSDRegistryT::State = ThreadState::TornDown;
 }
 
 } // namespace scudo
diff --git a/standalone/tsd_shared.h b/standalone/tsd_shared.h
index afe3623ce40..038a5905ff4 100644
--- a/standalone/tsd_shared.h
+++ b/standalone/tsd_shared.h
@@ -9,28 +9,36 @@
 #ifndef SCUDO_TSD_SHARED_H_
 #define SCUDO_TSD_SHARED_H_
 
+#include "linux.h" // for getAndroidTlsPtr()
 #include "tsd.h"
 
-#if SCUDO_HAS_PLATFORM_TLS_SLOT
-// This is a platform-provided header that needs to be on the include path when
-// Scudo is compiled. It must declare a function with the prototype:
-//   uintptr_t *getPlatformAllocatorTlsSlot()
-// that returns the address of a thread-local word of storage reserved for
-// Scudo, that must be zero-initialized in newly created threads.
-#include "scudo_platform_tls_slot.h"
-#endif
-
 namespace scudo {
 
-template <class Allocator, u32 TSDsArraySize, u32 DefaultTSDCount>
-struct TSDRegistrySharedT {
+template <class Allocator, u32 MaxTSDCount> struct TSDRegistrySharedT {
   void initLinkerInitialized(Allocator *Instance) {
     Instance->initLinkerInitialized();
-    for (u32 I = 0; I < TSDsArraySize; I++)
-      TSDs[I].initLinkerInitialized(Instance);
+    CHECK_EQ(pthread_key_create(&PThreadKey, nullptr), 0); // For non-TLS
     const u32 NumberOfCPUs = getNumberOfCPUs();
-    setNumberOfTSDs((NumberOfCPUs == 0) ? DefaultTSDCount
-                                        : Min(NumberOfCPUs, DefaultTSDCount));
+    NumberOfTSDs = (SCUDO_ANDROID || NumberOfCPUs == 0)
+                       ? MaxTSDCount
+                       : Min(NumberOfCPUs, MaxTSDCount);
+    for (u32 I = 0; I < NumberOfTSDs; I++)
+      TSDs[I].initLinkerInitialized(Instance);
+    // Compute all the coprimes of NumberOfTSDs. This will be used to walk the
+    // array of TSDs in a random order. For details, see:
+    // https://lemire.me/blog/2017/09/18/visiting-all-values-in-an-array-exactly-once-in-random-order/
+    for (u32 I = 0; I < NumberOfTSDs; I++) {
+      u32 A = I + 1;
+      u32 B = NumberOfTSDs;
+      // Find the GCD between I + 1 and NumberOfTSDs. If 1, they are coprimes.
+      while (B != 0) {
+        const u32 T = A;
+        A = B;
+        B = T % B;
+      }
+      if (A == 1)
+        CoPrimes[NumberOfCoPrimes++] = I + 1;
+    }
     Initialized = true;
   }
   void init(Allocator *Instance) {
@@ -38,15 +46,11 @@ struct TSDRegistrySharedT {
     initLinkerInitialized(Instance);
   }
 
-  void initOnceMaybe(Allocator *Instance) {
-    ScopedLock L(Mutex);
-    if (LIKELY(Initialized))
-      return;
-    initLinkerInitialized(Instance); // Sets Initialized.
+  void unmapTestOnly() {
+    setCurrentTSD(nullptr);
+    pthread_key_delete(PThreadKey);
   }
 
-  void unmapTestOnly() { setCurrentTSD(nullptr); }
-
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance,
                                      UNUSED bool MinimalInit) {
     if (LIKELY(getCurrentTSD()))
@@ -62,88 +66,49 @@ struct TSDRegistrySharedT {
     if (TSD->tryLock())
       return TSD;
     // If that fails, go down the slow path.
-    if (TSDsArraySize == 1U) {
-      // Only 1 TSD, not need to go any further.
-      // The compiler will optimize this one way or the other.
-      TSD->lock();
-      return TSD;
-    }
     return getTSDAndLockSlow(TSD);
   }
 
   void disable() {
     Mutex.lock();
-    for (u32 I = 0; I < TSDsArraySize; I++)
+    for (u32 I = 0; I < NumberOfTSDs; I++)
       TSDs[I].lock();
   }
 
   void enable() {
-    for (s32 I = static_cast<s32>(TSDsArraySize - 1); I >= 0; I--)
+    for (s32 I = static_cast<s32>(NumberOfTSDs - 1); I >= 0; I--)
       TSDs[I].unlock();
     Mutex.unlock();
   }
 
-  bool setOption(Option O, sptr Value) {
-    if (O == Option::MaxTSDsCount)
-      return setNumberOfTSDs(static_cast<u32>(Value));
-    if (O == Option::ThreadDisableMemInit)
-      setDisableMemInit(Value);
-    // Not supported by the TSD Registry, but not an error either.
-    return true;
-  }
-
-  bool getDisableMemInit() const { return *getTlsPtr() & 1; }
-
 private:
-  ALWAYS_INLINE uptr *getTlsPtr() const {
-#if SCUDO_HAS_PLATFORM_TLS_SLOT
-    return reinterpret_cast<uptr *>(getPlatformAllocatorTlsSlot());
+  ALWAYS_INLINE void setCurrentTSD(TSD<Allocator> *CurrentTSD) {
+#if _BIONIC
+    *getAndroidTlsPtr() = reinterpret_cast<uptr>(CurrentTSD);
+#elif SCUDO_LINUX
+    ThreadTSD = CurrentTSD;
 #else
-    static thread_local uptr ThreadTSD;
-    return &ThreadTSD;
+    CHECK_EQ(
+        pthread_setspecific(PThreadKey, reinterpret_cast<void *>(CurrentTSD)),
+        0);
 #endif
   }
 
-  static_assert(alignof(TSD<Allocator>) >= 2, "");
-
-  ALWAYS_INLINE void setCurrentTSD(TSD<Allocator> *CurrentTSD) {
-    *getTlsPtr() &= 1;
-    *getTlsPtr() |= reinterpret_cast<uptr>(CurrentTSD);
-  }
-
   ALWAYS_INLINE TSD<Allocator> *getCurrentTSD() {
-    return reinterpret_cast<TSD<Allocator> *>(*getTlsPtr() & ~1ULL);
-  }
-
-  bool setNumberOfTSDs(u32 N) {
-    ScopedLock L(MutexTSDs);
-    if (N < NumberOfTSDs)
-      return false;
-    if (N > TSDsArraySize)
-      N = TSDsArraySize;
-    NumberOfTSDs = N;
-    NumberOfCoPrimes = 0;
-    // Compute all the coprimes of NumberOfTSDs. This will be used to walk the
-    // array of TSDs in a random order. For details, see:
-    // https://lemire.me/blog/2017/09/18/visiting-all-values-in-an-array-exactly-once-in-random-order/
-    for (u32 I = 0; I < N; I++) {
-      u32 A = I + 1;
-      u32 B = N;
-      // Find the GCD between I + 1 and N. If 1, they are coprimes.
-      while (B != 0) {
-        const u32 T = A;
-        A = B;
-        B = T % B;
-      }
-      if (A == 1)
-        CoPrimes[NumberOfCoPrimes++] = I + 1;
-    }
-    return true;
+#if _BIONIC
+    return reinterpret_cast<TSD<Allocator> *>(*getAndroidTlsPtr());
+#elif SCUDO_LINUX
+    return ThreadTSD;
+#else
+    return reinterpret_cast<TSD<Allocator> *>(pthread_getspecific(PThreadKey));
+#endif
   }
 
-  void setDisableMemInit(bool B) {
-    *getTlsPtr() &= ~1ULL;
-    *getTlsPtr() |= B;
+  void initOnceMaybe(Allocator *Instance) {
+    ScopedLock L(Mutex);
+    if (LIKELY(Initialized))
+      return;
+    initLinkerInitialized(Instance); // Sets Initialized.
   }
 
   NOINLINE void initThread(Allocator *Instance) {
@@ -155,23 +120,17 @@ private:
   }
 
   NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD) {
-    // Use the Precedence of the current TSD as our random seed. Since we are
-    // in the slow path, it means that tryLock failed, and as a result it's
-    // very likely that said Precedence is non-zero.
-    const u32 R = static_cast<u32>(CurrentTSD->getPrecedence());
-    u32 N, Inc;
-    {
-      ScopedLock L(MutexTSDs);
-      N = NumberOfTSDs;
-      DCHECK_NE(NumberOfCoPrimes, 0U);
-      Inc = CoPrimes[R % NumberOfCoPrimes];
-    }
-    if (N > 1U) {
-      u32 Index = R % N;
+    if (MaxTSDCount > 1U && NumberOfTSDs > 1U) {
+      // Use the Precedence of the current TSD as our random seed. Since we are
+      // in the slow path, it means that tryLock failed, and as a result it's
+      // very likely that said Precedence is non-zero.
+      const u32 R = static_cast<u32>(CurrentTSD->getPrecedence());
+      const u32 Inc = CoPrimes[R % NumberOfCoPrimes];
+      u32 Index = R % NumberOfTSDs;
       uptr LowestPrecedence = UINTPTR_MAX;
       TSD<Allocator> *CandidateTSD = nullptr;
       // Go randomly through at most 4 contexts and find a candidate.
-      for (u32 I = 0; I < Min(4U, N); I++) {
+      for (u32 I = 0; I < Min(4U, NumberOfTSDs); I++) {
         if (TSDs[Index].tryLock()) {
           setCurrentTSD(&TSDs[Index]);
           return &TSDs[Index];
@@ -183,8 +142,8 @@ private:
           LowestPrecedence = Precedence;
         }
         Index += Inc;
-        if (Index >= N)
-          Index -= N;
+        if (Index >= NumberOfTSDs)
+          Index -= NumberOfTSDs;
       }
       if (CandidateTSD) {
         CandidateTSD->lock();
@@ -197,16 +156,25 @@ private:
     return CurrentTSD;
   }
 
-  atomic_u32 CurrentIndex = {};
-  u32 NumberOfTSDs = 0;
-  u32 NumberOfCoPrimes = 0;
-  u32 CoPrimes[TSDsArraySize] = {};
-  bool Initialized = false;
+  pthread_key_t PThreadKey;
+  atomic_u32 CurrentIndex;
+  u32 NumberOfTSDs;
+  u32 NumberOfCoPrimes;
+  u32 CoPrimes[MaxTSDCount];
+  bool Initialized;
   HybridMutex Mutex;
-  HybridMutex MutexTSDs;
-  TSD<Allocator> TSDs[TSDsArraySize];
+  TSD<Allocator> TSDs[MaxTSDCount];
+#if SCUDO_LINUX && !_BIONIC
+  static THREADLOCAL TSD<Allocator> *ThreadTSD;
+#endif
 };
 
+#if SCUDO_LINUX && !_BIONIC
+template <class Allocator, u32 MaxTSDCount>
+THREADLOCAL TSD<Allocator>
+    *TSDRegistrySharedT<Allocator, MaxTSDCount>::ThreadTSD;
+#endif
+
 } // namespace scudo
 
 #endif // SCUDO_TSD_SHARED_H_
diff --git a/standalone/wrappers_c.cpp b/standalone/wrappers_c.cpp
index 81c7dd60ee3..098cc089a1c 100644
--- a/standalone/wrappers_c.cpp
+++ b/standalone/wrappers_c.cpp
@@ -26,7 +26,6 @@ extern "C" void SCUDO_PREFIX(malloc_postinit)();
 // Export the static allocator so that the C++ wrappers can access it.
 // Technically we could have a completely separated heap for C & C++ but in
 // reality the amount of cross pollination between the two is staggering.
-SCUDO_REQUIRE_CONSTANT_INITIALIZATION
 scudo::Allocator<scudo::Config, SCUDO_PREFIX(malloc_postinit)> SCUDO_ALLOCATOR;
 
 #include "wrappers_c.inc"
diff --git a/standalone/wrappers_c.h b/standalone/wrappers_c.h
index 6d0cecdc4b4..33a0c53cec0 100644
--- a/standalone/wrappers_c.h
+++ b/standalone/wrappers_c.h
@@ -41,4 +41,12 @@ struct __scudo_mallinfo {
 #define SCUDO_MALLINFO __scudo_mallinfo
 #endif
 
+#ifndef M_DECAY_TIME
+#define M_DECAY_TIME -100
+#endif
+
+#ifndef M_PURGE
+#define M_PURGE -101
+#endif
+
 #endif // SCUDO_WRAPPERS_C_H_
diff --git a/standalone/wrappers_c.inc b/standalone/wrappers_c.inc
index 43efb02cb86..5a6c1a8d408 100644
--- a/standalone/wrappers_c.inc
+++ b/standalone/wrappers_c.inc
@@ -155,7 +155,7 @@ void SCUDO_PREFIX(malloc_postinit)() {
                  SCUDO_PREFIX(malloc_enable));
 }
 
-INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, int value) {
+INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, UNUSED int value) {
   if (param == M_DECAY_TIME) {
     if (SCUDO_ANDROID) {
       if (value == 0) {
@@ -173,29 +173,8 @@ INTERFACE WEAK int SCUDO_PREFIX(mallopt)(int param, int value) {
   } else if (param == M_PURGE) {
     SCUDO_ALLOCATOR.releaseToOS();
     return 1;
-  } else {
-    scudo::Option option;
-    switch (param) {
-    case M_MEMTAG_TUNING:
-      option = scudo::Option::MemtagTuning;
-      break;
-    case M_THREAD_DISABLE_MEM_INIT:
-      option = scudo::Option::ThreadDisableMemInit;
-      break;
-    case M_CACHE_COUNT_MAX:
-      option = scudo::Option::MaxCacheEntriesCount;
-      break;
-    case M_CACHE_SIZE_MAX:
-      option = scudo::Option::MaxCacheEntrySize;
-      break;
-    case M_TSDS_COUNT_MAX:
-      option = scudo::Option::MaxTSDsCount;
-      break;
-    default:
-      return 0;
-    }
-    return SCUDO_ALLOCATOR.setOption(option, static_cast<scudo::sptr>(value));
   }
+  return 0;
 }
 
 INTERFACE WEAK void *SCUDO_PREFIX(aligned_alloc)(size_t alignment,
@@ -234,38 +213,10 @@ INTERFACE WEAK int SCUDO_PREFIX(malloc_info)(UNUSED int options, FILE *stream) {
 
 // Disable memory tagging for the heap. The caller must disable memory tag
 // checks globally (e.g. by clearing TCF0 on aarch64) before calling this
-// function, and may not re-enable them after calling the function.
+// function, and may not re-enable them after calling the function. The program
+// must be single threaded at the point when the function is called.
 INTERFACE WEAK void SCUDO_PREFIX(malloc_disable_memory_tagging)() {
   SCUDO_ALLOCATOR.disableMemoryTagging();
 }
 
-// Sets whether scudo records stack traces and other metadata for allocations
-// and deallocations. This function only has an effect if the allocator and
-// hardware support memory tagging.
-INTERFACE WEAK void
-SCUDO_PREFIX(malloc_set_track_allocation_stacks)(int track) {
-  SCUDO_ALLOCATOR.setTrackAllocationStacks(track);
-}
-
-// Sets whether scudo zero-initializes all allocated memory.
-INTERFACE WEAK void SCUDO_PREFIX(malloc_set_zero_contents)(int zero_contents) {
-  SCUDO_ALLOCATOR.setFillContents(zero_contents ? scudo::ZeroFill
-                                                : scudo::NoFill);
-}
-
-// Sets whether scudo pattern-initializes all allocated memory.
-INTERFACE WEAK void
-SCUDO_PREFIX(malloc_set_pattern_fill_contents)(int pattern_fill_contents) {
-  SCUDO_ALLOCATOR.setFillContents(
-      pattern_fill_contents ? scudo::PatternOrZeroFill : scudo::NoFill);
-}
-
-// Sets whether scudo adds a small amount of slack at the end of large
-// allocations, before the guard page. This can be enabled to work around buggy
-// applications that read a few bytes past the end of their allocation.
-INTERFACE WEAK void
-SCUDO_PREFIX(malloc_set_add_large_allocation_slack)(int add_slack) {
-  SCUDO_ALLOCATOR.setAddLargeAllocationSlack(add_slack);
-}
-
 } // extern "C"
diff --git a/standalone/wrappers_c_bionic.cpp b/standalone/wrappers_c_bionic.cpp
index 18c3bf2c0ed..7a012a23bcf 100644
--- a/standalone/wrappers_c_bionic.cpp
+++ b/standalone/wrappers_c_bionic.cpp
@@ -23,7 +23,6 @@
 #define SCUDO_ALLOCATOR Allocator
 
 extern "C" void SCUDO_PREFIX(malloc_postinit)();
-SCUDO_REQUIRE_CONSTANT_INITIALIZATION
 static scudo::Allocator<scudo::AndroidConfig, SCUDO_PREFIX(malloc_postinit)>
     SCUDO_ALLOCATOR;
 
@@ -37,7 +36,6 @@ static scudo::Allocator<scudo::AndroidConfig, SCUDO_PREFIX(malloc_postinit)>
 #define SCUDO_ALLOCATOR SvelteAllocator
 
 extern "C" void SCUDO_PREFIX(malloc_postinit)();
-SCUDO_REQUIRE_CONSTANT_INITIALIZATION
 static scudo::Allocator<scudo::AndroidSvelteConfig,
                         SCUDO_PREFIX(malloc_postinit)>
     SCUDO_ALLOCATOR;
@@ -50,39 +48,4 @@ static scudo::Allocator<scudo::AndroidSvelteConfig,
 // TODO(kostyak): support both allocators.
 INTERFACE void __scudo_print_stats(void) { Allocator.printStats(); }
 
-INTERFACE void
-__scudo_get_error_info(struct scudo_error_info *error_info,
-                       uintptr_t fault_addr, const char *stack_depot,
-                       const char *region_info, const char *ring_buffer,
-                       const char *memory, const char *memory_tags,
-                       uintptr_t memory_addr, size_t memory_size) {
-  Allocator.getErrorInfo(error_info, fault_addr, stack_depot, region_info,
-                         ring_buffer, memory, memory_tags, memory_addr,
-                         memory_size);
-}
-
-INTERFACE const char *__scudo_get_stack_depot_addr() {
-  return Allocator.getStackDepotAddress();
-}
-
-INTERFACE size_t __scudo_get_stack_depot_size() {
-  return sizeof(scudo::StackDepot);
-}
-
-INTERFACE const char *__scudo_get_region_info_addr() {
-  return Allocator.getRegionInfoArrayAddress();
-}
-
-INTERFACE size_t __scudo_get_region_info_size() {
-  return Allocator.getRegionInfoArraySize();
-}
-
-INTERFACE const char *__scudo_get_ring_buffer_addr() {
-  return Allocator.getRingBufferAddress();
-}
-
-INTERFACE size_t __scudo_get_ring_buffer_size() {
-  return Allocator.getRingBufferSize();
-}
-
 #endif // SCUDO_ANDROID && _BIONIC