From 8965c6d7386e6a625e8a7f1d3dfc830b396a5d1b Mon Sep 17 00:00:00 2001
From: Frederick Mayle <fmayle@google.com>
Date: Wed, 17 Jan 2024 16:24:54 -0800
Subject: Import 'lz4_flex' crate

Request Document: go/android-rust-importing-crates
For CL Reviewers: go/android3p#cl-review
For Build Team: go/ab-third-party-imports

Bug: 319233671
Test: n/a
Change-Id: I9d65f7035d039557e5947767b401f576628e3d06
---
 .cargo_vcs_info.json         |   6 +
 Cargo.lock                   | 389 +++++++++++++++++
 Cargo.toml                   | 100 +++++
 Cargo.toml.orig              |  87 ++++
 LICENSE                      |  20 +
 METADATA                     |  19 +
 MODULE_LICENSE_MIT           |   0
 OWNERS                       |   3 +
 README.md                    | 129 ++++++
 src/block/compress.rs        | 969 +++++++++++++++++++++++++++++++++++++++++++
 src/block/decompress.rs      | 544 ++++++++++++++++++++++++
 src/block/decompress_safe.rs | 399 ++++++++++++++++++
 src/block/hashtable.rs       | 161 +++++++
 src/block/mod.rs             | 157 +++++++
 src/fastcpy.rs               | 145 +++++++
 src/fastcpy_unsafe.rs        | 165 ++++++++
 src/frame/compress.rs        | 471 +++++++++++++++++++++
 src/frame/decompress.rs      | 449 ++++++++++++++++++++
 src/frame/header.rs          | 412 ++++++++++++++++++
 src/frame/mod.rs             | 111 +++++
 src/lib.rs                   | 109 +++++
 src/sink.rs                  | 330 +++++++++++++++
 22 files changed, 5175 insertions(+)
 create mode 100644 .cargo_vcs_info.json
 create mode 100644 Cargo.lock
 create mode 100644 Cargo.toml
 create mode 100644 Cargo.toml.orig
 create mode 100644 LICENSE
 create mode 100644 METADATA
 create mode 100644 MODULE_LICENSE_MIT
 create mode 100644 OWNERS
 create mode 100644 README.md
 create mode 100644 src/block/compress.rs
 create mode 100644 src/block/decompress.rs
 create mode 100644 src/block/decompress_safe.rs
 create mode 100644 src/block/hashtable.rs
 create mode 100644 src/block/mod.rs
 create mode 100644 src/fastcpy.rs
 create mode 100644 src/fastcpy_unsafe.rs
 create mode 100644 src/frame/compress.rs
 create mode 100644 src/frame/decompress.rs
 create mode 100644 src/frame/header.rs
 create mode 100644 src/frame/mod.rs
 create mode 100644 src/lib.rs
 create mode 100644 src/sink.rs

diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
new file mode 100644
index 0000000..7ae5865
--- /dev/null
+++ b/.cargo_vcs_info.json
@@ -0,0 +1,6 @@
+{
+  "git": {
+    "sha1": "4e87de7f57ce8be339a73b385f3789a129b73fb0"
+  },
+  "path_in_vcs": ""
+}
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..0007166
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,389 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "bit-set"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
+dependencies = [
+ "bit-vec",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "byteorder"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855"
+
+[[package]]
+name = "byteorder"
+version = "1.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+
+[[package]]
+name = "cc"
+version = "1.0.73"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
+dependencies = [
+ "jobserver",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "fastrand"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
+dependencies = [
+ "instant",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "getrandom"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "instant"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d"
+
+[[package]]
+name = "jobserver"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "libc"
+version = "0.2.125"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5916d2ae698f6de9bfb891ad7a8d65c09d232dc58cc4ac433c7da3b2fd84bc2b"
+
+[[package]]
+name = "libm"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
+
+[[package]]
+name = "lz4-compress"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f966533a922a9bba9e95e594c1fdb3b9bf5fdcdb11e37e51ad84cd76e468b91"
+dependencies = [
+ "byteorder 0.5.3",
+ "quick-error 1.2.3",
+]
+
+[[package]]
+name = "lz4_flex"
+version = "0.11.2"
+dependencies = [
+ "lz4-compress",
+ "lzzzz",
+ "more-asserts",
+ "proptest",
+ "serde_json",
+ "snap",
+ "twox-hash",
+]
+
+[[package]]
+name = "lzzzz"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8014d1362004776e6a91e4c15a3aa7830d1b6650a075b51a9969ebb6d6af13bc"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "more-asserts"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fafa6961cabd9c63bcd77a45d7e3b7f3b552b70417831fb0f56db717e72407e"
+
+[[package]]
+name = "num-traits"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+
+[[package]]
+name = "proptest"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29f1b898011ce9595050a68e60f90bad083ff2987a695a42357134c8381fba70"
+dependencies = [
+ "bit-set",
+ "bitflags",
+ "byteorder 1.4.3",
+ "lazy_static",
+ "num-traits",
+ "quick-error 2.0.1",
+ "rand",
+ "rand_chacha",
+ "rand_xorshift",
+ "regex-syntax",
+ "rusty-fork",
+ "tempfile",
+ "unarray",
+]
+
+[[package]]
+name = "quick-error"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
+
+[[package]]
+name = "quick-error"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "rand_xorshift"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
+
+[[package]]
+name = "remove_dir_all"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "rusty-fork"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f"
+dependencies = [
+ "fnv",
+ "quick-error 1.2.3",
+ "tempfile",
+ "wait-timeout",
+]
+
+[[package]]
+name = "ryu"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695"
+
+[[package]]
+name = "serde"
+version = "1.0.137"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61ea8d54c77f8315140a05f4c7237403bf38b72704d031543aa1d16abbf517d1"
+
+[[package]]
+name = "serde_json"
+version = "1.0.91"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883"
+dependencies = [
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "snap"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e9f0ab6ef7eb7353d9119c170a436d1bf248eea575ac42d19d12f4e34130831"
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "tempfile"
+version = "3.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
+dependencies = [
+ "cfg-if",
+ "fastrand",
+ "libc",
+ "redox_syscall",
+ "remove_dir_all",
+ "winapi",
+]
+
+[[package]]
+name = "twox-hash"
+version = "1.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
+dependencies = [
+ "cfg-if",
+ "static_assertions",
+]
+
+[[package]]
+name = "unarray"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94"
+
+[[package]]
+name = "wait-timeout"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..40fbadd
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,100 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2021"
+name = "lz4_flex"
+version = "0.11.2"
+authors = [
+    "Pascal Seitz <pascal.seitz@gmail.com>",
+    "Arthur Silva <arthurprs@gmail.com>",
+    "ticki <Ticki@users.noreply.github.com>",
+]
+include = [
+    "src/*.rs",
+    "src/frame/**/*",
+    "src/block/**/*",
+    "README.md",
+    "LICENSE",
+]
+description = "Fastest LZ4 implementation in Rust, no unsafe by default."
+homepage = "https://github.com/pseitz/lz4_flex"
+readme = "README.md"
+keywords = [
+    "compression",
+    "lz4",
+    "compress",
+    "decompression",
+    "decompress",
+]
+license = "MIT"
+repository = "https://github.com/pseitz/lz4_flex"
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = [
+    "--cfg",
+    "docsrs",
+]
+
+[profile.bench]
+opt-level = 3
+lto = true
+codegen-units = 1
+
+[profile.release]
+opt-level = 3
+codegen-units = 1
+panic = "unwind"
+
+[[bench]]
+name = "crit_bench"
+path = "benches/crit_bench.rs"
+harness = false
+
+[dependencies.twox-hash]
+version = "1.6.3"
+optional = true
+default-features = false
+
+[dev-dependencies.lz4-compress]
+version = "0.1.1"
+
+[dev-dependencies.lzzzz]
+version = "1.0.4"
+
+[dev-dependencies.more-asserts]
+version = "0.3.1"
+
+[dev-dependencies.proptest]
+version = "1.0.0"
+
+[dev-dependencies.serde_json]
+version = "1.0.91"
+
+[dev-dependencies.snap]
+version = "1.1.0"
+
+[features]
+default = [
+    "std",
+    "safe-encode",
+    "safe-decode",
+    "frame",
+]
+frame = [
+    "std",
+    "dep:twox-hash",
+]
+nightly = []
+safe-decode = []
+safe-encode = []
+std = []
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
new file mode 100644
index 0000000..3eea255
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,87 @@
+[package]
+authors = ["Pascal Seitz <pascal.seitz@gmail.com>", "Arthur Silva <arthurprs@gmail.com>", "ticki <Ticki@users.noreply.github.com>"]
+description = "Fastest LZ4 implementation in Rust, no unsafe by default."
+edition = "2021"
+keywords = ["compression", "lz4", "compress", "decompression", "decompress"]
+name = "lz4_flex"
+homepage = "https://github.com/pseitz/lz4_flex"
+repository = "https://github.com/pseitz/lz4_flex"
+readme = "README.md"
+license = "MIT"
+version = "0.11.2"
+include = ["src/*.rs", "src/frame/**/*", "src/block/**/*", "README.md", "LICENSE"]
+
+[package.metadata.docs.rs]
+all-features = true
+rustdoc-args = ["--cfg", "docsrs"]
+
+[[bench]]
+harness = false
+name = "crit_bench"
+path = "benches/crit_bench.rs"
+
+[dev-dependencies]
+criterion = { git = "https://github.com/PSeitz/criterion.rs/", rev = "cf60ffc"}
+lzzzz = "1.0.4"
+lz4-compress = "0.1.1"
+more-asserts = "0.3.1"
+snap = "1.1.0"
+serde_json = "1.0.91"
+proptest = "1.0.0"
+
+[dev-dependencies.lz-fear]
+git = "https://github.com/main--/rust-lz-fear"
+
+ #Uncomment to make lz4_flex master available as lz4_flex_master
+ #[dev-dependencies.lz4_flex_master]
+ #rev= "a122673" # v10
+ #git = "https://github.com/PSeitz/lz4_flex"
+ #package = "lz4_flex"
+ #default-features=false
+ #features = ["std", "safe-encode", "safe-decode", "frame"]
+
+[features]
+default = ["std", "safe-encode", "safe-decode", "frame"]
+safe-decode = []
+safe-encode = []
+#unchecked-decode = [] # Removes some checks for additional performance. Only enable on trusted input!
+frame = ["std", "dep:twox-hash"]
+std = []
+# use nightly compiler features
+nightly = []
+
+[dependencies]
+twox-hash = { version = "1.6.3", default-features = false, optional = true }
+
+[profile.bench]
+codegen-units = 1
+lto = true
+opt-level = 3
+
+[profile.release]
+codegen-units = 1
+#debug = true
+opt-level = 3
+panic = "unwind"
+
+# [[bench]]
+# harness = false
+# name = "quickbench"
+# path = "benches/quickbench.rs"
+
+# [[bench]]
+# harness = false
+# name = "bench"
+# path = "benches/bench.rs"
+
+# [[bin]]
+# name = "decompress_with_stats"
+# path = "src/test_bins/decompress_with_stats.rs"
+
+# [[bin]]
+# name = "profile_decomp"
+# path = "src/test_bins/profile_decomp.rs"
+
+# [[bin]]
+# name = "profile_comp"
+# path = "src/test_bins/profile_comp.rs"
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0037a72
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2020 Pascal Seitz
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..858b3d6
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,19 @@
+name: "lz4_flex"
+description: "Fastest LZ4 implementation in Rust, no unsafe by default."
+third_party {
+  identifier {
+    type: "crates.io"
+    value: "https://crates.io/crates/lz4_flex"
+  }
+  identifier {
+    type: "Archive"
+    value: "https://static.crates.io/crates/lz4_flex/lz4_flex-0.11.2.crate"
+  }
+  version: "0.11.2"
+  license_type: NOTICE
+  last_upgrade_date {
+    year: 2024
+    month: 1
+    day: 17
+  }
+}
diff --git a/MODULE_LICENSE_MIT b/MODULE_LICENSE_MIT
new file mode 100644
index 0000000..e69de29
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..41179fc
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,3 @@
+# Bug component: 688011
+include platform/prebuilts/rust:main:/OWNERS
+fmayle@google.com
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7b303a6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,129 @@
+![Rust](https://github.com/PSeitz/lz4_flex/workflows/Rust/badge.svg)
+[![Docs](https://docs.rs/lz4_flex/badge.svg)](https://docs.rs/crate/lz4_flex/)
+[![Crates.io](https://img.shields.io/crates/v/lz4_flex.svg)](https://crates.io/crates/lz4_flex)
+
+# lz4_flex
+
+![lz4_flex_logo](https://raw.githubusercontent.com/PSeitz/lz4_flex/master/logo.jpg)
+
+Fastest LZ4 implementation in Rust. Originally based on [redox-os' lz4 compression](https://crates.io/crates/lz4-compress), but now a complete rewrite.
+The results in the table are from a benchmark in this project (66Kb JSON, 10MB dickens) with the block format. 
+
+AMD Ryzen 7 5900HX, rustc 1.69.0 (84c898d65 2023-04-16), Manjaro, CPU Boost Disabled, CPU Governor: Performance
+
+66Kb JSON
+|    Compressor        | Compression | Decompression | Ratio		 |
+|----------------------|-------------|---------------|---------------|
+| lz4_flex unsafe w. unchecked_decode  | 1615 MiB/s   | 5973 MiB/s    | 0.2284   	 |
+| lz4_flex unsafe | 1615 MiB/s   | 5512 MiB/s    | 0.2284   	 |
+| lz4_flex safe        | 1272 MiB/s   | 4540 MiB/s    | 0.2284   	 |
+| lzzz (lz4 1.9.3)     | 1469 MiB/s   | 5313 MiB/s    | 0.2283   	 |
+| lz4_fear             | 662 MiB/s   | 939 MiB/s     | 0.2283	     |
+| snap                 | 1452 MiB/s   | 1649 MiB/s     | 0.2242      |
+
+10 Mb dickens
+|    Compressor        | Compression | Decompression | Ratio		 |
+|----------------------|-------------|---------------|---------------|
+| lz4_flex unsafe w. unchecked_decode       | 347 MiB/s   | 3168 MiB/s    |  0.6372  	 |
+| lz4_flex unsafe      | 347 MiB/s   | 2734 MiB/s    |  0.6372  	 |
+| lz4_flex safe        | 259 MiB/s   | 2338 MiB/s    | 0.6372 |
+| lzzz (lz4 1.9.3)     | 324 MiB/s | 2759 MiB/s    | 0.6372 |
+| lz4_fear             | 201 MiB/s   | 370 MiB/s     | 0.6372 |
+| snap                 | 286 MiB/s   | 679 MiB/s     | 0.6276 |
+
+## Features
+- Very good logo
+- LZ4 Block format
+- LZ4 Frame format (thanks @arthurprs)
+- High performance
+- 1,5s clean release build time
+- Feature flags to configure safe/unsafe code usage
+- no-std support with block format (thanks @coolreader18)
+- 32-bit support
+
+## Usage: 
+Compression and decompression uses no usafe via the default feature flags "safe-encode" and "safe-decode". If you need more performance you can disable them (e.g. with no-default-features).
+
+Safe:
+```
+lz4_flex = { version = "0.11" }
+```
+
+Performance:
+```
+lz4_flex = { version = "0.11", default-features = false }
+```
+
+### Block Format
+The block format is only valid for smaller data chunks as as block is de/compressed in memory.
+For larger data use the frame format, which consists of multiple blocks.
+
+```rust
+use lz4_flex::block::{compress_prepend_size, decompress_size_prepended};
+
+fn main(){
+    let input: &[u8] = b"Hello people, what's up?";
+    let compressed = compress_prepend_size(input);
+    let uncompressed = decompress_size_prepended(&compressed).unwrap();
+    assert_eq!(input, uncompressed);
+}
+```
+
+
+## no_std support
+
+no_std support is currently only for the block format, since the frame format uses `std::io::Write`, which is not available in core.
+
+## Benchmarks
+The benchmark is run with criterion, the test files are in the benches folder.
+
+Currently 4 implementations are compared, this one, [lz-fear](https://github.com/main--/rust-lz-fear), the [c version via rust bindings](https://crates.io/crates/lzzzz) and [snappy](https://github.com/burntsushi/rust-snappy). 
+The lz4-flex version is tested with the feature flags safe-decode and safe-encode switched on and off.
+
+- lz4_cpp: https://crates.io/crates/lzzzz
+- lz-fear: https://github.com/main--/rust-lz-fear
+- snap: https://github.com/burntsushi/rust-snappy 
+
+Tested on AMD Ryzen 7 5900HX, rustc 1.69.0 (84c898d65 2023-04-16), Manjaro, CPU Boost Disabled, CPU 3GHZ
+
+### Results v0.11.0 02-06-2023 (safe-decode and safe-encode off)
+`cargo bench --no-default-features`
+
+![Compress](./compress_bench.svg)
+
+![Decompress](./decompress_bench.svg)
+
+### Results v0.11.0 02-06-2023 (safe-decode and safe-encode on)
+`cargo bench`
+
+![Compress](./compress_bench_safe.svg)
+
+![Decompress](./decompress_bench_safe.svg)
+
+## Miri
+
+[Miri](https://github.com/rust-lang/miri) can be used to find issues related to incorrect unsafe usage:
+
+`MIRIFLAGS="-Zmiri-disable-isolation -Zmiri-disable-stacked-borrows" cargo +nightly miri test --no-default-features --features frame`
+
+## Fuzzer
+This fuzz target generates corrupted data for the decompressor. 
+`cargo +nightly fuzz run fuzz_decomp_corrupt_block` and `cargo +nightly fuzz run fuzz_decomp_corrupt_frame`
+
+This fuzz target asserts that a compression and decompression rountrip returns the original input.
+`cargo +nightly fuzz run fuzz_roundtrip` and `cargo +nightly fuzz run fuzz_roundtrip_frame`
+
+This fuzz target asserts compression with cpp and decompression with lz4_flex returns the original input.
+`cargo +nightly fuzz run fuzz_roundtrip_cpp_compress`
+
+## Bindings in other languages
+ - Node.js: [lz4-napi](https://github.com/antoniomuso/lz4-napi) 
+ - Wasm: [lz4-wasm](https://github.com/PSeitz/lz4-wasm)
+
+## TODO
+- High compression
+
+## Migrate from v0.10 to v0.11.1
+To migrate, just remove the `checked-decode` feature flag if you used it.
+
+
diff --git a/src/block/compress.rs b/src/block/compress.rs
new file mode 100644
index 0000000..c18474a
--- /dev/null
+++ b/src/block/compress.rs
@@ -0,0 +1,969 @@
+//! The compression algorithm.
+//!
+//! We make use of hash tables to find duplicates. This gives a reasonable compression ratio with a
+//! high performance. It has fixed memory usage, which contrary to other approachs, makes it less
+//! memory hungry.
+
+use crate::block::hashtable::HashTable;
+use crate::block::END_OFFSET;
+use crate::block::LZ4_MIN_LENGTH;
+use crate::block::MAX_DISTANCE;
+use crate::block::MFLIMIT;
+use crate::block::MINMATCH;
+#[cfg(not(feature = "safe-encode"))]
+use crate::sink::PtrSink;
+use crate::sink::Sink;
+use crate::sink::SliceSink;
+#[allow(unused_imports)]
+use alloc::vec;
+use alloc::vec::Vec;
+
+#[cfg(feature = "safe-encode")]
+use core::convert::TryInto;
+
+use super::hashtable::HashTable4K;
+use super::hashtable::HashTable4KU16;
+use super::{CompressError, WINDOW_SIZE};
+
+/// Increase step size after 1<<INCREASE_STEPSIZE_BITSHIFT non matches
+const INCREASE_STEPSIZE_BITSHIFT: usize = 5;
+
+/// Read a 4-byte "batch" from some position.
+///
+/// This will read a native-endian 4-byte integer from some position.
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+pub(super) fn get_batch(input: &[u8], n: usize) -> u32 {
+    unsafe { read_u32_ptr(input.as_ptr().add(n)) }
+}
+
+#[inline]
+#[cfg(feature = "safe-encode")]
+pub(super) fn get_batch(input: &[u8], n: usize) -> u32 {
+    u32::from_ne_bytes(input[n..n + 4].try_into().unwrap())
+}
+
+/// Read an usize sized "batch" from some position.
+///
+/// This will read a native-endian usize from some position.
+#[inline]
+#[allow(dead_code)]
+#[cfg(not(feature = "safe-encode"))]
+pub(super) fn get_batch_arch(input: &[u8], n: usize) -> usize {
+    unsafe { read_usize_ptr(input.as_ptr().add(n)) }
+}
+
+#[inline]
+#[allow(dead_code)]
+#[cfg(feature = "safe-encode")]
+pub(super) fn get_batch_arch(input: &[u8], n: usize) -> usize {
+    const USIZE_SIZE: usize = core::mem::size_of::<usize>();
+    let arr: &[u8; USIZE_SIZE] = input[n..n + USIZE_SIZE].try_into().unwrap();
+    usize::from_ne_bytes(*arr)
+}
+
+#[inline]
+fn token_from_literal(lit_len: usize) -> u8 {
+    if lit_len < 0xF {
+        // Since we can fit the literals length into it, there is no need for saturation.
+        (lit_len as u8) << 4
+    } else {
+        // We were unable to fit the literals into it, so we saturate to 0xF. We will later
+        // write the extensional value.
+        0xF0
+    }
+}
+
+#[inline]
+fn token_from_literal_and_match_length(lit_len: usize, duplicate_length: usize) -> u8 {
+    let mut token = if lit_len < 0xF {
+        // Since we can fit the literals length into it, there is no need for saturation.
+        (lit_len as u8) << 4
+    } else {
+        // We were unable to fit the literals into it, so we saturate to 0xF. We will later
+        // write the extensional value.
+        0xF0
+    };
+
+    token |= if duplicate_length < 0xF {
+        // We could fit it in.
+        duplicate_length as u8
+    } else {
+        // We were unable to fit it in, so we default to 0xF, which will later be extended.
+        0xF
+    };
+
+    token
+}
+
+/// Counts the number of same bytes in two byte streams.
+/// `input` is the complete input
+/// `cur` is the current position in the input. it will be incremented by the number of matched
+/// bytes `source` either the same as input or an external slice
+/// `candidate` is the candidate position in `source`
+///
+/// The function ignores the last END_OFFSET bytes in input as those should be literals.
+#[inline]
+#[cfg(feature = "safe-encode")]
+fn count_same_bytes(input: &[u8], cur: &mut usize, source: &[u8], candidate: usize) -> usize {
+    const USIZE_SIZE: usize = core::mem::size_of::<usize>();
+    let cur_slice = &input[*cur..input.len() - END_OFFSET];
+    let cand_slice = &source[candidate..];
+
+    let mut num = 0;
+    for (block1, block2) in cur_slice
+        .chunks_exact(USIZE_SIZE)
+        .zip(cand_slice.chunks_exact(USIZE_SIZE))
+    {
+        let input_block = usize::from_ne_bytes(block1.try_into().unwrap());
+        let match_block = usize::from_ne_bytes(block2.try_into().unwrap());
+
+        if input_block == match_block {
+            num += USIZE_SIZE;
+        } else {
+            let diff = input_block ^ match_block;
+            num += (diff.to_le().trailing_zeros() / 8) as usize;
+            *cur += num;
+            return num;
+        }
+    }
+
+    // If we're here we may have 1 to 7 bytes left to check close to the end of input
+    // or source slices. Since this is rare occurrence we mark it cold to get better
+    // ~5% better performance.
+    #[cold]
+    fn count_same_bytes_tail(a: &[u8], b: &[u8], offset: usize) -> usize {
+        a.iter()
+            .zip(b)
+            .skip(offset)
+            .take_while(|(a, b)| a == b)
+            .count()
+    }
+    num += count_same_bytes_tail(cur_slice, cand_slice, num);
+
+    *cur += num;
+    num
+}
+
+/// Counts the number of same bytes in two byte streams.
+/// `input` is the complete input
+/// `cur` is the current position in the input. it will be incremented by the number of matched
+/// bytes `source` either the same as input OR an external slice
+/// `candidate` is the candidate position in `source`
+///
+/// The function ignores the last END_OFFSET bytes in input as those should be literals.
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn count_same_bytes(input: &[u8], cur: &mut usize, source: &[u8], candidate: usize) -> usize {
+    let max_input_match = input.len().saturating_sub(*cur + END_OFFSET);
+    let max_candidate_match = source.len() - candidate;
+    // Considering both limits calc how far we may match in input.
+    let input_end = *cur + max_input_match.min(max_candidate_match);
+
+    let start = *cur;
+    let mut source_ptr = unsafe { source.as_ptr().add(candidate) };
+
+    // compare 4/8 bytes blocks depending on the arch
+    const STEP_SIZE: usize = core::mem::size_of::<usize>();
+    while *cur + STEP_SIZE <= input_end {
+        let diff = read_usize_ptr(unsafe { input.as_ptr().add(*cur) }) ^ read_usize_ptr(source_ptr);
+
+        if diff == 0 {
+            *cur += STEP_SIZE;
+            unsafe {
+                source_ptr = source_ptr.add(STEP_SIZE);
+            }
+        } else {
+            *cur += (diff.to_le().trailing_zeros() / 8) as usize;
+            return *cur - start;
+        }
+    }
+
+    // compare 4 bytes block
+    #[cfg(target_pointer_width = "64")]
+    {
+        if input_end - *cur >= 4 {
+            let diff = read_u32_ptr(unsafe { input.as_ptr().add(*cur) }) ^ read_u32_ptr(source_ptr);
+
+            if diff == 0 {
+                *cur += 4;
+                unsafe {
+                    source_ptr = source_ptr.add(4);
+                }
+            } else {
+                *cur += (diff.to_le().trailing_zeros() / 8) as usize;
+                return *cur - start;
+            }
+        }
+    }
+
+    // compare 2 bytes block
+    if input_end - *cur >= 2
+        && unsafe { read_u16_ptr(input.as_ptr().add(*cur)) == read_u16_ptr(source_ptr) }
+    {
+        *cur += 2;
+        unsafe {
+            source_ptr = source_ptr.add(2);
+        }
+    }
+
+    if *cur < input_end
+        && unsafe { input.as_ptr().add(*cur).read() } == unsafe { source_ptr.read() }
+    {
+        *cur += 1;
+    }
+
+    *cur - start
+}
+
+/// Write an integer to the output.
+///
+/// Each additional byte then represent a value from 0 to 255, which is added to the previous value
+/// to produce a total length. When the byte value is 255, another byte must read and added, and so
+/// on. There can be any number of bytes of value "255" following token
+#[inline]
+#[cfg(feature = "safe-encode")]
+fn write_integer(output: &mut impl Sink, mut n: usize) {
+    // Note: Since `n` is usually < 0xFF and writing multiple bytes to the output
+    // requires 2 branches of bound check (due to the possibility of add overflows)
+    // the simple byte at a time implementation below is faster in most cases.
+    while n >= 0xFF {
+        n -= 0xFF;
+        push_byte(output, 0xFF);
+    }
+    push_byte(output, n as u8);
+}
+
+/// Write an integer to the output.
+///
+/// Each additional byte then represent a value from 0 to 255, which is added to the previous value
+/// to produce a total length. When the byte value is 255, another byte must read and added, and so
+/// on. There can be any number of bytes of value "255" following token
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn write_integer(output: &mut impl Sink, mut n: usize) {
+    // Write the 0xFF bytes as long as the integer is higher than said value.
+    if n >= 4 * 0xFF {
+        // In this unlikelly branch we use a fill instead of a loop,
+        // otherwise rustc may output a large unrolled/vectorized loop.
+        let bulk = n / (4 * 0xFF);
+        n %= 4 * 0xFF;
+        unsafe {
+            core::ptr::write_bytes(output.pos_mut_ptr(), 0xFF, 4 * bulk);
+            output.set_pos(output.pos() + 4 * bulk);
+        }
+    }
+
+    // Handle last 1 to 4 bytes
+    push_u32(output, 0xFFFFFFFF);
+    // Updating output len for the remainder
+    unsafe {
+        output.set_pos(output.pos() - 4 + 1 + n / 255);
+        // Write the remaining byte.
+        *output.pos_mut_ptr().sub(1) = (n % 255) as u8;
+    }
+}
+
+/// Handle the last bytes from the input as literals
+#[cold]
+fn handle_last_literals(output: &mut impl Sink, input: &[u8], start: usize) {
+    let lit_len = input.len() - start;
+
+    let token = token_from_literal(lit_len);
+    push_byte(output, token);
+    if lit_len >= 0xF {
+        write_integer(output, lit_len - 0xF);
+    }
+    // Now, write the actual literals.
+    output.extend_from_slice(&input[start..]);
+}
+
+/// Moves the cursors back as long as the bytes match, to find additional bytes in a duplicate
+#[inline]
+#[cfg(feature = "safe-encode")]
+fn backtrack_match(
+    input: &[u8],
+    cur: &mut usize,
+    literal_start: usize,
+    source: &[u8],
+    candidate: &mut usize,
+) {
+    // Note: Even if iterator version of this loop has less branches inside the loop it has more
+    // branches before the loop. That in practice seems to make it slower than the while version
+    // bellow. TODO: It should be possible remove all bounds checks, since we are walking
+    // backwards
+    while *candidate > 0 && *cur > literal_start && input[*cur - 1] == source[*candidate - 1] {
+        *cur -= 1;
+        *candidate -= 1;
+    }
+}
+
+/// Moves the cursors back as long as the bytes match, to find additional bytes in a duplicate
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn backtrack_match(
+    input: &[u8],
+    cur: &mut usize,
+    literal_start: usize,
+    source: &[u8],
+    candidate: &mut usize,
+) {
+    while unsafe {
+        *candidate > 0
+            && *cur > literal_start
+            && input.get_unchecked(*cur - 1) == source.get_unchecked(*candidate - 1)
+    } {
+        *cur -= 1;
+        *candidate -= 1;
+    }
+}
+
+/// Compress all bytes of `input[input_pos..]` into `output`.
+///
+/// Bytes in `input[..input_pos]` are treated as a preamble and can be used for lookback.
+/// This part is known as the compressor "prefix".
+/// Bytes in `ext_dict` logically precede the bytes in `input` and can also be used for lookback.
+///
+/// `input_stream_offset` is the logical position of the first byte of `input`. This allows same
+/// `dict` to be used for many calls to `compress_internal` as we can "readdress" the first byte of
+/// `input` to be something other than 0.
+///
+/// `dict` is the dictionary of previously encoded sequences.
+///
+/// This is used to find duplicates in the stream so they are not written multiple times.
+///
+/// Every four bytes are hashed, and in the resulting slot their position in the input buffer
+/// is placed in the dict. This way we can easily look up a candidate to back references.
+///
+/// Returns the number of bytes written (compressed) into `output`.
+///
+/// # Const parameters
+/// `USE_DICT`: Disables usage of ext_dict (it'll panic if a non-empty slice is used).
+/// In other words, this generates more optimized code when an external dictionary isn't used.
+///
+/// A similar const argument could be used to disable the Prefix mode (eg. USE_PREFIX),
+/// which would impose `input_pos == 0 && input_stream_offset == 0`. Experiments didn't
+/// show significant improvement though.
+// Intentionally avoid inlining.
+// Empirical tests revealed it to be rarely better but often significantly detrimental.
+#[inline(never)]
+pub(crate) fn compress_internal<T: HashTable, const USE_DICT: bool, S: Sink>(
+    input: &[u8],
+    input_pos: usize,
+    output: &mut S,
+    dict: &mut T,
+    ext_dict: &[u8],
+    input_stream_offset: usize,
+) -> Result<usize, CompressError> {
+    assert!(input_pos <= input.len());
+    if USE_DICT {
+        assert!(ext_dict.len() <= super::WINDOW_SIZE);
+        assert!(ext_dict.len() <= input_stream_offset);
+        // Check for overflow hazard when using ext_dict
+        assert!(input_stream_offset
+            .checked_add(input.len())
+            .and_then(|i| i.checked_add(ext_dict.len()))
+            .map_or(false, |i| i <= isize::MAX as usize));
+    } else {
+        assert!(ext_dict.is_empty());
+    }
+    if output.capacity() - output.pos() < get_maximum_output_size(input.len() - input_pos) {
+        return Err(CompressError::OutputTooSmall);
+    }
+
+    let output_start_pos = output.pos();
+    if input.len() - input_pos < LZ4_MIN_LENGTH {
+        handle_last_literals(output, input, input_pos);
+        return Ok(output.pos() - output_start_pos);
+    }
+
+    let ext_dict_stream_offset = input_stream_offset - ext_dict.len();
+    let end_pos_check = input.len() - MFLIMIT;
+    let mut literal_start = input_pos;
+    let mut cur = input_pos;
+
+    if cur == 0 && input_stream_offset == 0 {
+        // According to the spec we can't start with a match,
+        // except when referencing another block.
+        let hash = T::get_hash_at(input, 0);
+        dict.put_at(hash, 0);
+        cur = 1;
+    }
+
+    loop {
+        // Read the next block into two sections, the literals and the duplicates.
+        let mut step_size;
+        let mut candidate;
+        let mut candidate_source;
+        let mut offset;
+        let mut non_match_count = 1 << INCREASE_STEPSIZE_BITSHIFT;
+        // The number of bytes before our cursor, where the duplicate starts.
+        let mut next_cur = cur;
+
+        // In this loop we search for duplicates via the hashtable. 4bytes or 8bytes are hashed and
+        // compared.
+        loop {
+            step_size = non_match_count >> INCREASE_STEPSIZE_BITSHIFT;
+            non_match_count += 1;
+
+            cur = next_cur;
+            next_cur += step_size;
+
+            // Same as cur + MFLIMIT > input.len()
+            if cur > end_pos_check {
+                handle_last_literals(output, input, literal_start);
+                return Ok(output.pos() - output_start_pos);
+            }
+            // Find a candidate in the dictionary with the hash of the current four bytes.
+            // Unchecked is safe as long as the values from the hash function don't exceed the size
+            // of the table. This is ensured by right shifting the hash values
+            // (`dict_bitshift`) to fit them in the table
+
+            // [Bounds Check]: Can be elided due to `end_pos_check` above
+            let hash = T::get_hash_at(input, cur);
+            candidate = dict.get_at(hash);
+            dict.put_at(hash, cur + input_stream_offset);
+
+            // Sanity check: Matches can't be ahead of `cur`.
+            debug_assert!(candidate <= input_stream_offset + cur);
+
+            // Two requirements to the candidate exists:
+            // - We should not return a position which is merely a hash collision, so that the
+            //   candidate actually matches what we search for.
+            // - We can address up to 16-bit offset, hence we are only able to address the candidate
+            //   if its offset is less than or equals to 0xFFFF.
+            if input_stream_offset + cur - candidate > MAX_DISTANCE {
+                continue;
+            }
+
+            if candidate >= input_stream_offset {
+                // match within input
+                offset = (input_stream_offset + cur - candidate) as u16;
+                candidate -= input_stream_offset;
+                candidate_source = input;
+            } else if USE_DICT {
+                // Sanity check, which may fail if we lost history beyond MAX_DISTANCE
+                debug_assert!(
+                    candidate >= ext_dict_stream_offset,
+                    "Lost history in ext dict mode"
+                );
+                // match within ext dict
+                offset = (input_stream_offset + cur - candidate) as u16;
+                candidate -= ext_dict_stream_offset;
+                candidate_source = ext_dict;
+            } else {
+                // Match is not reachable anymore
+                // eg. compressing an independent block frame w/o clearing
+                // the matches tables, only increasing input_stream_offset.
+                // Sanity check
+                debug_assert!(input_pos == 0, "Lost history in prefix mode");
+                continue;
+            }
+            // [Bounds Check]: Candidate is coming from the Hashmap. It can't be out of bounds, but
+            // impossible to prove for the compiler and remove the bounds checks.
+            let cand_bytes: u32 = get_batch(candidate_source, candidate);
+            // [Bounds Check]: Should be able to be elided due to `end_pos_check`.
+            let curr_bytes: u32 = get_batch(input, cur);
+
+            if cand_bytes == curr_bytes {
+                break;
+            }
+        }
+
+        // Extend the match backwards if we can
+        backtrack_match(
+            input,
+            &mut cur,
+            literal_start,
+            candidate_source,
+            &mut candidate,
+        );
+
+        // The length (in bytes) of the literals section.
+        let lit_len = cur - literal_start;
+
+        // Generate the higher half of the token.
+        cur += MINMATCH;
+        candidate += MINMATCH;
+        let duplicate_length = count_same_bytes(input, &mut cur, candidate_source, candidate);
+
+        // Note: The `- 2` offset was copied from the reference implementation, it could be
+        // arbitrary.
+        let hash = T::get_hash_at(input, cur - 2);
+        dict.put_at(hash, cur - 2 + input_stream_offset);
+
+        let token = token_from_literal_and_match_length(lit_len, duplicate_length);
+
+        // Push the token to the output stream.
+        push_byte(output, token);
+        // If we were unable to fit the literals length into the token, write the extensional
+        // part.
+        if lit_len >= 0xF {
+            write_integer(output, lit_len - 0xF);
+        }
+
+        // Now, write the actual literals.
+        //
+        // The unsafe version copies blocks of 8bytes, and therefore may copy up to 7bytes more than
+        // needed. This is safe, because the last 12 bytes (MF_LIMIT) are handled in
+        // handle_last_literals.
+        copy_literals_wild(output, input, literal_start, lit_len);
+        // write the offset in little endian.
+        push_u16(output, offset);
+
+        // If we were unable to fit the duplicates length into the token, write the
+        // extensional part.
+        if duplicate_length >= 0xF {
+            write_integer(output, duplicate_length - 0xF);
+        }
+        literal_start = cur;
+    }
+}
+
+#[inline]
+#[cfg(feature = "safe-encode")]
+fn push_byte(output: &mut impl Sink, el: u8) {
+    output.push(el);
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn push_byte(output: &mut impl Sink, el: u8) {
+    unsafe {
+        core::ptr::write(output.pos_mut_ptr(), el);
+        output.set_pos(output.pos() + 1);
+    }
+}
+
+#[inline]
+#[cfg(feature = "safe-encode")]
+fn push_u16(output: &mut impl Sink, el: u16) {
+    output.extend_from_slice(&el.to_le_bytes());
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn push_u16(output: &mut impl Sink, el: u16) {
+    unsafe {
+        core::ptr::copy_nonoverlapping(el.to_le_bytes().as_ptr(), output.pos_mut_ptr(), 2);
+        output.set_pos(output.pos() + 2);
+    }
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn push_u32(output: &mut impl Sink, el: u32) {
+    unsafe {
+        core::ptr::copy_nonoverlapping(el.to_le_bytes().as_ptr(), output.pos_mut_ptr(), 4);
+        output.set_pos(output.pos() + 4);
+    }
+}
+
+#[inline(always)] // (always) necessary otherwise compiler fails to inline it
+#[cfg(feature = "safe-encode")]
+fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) {
+    output.extend_from_slice_wild(&input[input_start..input_start + len], len)
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) {
+    debug_assert!(input_start + len / 8 * 8 + ((len % 8) != 0) as usize * 8 <= input.len());
+    debug_assert!(output.pos() + len / 8 * 8 + ((len % 8) != 0) as usize * 8 <= output.capacity());
+    unsafe {
+        // Note: This used to be a wild copy loop of 8 bytes, but the compiler consistently
+        // transformed it into a call to memcopy, which hurts performance significantly for
+        // small copies, which are common.
+        let start_ptr = input.as_ptr().add(input_start);
+        match len {
+            0..=8 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 8),
+            9..=16 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 16),
+            17..=24 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 24),
+            _ => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), len),
+        }
+        output.set_pos(output.pos() + len);
+    }
+}
+
+/// Compress all bytes of `input` into `output`.
+/// The method chooses an appropriate hashtable to lookup duplicates.
+/// output should be preallocated with a size of
+/// `get_maximum_output_size`.
+///
+/// Returns the number of bytes written (compressed) into `output`.
+
+#[inline]
+pub(crate) fn compress_into_sink_with_dict<const USE_DICT: bool>(
+    input: &[u8],
+    output: &mut impl Sink,
+    mut dict_data: &[u8],
+) -> Result<usize, CompressError> {
+    if dict_data.len() + input.len() < u16::MAX as usize {
+        let mut dict = HashTable4KU16::new();
+        init_dict(&mut dict, &mut dict_data);
+        compress_internal::<_, USE_DICT, _>(input, 0, output, &mut dict, dict_data, dict_data.len())
+    } else {
+        let mut dict = HashTable4K::new();
+        init_dict(&mut dict, &mut dict_data);
+        compress_internal::<_, USE_DICT, _>(input, 0, output, &mut dict, dict_data, dict_data.len())
+    }
+}
+
+#[inline]
+fn init_dict<T: HashTable>(dict: &mut T, dict_data: &mut &[u8]) {
+    if dict_data.len() > WINDOW_SIZE {
+        *dict_data = &dict_data[dict_data.len() - WINDOW_SIZE..];
+    }
+    let mut i = 0usize;
+    while i + core::mem::size_of::<usize>() <= dict_data.len() {
+        let hash = T::get_hash_at(dict_data, i);
+        dict.put_at(hash, i);
+        // Note: The 3 byte step was copied from the reference implementation, it could be
+        // arbitrary.
+        i += 3;
+    }
+}
+
+/// Returns the maximum output size of the compressed data.
+/// Can be used to preallocate capacity on the output vector
+#[inline]
+pub fn get_maximum_output_size(input_len: usize) -> usize {
+    16 + 4 + (input_len as f64 * 1.1) as usize
+}
+
+/// Compress all bytes of `input` into `output`.
+/// The method chooses an appropriate hashtable to lookup duplicates.
+/// output should be preallocated with a size of
+/// `get_maximum_output_size`.
+///
+/// Returns the number of bytes written (compressed) into `output`.
+#[inline]
+pub fn compress_into(input: &[u8], output: &mut [u8]) -> Result<usize, CompressError> {
+    compress_into_sink_with_dict::<false>(input, &mut SliceSink::new(output, 0), b"")
+}
+
+/// Compress all bytes of `input` into `output`.
+/// The method chooses an appropriate hashtable to lookup duplicates.
+/// output should be preallocated with a size of
+/// `get_maximum_output_size`.
+///
+/// Returns the number of bytes written (compressed) into `output`.
+#[inline]
+pub fn compress_into_with_dict(
+    input: &[u8],
+    output: &mut [u8],
+    dict_data: &[u8],
+) -> Result<usize, CompressError> {
+    compress_into_sink_with_dict::<true>(input, &mut SliceSink::new(output, 0), dict_data)
+}
+
+#[inline]
+fn compress_into_vec_with_dict<const USE_DICT: bool>(
+    input: &[u8],
+    prepend_size: bool,
+    mut dict_data: &[u8],
+) -> Vec<u8> {
+    let prepend_size_num_bytes = if prepend_size { 4 } else { 0 };
+    let max_compressed_size = get_maximum_output_size(input.len()) + prepend_size_num_bytes;
+    if dict_data.len() <= 3 {
+        dict_data = b"";
+    }
+    #[cfg(feature = "safe-encode")]
+    let mut compressed = {
+        let mut compressed: Vec<u8> = vec![0u8; max_compressed_size];
+        let out = if prepend_size {
+            compressed[..4].copy_from_slice(&(input.len() as u32).to_le_bytes());
+            &mut compressed[4..]
+        } else {
+            &mut compressed
+        };
+        let compressed_len =
+            compress_into_sink_with_dict::<USE_DICT>(input, &mut SliceSink::new(out, 0), dict_data)
+                .unwrap();
+
+        compressed.truncate(prepend_size_num_bytes + compressed_len);
+        compressed
+    };
+    #[cfg(not(feature = "safe-encode"))]
+    let mut compressed = {
+        let mut vec = Vec::with_capacity(max_compressed_size);
+        let start_pos = if prepend_size {
+            vec.extend_from_slice(&(input.len() as u32).to_le_bytes());
+            4
+        } else {
+            0
+        };
+        let compressed_len = compress_into_sink_with_dict::<USE_DICT>(
+            input,
+            &mut PtrSink::from_vec(&mut vec, start_pos),
+            dict_data,
+        )
+        .unwrap();
+        unsafe {
+            vec.set_len(prepend_size_num_bytes + compressed_len);
+        }
+        vec
+    };
+
+    compressed.shrink_to_fit();
+    compressed
+}
+
+/// Compress all bytes of `input` into `output`. The uncompressed size will be prepended as a little
+/// endian u32. Can be used in conjunction with `decompress_size_prepended`
+#[inline]
+pub fn compress_prepend_size(input: &[u8]) -> Vec<u8> {
+    compress_into_vec_with_dict::<false>(input, true, b"")
+}
+
+/// Compress all bytes of `input`.
+#[inline]
+pub fn compress(input: &[u8]) -> Vec<u8> {
+    compress_into_vec_with_dict::<false>(input, false, b"")
+}
+
+/// Compress all bytes of `input` with an external dictionary.
+#[inline]
+pub fn compress_with_dict(input: &[u8], ext_dict: &[u8]) -> Vec<u8> {
+    compress_into_vec_with_dict::<true>(input, false, ext_dict)
+}
+
+/// Compress all bytes of `input` into `output`. The uncompressed size will be prepended as a little
+/// endian u32. Can be used in conjunction with `decompress_size_prepended_with_dict`
+#[inline]
+pub fn compress_prepend_size_with_dict(input: &[u8], ext_dict: &[u8]) -> Vec<u8> {
+    compress_into_vec_with_dict::<true>(input, true, ext_dict)
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn read_u16_ptr(input: *const u8) -> u16 {
+    let mut num: u16 = 0;
+    unsafe {
+        core::ptr::copy_nonoverlapping(input, &mut num as *mut u16 as *mut u8, 2);
+    }
+    num
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn read_u32_ptr(input: *const u8) -> u32 {
+    let mut num: u32 = 0;
+    unsafe {
+        core::ptr::copy_nonoverlapping(input, &mut num as *mut u32 as *mut u8, 4);
+    }
+    num
+}
+
+#[inline]
+#[cfg(not(feature = "safe-encode"))]
+fn read_usize_ptr(input: *const u8) -> usize {
+    let mut num: usize = 0;
+    unsafe {
+        core::ptr::copy_nonoverlapping(
+            input,
+            &mut num as *mut usize as *mut u8,
+            core::mem::size_of::<usize>(),
+        );
+    }
+    num
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_count_same_bytes() {
+        // 8byte aligned block, zeros and ones are added because the end/offset
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 16);
+
+        // 4byte aligned block
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 20);
+
+        // 2byte aligned block
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 22);
+
+        // 1byte aligned block
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 23);
+
+        // 1byte aligned block - last byte different
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 6, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 22);
+
+        // 1byte aligned block
+        let first: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 9, 5, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0,
+        ];
+        let second: &[u8] = &[
+            1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 6, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1,
+        ];
+        assert_eq!(count_same_bytes(first, &mut 0, second, 0), 21);
+
+        for diff_idx in 8..100 {
+            let first: Vec<u8> = (0u8..255).cycle().take(100 + 12).collect();
+            let mut second = first.clone();
+            second[diff_idx] = 255;
+            for start in 0..=diff_idx {
+                let same_bytes = count_same_bytes(&first, &mut start.clone(), &second, start);
+                assert_eq!(same_bytes, diff_idx - start);
+            }
+        }
+    }
+
+    #[test]
+    fn test_bug() {
+        let input: &[u8] = &[
+            10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
+        ];
+        let _out = compress(input);
+    }
+
+    #[test]
+    fn test_dict() {
+        let input: &[u8] = &[
+            10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
+        ];
+        let dict = input;
+        let compressed = compress_with_dict(input, dict);
+        assert_lt!(compressed.len(), compress(input).len());
+
+        assert!(compressed.len() < compress(input).len());
+        let mut uncompressed = vec![0u8; input.len()];
+        let uncomp_size = crate::block::decompress::decompress_into_with_dict(
+            &compressed,
+            &mut uncompressed,
+            dict,
+        )
+        .unwrap();
+        uncompressed.truncate(uncomp_size);
+        assert_eq!(input, uncompressed);
+    }
+
+    #[test]
+    fn test_dict_no_panic() {
+        let input: &[u8] = &[
+            10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
+        ];
+        let dict = &[10, 12, 14];
+        let _compressed = compress_with_dict(input, dict);
+    }
+
+    #[test]
+    fn test_dict_match_crossing() {
+        let input: &[u8] = &[
+            10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
+        ];
+        let dict = input;
+        let compressed = compress_with_dict(input, dict);
+        assert_lt!(compressed.len(), compress(input).len());
+
+        let mut uncompressed = vec![0u8; input.len() * 2];
+        // copy first half of the input into output
+        let dict_cutoff = dict.len() / 2;
+        let output_start = dict.len() - dict_cutoff;
+        uncompressed[..output_start].copy_from_slice(&dict[dict_cutoff..]);
+        let uncomp_len = {
+            let mut sink = SliceSink::new(&mut uncompressed[..], output_start);
+            crate::block::decompress::decompress_internal::<true, _>(
+                &compressed,
+                &mut sink,
+                &dict[..dict_cutoff],
+            )
+            .unwrap()
+        };
+        assert_eq!(input.len(), uncomp_len);
+        assert_eq!(
+            input,
+            &uncompressed[output_start..output_start + uncomp_len]
+        );
+    }
+
+    #[test]
+    fn test_conformant_last_block() {
+        // From the spec:
+        // The last match must start at least 12 bytes before the end of block.
+        // The last match is part of the penultimate sequence. It is followed by the last sequence,
+        // which contains only literals. Note that, as a consequence, an independent block <
+        // 13 bytes cannot be compressed, because the match must copy "something",
+        // so it needs at least one prior byte.
+        // When a block can reference data from another block, it can start immediately with a match
+        // and no literal, so a block of 12 bytes can be compressed.
+        let aaas: &[u8] = b"aaaaaaaaaaaaaaa";
+
+        // uncompressible
+        let out = compress(&aaas[..12]);
+        assert_gt!(out.len(), 12);
+        // compressible
+        let out = compress(&aaas[..13]);
+        assert_le!(out.len(), 13);
+        let out = compress(&aaas[..14]);
+        assert_le!(out.len(), 14);
+        let out = compress(&aaas[..15]);
+        assert_le!(out.len(), 15);
+
+        // dict uncompressible
+        let out = compress_with_dict(&aaas[..11], aaas);
+        assert_gt!(out.len(), 11);
+        // compressible
+        let out = compress_with_dict(&aaas[..12], aaas);
+        // According to the spec this _could_ compres, but it doesn't in this lib
+        // as it aborts compression for any input len < LZ4_MIN_LENGTH
+        assert_gt!(out.len(), 12);
+        let out = compress_with_dict(&aaas[..13], aaas);
+        assert_le!(out.len(), 13);
+        let out = compress_with_dict(&aaas[..14], aaas);
+        assert_le!(out.len(), 14);
+        let out = compress_with_dict(&aaas[..15], aaas);
+        assert_le!(out.len(), 15);
+    }
+
+    #[test]
+    fn test_dict_size() {
+        let dict = vec![b'a'; 1024 * 1024];
+        let input = &b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"[..];
+        let compressed = compress_prepend_size_with_dict(input, &dict);
+        let decompressed =
+            crate::block::decompress_size_prepended_with_dict(&compressed, &dict).unwrap();
+        assert_eq!(decompressed, input);
+    }
+}
diff --git a/src/block/decompress.rs b/src/block/decompress.rs
new file mode 100644
index 0000000..62065a9
--- /dev/null
+++ b/src/block/decompress.rs
@@ -0,0 +1,544 @@
+//! The block decompression algorithm.
+use crate::block::{DecompressError, MINMATCH};
+use crate::fastcpy_unsafe;
+use crate::sink::SliceSink;
+use crate::sink::{PtrSink, Sink};
+use alloc::vec::Vec;
+
+/// Copies data to output_ptr by self-referential copy from start and match_length
+#[inline]
+unsafe fn duplicate(
+    output_ptr: &mut *mut u8,
+    output_end: *mut u8,
+    start: *const u8,
+    match_length: usize,
+) {
+    // We cannot simply use memcpy or `extend_from_slice`, because these do not allow
+    // self-referential copies: http://ticki.github.io/img/lz4_runs_encoding_diagram.svg
+
+    // Considering that `wild_copy_match_16` can copy up to `16 - 1` extra bytes.
+    // Defer to `duplicate_overlapping` in case of an overlapping match
+    // OR the if the wild copy would copy beyond the end of the output.
+    if (output_ptr.offset_from(start) as usize) < match_length + 16 - 1
+        || (output_end.offset_from(*output_ptr) as usize) < match_length + 16 - 1
+    {
+        duplicate_overlapping(output_ptr, start, match_length);
+    } else {
+        debug_assert!(
+            output_ptr.add(match_length / 16 * 16 + ((match_length % 16) != 0) as usize * 16)
+                <= output_end
+        );
+        wild_copy_from_src_16(start, *output_ptr, match_length);
+        *output_ptr = output_ptr.add(match_length);
+    }
+}
+
+#[inline]
+fn wild_copy_from_src_16(mut source: *const u8, mut dst_ptr: *mut u8, num_items: usize) {
+    // Note: if the compiler auto-vectorizes this it'll hurt performance!
+    // It's not the case for 16 bytes stepsize, but for 8 bytes.
+    unsafe {
+        let dst_ptr_end = dst_ptr.add(num_items);
+        loop {
+            core::ptr::copy_nonoverlapping(source, dst_ptr, 16);
+            source = source.add(16);
+            dst_ptr = dst_ptr.add(16);
+            if dst_ptr >= dst_ptr_end {
+                break;
+            }
+        }
+    }
+}
+
+/// Copy function, if the data start + match_length overlaps into output_ptr
+#[inline]
+#[cfg_attr(nightly, optimize(size))] // to avoid loop unrolling
+unsafe fn duplicate_overlapping(
+    output_ptr: &mut *mut u8,
+    mut start: *const u8,
+    match_length: usize,
+) {
+    // There is an edge case when output_ptr == start, which causes the decoder to potentially
+    // expose up to match_length bytes of uninitialized data in the decompression buffer.
+    // To prevent that we write a dummy zero to output, which will zero out output in such cases.
+    // This is the same strategy used by the reference C implementation https://github.com/lz4/lz4/pull/772
+    output_ptr.write(0u8);
+    let dst_ptr_end = output_ptr.add(match_length);
+
+    while output_ptr.add(1) < dst_ptr_end {
+        // Note that this loop unrolling is done, so that the compiler doesn't do it in a awful
+        // way.
+        // Without that the compiler will unroll/auto-vectorize the copy with a lot of branches.
+        // This is not what we want, as large overlapping copies are not that common.
+        core::ptr::copy(start, *output_ptr, 1);
+        start = start.add(1);
+        *output_ptr = output_ptr.add(1);
+
+        core::ptr::copy(start, *output_ptr, 1);
+        start = start.add(1);
+        *output_ptr = output_ptr.add(1);
+    }
+
+    if *output_ptr < dst_ptr_end {
+        core::ptr::copy(start, *output_ptr, 1);
+        *output_ptr = output_ptr.add(1);
+    }
+}
+
+#[inline]
+unsafe fn copy_from_dict(
+    output_base: *mut u8,
+    output_ptr: &mut *mut u8,
+    ext_dict: &[u8],
+    offset: usize,
+    match_length: usize,
+) -> usize {
+    // If we're here we know offset > output pos, so we have at least 1 byte to copy from dict
+    debug_assert!(output_ptr.offset_from(output_base) >= 0);
+    debug_assert!(offset > output_ptr.offset_from(output_base) as usize);
+    // If unchecked-decode is not disabled we also know that the offset falls within ext_dict
+    debug_assert!(ext_dict.len() + output_ptr.offset_from(output_base) as usize >= offset);
+
+    let dict_offset = ext_dict.len() + output_ptr.offset_from(output_base) as usize - offset;
+    // Can't copy past ext_dict len, the match may cross dict and output
+    let dict_match_length = match_length.min(ext_dict.len() - dict_offset);
+    // TODO test fastcpy_unsafe
+    core::ptr::copy_nonoverlapping(
+        ext_dict.as_ptr().add(dict_offset),
+        *output_ptr,
+        dict_match_length,
+    );
+    *output_ptr = output_ptr.add(dict_match_length);
+    dict_match_length
+}
+
+/// Read an integer.
+///
+/// In LZ4, we encode small integers in a way that we can have an arbitrary number of bytes. In
+/// particular, we add the bytes repeatedly until we hit a non-0xFF byte. When we do, we add
+/// this byte to our sum and terminate the loop.
+///
+/// # Example
+///
+/// ```notest
+///     255, 255, 255, 4, 2, 3, 4, 6, 7
+/// ```
+///
+/// is encoded to _255 + 255 + 255 + 4 = 769_. The bytes after the first 4 is ignored, because
+/// 4 is the first non-0xFF byte.
+#[inline]
+fn read_integer_ptr(
+    input_ptr: &mut *const u8,
+    _input_ptr_end: *const u8,
+) -> Result<u32, DecompressError> {
+    // We start at zero and count upwards.
+    let mut n: u32 = 0;
+    // If this byte takes value 255 (the maximum value it can take), another byte is read
+    // and added to the sum. This repeats until a byte lower than 255 is read.
+    loop {
+        // We add the next byte until we get a byte which we add to the counting variable.
+
+        #[cfg(not(feature = "unchecked-decode"))]
+        {
+            if *input_ptr >= _input_ptr_end {
+                return Err(DecompressError::ExpectedAnotherByte);
+            }
+        }
+        let extra = unsafe { input_ptr.read() };
+        *input_ptr = unsafe { input_ptr.add(1) };
+        n += extra as u32;
+
+        // We continue if we got 255, break otherwise.
+        if extra != 0xFF {
+            break;
+        }
+    }
+
+    // 255, 255, 255, 8
+    // 111, 111, 111, 101
+
+    Ok(n)
+}
+
+/// Read a little-endian 16-bit integer from the input stream.
+#[inline]
+fn read_u16_ptr(input_ptr: &mut *const u8) -> u16 {
+    let mut num: u16 = 0;
+    unsafe {
+        core::ptr::copy_nonoverlapping(*input_ptr, &mut num as *mut u16 as *mut u8, 2);
+        *input_ptr = input_ptr.add(2);
+    }
+
+    u16::from_le(num)
+}
+
+const FIT_TOKEN_MASK_LITERAL: u8 = 0b00001111;
+const FIT_TOKEN_MASK_MATCH: u8 = 0b11110000;
+
+#[test]
+fn check_token() {
+    assert!(!does_token_fit(15));
+    assert!(does_token_fit(14));
+    assert!(does_token_fit(114));
+    assert!(!does_token_fit(0b11110000));
+    assert!(does_token_fit(0b10110000));
+}
+
+/// The token consists of two parts, the literal length (upper 4 bits) and match_length (lower 4
+/// bits) if the literal length and match_length are both below 15, we don't need to read additional
+/// data, so the token does fit the metadata in a single u8.
+#[inline]
+fn does_token_fit(token: u8) -> bool {
+    !((token & FIT_TOKEN_MASK_LITERAL) == FIT_TOKEN_MASK_LITERAL
+        || (token & FIT_TOKEN_MASK_MATCH) == FIT_TOKEN_MASK_MATCH)
+}
+
+/// Decompress all bytes of `input` into `output`.
+///
+/// Returns the number of bytes written (decompressed) into `output`.
+#[inline]
+pub(crate) fn decompress_internal<const USE_DICT: bool, S: Sink>(
+    input: &[u8],
+    output: &mut S,
+    ext_dict: &[u8],
+) -> Result<usize, DecompressError> {
+    // Prevent segfault for empty input
+    if input.is_empty() {
+        return Err(DecompressError::ExpectedAnotherByte);
+    }
+
+    let ext_dict = if USE_DICT {
+        ext_dict
+    } else {
+        // ensure optimizer knows ext_dict length is 0 if !USE_DICT
+        debug_assert!(ext_dict.is_empty());
+        &[]
+    };
+    let output_base = unsafe { output.base_mut_ptr() };
+    let output_end = unsafe { output_base.add(output.capacity()) };
+    let output_start_pos_ptr = unsafe { output.base_mut_ptr().add(output.pos()) as *mut u8 };
+    let mut output_ptr = output_start_pos_ptr;
+
+    let mut input_ptr = input.as_ptr();
+    let input_ptr_end = unsafe { input.as_ptr().add(input.len()) };
+    let safe_distance_from_end =  (16 /* literal copy */ +  2 /* u16 match offset */ + 1 /* The next token to read (we can skip the check) */).min(input.len()) ;
+    let input_ptr_safe = unsafe { input_ptr_end.sub(safe_distance_from_end) };
+
+    let safe_output_ptr = unsafe {
+        let mut output_num_safe_bytes = output
+            .capacity()
+            .saturating_sub(16 /* literal copy */ + 18 /* match copy */);
+        if USE_DICT {
+            // In the dictionary case the output pointer is moved by the match length in the dictionary.
+            // This may be up to 17 bytes without exiting the loop. So we need to ensure that we have
+            // at least additional 17 bytes of space left in the output buffer in the fast loop.
+            output_num_safe_bytes = output_num_safe_bytes.saturating_sub(17);
+        };
+
+        output_base.add(output_num_safe_bytes)
+    };
+
+    // Exhaust the decoder by reading and decompressing all blocks until the remaining buffer is
+    // empty.
+    loop {
+        // Read the token. The token is the first byte in a block. It is divided into two 4-bit
+        // subtokens, the higher and the lower.
+        // This token contains to 4-bit "fields", a higher and a lower, representing the literals'
+        // length and the back reference's length, respectively.
+        let token = unsafe { input_ptr.read() };
+        input_ptr = unsafe { input_ptr.add(1) };
+
+        // Checking for hot-loop.
+        // In most cases the metadata does fit in a single 1byte token (statistically) and we are in
+        // a safe-distance to the end. This enables some optimized handling.
+        //
+        // Ideally we want to check for safe output pos like: output.pos() <= safe_output_pos; But
+        // that doesn't work when the safe_output_ptr is == output_ptr due to insufficient
+        // capacity. So we use `<` instead of `<=`, which covers that case.
+        if does_token_fit(token)
+            && (input_ptr as usize) <= input_ptr_safe as usize
+            && output_ptr < safe_output_ptr
+        {
+            let literal_length = (token >> 4) as usize;
+            let mut match_length = MINMATCH + (token & 0xF) as usize;
+
+            // output_ptr <= safe_output_ptr should guarantee we have enough space in output
+            debug_assert!(
+                unsafe { output_ptr.add(literal_length + match_length) } <= output_end,
+                "{literal_length} + {match_length} {} wont fit ",
+                literal_length + match_length
+            );
+
+            // Copy the literal
+            // The literal is at max 16 bytes, and the is_safe_distance check assures
+            // that we are far away enough from the end so we can safely copy 16 bytes
+            unsafe {
+                core::ptr::copy_nonoverlapping(input_ptr, output_ptr, 16);
+                input_ptr = input_ptr.add(literal_length);
+                output_ptr = output_ptr.add(literal_length);
+            }
+
+            // input_ptr <= input_ptr_safe should guarantee we have enough space in input
+            debug_assert!(input_ptr_end as usize - input_ptr as usize >= 2);
+            let offset = read_u16_ptr(&mut input_ptr) as usize;
+
+            let output_len = unsafe { output_ptr.offset_from(output_base) as usize };
+            let offset = offset.min(output_len + ext_dict.len());
+
+            // Check if part of the match is in the external dict
+            if USE_DICT && offset > output_len {
+                let copied = unsafe {
+                    copy_from_dict(output_base, &mut output_ptr, ext_dict, offset, match_length)
+                };
+                if copied == match_length {
+                    continue;
+                }
+                // match crosses ext_dict and output
+                match_length -= copied;
+            }
+
+            // Calculate the start of this duplicate segment. At this point offset was already
+            // checked to be in bounds and the external dictionary copy, if any, was
+            // already copied and subtracted from match_length.
+            let start_ptr = unsafe { output_ptr.sub(offset) };
+            debug_assert!(start_ptr >= output_base);
+            debug_assert!(start_ptr < output_end);
+            debug_assert!(unsafe { output_end.offset_from(start_ptr) as usize } >= match_length);
+
+            // In this branch we know that match_length is at most 18 (14 + MINMATCH).
+            // But the blocks can overlap, so make sure they are at least 18 bytes apart
+            // to enable an optimized copy of 18 bytes.
+            if offset >= match_length {
+                unsafe {
+                    // _copy_, not copy_non_overlaping, as it may overlap.
+                    // Compiles to the same assembly on x68_64.
+                    core::ptr::copy(start_ptr, output_ptr, 18);
+                    output_ptr = output_ptr.add(match_length);
+                }
+            } else {
+                unsafe {
+                    duplicate_overlapping(&mut output_ptr, start_ptr, match_length);
+                }
+            }
+
+            continue;
+        }
+
+        // Now, we read the literals section.
+        // Literal Section
+        // If the initial value is 15, it is indicated that another byte will be read and added to
+        // it
+        let mut literal_length = (token >> 4) as usize;
+        if literal_length != 0 {
+            if literal_length == 15 {
+                // The literal_length length took the maximal value, indicating that there is more
+                // than 15 literal_length bytes. We read the extra integer.
+                literal_length += read_integer_ptr(&mut input_ptr, input_ptr_end)? as usize;
+            }
+
+            #[cfg(not(feature = "unchecked-decode"))]
+            {
+                // Check if literal is out of bounds for the input, and if there is enough space on
+                // the output
+                if literal_length > input_ptr_end as usize - input_ptr as usize {
+                    return Err(DecompressError::LiteralOutOfBounds);
+                }
+                if literal_length > unsafe { output_end.offset_from(output_ptr) as usize } {
+                    return Err(DecompressError::OutputTooSmall {
+                        expected: unsafe { output_ptr.offset_from(output_base) as usize }
+                            + literal_length,
+                        actual: output.capacity(),
+                    });
+                }
+            }
+            unsafe {
+                fastcpy_unsafe::slice_copy(input_ptr, output_ptr, literal_length);
+                output_ptr = output_ptr.add(literal_length);
+                input_ptr = input_ptr.add(literal_length);
+            }
+        }
+
+        // If the input stream is emptied, we break out of the loop. This is only the case
+        // in the end of the stream, since the block is intact otherwise.
+        if input_ptr >= input_ptr_end {
+            break;
+        }
+
+        // Read duplicate section
+        #[cfg(not(feature = "unchecked-decode"))]
+        {
+            if (input_ptr_end as usize) - (input_ptr as usize) < 2 {
+                return Err(DecompressError::ExpectedAnotherByte);
+            }
+        }
+        let offset = read_u16_ptr(&mut input_ptr) as usize;
+        // Obtain the initial match length. The match length is the length of the duplicate segment
+        // which will later be copied from data previously decompressed into the output buffer. The
+        // initial length is derived from the second part of the token (the lower nibble), we read
+        // earlier. Since having a match length of less than 4 would mean negative compression
+        // ratio, we start at 4 (MINMATCH).
+
+        // The initial match length can maximally be 19 (MINMATCH + 15). As with the literal length,
+        // this indicates that there are more bytes to read.
+        let mut match_length = MINMATCH + (token & 0xF) as usize;
+        if match_length == MINMATCH + 15 {
+            // The match length took the maximal value, indicating that there is more bytes. We
+            // read the extra integer.
+            match_length += read_integer_ptr(&mut input_ptr, input_ptr_end)? as usize;
+        }
+
+        // We now copy from the already decompressed buffer. This allows us for storing duplicates
+        // by simply referencing the other location.
+        let output_len = unsafe { output_ptr.offset_from(output_base) as usize };
+
+        // We'll do a bounds check except unchecked-decode is enabled.
+        #[cfg(not(feature = "unchecked-decode"))]
+        {
+            if offset > output_len + ext_dict.len() {
+                return Err(DecompressError::OffsetOutOfBounds);
+            }
+            if match_length > unsafe { output_end.offset_from(output_ptr) as usize } {
+                return Err(DecompressError::OutputTooSmall {
+                    expected: output_len + match_length,
+                    actual: output.capacity(),
+                });
+            }
+        }
+
+        if USE_DICT && offset > output_len {
+            let copied = unsafe {
+                copy_from_dict(output_base, &mut output_ptr, ext_dict, offset, match_length)
+            };
+            if copied == match_length {
+                #[cfg(not(feature = "unchecked-decode"))]
+                {
+                    if input_ptr >= input_ptr_end {
+                        return Err(DecompressError::ExpectedAnotherByte);
+                    }
+                }
+
+                continue;
+            }
+            // match crosses ext_dict and output
+            match_length -= copied;
+        }
+
+        // Calculate the start of this duplicate segment. At this point offset was already checked
+        // to be in bounds and the external dictionary copy, if any, was already copied and
+        // subtracted from match_length.
+        let start_ptr = unsafe { output_ptr.sub(offset) };
+        debug_assert!(start_ptr >= output_base);
+        debug_assert!(start_ptr < output_end);
+        debug_assert!(unsafe { output_end.offset_from(start_ptr) as usize } >= match_length);
+        unsafe {
+            duplicate(&mut output_ptr, output_end, start_ptr, match_length);
+        }
+        #[cfg(not(feature = "unchecked-decode"))]
+        {
+            if input_ptr >= input_ptr_end {
+                return Err(DecompressError::ExpectedAnotherByte);
+            }
+        }
+    }
+    unsafe {
+        output.set_pos(output_ptr.offset_from(output_base) as usize);
+        Ok(output_ptr.offset_from(output_start_pos_ptr) as usize)
+    }
+}
+
+/// Decompress all bytes of `input` into `output`.
+/// `output` should be preallocated with a size of of the uncompressed data.
+#[inline]
+pub fn decompress_into(input: &[u8], output: &mut [u8]) -> Result<usize, DecompressError> {
+    decompress_internal::<false, _>(input, &mut SliceSink::new(output, 0), b"")
+}
+
+/// Decompress all bytes of `input` into `output`.
+///
+/// Returns the number of bytes written (decompressed) into `output`.
+#[inline]
+pub fn decompress_into_with_dict(
+    input: &[u8],
+    output: &mut [u8],
+    ext_dict: &[u8],
+) -> Result<usize, DecompressError> {
+    decompress_internal::<true, _>(input, &mut SliceSink::new(output, 0), ext_dict)
+}
+
+/// Decompress all bytes of `input` into a new vec.
+/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size.
+///
+/// # Panics
+/// May panic if the parameter `min_uncompressed_size` is smaller than the
+/// uncompressed data.
+
+#[inline]
+pub fn decompress_with_dict(
+    input: &[u8],
+    min_uncompressed_size: usize,
+    ext_dict: &[u8],
+) -> Result<Vec<u8>, DecompressError> {
+    // Allocate a vector to contain the decompressed stream.
+    let mut vec = Vec::with_capacity(min_uncompressed_size);
+    let decomp_len =
+        decompress_internal::<true, _>(input, &mut PtrSink::from_vec(&mut vec, 0), ext_dict)?;
+    unsafe {
+        vec.set_len(decomp_len);
+    }
+    Ok(vec)
+}
+
+/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in
+/// little endian. Can be used in conjunction with `compress_prepend_size`
+#[inline]
+pub fn decompress_size_prepended(input: &[u8]) -> Result<Vec<u8>, DecompressError> {
+    let (uncompressed_size, input) = super::uncompressed_size(input)?;
+    decompress(input, uncompressed_size)
+}
+
+/// Decompress all bytes of `input` into a new vec.
+/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size.
+///
+/// # Panics
+/// May panic if the parameter `min_uncompressed_size` is smaller than the
+/// uncompressed data.
+#[inline]
+pub fn decompress(input: &[u8], min_uncompressed_size: usize) -> Result<Vec<u8>, DecompressError> {
+    // Allocate a vector to contain the decompressed stream.
+    let mut vec = Vec::with_capacity(min_uncompressed_size);
+    let decomp_len =
+        decompress_internal::<true, _>(input, &mut PtrSink::from_vec(&mut vec, 0), b"")?;
+    unsafe {
+        vec.set_len(decomp_len);
+    }
+    Ok(vec)
+}
+
+/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in
+/// little endian. Can be used in conjunction with `compress_prepend_size_with_dict`
+#[inline]
+pub fn decompress_size_prepended_with_dict(
+    input: &[u8],
+    ext_dict: &[u8],
+) -> Result<Vec<u8>, DecompressError> {
+    let (uncompressed_size, input) = super::uncompressed_size(input)?;
+    decompress_with_dict(input, uncompressed_size, ext_dict)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn all_literal() {
+        assert_eq!(decompress(&[0x30, b'a', b'4', b'9'], 3).unwrap(), b"a49");
+    }
+
+    // this error test is only valid with checked-decode.
+    #[cfg(not(feature = "unchecked-decode"))]
+    #[test]
+    fn offset_oob() {
+        decompress(&[0x10, b'a', 2, 0], 4).unwrap_err();
+        decompress(&[0x40, b'a', 1, 0], 4).unwrap_err();
+    }
+}
diff --git a/src/block/decompress_safe.rs b/src/block/decompress_safe.rs
new file mode 100644
index 0000000..cebe3cb
--- /dev/null
+++ b/src/block/decompress_safe.rs
@@ -0,0 +1,399 @@
+//! The block decompression algorithm.
+
+use core::convert::TryInto;
+
+use crate::block::DecompressError;
+use crate::block::MINMATCH;
+use crate::sink::Sink;
+use crate::sink::SliceSink;
+use alloc::vec;
+use alloc::vec::Vec;
+
+/// Read an integer.
+///
+/// In LZ4, we encode small integers in a way that we can have an arbitrary number of bytes. In
+/// particular, we add the bytes repeatedly until we hit a non-0xFF byte. When we do, we add
+/// this byte to our sum and terminate the loop.
+///
+/// # Example
+///
+/// ```notest
+///     255, 255, 255, 4, 2, 3, 4, 6, 7
+/// ```
+///
+/// is encoded to _255 + 255 + 255 + 4 = 769_. The bytes after the first 4 is ignored, because
+/// 4 is the first non-0xFF byte.
+#[inline]
+fn read_integer(input: &[u8], input_pos: &mut usize) -> Result<u32, DecompressError> {
+    // We start at zero and count upwards.
+    let mut n: u32 = 0;
+    // If this byte takes value 255 (the maximum value it can take), another byte is read
+    // and added to the sum. This repeats until a byte lower than 255 is read.
+    loop {
+        // We add the next byte until we get a byte which we add to the counting variable.
+        let extra: u8 = *input
+            .get(*input_pos)
+            .ok_or(DecompressError::ExpectedAnotherByte)?;
+        *input_pos += 1;
+        n += extra as u32;
+
+        // We continue if we got 255, break otherwise.
+        if extra != 0xFF {
+            break;
+        }
+    }
+
+    // 255, 255, 255, 8
+    // 111, 111, 111, 101
+
+    Ok(n)
+}
+
+/// Read a little-endian 16-bit integer from the input stream.
+#[inline]
+fn read_u16(input: &[u8], input_pos: &mut usize) -> Result<u16, DecompressError> {
+    let dst = input
+        .get(*input_pos..*input_pos + 2)
+        .ok_or(DecompressError::ExpectedAnotherByte)?;
+    *input_pos += 2;
+    Ok(u16::from_le_bytes(dst.try_into().unwrap()))
+}
+
+const FIT_TOKEN_MASK_LITERAL: u8 = 0b00001111;
+const FIT_TOKEN_MASK_MATCH: u8 = 0b11110000;
+
+#[test]
+fn check_token() {
+    assert!(!does_token_fit(15));
+    assert!(does_token_fit(14));
+    assert!(does_token_fit(114));
+    assert!(!does_token_fit(0b11110000));
+    assert!(does_token_fit(0b10110000));
+}
+
+/// The token consists of two parts, the literal length (upper 4 bits) and match_length (lower 4
+/// bits) if the literal length and match_length are both below 15, we don't need to read additional
+/// data, so the token does fit the metadata.
+#[inline]
+fn does_token_fit(token: u8) -> bool {
+    !((token & FIT_TOKEN_MASK_LITERAL) == FIT_TOKEN_MASK_LITERAL
+        || (token & FIT_TOKEN_MASK_MATCH) == FIT_TOKEN_MASK_MATCH)
+}
+
+/// Decompress all bytes of `input` into `output`.
+///
+/// Returns the number of bytes written (decompressed) into `output`.
+#[inline(always)] // (always) necessary to get the best performance in non LTO builds
+pub(crate) fn decompress_internal<const USE_DICT: bool, S: Sink>(
+    input: &[u8],
+    output: &mut S,
+    ext_dict: &[u8],
+) -> Result<usize, DecompressError> {
+    let mut input_pos = 0;
+    let initial_output_pos = output.pos();
+
+    let safe_input_pos = input
+        .len()
+        .saturating_sub(16 /* literal copy */ +  2 /* u16 match offset */);
+    let mut safe_output_pos = output
+        .capacity()
+        .saturating_sub(16 /* literal copy */ + 18 /* match copy */);
+
+    if USE_DICT {
+        // In the dictionary case the output pointer is moved by the match length in the dictionary.
+        // This may be up to 17 bytes without exiting the loop. So we need to ensure that we have
+        // at least additional 17 bytes of space left in the output buffer in the fast loop.
+        safe_output_pos = safe_output_pos.saturating_sub(17);
+    };
+
+    // Exhaust the decoder by reading and decompressing all blocks until the remaining buffer is
+    // empty.
+    loop {
+        // Read the token. The token is the first byte in a block. It is divided into two 4-bit
+        // subtokens, the higher and the lower.
+        // This token contains to 4-bit "fields", a higher and a lower, representing the literals'
+        // length and the back reference's length, respectively.
+        let token = *input
+            .get(input_pos)
+            .ok_or(DecompressError::ExpectedAnotherByte)?;
+        input_pos += 1;
+
+        // Checking for hot-loop.
+        // In most cases the metadata does fit in a single 1byte token (statistically) and we are in
+        // a safe-distance to the end. This enables some optimized handling.
+        //
+        // Ideally we want to check for safe output pos like: output.pos() <= safe_output_pos; But
+        // that doesn't work when the safe_output_pos is 0 due to saturated_sub. So we use
+        // `<` instead of `<=`, which covers that case.
+        if does_token_fit(token) && input_pos <= safe_input_pos && output.pos() < safe_output_pos {
+            let literal_length = (token >> 4) as usize;
+
+            // casting to [u8;u16] doesn't seem to make a difference vs &[u8] (same assembly)
+            let input: &[u8; 16] = input[input_pos..input_pos + 16].try_into().unwrap();
+
+            // Copy the literal
+            // The literal is at max 14 bytes, and the is_safe_distance check assures
+            // that we are far away enough from the end so we can safely copy 16 bytes
+            output.extend_from_slice_wild(input, literal_length);
+            input_pos += literal_length;
+
+            // clone as we don't want to mutate
+            let offset = read_u16(input, &mut literal_length.clone())? as usize;
+            input_pos += 2;
+
+            let mut match_length = MINMATCH + (token & 0xF) as usize;
+
+            if USE_DICT && offset > output.pos() {
+                let copied = copy_from_dict(output, ext_dict, offset, match_length)?;
+                if copied == match_length {
+                    continue;
+                }
+                // match crosses ext_dict and output, offset is still correct as output pos
+                // increased
+                match_length -= copied;
+            }
+
+            // In this branch we know that match_length is at most 18 (14 + MINMATCH).
+            // But the blocks can overlap, so make sure they are at least 18 bytes apart
+            // to enable an optimized copy of 18 bytes.
+            let start = output.pos().saturating_sub(offset);
+            if offset >= match_length {
+                output.extend_from_within(start, 18, match_length);
+            } else {
+                output.extend_from_within_overlapping(start, match_length)
+            }
+
+            continue;
+        }
+
+        // Now, we read the literals section.
+        // Literal Section
+        // If the initial value is 15, it is indicated that another byte will be read and added to
+        // it
+        let mut literal_length = (token >> 4) as usize;
+        if literal_length != 0 {
+            if literal_length == 15 {
+                // The literal_length length took the maximal value, indicating that there is more
+                // than 15 literal_length bytes. We read the extra integer.
+                literal_length += read_integer(input, &mut input_pos)? as usize;
+            }
+
+            if literal_length > input.len() - input_pos {
+                return Err(DecompressError::LiteralOutOfBounds);
+            }
+            #[cfg(not(feature = "unchecked-decode"))]
+            if literal_length > output.capacity() - output.pos() {
+                return Err(DecompressError::OutputTooSmall {
+                    expected: output.pos() + literal_length,
+                    actual: output.capacity(),
+                });
+            }
+            output.extend_from_slice(&input[input_pos..input_pos + literal_length]);
+            input_pos += literal_length;
+        }
+
+        // If the input stream is emptied, we break out of the loop. This is only the case
+        // in the end of the stream, since the block is intact otherwise.
+        if input_pos >= input.len() {
+            break;
+        }
+
+        let offset = read_u16(input, &mut input_pos)? as usize;
+        // Obtain the initial match length. The match length is the length of the duplicate segment
+        // which will later be copied from data previously decompressed into the output buffer. The
+        // initial length is derived from the second part of the token (the lower nibble), we read
+        // earlier. Since having a match length of less than 4 would mean negative compression
+        // ratio, we start at 4 (MINMATCH).
+
+        // The initial match length can maximally be 19. As with the literal length, this indicates
+        // that there are more bytes to read.
+        let mut match_length = MINMATCH + (token & 0xF) as usize;
+        if match_length == MINMATCH + 15 {
+            // The match length took the maximal value, indicating that there is more bytes. We
+            // read the extra integer.
+            match_length += read_integer(input, &mut input_pos)? as usize;
+        }
+
+        #[cfg(not(feature = "unchecked-decode"))]
+        if output.pos() + match_length > output.capacity() {
+            return Err(DecompressError::OutputTooSmall {
+                expected: output.pos() + match_length,
+                actual: output.capacity(),
+            });
+        }
+        if USE_DICT && offset > output.pos() {
+            let copied = copy_from_dict(output, ext_dict, offset, match_length)?;
+            if copied == match_length {
+                continue;
+            }
+            // match crosses ext_dict and output, offset is still correct as output_len was
+            // increased
+            match_length -= copied;
+        }
+        // We now copy from the already decompressed buffer. This allows us for storing duplicates
+        // by simply referencing the other location.
+        duplicate_slice(output, offset, match_length)?;
+    }
+    Ok(output.pos() - initial_output_pos)
+}
+
+#[inline]
+fn copy_from_dict(
+    output: &mut impl Sink,
+    ext_dict: &[u8],
+    offset: usize,
+    match_length: usize,
+) -> Result<usize, DecompressError> {
+    // If we're here we know offset > output.pos
+    debug_assert!(offset > output.pos());
+    let (dict_offset, did_overflow) = ext_dict.len().overflowing_sub(offset - output.pos());
+    if did_overflow {
+        return Err(DecompressError::OffsetOutOfBounds);
+    }
+    // Can't copy past ext_dict len, the match may cross dict and output
+    let dict_match_length = match_length.min(ext_dict.len() - dict_offset);
+    let ext_match = &ext_dict[dict_offset..dict_offset + dict_match_length];
+    output.extend_from_slice(ext_match);
+    Ok(dict_match_length)
+}
+
+/// Extends output by self-referential copies
+#[inline(always)] // (always) necessary otherwise compiler fails to inline it
+fn duplicate_slice(
+    output: &mut impl Sink,
+    offset: usize,
+    match_length: usize,
+) -> Result<(), DecompressError> {
+    // This function assumes output will fit match_length, it might panic otherwise.
+    if match_length > offset {
+        duplicate_overlapping_slice(output, offset, match_length)?;
+    } else {
+        let (start, did_overflow) = output.pos().overflowing_sub(offset);
+        if did_overflow {
+            return Err(DecompressError::OffsetOutOfBounds);
+        }
+
+        match match_length {
+            0..=32 if output.pos() + 32 <= output.capacity() => {
+                output.extend_from_within(start, 32, match_length)
+            }
+            33..=64 if output.pos() + 64 <= output.capacity() => {
+                output.extend_from_within(start, 64, match_length)
+            }
+            _ => output.extend_from_within(start, match_length, match_length),
+        }
+    }
+    Ok(())
+}
+
+/// self-referential copy for the case data start (end of output - offset) + match_length overlaps
+/// into output
+#[inline]
+fn duplicate_overlapping_slice(
+    sink: &mut impl Sink,
+    offset: usize,
+    match_length: usize,
+) -> Result<(), DecompressError> {
+    // This function assumes output will fit match_length, it might panic otherwise.
+    let (start, did_overflow) = sink.pos().overflowing_sub(offset);
+    if did_overflow {
+        return Err(DecompressError::OffsetOutOfBounds);
+    }
+    if offset == 1 {
+        let val = sink.byte_at(start);
+        sink.extend_with_fill(val, match_length);
+    } else {
+        sink.extend_from_within_overlapping(start, match_length);
+    }
+    Ok(())
+}
+
+/// Decompress all bytes of `input` into `output`.
+/// `output` should be preallocated with a size of of the uncompressed data.
+#[inline]
+pub fn decompress_into(input: &[u8], output: &mut [u8]) -> Result<usize, DecompressError> {
+    decompress_internal::<false, _>(input, &mut SliceSink::new(output, 0), b"")
+}
+
+/// Decompress all bytes of `input` into `output`.
+///
+/// Returns the number of bytes written (decompressed) into `output`.
+#[inline]
+pub fn decompress_into_with_dict(
+    input: &[u8],
+    output: &mut [u8],
+    ext_dict: &[u8],
+) -> Result<usize, DecompressError> {
+    decompress_internal::<true, _>(input, &mut SliceSink::new(output, 0), ext_dict)
+}
+
+/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in
+/// litte endian. Can be used in conjunction with `compress_prepend_size`
+#[inline]
+pub fn decompress_size_prepended(input: &[u8]) -> Result<Vec<u8>, DecompressError> {
+    let (uncompressed_size, input) = super::uncompressed_size(input)?;
+    decompress(input, uncompressed_size)
+}
+
+/// Decompress all bytes of `input` into a new vec.
+/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size.
+///
+/// # Panics
+/// May panic if the parameter `min_uncompressed_size` is smaller than the
+/// uncompressed data.
+#[inline]
+pub fn decompress(input: &[u8], min_uncompressed_size: usize) -> Result<Vec<u8>, DecompressError> {
+    let mut decompressed: Vec<u8> = vec![0; min_uncompressed_size];
+    let decomp_len =
+        decompress_internal::<false, _>(input, &mut SliceSink::new(&mut decompressed, 0), b"")?;
+    decompressed.truncate(decomp_len);
+    Ok(decompressed)
+}
+
+/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in
+/// little endian. Can be used in conjunction with `compress_prepend_size_with_dict`
+#[inline]
+pub fn decompress_size_prepended_with_dict(
+    input: &[u8],
+    ext_dict: &[u8],
+) -> Result<Vec<u8>, DecompressError> {
+    let (uncompressed_size, input) = super::uncompressed_size(input)?;
+    decompress_with_dict(input, uncompressed_size, ext_dict)
+}
+
+/// Decompress all bytes of `input` into a new vec.
+/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size.
+///
+/// # Panics
+/// May panic if the parameter `min_uncompressed_size` is smaller than the
+/// uncompressed data.
+#[inline]
+pub fn decompress_with_dict(
+    input: &[u8],
+    min_uncompressed_size: usize,
+    ext_dict: &[u8],
+) -> Result<Vec<u8>, DecompressError> {
+    let mut decompressed: Vec<u8> = vec![0; min_uncompressed_size];
+    let decomp_len =
+        decompress_internal::<true, _>(input, &mut SliceSink::new(&mut decompressed, 0), ext_dict)?;
+    decompressed.truncate(decomp_len);
+    Ok(decompressed)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn all_literal() {
+        assert_eq!(decompress(&[0x30, b'a', b'4', b'9'], 3).unwrap(), b"a49");
+    }
+
+    // this error test is only valid in safe-decode.
+    #[cfg(feature = "safe-decode")]
+    #[test]
+    fn offset_oob() {
+        decompress(&[0x10, b'a', 2, 0], 4).unwrap_err();
+        decompress(&[0x40, b'a', 1, 0], 4).unwrap_err();
+    }
+}
diff --git a/src/block/hashtable.rs b/src/block/hashtable.rs
new file mode 100644
index 0000000..7c29db7
--- /dev/null
+++ b/src/block/hashtable.rs
@@ -0,0 +1,161 @@
+use alloc::boxed::Box;
+use core::convert::TryInto;
+
+/// The Hashtable trait used by the compression to store hashed bytes to their position.
+/// `val` can be maximum the size of the input in bytes.
+///
+/// `pos` can have a maximum value of u16::MAX or 65535
+/// If the hashtable is smaller it needs to reduce the pos to its space, e.g. by right
+/// shifting.
+///
+/// Duplication dictionary size.
+///
+/// Every four bytes is assigned an entry. When this number is lower, fewer entries exists, and
+/// thus collisions are more likely, hurting the compression ratio.
+
+/// hashes and right shifts to a maximum value of 16bit, 65535
+/// The right shift is done in order to not exceed, the hashtables capacity
+#[inline]
+fn hash(sequence: u32) -> u32 {
+    (sequence.wrapping_mul(2654435761_u32)) >> 16
+}
+
+/// hashes and right shifts to a maximum value of 16bit, 65535
+/// The right shift is done in order to not exceed, the hashtables capacity
+#[cfg(target_pointer_width = "64")]
+#[inline]
+fn hash5(sequence: usize) -> u32 {
+    let primebytes = if cfg!(target_endian = "little") {
+        889523592379_usize
+    } else {
+        11400714785074694791_usize
+    };
+    (((sequence << 24).wrapping_mul(primebytes)) >> 48) as u32
+}
+
+pub trait HashTable {
+    fn get_at(&self, pos: usize) -> usize;
+    fn put_at(&mut self, pos: usize, val: usize);
+    fn clear(&mut self);
+    #[inline]
+    #[cfg(target_pointer_width = "64")]
+    fn get_hash_at(input: &[u8], pos: usize) -> usize {
+        hash5(super::compress::get_batch_arch(input, pos)) as usize
+    }
+    #[inline]
+    #[cfg(target_pointer_width = "32")]
+    fn get_hash_at(input: &[u8], pos: usize) -> usize {
+        hash(super::compress::get_batch(input, pos)) as usize
+    }
+}
+
+const HASHTABLE_SIZE_4K: usize = 4 * 1024;
+const HASHTABLE_BIT_SHIFT_4K: usize = 4;
+
+#[derive(Debug)]
+#[repr(align(64))]
+pub struct HashTable4KU16 {
+    dict: Box<[u16; HASHTABLE_SIZE_4K]>,
+}
+impl HashTable4KU16 {
+    #[inline]
+    pub fn new() -> Self {
+        // This generates more efficient assembly in contrast to Box::new(slice), because of an
+        // optmized call alloc_zeroed, vs. alloc + memset
+        // try_into is optimized away
+        let dict = alloc::vec![0; HASHTABLE_SIZE_4K]
+            .into_boxed_slice()
+            .try_into()
+            .unwrap();
+        Self { dict }
+    }
+}
+impl HashTable for HashTable4KU16 {
+    #[inline]
+    fn get_at(&self, hash: usize) -> usize {
+        self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] as usize
+    }
+    #[inline]
+    fn put_at(&mut self, hash: usize, val: usize) {
+        self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] = val as u16;
+    }
+    #[inline]
+    fn clear(&mut self) {
+        self.dict.fill(0);
+    }
+    #[inline]
+    fn get_hash_at(input: &[u8], pos: usize) -> usize {
+        hash(super::get_batch(input, pos)) as usize
+    }
+}
+
+#[derive(Debug)]
+pub struct HashTable4K {
+    dict: Box<[u32; HASHTABLE_SIZE_4K]>,
+}
+impl HashTable4K {
+    #[inline]
+    pub fn new() -> Self {
+        let dict = alloc::vec![0; HASHTABLE_SIZE_4K]
+            .into_boxed_slice()
+            .try_into()
+            .unwrap();
+        Self { dict }
+    }
+
+    #[cold]
+    #[allow(dead_code)]
+    pub fn reposition(&mut self, offset: u32) {
+        for i in self.dict.iter_mut() {
+            *i = i.saturating_sub(offset);
+        }
+    }
+}
+impl HashTable for HashTable4K {
+    #[inline]
+    fn get_at(&self, hash: usize) -> usize {
+        self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] as usize
+    }
+    #[inline]
+    fn put_at(&mut self, hash: usize, val: usize) {
+        self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] = val as u32;
+    }
+    #[inline]
+    fn clear(&mut self) {
+        self.dict.fill(0);
+    }
+}
+
+const HASHTABLE_SIZE_8K: usize = 8 * 1024;
+const HASH_TABLE_BIT_SHIFT_8K: usize = 3;
+
+#[derive(Debug)]
+pub struct HashTable8K {
+    dict: Box<[u32; HASHTABLE_SIZE_8K]>,
+}
+#[allow(dead_code)]
+impl HashTable8K {
+    #[inline]
+    pub fn new() -> Self {
+        let dict = alloc::vec![0; HASHTABLE_SIZE_8K]
+            .into_boxed_slice()
+            .try_into()
+            .unwrap();
+
+        Self { dict }
+    }
+}
+impl HashTable for HashTable8K {
+    #[inline]
+    fn get_at(&self, hash: usize) -> usize {
+        self.dict[hash >> HASH_TABLE_BIT_SHIFT_8K] as usize
+    }
+    #[inline]
+    fn put_at(&mut self, hash: usize, val: usize) {
+        self.dict[hash >> HASH_TABLE_BIT_SHIFT_8K] = val as u32;
+    }
+    #[inline]
+    fn clear(&mut self) {
+        self.dict.fill(0);
+    }
+}
diff --git a/src/block/mod.rs b/src/block/mod.rs
new file mode 100644
index 0000000..31c52f4
--- /dev/null
+++ b/src/block/mod.rs
@@ -0,0 +1,157 @@
+//! LZ4 Block Format
+//!
+//! As defined in <https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md>
+//!
+//! Currently for no_std support only the block format is supported.
+//!
+//! # Example: block format roundtrip
+//! ```
+//! use lz4_flex::block::{compress_prepend_size, decompress_size_prepended};
+//! let input: &[u8] = b"Hello people, what's up?";
+//! let compressed = compress_prepend_size(input);
+//! let uncompressed = decompress_size_prepended(&compressed).unwrap();
+//! assert_eq!(input, uncompressed);
+//! ```
+//!
+
+#[cfg_attr(feature = "safe-encode", forbid(unsafe_code))]
+pub(crate) mod compress;
+pub(crate) mod hashtable;
+
+#[cfg(feature = "safe-decode")]
+#[cfg_attr(feature = "safe-decode", forbid(unsafe_code))]
+pub(crate) mod decompress_safe;
+#[cfg(feature = "safe-decode")]
+pub(crate) use decompress_safe as decompress;
+
+#[cfg(not(feature = "safe-decode"))]
+pub(crate) mod decompress;
+
+pub use compress::*;
+pub use decompress::*;
+
+use core::convert::TryInto;
+use core::fmt;
+
+pub(crate) const WINDOW_SIZE: usize = 64 * 1024;
+
+/// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md#end-of-block-restrictions
+/// The last match must start at least 12 bytes before the end of block. The last match is part of
+/// the penultimate sequence. It is followed by the last sequence, which contains only literals.
+///
+/// Note that, as a consequence, an independent block < 13 bytes cannot be compressed, because the
+/// match must copy "something", so it needs at least one prior byte.
+///
+/// When a block can reference data from another block, it can start immediately with a match and no
+/// literal, so a block of 12 bytes can be compressed.
+const MFLIMIT: usize = 12;
+
+/// The last 5 bytes of input are always literals. Therefore, the last sequence contains at least 5
+/// bytes.
+const LAST_LITERALS: usize = 5;
+
+/// Due the way the compression loop is arrange we may read up to (register_size - 2) bytes from the
+/// current position. So we must end the matches 6 bytes before the end, 1 more than required by the
+/// spec.
+const END_OFFSET: usize = LAST_LITERALS + 1;
+
+/// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md#end-of-block-restrictions
+/// Minimum length of a block
+///
+/// MFLIMIT + 1 for the token.
+const LZ4_MIN_LENGTH: usize = MFLIMIT + 1;
+
+const MAXD_LOG: usize = 16;
+const MAX_DISTANCE: usize = (1 << MAXD_LOG) - 1;
+
+#[allow(dead_code)]
+const MATCH_LENGTH_MASK: u32 = (1_u32 << 4) - 1; // 0b1111 / 15
+
+/// The minimum length of a duplicate
+const MINMATCH: usize = 4;
+
+#[allow(dead_code)]
+const FASTLOOP_SAFE_DISTANCE: usize = 64;
+
+/// Switch for the hashtable size byU16
+#[allow(dead_code)]
+static LZ4_64KLIMIT: usize = (64 * 1024) + (MFLIMIT - 1);
+
+/// An error representing invalid compressed data.
+#[derive(Debug)]
+#[non_exhaustive]
+pub enum DecompressError {
+    /// The provided output is too small
+    OutputTooSmall {
+        /// Minimum expected output size
+        expected: usize,
+        /// Actual size of output
+        actual: usize,
+    },
+    /// Literal is out of bounds of the input
+    LiteralOutOfBounds,
+    /// Expected another byte, but none found.
+    ExpectedAnotherByte,
+    /// Deduplication offset out of bounds (not in buffer).
+    OffsetOutOfBounds,
+}
+
+#[derive(Debug)]
+#[non_exhaustive]
+/// Errors that can happen during compression.
+pub enum CompressError {
+    /// The provided output is too small.
+    OutputTooSmall,
+}
+
+impl fmt::Display for DecompressError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            DecompressError::OutputTooSmall { expected, actual } => {
+                write!(
+                    f,
+                    "provided output is too small for the decompressed data, actual {actual}, expected \
+                     {expected}"
+                )
+            }
+            DecompressError::LiteralOutOfBounds => {
+                f.write_str("literal is out of bounds of the input")
+            }
+            DecompressError::ExpectedAnotherByte => {
+                f.write_str("expected another byte, found none")
+            }
+            DecompressError::OffsetOutOfBounds => {
+                f.write_str("the offset to copy is not contained in the decompressed buffer")
+            }
+        }
+    }
+}
+
+impl fmt::Display for CompressError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            CompressError::OutputTooSmall => f.write_str(
+                "output is too small for the compressed data, use get_maximum_output_size to \
+                 reserve enough space",
+            ),
+        }
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for DecompressError {}
+
+#[cfg(feature = "std")]
+impl std::error::Error for CompressError {}
+
+/// This can be used in conjunction with `decompress_size_prepended`.
+/// It will read the first 4 bytes as little-endian encoded length, and return
+/// the rest of the bytes after the length encoding.
+#[inline]
+pub fn uncompressed_size(input: &[u8]) -> Result<(usize, &[u8]), DecompressError> {
+    let size = input.get(..4).ok_or(DecompressError::ExpectedAnotherByte)?;
+    let size: &[u8; 4] = size.try_into().unwrap();
+    let uncompressed_size = u32::from_le_bytes(*size) as usize;
+    let rest = &input[4..];
+    Ok((uncompressed_size, rest))
+}
diff --git a/src/fastcpy.rs b/src/fastcpy.rs
new file mode 100644
index 0000000..8bbd480
--- /dev/null
+++ b/src/fastcpy.rs
@@ -0,0 +1,145 @@
+//! # FastCpy
+//!
+//! The Rust Compiler calls `memcpy` for slices of unknown length.
+//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`).
+//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program.
+//!
+//! `fastcpy` is designed to contain not too much assembly, so the overhead is low.
+//!
+//! As fall back the standard `memcpy` is called
+//!
+//! ## Double Copy Trick
+//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`).
+//! E.g. Slice of length 6 can be copied with two uncoditional copy operations.
+//!
+//! /// [1, 2, 3, 4, 5, 6]
+//! /// [1, 2, 3, 4]
+//! ///       [3, 4, 5, 6]
+//!
+
+#[inline]
+pub fn slice_copy(src: &[u8], dst: &mut [u8]) {
+    #[inline(never)]
+    #[cold]
+    #[track_caller]
+    fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! {
+        panic!(
+            "source slice length ({}) does not match destination slice length ({})",
+            src_len, dst_len,
+        );
+    }
+
+    if src.len() != dst.len() {
+        len_mismatch_fail(src.len(), dst.len());
+    }
+    let len = src.len();
+
+    if src.is_empty() {
+        return;
+    }
+
+    if len < 4 {
+        short_copy(src, dst);
+        return;
+    }
+
+    if len < 8 {
+        double_copy_trick::<4>(src, dst);
+        return;
+    }
+
+    if len <= 16 {
+        double_copy_trick::<8>(src, dst);
+        return;
+    }
+
+    if len <= 32 {
+        double_copy_trick::<16>(src, dst);
+        return;
+    }
+
+    /// The code will use the vmovdqu instruction to copy 32 bytes at a time.
+    #[cfg(target_feature = "avx")]
+    {
+        if len <= 64 {
+            double_copy_trick::<32>(src, dst);
+            return;
+        }
+    }
+
+    // For larger sizes we use the default, which calls memcpy
+    // memcpy does some virtual memory tricks to copy large chunks of memory.
+    //
+    // The theory should be that the checks above don't cost much relative to the copy call for
+    // larger copies.
+    // The bounds checks in `copy_from_slice` are elided.
+    dst.copy_from_slice(src);
+}
+
+#[inline(always)]
+fn short_copy(src: &[u8], dst: &mut [u8]) {
+    let len = src.len();
+
+    // length 1-3
+    dst[0] = src[0];
+    if len >= 2 {
+        double_copy_trick::<2>(src, dst);
+    }
+}
+
+#[inline(always)]
+/// [1, 2, 3, 4, 5, 6]
+/// [1, 2, 3, 4]
+///       [3, 4, 5, 6]
+fn double_copy_trick<const SIZE: usize>(src: &[u8], dst: &mut [u8]) {
+    dst[0..SIZE].copy_from_slice(&src[0..SIZE]);
+    dst[src.len() - SIZE..].copy_from_slice(&src[src.len() - SIZE..]);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::slice_copy;
+    use alloc::vec::Vec;
+    use proptest::prelude::*;
+    proptest! {
+        #[test]
+        fn test_fast_short_slice_copy(left: Vec<u8>) {
+            let mut right = vec![0u8; left.len()];
+            slice_copy(&left, &mut right);
+            prop_assert_eq!(&left, &right);
+        }
+    }
+
+    #[test]
+    fn test_fast_short_slice_copy_edge_cases() {
+        for len in 0..(512 * 2) {
+            let left = (0..len).map(|i| i as u8).collect::<Vec<_>>();
+            let mut right = vec![0u8; len];
+            slice_copy(&left, &mut right);
+            assert_eq!(left, right);
+        }
+    }
+
+    #[test]
+    fn test_fail2() {
+        let left = vec![
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let mut right = vec![0u8; left.len()];
+        slice_copy(&left, &mut right);
+        assert_eq!(left, right);
+    }
+
+    #[test]
+    fn test_fail() {
+        let left = vec![
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let mut right = vec![0u8; left.len()];
+        slice_copy(&left, &mut right);
+        assert_eq!(left, right);
+    }
+}
diff --git a/src/fastcpy_unsafe.rs b/src/fastcpy_unsafe.rs
new file mode 100644
index 0000000..fb17280
--- /dev/null
+++ b/src/fastcpy_unsafe.rs
@@ -0,0 +1,165 @@
+//! # FastCpy
+//!
+//! The Rust Compiler calls `memcpy` for slices of unknown length.
+//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`).
+//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program.
+//!
+//! `fastcpy` is designed to contain not too much assembly, so the overhead is low.
+//!
+//! As fall back the standard `memcpy` is called
+//!
+//! ## Double Copy Trick
+//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`).
+//! E.g. Slice of length 6 can be copied with two uncoditional copy operations.
+//!
+//! /// [1, 2, 3, 4, 5, 6]
+//! /// [1, 2, 3, 4]
+//! ///       [3, 4, 5, 6]
+//!
+
+#[inline]
+pub fn slice_copy(src: *const u8, dst: *mut u8, num_bytes: usize) {
+    if num_bytes < 4 {
+        short_copy(src, dst, num_bytes);
+        return;
+    }
+
+    if num_bytes < 8 {
+        double_copy_trick::<4>(src, dst, num_bytes);
+        return;
+    }
+
+    if num_bytes <= 16 {
+        double_copy_trick::<8>(src, dst, num_bytes);
+        return;
+    }
+
+    //if num_bytes <= 32 {
+    //double_copy_trick::<16>(src, dst, num_bytes);
+    //return;
+    //}
+
+    // /// The code will use the vmovdqu instruction to copy 32 bytes at a time.
+    //#[cfg(target_feature = "avx")]
+    //{
+    //if num_bytes <= 64 {
+    //double_copy_trick::<32>(src, dst, num_bytes);
+    //return;
+    //}
+    //}
+
+    // For larger sizes we use the default, which calls memcpy
+    // memcpy does some virtual memory tricks to copy large chunks of memory.
+    //
+    // The theory should be that the checks above don't cost much relative to the copy call for
+    // larger copies.
+    // The bounds checks in `copy_from_slice` are elided.
+
+    //unsafe { core::ptr::copy_nonoverlapping(src, dst, num_bytes) }
+    wild_copy_from_src::<16>(src, dst, num_bytes)
+}
+
+// Inline never because otherwise we get a call to memcpy -.-
+#[inline]
+fn wild_copy_from_src<const SIZE: usize>(
+    mut source: *const u8,
+    mut dst: *mut u8,
+    num_bytes: usize,
+) {
+    // Note: if the compiler auto-vectorizes this it'll hurt performance!
+    // It's not the case for 16 bytes stepsize, but for 8 bytes.
+    let l_last = unsafe { source.add(num_bytes - SIZE) };
+    let r_last = unsafe { dst.add(num_bytes - SIZE) };
+    let num_bytes = (num_bytes / SIZE) * SIZE;
+
+    unsafe {
+        let dst_ptr_end = dst.add(num_bytes);
+        loop {
+            core::ptr::copy_nonoverlapping(source, dst, SIZE);
+            source = source.add(SIZE);
+            dst = dst.add(SIZE);
+            if dst >= dst_ptr_end {
+                break;
+            }
+        }
+    }
+
+    unsafe {
+        core::ptr::copy_nonoverlapping(l_last, r_last, SIZE);
+    }
+}
+
+#[inline]
+fn short_copy(src: *const u8, dst: *mut u8, len: usize) {
+    unsafe {
+        *dst = *src;
+    }
+    if len >= 2 {
+        double_copy_trick::<2>(src, dst, len);
+    }
+}
+
+#[inline(always)]
+/// [1, 2, 3, 4, 5, 6]
+/// [1, 2, 3, 4]
+///       [3, 4, 5, 6]
+fn double_copy_trick<const SIZE: usize>(src: *const u8, dst: *mut u8, len: usize) {
+    let l_end = unsafe { src.add(len - SIZE) };
+    let r_end = unsafe { dst.add(len - SIZE) };
+
+    unsafe {
+        core::ptr::copy_nonoverlapping(src, dst, SIZE);
+        core::ptr::copy_nonoverlapping(l_end, r_end, SIZE);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::slice_copy;
+    use alloc::vec::Vec;
+    use proptest::prelude::*;
+    proptest! {
+        #[test]
+        fn test_fast_short_slice_copy(left: Vec<u8>) {
+            if left.is_empty() {
+                return Ok(());
+            }
+            let mut right = vec![0u8; left.len()];
+            slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
+            prop_assert_eq!(&left, &right);
+        }
+    }
+
+    #[test]
+    fn test_fast_short_slice_copy_edge_cases() {
+        for len in 1..(512 * 2) {
+            let left = (0..len).map(|i| i as u8).collect::<Vec<_>>();
+            let mut right = vec![0u8; len];
+            slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
+            assert_eq!(left, right);
+        }
+    }
+
+    #[test]
+    fn test_fail2() {
+        let left = vec![
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31, 32,
+        ];
+        let mut right = vec![0u8; left.len()];
+        slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
+        assert_eq!(left, right);
+    }
+
+    #[test]
+    fn test_fail() {
+        let left = vec![
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
+        let mut right = vec![0u8; left.len()];
+        slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len());
+        assert_eq!(left, right);
+    }
+}
diff --git a/src/frame/compress.rs b/src/frame/compress.rs
new file mode 100644
index 0000000..cee32eb
--- /dev/null
+++ b/src/frame/compress.rs
@@ -0,0 +1,471 @@
+use std::{
+    fmt,
+    hash::Hasher,
+    io::{self, Write},
+};
+use twox_hash::XxHash32;
+
+use crate::{
+    block::{
+        compress::compress_internal,
+        hashtable::{HashTable, HashTable4K},
+    },
+    sink::vec_sink_for_compression,
+};
+
+use super::Error;
+use super::{
+    header::{BlockInfo, BlockMode, FrameInfo, BLOCK_INFO_SIZE, MAX_FRAME_INFO_SIZE},
+    BlockSize,
+};
+use crate::block::WINDOW_SIZE;
+
+/// A writer for compressing a LZ4 stream.
+///
+/// This `FrameEncoder` wraps any other writer that implements `io::Write`.
+/// Bytes written to this writer are compressed using the [LZ4 frame
+/// format](https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md).
+///
+/// Writes are buffered automatically, so there's no need to wrap the given
+/// writer in a `std::io::BufWriter`.
+///
+/// To ensure a well formed stream the encoder must be finalized by calling
+/// either the [`finish()`], [`try_finish()`], or [`auto_finish()`] methods.
+///
+/// [`finish()`]: Self::finish
+/// [`try_finish()`]: Self::try_finish
+/// [`auto_finish()`]: Self::auto_finish
+///
+/// # Example 1
+/// Serializing json values into a compressed file.
+///
+/// ```no_run
+/// let compressed_file = std::fs::File::create("datafile").unwrap();
+/// let mut compressor = lz4_flex::frame::FrameEncoder::new(compressed_file);
+/// serde_json::to_writer(&mut compressor, &serde_json::json!({ "an": "object" })).unwrap();
+/// compressor.finish().unwrap();
+/// ```
+///
+/// # Example 2
+/// Serializing multiple json values into a compressed file using linked blocks.
+///
+/// ```no_run
+/// let compressed_file = std::fs::File::create("datafile").unwrap();
+/// let mut frame_info = lz4_flex::frame::FrameInfo::new();
+/// frame_info.block_mode = lz4_flex::frame::BlockMode::Linked;
+/// let mut compressor = lz4_flex::frame::FrameEncoder::with_frame_info(frame_info, compressed_file);
+/// for i in 0..10u64 {
+///     serde_json::to_writer(&mut compressor, &serde_json::json!({ "i": i })).unwrap();
+/// }
+/// compressor.finish().unwrap();
+/// ```
+pub struct FrameEncoder<W: io::Write> {
+    /// Our buffer of uncompressed bytes.
+    src: Vec<u8>,
+    /// Index into src: starting point of bytes not yet compressed
+    src_start: usize,
+    /// Index into src: end point of bytes not not yet compressed
+    src_end: usize,
+    /// Index into src: starting point of external dictionary (applicable in Linked block mode)
+    ext_dict_offset: usize,
+    /// Length of external dictionary
+    ext_dict_len: usize,
+    /// Counter of bytes already compressed to the compression_table
+    /// _Not_ the same as `content_len` as this is reset every to 2GB.
+    src_stream_offset: usize,
+    /// Encoder table
+    compression_table: HashTable4K,
+    /// The underlying writer.
+    w: W,
+    /// Xxhash32 used when content checksum is enabled.
+    content_hasher: XxHash32,
+    /// Number of bytes compressed
+    content_len: u64,
+    /// The compressed bytes buffer. Bytes are compressed from src (usually)
+    /// to dst before being written to w.
+    dst: Vec<u8>,
+    /// Whether we have an open frame in the output.
+    is_frame_open: bool,
+    /// Whether we have an frame closed in the output.
+    data_to_frame_written: bool,
+    /// The frame information to be used in this encoder.
+    frame_info: FrameInfo,
+}
+
+impl<W: io::Write> FrameEncoder<W> {
+    fn init(&mut self) {
+        let max_block_size = self.frame_info.block_size.get_size();
+        let src_size = if self.frame_info.block_mode == BlockMode::Linked {
+            // In linked mode we consume the input (bumping src_start) but leave the
+            // beginning of src to be used as a prefix in subsequent blocks.
+            // That is at least until we have at least `max_block_size + WINDOW_SIZE`
+            // bytes in src, then we setup an ext_dict with the last WINDOW_SIZE bytes
+            // and the input goes to the beginning of src again.
+            // Since we always want to be able to write a full block (up to max_block_size)
+            // we need a buffer with at least `max_block_size * 2 + WINDOW_SIZE` bytes.
+            max_block_size * 2 + WINDOW_SIZE
+        } else {
+            max_block_size
+        };
+        // Since this method is called potentially multiple times, don't reserve _additional_
+        // capacity if not required.
+        self.src
+            .reserve(src_size.saturating_sub(self.src.capacity()));
+        self.dst.reserve(
+            crate::block::compress::get_maximum_output_size(max_block_size)
+                .saturating_sub(self.dst.capacity()),
+        );
+    }
+
+    /// Returns a wrapper around `self` that will finish the stream on drop.
+    ///
+    /// # Note
+    /// Errors on drop get silently ignored. If you want to handle errors then use [`finish()`] or
+    /// [`try_finish()`] instead.
+    ///
+    /// [`finish()`]: Self::finish
+    /// [`try_finish()`]: Self::try_finish
+    pub fn auto_finish(self) -> AutoFinishEncoder<W> {
+        AutoFinishEncoder {
+            encoder: Some(self),
+        }
+    }
+
+    /// Creates a new Encoder with the specified FrameInfo.
+    pub fn with_frame_info(frame_info: FrameInfo, wtr: W) -> Self {
+        FrameEncoder {
+            src: Vec::new(),
+            w: wtr,
+            // 16 KB hash table for matches, same as the reference implementation.
+            compression_table: HashTable4K::new(),
+            content_hasher: XxHash32::with_seed(0),
+            content_len: 0,
+            dst: Vec::new(),
+            is_frame_open: false,
+            data_to_frame_written: false,
+            frame_info,
+            src_start: 0,
+            src_end: 0,
+            ext_dict_offset: 0,
+            ext_dict_len: 0,
+            src_stream_offset: 0,
+        }
+    }
+
+    /// Creates a new Encoder with the default settings.
+    pub fn new(wtr: W) -> Self {
+        Self::with_frame_info(Default::default(), wtr)
+    }
+
+    /// The frame information used by this Encoder.
+    pub fn frame_info(&mut self) -> &FrameInfo {
+        &self.frame_info
+    }
+
+    /// Consumes this encoder, flushing internal buffer and writing stream terminator.
+    pub fn finish(mut self) -> Result<W, Error> {
+        self.try_finish()?;
+        Ok(self.w)
+    }
+
+    /// Attempt to finish this output stream, flushing internal buffer and writing stream
+    /// terminator.
+    pub fn try_finish(&mut self) -> Result<(), Error> {
+        match self.flush() {
+            Ok(()) => {
+                // Empty input special case
+                // https://github.com/ouch-org/ouch/pull/163#discussion_r1108965151
+                if !self.is_frame_open && !self.data_to_frame_written {
+                    self.begin_frame(0)?;
+                }
+                self.end_frame()?;
+                self.data_to_frame_written = true;
+                Ok(())
+            }
+            Err(err) => Err(err.into()),
+        }
+    }
+
+    /// Returns the underlying writer _without_ flushing the stream.
+    /// This may leave the output in an unfinished state.
+    pub fn into_inner(self) -> W {
+        self.w
+    }
+
+    /// Gets a reference to the underlying writer in this encoder.
+    pub fn get_ref(&self) -> &W {
+        &self.w
+    }
+
+    /// Gets a reference to the underlying writer in this encoder.
+    ///
+    /// Note that mutating the output/input state of the stream may corrupt
+    /// this encoder, so care must be taken when using this method.
+    pub fn get_mut(&mut self) -> &mut W {
+        &mut self.w
+    }
+
+    /// Closes the frame by writing the end marker.
+    fn end_frame(&mut self) -> Result<(), Error> {
+        debug_assert!(self.is_frame_open);
+        self.is_frame_open = false;
+        if let Some(expected) = self.frame_info.content_size {
+            if expected != self.content_len {
+                return Err(Error::ContentLengthError {
+                    expected,
+                    actual: self.content_len,
+                });
+            }
+        }
+
+        let mut block_info_buffer = [0u8; BLOCK_INFO_SIZE];
+        BlockInfo::EndMark.write(&mut block_info_buffer[..])?;
+        self.w.write_all(&block_info_buffer[..])?;
+        if self.frame_info.content_checksum {
+            let content_checksum = self.content_hasher.finish() as u32;
+            self.w.write_all(&content_checksum.to_le_bytes())?;
+        }
+
+        Ok(())
+    }
+
+    /// Begin the frame by writing the frame header.
+    /// It'll also setup the encoder for compressing blocks for the the new frame.
+    fn begin_frame(&mut self, buf_len: usize) -> io::Result<()> {
+        self.is_frame_open = true;
+        if self.frame_info.block_size == BlockSize::Auto {
+            self.frame_info.block_size = BlockSize::from_buf_length(buf_len);
+        }
+        self.init();
+        let mut frame_info_buffer = [0u8; MAX_FRAME_INFO_SIZE];
+        let size = self.frame_info.write(&mut frame_info_buffer)?;
+        self.w.write_all(&frame_info_buffer[..size])?;
+
+        if self.content_len != 0 {
+            // This is the second or later frame for this Encoder,
+            // reset compressor state for the new frame.
+            self.content_len = 0;
+            self.src_stream_offset = 0;
+            self.src.clear();
+            self.src_start = 0;
+            self.src_end = 0;
+            self.ext_dict_len = 0;
+            self.content_hasher = XxHash32::with_seed(0);
+            self.compression_table.clear();
+        }
+        Ok(())
+    }
+
+    /// Consumes the src contents between src_start and src_end,
+    /// which shouldn't exceed the max block size.
+    fn write_block(&mut self) -> io::Result<()> {
+        debug_assert!(self.is_frame_open);
+        let max_block_size = self.frame_info.block_size.get_size();
+        debug_assert!(self.src_end - self.src_start <= max_block_size);
+
+        // Reposition the compression table if we're anywhere near an overflowing hazard
+        if self.src_stream_offset + max_block_size + WINDOW_SIZE >= u32::MAX as usize / 2 {
+            self.compression_table
+                .reposition((self.src_stream_offset - self.ext_dict_len) as _);
+            self.src_stream_offset = self.ext_dict_len;
+        }
+
+        // input to the compressor, which may include a prefix when blocks are linked
+        let input = &self.src[..self.src_end];
+        // the contents of the block are between src_start and src_end
+        let src = &input[self.src_start..];
+
+        let dst_required_size = crate::block::compress::get_maximum_output_size(src.len());
+
+        let compress_result = if self.ext_dict_len != 0 {
+            debug_assert_eq!(self.frame_info.block_mode, BlockMode::Linked);
+            compress_internal::<_, true, _>(
+                input,
+                self.src_start,
+                &mut vec_sink_for_compression(&mut self.dst, 0, 0, dst_required_size),
+                &mut self.compression_table,
+                &self.src[self.ext_dict_offset..self.ext_dict_offset + self.ext_dict_len],
+                self.src_stream_offset,
+            )
+        } else {
+            compress_internal::<_, false, _>(
+                input,
+                self.src_start,
+                &mut vec_sink_for_compression(&mut self.dst, 0, 0, dst_required_size),
+                &mut self.compression_table,
+                b"",
+                self.src_stream_offset,
+            )
+        };
+
+        let (block_info, block_data) = match compress_result.map_err(Error::CompressionError)? {
+            comp_len if comp_len < src.len() => {
+                (BlockInfo::Compressed(comp_len as _), &self.dst[..comp_len])
+            }
+            _ => (BlockInfo::Uncompressed(src.len() as _), src),
+        };
+
+        // Write the (un)compressed block to the writer and the block checksum (if applicable).
+        let mut block_info_buffer = [0u8; BLOCK_INFO_SIZE];
+        block_info.write(&mut block_info_buffer[..])?;
+        self.w.write_all(&block_info_buffer[..])?;
+        self.w.write_all(block_data)?;
+        if self.frame_info.block_checksums {
+            let mut block_hasher = XxHash32::with_seed(0);
+            block_hasher.write(block_data);
+            let block_checksum = block_hasher.finish() as u32;
+            self.w.write_all(&block_checksum.to_le_bytes())?;
+        }
+
+        // Content checksum, if applicable
+        if self.frame_info.content_checksum {
+            self.content_hasher.write(src);
+        }
+
+        // Buffer and offsets maintenance
+        self.content_len += src.len() as u64;
+        self.src_start += src.len();
+        debug_assert_eq!(self.src_start, self.src_end);
+        if self.frame_info.block_mode == BlockMode::Linked {
+            // In linked mode we consume the input (bumping src_start) but leave the
+            // beginning of src to be used as a prefix in subsequent blocks.
+            // That is at least until we have at least `max_block_size + WINDOW_SIZE`
+            // bytes in src, then we setup an ext_dict with the last WINDOW_SIZE bytes
+            // and the input goes to the beginning of src again.
+            debug_assert_eq!(self.src.capacity(), max_block_size * 2 + WINDOW_SIZE);
+            if self.src_start >= max_block_size + WINDOW_SIZE {
+                // The ext_dict will become the last WINDOW_SIZE bytes
+                self.ext_dict_offset = self.src_end - WINDOW_SIZE;
+                self.ext_dict_len = WINDOW_SIZE;
+                // Input goes in the beginning of the buffer again.
+                self.src_stream_offset += self.src_end;
+                self.src_start = 0;
+                self.src_end = 0;
+            } else if self.src_start + self.ext_dict_len > WINDOW_SIZE {
+                // There's more than WINDOW_SIZE bytes of lookback adding the prefix and ext_dict.
+                // Since we have a limited buffer we must shrink ext_dict in favor of the prefix,
+                // so that we can fit up to max_block_size bytes between dst_start and ext_dict
+                // start.
+                let delta = self
+                    .ext_dict_len
+                    .min(self.src_start + self.ext_dict_len - WINDOW_SIZE);
+                self.ext_dict_offset += delta;
+                self.ext_dict_len -= delta;
+                debug_assert!(self.src_start + self.ext_dict_len >= WINDOW_SIZE)
+            }
+            debug_assert!(
+                self.ext_dict_len == 0 || self.src_start + max_block_size <= self.ext_dict_offset
+            );
+        } else {
+            // In independent block mode we consume the entire src buffer
+            // which is sized equal to the frame max_block_size.
+            debug_assert_eq!(self.ext_dict_len, 0);
+            debug_assert_eq!(self.src.capacity(), max_block_size);
+            self.src_start = 0;
+            self.src_end = 0;
+            // Advance stream offset so we don't have to reset the match dict
+            // for the next block.
+            self.src_stream_offset += src.len();
+        }
+        debug_assert!(self.src_start <= self.src_end);
+        debug_assert!(self.src_start + max_block_size <= self.src.capacity());
+        Ok(())
+    }
+}
+
+impl<W: io::Write> io::Write for FrameEncoder<W> {
+    fn write(&mut self, mut buf: &[u8]) -> io::Result<usize> {
+        if !self.is_frame_open && !buf.is_empty() {
+            self.begin_frame(buf.len())?;
+        }
+        let buf_len = buf.len();
+        while !buf.is_empty() {
+            let src_filled = self.src_end - self.src_start;
+            let max_fill_len = self.frame_info.block_size.get_size() - src_filled;
+            if max_fill_len == 0 {
+                // make space by writing next block
+                self.write_block()?;
+                debug_assert_eq!(self.src_end, self.src_start);
+                continue;
+            }
+
+            let fill_len = max_fill_len.min(buf.len());
+            vec_copy_overwriting(&mut self.src, self.src_end, &buf[..fill_len]);
+            buf = &buf[fill_len..];
+            self.src_end += fill_len;
+        }
+        Ok(buf_len)
+    }
+
+    fn flush(&mut self) -> io::Result<()> {
+        if self.src_start != self.src_end {
+            self.write_block()?;
+        }
+        Ok(())
+    }
+}
+
+/// A wrapper around an [`FrameEncoder<W>`] that finishes the stream on drop.
+///
+/// This can be created by the [`auto_finish()`] method on the [`FrameEncoder<W>`].
+///
+/// # Note
+/// Errors on drop get silently ignored. If you want to handle errors then use [`finish()`] or
+/// [`try_finish()`] instead.
+///
+/// [`finish()`]: FrameEncoder::finish
+/// [`try_finish()`]: FrameEncoder::try_finish
+/// [`auto_finish()`]: FrameEncoder::auto_finish
+pub struct AutoFinishEncoder<W: Write> {
+    // We wrap this in an option to take it during drop.
+    encoder: Option<FrameEncoder<W>>,
+}
+
+impl<W: io::Write> Drop for AutoFinishEncoder<W> {
+    fn drop(&mut self) {
+        if let Some(mut encoder) = self.encoder.take() {
+            let _ = encoder.try_finish();
+        }
+    }
+}
+
+impl<W: Write> Write for AutoFinishEncoder<W> {
+    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+        self.encoder.as_mut().unwrap().write(buf)
+    }
+
+    fn flush(&mut self) -> io::Result<()> {
+        self.encoder.as_mut().unwrap().flush()
+    }
+}
+
+impl<W: fmt::Debug + io::Write> fmt::Debug for FrameEncoder<W> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("FrameEncoder")
+            .field("w", &self.w)
+            .field("frame_info", &self.frame_info)
+            .field("is_frame_open", &self.is_frame_open)
+            .field("content_hasher", &self.content_hasher)
+            .field("content_len", &self.content_len)
+            .field("dst", &"[...]")
+            .field("src", &"[...]")
+            .field("src_start", &self.src_start)
+            .field("src_end", &self.src_end)
+            .field("ext_dict_offset", &self.ext_dict_offset)
+            .field("ext_dict_len", &self.ext_dict_len)
+            .field("src_stream_offset", &self.src_stream_offset)
+            .finish()
+    }
+}
+
+/// Copy `src` into `target` starting from the `start` index, overwriting existing data if any.
+#[inline]
+fn vec_copy_overwriting(target: &mut Vec<u8>, target_start: usize, src: &[u8]) {
+    debug_assert!(target_start + src.len() <= target.capacity());
+
+    // By combining overwriting (copy_from_slice) and extending (extend_from_slice)
+    // we can fill the ring buffer without initializing it (eg. filling with 0).
+    let overwrite_len = (target.len() - target_start).min(src.len());
+    target[target_start..target_start + overwrite_len].copy_from_slice(&src[..overwrite_len]);
+    target.extend_from_slice(&src[overwrite_len..]);
+}
diff --git a/src/frame/decompress.rs b/src/frame/decompress.rs
new file mode 100644
index 0000000..2b495e2
--- /dev/null
+++ b/src/frame/decompress.rs
@@ -0,0 +1,449 @@
+use std::{
+    convert::TryInto,
+    fmt,
+    hash::Hasher,
+    io::{self, BufRead, ErrorKind},
+    mem::size_of,
+};
+use twox_hash::XxHash32;
+
+use super::header::{
+    BlockInfo, BlockMode, FrameInfo, LZ4F_LEGACY_MAGIC_NUMBER, MAGIC_NUMBER_SIZE,
+    MAX_FRAME_INFO_SIZE, MIN_FRAME_INFO_SIZE,
+};
+use super::Error;
+use crate::{
+    block::WINDOW_SIZE,
+    sink::{vec_sink_for_decompression, SliceSink},
+};
+
+/// A reader for decompressing the LZ4 frame format
+///
+/// This Decoder wraps any other reader that implements `io::Read`.
+/// Bytes read will be decompressed according to the [LZ4 frame format](
+/// https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md).
+///
+/// # Example 1
+/// Deserializing json values out of a compressed file.
+///
+/// ```no_run
+/// let compressed_input = std::fs::File::open("datafile").unwrap();
+/// let mut decompressed_input = lz4_flex::frame::FrameDecoder::new(compressed_input);
+/// let json: serde_json::Value = serde_json::from_reader(decompressed_input).unwrap();
+/// ```
+///
+/// # Example
+/// Deserializing multiple json values out of a compressed file
+///
+/// ```no_run
+/// let compressed_input = std::fs::File::open("datafile").unwrap();
+/// let mut decompressed_input = lz4_flex::frame::FrameDecoder::new(compressed_input);
+/// loop {
+///     match serde_json::from_reader::<_, serde_json::Value>(&mut decompressed_input) {
+///         Ok(json) => { println!("json {:?}", json); }
+///         Err(e) if e.is_eof() => break,
+///         Err(e) => panic!("{}", e),
+///     }
+/// }
+/// ```
+pub struct FrameDecoder<R: io::Read> {
+    /// The underlying reader.
+    r: R,
+    /// The FrameInfo of the frame currently being decoded.
+    /// It starts as `None` and is filled with the FrameInfo is read from the input.
+    /// It's reset to `None` once the frame EndMarker is read from the input.
+    current_frame_info: Option<FrameInfo>,
+    /// Xxhash32 used when content checksum is enabled.
+    content_hasher: XxHash32,
+    /// Total length of decompressed output for the current frame.
+    content_len: u64,
+    /// The compressed bytes buffer, taken from the underlying reader.
+    src: Vec<u8>,
+    /// The decompressed bytes buffer. Bytes are decompressed from src to dst
+    /// before being passed back to the caller.
+    dst: Vec<u8>,
+    /// Index into dst and length: starting point of bytes previously output
+    /// that are still part of the decompressor window.
+    ext_dict_offset: usize,
+    ext_dict_len: usize,
+    /// Index into dst: starting point of bytes not yet read by caller.
+    dst_start: usize,
+    /// Index into dst: ending point of bytes not yet read by caller.
+    dst_end: usize,
+}
+
+impl<R: io::Read> FrameDecoder<R> {
+    /// Creates a new Decoder for the specified reader.
+    pub fn new(rdr: R) -> FrameDecoder<R> {
+        FrameDecoder {
+            r: rdr,
+            src: Default::default(),
+            dst: Default::default(),
+            ext_dict_offset: 0,
+            ext_dict_len: 0,
+            dst_start: 0,
+            dst_end: 0,
+            current_frame_info: None,
+            content_hasher: XxHash32::with_seed(0),
+            content_len: 0,
+        }
+    }
+
+    /// Gets a reference to the underlying reader in this decoder.
+    pub fn get_ref(&self) -> &R {
+        &self.r
+    }
+
+    /// Gets a mutable reference to the underlying reader in this decoder.
+    ///
+    /// Note that mutation of the stream may result in surprising results if
+    /// this decoder is continued to be used.
+    pub fn get_mut(&mut self) -> &mut R {
+        &mut self.r
+    }
+
+    /// Consumes the FrameDecoder and returns the underlying reader.
+    pub fn into_inner(self) -> R {
+        self.r
+    }
+
+    fn read_frame_info(&mut self) -> Result<usize, io::Error> {
+        let mut buffer = [0u8; MAX_FRAME_INFO_SIZE];
+
+        match self.r.read(&mut buffer[..MAGIC_NUMBER_SIZE])? {
+            0 => return Ok(0),
+            MAGIC_NUMBER_SIZE => (),
+            read => self.r.read_exact(&mut buffer[read..MAGIC_NUMBER_SIZE])?,
+        }
+
+        if u32::from_le_bytes(buffer[0..MAGIC_NUMBER_SIZE].try_into().unwrap())
+            != LZ4F_LEGACY_MAGIC_NUMBER
+        {
+            match self
+                .r
+                .read(&mut buffer[MAGIC_NUMBER_SIZE..MIN_FRAME_INFO_SIZE])?
+            {
+                0 => return Ok(0),
+                MIN_FRAME_INFO_SIZE => (),
+                read => self
+                    .r
+                    .read_exact(&mut buffer[MAGIC_NUMBER_SIZE + read..MIN_FRAME_INFO_SIZE])?,
+            }
+        }
+        let required = FrameInfo::read_size(&buffer[..MIN_FRAME_INFO_SIZE])?;
+        if required != MIN_FRAME_INFO_SIZE && required != MAGIC_NUMBER_SIZE {
+            self.r
+                .read_exact(&mut buffer[MIN_FRAME_INFO_SIZE..required])?;
+        }
+
+        let frame_info = FrameInfo::read(&buffer[..required])?;
+        if frame_info.dict_id.is_some() {
+            // Unsupported right now so it must be None
+            return Err(Error::DictionaryNotSupported.into());
+        }
+
+        let max_block_size = frame_info.block_size.get_size();
+        let dst_size = if frame_info.block_mode == BlockMode::Linked {
+            // In linked mode we consume the output (bumping dst_start) but leave the
+            // beginning of dst to be used as a prefix in subsequent blocks.
+            // That is at least until we have at least `max_block_size + WINDOW_SIZE`
+            // bytes in dst, then we setup an ext_dict with the last WINDOW_SIZE bytes
+            // and the output goes to the beginning of dst again.
+            // Since we always want to be able to write a full block (up to max_block_size)
+            // we need a buffer with at least `max_block_size * 2 + WINDOW_SIZE` bytes.
+            max_block_size * 2 + WINDOW_SIZE
+        } else {
+            max_block_size
+        };
+        self.src.clear();
+        self.dst.clear();
+        self.src.reserve_exact(max_block_size);
+        self.dst.reserve_exact(dst_size);
+        self.current_frame_info = Some(frame_info);
+        self.content_hasher = XxHash32::with_seed(0);
+        self.content_len = 0;
+        self.ext_dict_len = 0;
+        self.dst_start = 0;
+        self.dst_end = 0;
+        Ok(required)
+    }
+
+    #[inline]
+    fn read_checksum(r: &mut R) -> Result<u32, io::Error> {
+        let mut checksum_buffer = [0u8; size_of::<u32>()];
+        r.read_exact(&mut checksum_buffer[..])?;
+        let checksum = u32::from_le_bytes(checksum_buffer);
+        Ok(checksum)
+    }
+
+    #[inline]
+    fn check_block_checksum(data: &[u8], expected_checksum: u32) -> Result<(), io::Error> {
+        let mut block_hasher = XxHash32::with_seed(0);
+        block_hasher.write(data);
+        let calc_checksum = block_hasher.finish() as u32;
+        if calc_checksum != expected_checksum {
+            return Err(Error::BlockChecksumError.into());
+        }
+        Ok(())
+    }
+
+    fn read_block(&mut self) -> io::Result<usize> {
+        debug_assert_eq!(self.dst_start, self.dst_end);
+        let frame_info = self.current_frame_info.as_ref().unwrap();
+
+        // Adjust dst buffer offsets to decompress the next block
+        let max_block_size = frame_info.block_size.get_size();
+        if frame_info.block_mode == BlockMode::Linked {
+            // In linked mode we consume the output (bumping dst_start) but leave the
+            // beginning of dst to be used as a prefix in subsequent blocks.
+            // That is at least until we have at least `max_block_size + WINDOW_SIZE`
+            // bytes in dst, then we setup an ext_dict with the last WINDOW_SIZE bytes
+            // and the output goes to the beginning of dst again.
+            debug_assert_eq!(self.dst.capacity(), max_block_size * 2 + WINDOW_SIZE);
+            if self.dst_start + max_block_size > self.dst.capacity() {
+                // Output might not fit in the buffer.
+                // The ext_dict will become the last WINDOW_SIZE bytes
+                debug_assert!(self.dst_start >= max_block_size + WINDOW_SIZE);
+                self.ext_dict_offset = self.dst_start - WINDOW_SIZE;
+                self.ext_dict_len = WINDOW_SIZE;
+                // Output goes in the beginning of the buffer again.
+                self.dst_start = 0;
+                self.dst_end = 0;
+            } else if self.dst_start + self.ext_dict_len > WINDOW_SIZE {
+                // There's more than WINDOW_SIZE bytes of lookback adding the prefix and ext_dict.
+                // Since we have a limited buffer we must shrink ext_dict in favor of the prefix,
+                // so that we can fit up to max_block_size bytes between dst_start and ext_dict
+                // start.
+                let delta = self
+                    .ext_dict_len
+                    .min(self.dst_start + self.ext_dict_len - WINDOW_SIZE);
+                self.ext_dict_offset += delta;
+                self.ext_dict_len -= delta;
+                debug_assert!(self.dst_start + self.ext_dict_len >= WINDOW_SIZE)
+            }
+        } else {
+            debug_assert_eq!(self.ext_dict_len, 0);
+            debug_assert_eq!(self.dst.capacity(), max_block_size);
+            self.dst_start = 0;
+            self.dst_end = 0;
+        }
+
+        // Read and decompress block
+        let block_info = {
+            let mut buffer = [0u8; 4];
+            if let Err(err) = self.r.read_exact(&mut buffer) {
+                if err.kind() == ErrorKind::UnexpectedEof {
+                    return Ok(0);
+                } else {
+                    return Err(err);
+                }
+            }
+            BlockInfo::read(&buffer)?
+        };
+        match block_info {
+            BlockInfo::Uncompressed(len) => {
+                let len = len as usize;
+                if len > max_block_size {
+                    return Err(Error::BlockTooBig.into());
+                }
+                // TODO: Attempt to avoid initialization of read buffer when
+                // https://github.com/rust-lang/rust/issues/42788 stabilizes
+                self.r.read_exact(vec_resize_and_get_mut(
+                    &mut self.dst,
+                    self.dst_start,
+                    self.dst_start + len,
+                ))?;
+                if frame_info.block_checksums {
+                    let expected_checksum = Self::read_checksum(&mut self.r)?;
+                    Self::check_block_checksum(
+                        &self.dst[self.dst_start..self.dst_start + len],
+                        expected_checksum,
+                    )?;
+                }
+
+                self.dst_end += len;
+                self.content_len += len as u64;
+            }
+            BlockInfo::Compressed(len) => {
+                let len = len as usize;
+                if len > max_block_size {
+                    return Err(Error::BlockTooBig.into());
+                }
+                // TODO: Attempt to avoid initialization of read buffer when
+                // https://github.com/rust-lang/rust/issues/42788 stabilizes
+                self.r
+                    .read_exact(vec_resize_and_get_mut(&mut self.src, 0, len))?;
+                if frame_info.block_checksums {
+                    let expected_checksum = Self::read_checksum(&mut self.r)?;
+                    Self::check_block_checksum(&self.src[..len], expected_checksum)?;
+                }
+
+                let with_dict_mode =
+                    frame_info.block_mode == BlockMode::Linked && self.ext_dict_len != 0;
+                let decomp_size = if with_dict_mode {
+                    debug_assert!(self.dst_start + max_block_size <= self.ext_dict_offset);
+                    let (head, tail) = self.dst.split_at_mut(self.ext_dict_offset);
+                    let ext_dict = &tail[..self.ext_dict_len];
+
+                    debug_assert!(head.len() - self.dst_start >= max_block_size);
+                    crate::block::decompress::decompress_internal::<true, _>(
+                        &self.src[..len],
+                        &mut SliceSink::new(head, self.dst_start),
+                        ext_dict,
+                    )
+                } else {
+                    // Independent blocks OR linked blocks with only prefix data
+                    debug_assert!(self.dst.capacity() - self.dst_start >= max_block_size);
+                    crate::block::decompress::decompress_internal::<false, _>(
+                        &self.src[..len],
+                        &mut vec_sink_for_decompression(
+                            &mut self.dst,
+                            0,
+                            self.dst_start,
+                            self.dst_start + max_block_size,
+                        ),
+                        b"",
+                    )
+                }
+                .map_err(Error::DecompressionError)?;
+
+                self.dst_end += decomp_size;
+                self.content_len += decomp_size as u64;
+            }
+
+            BlockInfo::EndMark => {
+                if let Some(expected) = frame_info.content_size {
+                    if self.content_len != expected {
+                        return Err(Error::ContentLengthError {
+                            expected,
+                            actual: self.content_len,
+                        }
+                        .into());
+                    }
+                }
+                if frame_info.content_checksum {
+                    let expected_checksum = Self::read_checksum(&mut self.r)?;
+                    let calc_checksum = self.content_hasher.finish() as u32;
+                    if calc_checksum != expected_checksum {
+                        return Err(Error::ContentChecksumError.into());
+                    }
+                }
+                self.current_frame_info = None;
+                return Ok(0);
+            }
+        }
+
+        // Content checksum, if applicable
+        if frame_info.content_checksum {
+            self.content_hasher
+                .write(&self.dst[self.dst_start..self.dst_end]);
+        }
+
+        Ok(self.dst_end - self.dst_start)
+    }
+
+    fn read_more(&mut self) -> io::Result<usize> {
+        if self.current_frame_info.is_none() && self.read_frame_info()? == 0 {
+            return Ok(0);
+        }
+        self.read_block()
+    }
+}
+
+impl<R: io::Read> io::Read for FrameDecoder<R> {
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        loop {
+            // Fill read buffer if there's uncompressed data left
+            if self.dst_start < self.dst_end {
+                let read_len = std::cmp::min(self.dst_end - self.dst_start, buf.len());
+                let dst_read_end = self.dst_start + read_len;
+                buf[..read_len].copy_from_slice(&self.dst[self.dst_start..dst_read_end]);
+                self.dst_start = dst_read_end;
+                return Ok(read_len);
+            }
+            if self.read_more()? == 0 {
+                return Ok(0);
+            }
+        }
+    }
+
+    fn read_to_string(&mut self, buf: &mut String) -> io::Result<usize> {
+        let mut written = 0;
+        loop {
+            match self.fill_buf() {
+                Ok(b) if b.is_empty() => return Ok(written),
+                Ok(b) => {
+                    let s = std::str::from_utf8(b).map_err(|_| {
+                        io::Error::new(
+                            io::ErrorKind::InvalidData,
+                            "stream did not contain valid UTF-8",
+                        )
+                    })?;
+                    buf.push_str(s);
+                    let len = s.len();
+                    self.consume(len);
+                    written += len;
+                }
+                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
+                Err(e) => return Err(e),
+            }
+        }
+    }
+
+    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
+        let mut written = 0;
+        loop {
+            match self.fill_buf() {
+                Ok(b) if b.is_empty() => return Ok(written),
+                Ok(b) => {
+                    buf.extend_from_slice(b);
+                    let len = b.len();
+                    self.consume(len);
+                    written += len;
+                }
+                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
+                Err(e) => return Err(e),
+            }
+        }
+    }
+}
+
+impl<R: io::Read> io::BufRead for FrameDecoder<R> {
+    fn fill_buf(&mut self) -> io::Result<&[u8]> {
+        if self.dst_start == self.dst_end {
+            self.read_more()?;
+        }
+        Ok(&self.dst[self.dst_start..self.dst_end])
+    }
+
+    fn consume(&mut self, amt: usize) {
+        assert!(amt <= self.dst_end - self.dst_start);
+        self.dst_start += amt;
+    }
+}
+
+impl<R: fmt::Debug + io::Read> fmt::Debug for FrameDecoder<R> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("FrameDecoder")
+            .field("r", &self.r)
+            .field("content_hasher", &self.content_hasher)
+            .field("content_len", &self.content_len)
+            .field("src", &"[...]")
+            .field("dst", &"[...]")
+            .field("dst_start", &self.dst_start)
+            .field("dst_end", &self.dst_end)
+            .field("ext_dict_offset", &self.ext_dict_offset)
+            .field("ext_dict_len", &self.ext_dict_len)
+            .field("current_frame_info", &self.current_frame_info)
+            .finish()
+    }
+}
+
+/// Similar to `v.get_mut(start..end) but will adjust the len if needed.
+#[inline]
+fn vec_resize_and_get_mut(v: &mut Vec<u8>, start: usize, end: usize) -> &mut [u8] {
+    if end > v.len() {
+        v.resize(end, 0)
+    }
+    &mut v[start..end]
+}
diff --git a/src/frame/header.rs b/src/frame/header.rs
new file mode 100644
index 0000000..1513c11
--- /dev/null
+++ b/src/frame/header.rs
@@ -0,0 +1,412 @@
+use twox_hash::XxHash32;
+
+use super::Error;
+use std::{
+    convert::TryInto,
+    fmt::Debug,
+    hash::Hasher,
+    io,
+    io::{Read, Write},
+};
+
+const FLG_RESERVED_MASK: u8 = 0b00000010;
+const FLG_VERSION_MASK: u8 = 0b11000000;
+const FLG_SUPPORTED_VERSION_BITS: u8 = 0b01000000;
+
+const FLG_INDEPENDENT_BLOCKS: u8 = 0b00100000;
+const FLG_BLOCK_CHECKSUMS: u8 = 0b00010000;
+const FLG_CONTENT_SIZE: u8 = 0b00001000;
+const FLG_CONTENT_CHECKSUM: u8 = 0b00000100;
+const FLG_DICTIONARY_ID: u8 = 0b00000001;
+
+const BD_RESERVED_MASK: u8 = !BD_BLOCK_SIZE_MASK;
+const BD_BLOCK_SIZE_MASK: u8 = 0b01110000;
+const BD_BLOCK_SIZE_MASK_RSHIFT: u8 = 4;
+
+const BLOCK_UNCOMPRESSED_SIZE_BIT: u32 = 0x80000000;
+
+const LZ4F_MAGIC_NUMBER: u32 = 0x184D2204;
+pub(crate) const LZ4F_LEGACY_MAGIC_NUMBER: u32 = 0x184C2102;
+const LZ4F_SKIPPABLE_MAGIC_RANGE: std::ops::RangeInclusive<u32> = 0x184D2A50..=0x184D2A5F;
+
+pub(crate) const MAGIC_NUMBER_SIZE: usize = 4;
+pub(crate) const MIN_FRAME_INFO_SIZE: usize = 7;
+pub(crate) const MAX_FRAME_INFO_SIZE: usize = 19;
+pub(crate) const BLOCK_INFO_SIZE: usize = 4;
+
+#[derive(Clone, Copy, PartialEq, Debug)]
+/// Different predefines blocksizes to choose when compressing data.
+#[derive(Default)]
+pub enum BlockSize {
+    /// Will detect optimal frame size based on the size of the first write call
+    #[default]
+    Auto = 0,
+    /// The default block size.
+    Max64KB = 4,
+    /// 256KB block size.
+    Max256KB = 5,
+    /// 1MB block size.
+    Max1MB = 6,
+    /// 4MB block size.
+    Max4MB = 7,
+    /// 8MB block size.
+    Max8MB = 8,
+}
+
+impl BlockSize {
+    /// Try to find optimal size based on passed buffer length.
+    pub(crate) fn from_buf_length(buf_len: usize) -> Self {
+        let mut blocksize = BlockSize::Max4MB;
+
+        for candidate in [BlockSize::Max256KB, BlockSize::Max64KB] {
+            if buf_len > candidate.get_size() {
+                return blocksize;
+            }
+            blocksize = candidate;
+        }
+        BlockSize::Max64KB
+    }
+    pub(crate) fn get_size(&self) -> usize {
+        match self {
+            BlockSize::Auto => unreachable!(),
+            BlockSize::Max64KB => 64 * 1024,
+            BlockSize::Max256KB => 256 * 1024,
+            BlockSize::Max1MB => 1024 * 1024,
+            BlockSize::Max4MB => 4 * 1024 * 1024,
+            BlockSize::Max8MB => 8 * 1024 * 1024,
+        }
+    }
+}
+
+#[derive(Clone, Copy, PartialEq, Debug)]
+/// The two `BlockMode` operations that can be set on (`FrameInfo`)[FrameInfo]
+#[derive(Default)]
+pub enum BlockMode {
+    /// Every block is compressed independently. The default.
+    #[default]
+    Independent,
+    /// Blocks can reference data from previous blocks.
+    ///
+    /// Effective when the stream contains small blocks.
+    Linked,
+}
+
+// From: https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md
+//
+// General Structure of LZ4 Frame format
+// -------------------------------------
+//
+// | MagicNb | F. Descriptor | Block | (...) | EndMark | C. Checksum |
+// |:-------:|:-------------:| ----- | ----- | ------- | ----------- |
+// | 4 bytes |  3-15 bytes   |       |       | 4 bytes | 0-4 bytes   |
+//
+// Frame Descriptor
+// ----------------
+//
+// | FLG     | BD      | (Content Size) | (Dictionary ID) | HC      |
+// | ------- | ------- |:--------------:|:---------------:| ------- |
+// | 1 byte  | 1 byte  |  0 - 8 bytes   |   0 - 4 bytes   | 1 byte  |
+//
+// __FLG byte__
+//
+// |  BitNb  |  7-6  |   5   |    4     |  3   |    2     |    1     |   0  |
+// | ------- |-------|-------|----------|------|----------|----------|------|
+// |FieldName|Version|B.Indep|B.Checksum|C.Size|C.Checksum|*Reserved*|DictID|
+//
+// __BD byte__
+//
+// |  BitNb  |     7    |     6-5-4     |  3-2-1-0 |
+// | ------- | -------- | ------------- | -------- |
+// |FieldName|*Reserved*| Block MaxSize |*Reserved*|
+//
+// Data Blocks
+// -----------
+//
+// | Block Size |  data  | (Block Checksum) |
+// |:----------:| ------ |:----------------:|
+// |  4 bytes   |        |   0 - 4 bytes    |
+//
+#[derive(Debug, Default, Clone)]
+/// The metadata for de/compressing with lz4 frame format.
+pub struct FrameInfo {
+    /// If set, includes the total uncompressed size of data in the frame.
+    pub content_size: Option<u64>,
+    /// The identifier for the dictionary that must be used to correctly decode data.
+    /// The compressor and the decompressor must use exactly the same dictionary.
+    ///
+    /// Note that this is currently unsupported and for this reason it's not pub.
+    pub(crate) dict_id: Option<u32>,
+    /// The maximum uncompressed size of each data block.
+    pub block_size: BlockSize,
+    /// The block mode.
+    pub block_mode: BlockMode,
+    /// If set, includes a checksum for each data block in the frame.
+    pub block_checksums: bool,
+    /// If set, includes a content checksum to verify that the full frame contents have been
+    /// decoded correctly.
+    pub content_checksum: bool,
+    /// If set, use the legacy frame format
+    pub legacy_frame: bool,
+}
+
+impl FrameInfo {
+    /// Create a new `FrameInfo`.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Whether to include the total uncompressed size of data in the frame.
+    pub fn content_size(mut self, content_size: Option<u64>) -> Self {
+        self.content_size = content_size;
+        self
+    }
+
+    /// The maximum uncompressed size of each data block.
+    pub fn block_size(mut self, block_size: BlockSize) -> Self {
+        self.block_size = block_size;
+        self
+    }
+
+    /// The block mode.
+    pub fn block_mode(mut self, block_mode: BlockMode) -> Self {
+        self.block_mode = block_mode;
+        self
+    }
+
+    /// If set, includes a checksum for each data block in the frame.
+    pub fn block_checksums(mut self, block_checksums: bool) -> Self {
+        self.block_checksums = block_checksums;
+        self
+    }
+
+    /// If set, includes a content checksum to verify that the full frame contents have been
+    /// decoded correctly.
+    pub fn content_checksum(mut self, content_checksum: bool) -> Self {
+        self.content_checksum = content_checksum;
+        self
+    }
+
+    /// If set, use the legacy frame format.
+    pub fn legacy_frame(mut self, legacy_frame: bool) -> Self {
+        self.legacy_frame = legacy_frame;
+        self
+    }
+
+    pub(crate) fn read_size(input: &[u8]) -> Result<usize, Error> {
+        let mut required = MIN_FRAME_INFO_SIZE;
+        let magic_num = u32::from_le_bytes(input[0..4].try_into().unwrap());
+        if magic_num == LZ4F_LEGACY_MAGIC_NUMBER {
+            return Ok(MAGIC_NUMBER_SIZE);
+        }
+
+        if input.len() < required {
+            return Ok(required);
+        }
+
+        if LZ4F_SKIPPABLE_MAGIC_RANGE.contains(&magic_num) {
+            return Ok(8);
+        }
+        if magic_num != LZ4F_MAGIC_NUMBER {
+            return Err(Error::WrongMagicNumber);
+        }
+
+        if input[4] & FLG_CONTENT_SIZE != 0 {
+            required += 8;
+        }
+        if input[4] & FLG_DICTIONARY_ID != 0 {
+            required += 4
+        }
+        Ok(required)
+    }
+
+    pub(crate) fn write_size(&self) -> usize {
+        let mut required = MIN_FRAME_INFO_SIZE;
+        if self.content_size.is_some() {
+            required += 8;
+        }
+        if self.dict_id.is_some() {
+            required += 4;
+        }
+        required
+    }
+
+    pub(crate) fn write(&self, output: &mut [u8]) -> Result<usize, Error> {
+        let write_size = self.write_size();
+        if output.len() < write_size {
+            return Err(Error::IoError(io::ErrorKind::UnexpectedEof.into()));
+        }
+        let mut buffer = [0u8; MAX_FRAME_INFO_SIZE];
+        assert!(write_size <= buffer.len());
+        buffer[0..4].copy_from_slice(&LZ4F_MAGIC_NUMBER.to_le_bytes());
+        buffer[4] = FLG_SUPPORTED_VERSION_BITS;
+        if self.block_checksums {
+            buffer[4] |= FLG_BLOCK_CHECKSUMS;
+        }
+        if self.content_checksum {
+            buffer[4] |= FLG_CONTENT_CHECKSUM;
+        }
+        if self.block_mode == BlockMode::Independent {
+            buffer[4] |= FLG_INDEPENDENT_BLOCKS;
+        }
+        buffer[5] = (self.block_size as u8) << BD_BLOCK_SIZE_MASK_RSHIFT;
+
+        // Optional section
+        let mut offset = 6;
+        if let Some(size) = self.content_size {
+            buffer[4] |= FLG_CONTENT_SIZE;
+            buffer[offset..offset + 8].copy_from_slice(&size.to_le_bytes());
+            offset += 8;
+        }
+        if let Some(dict_id) = self.dict_id {
+            buffer[4] |= FLG_DICTIONARY_ID;
+            buffer[offset..offset + 4].copy_from_slice(&dict_id.to_le_bytes());
+            offset += 4;
+        }
+
+        // Header checksum
+        let mut hasher = XxHash32::with_seed(0);
+        hasher.write(&buffer[4..offset]);
+        let header_checksum = (hasher.finish() >> 8) as u8;
+        buffer[offset] = header_checksum;
+        offset += 1;
+
+        debug_assert_eq!(offset, write_size);
+        output[..write_size].copy_from_slice(&buffer[..write_size]);
+        Ok(write_size)
+    }
+
+    pub(crate) fn read(mut input: &[u8]) -> Result<FrameInfo, Error> {
+        let original_input = input;
+        // 4 byte Magic
+        let magic_num = {
+            let mut buffer = [0u8; 4];
+            input.read_exact(&mut buffer)?;
+            u32::from_le_bytes(buffer)
+        };
+        if magic_num == LZ4F_LEGACY_MAGIC_NUMBER {
+            return Ok(FrameInfo {
+                block_size: BlockSize::Max8MB,
+                legacy_frame: true,
+                ..FrameInfo::default()
+            });
+        }
+        if LZ4F_SKIPPABLE_MAGIC_RANGE.contains(&magic_num) {
+            let mut buffer = [0u8; 4];
+            input.read_exact(&mut buffer)?;
+            let user_data_len = u32::from_le_bytes(buffer);
+            return Err(Error::SkippableFrame(user_data_len));
+        }
+        if magic_num != LZ4F_MAGIC_NUMBER {
+            return Err(Error::WrongMagicNumber);
+        }
+
+        // fixed size section
+        let [flg_byte, bd_byte] = {
+            let mut buffer = [0u8, 0];
+            input.read_exact(&mut buffer)?;
+            buffer
+        };
+
+        if flg_byte & FLG_VERSION_MASK != FLG_SUPPORTED_VERSION_BITS {
+            // version is always 01
+            return Err(Error::UnsupportedVersion(flg_byte & FLG_VERSION_MASK));
+        }
+
+        if flg_byte & FLG_RESERVED_MASK != 0 || bd_byte & BD_RESERVED_MASK != 0 {
+            return Err(Error::ReservedBitsSet);
+        }
+
+        let block_mode = if flg_byte & FLG_INDEPENDENT_BLOCKS != 0 {
+            BlockMode::Independent
+        } else {
+            BlockMode::Linked
+        };
+        let content_checksum = flg_byte & FLG_CONTENT_CHECKSUM != 0;
+        let block_checksums = flg_byte & FLG_BLOCK_CHECKSUMS != 0;
+
+        let block_size = match (bd_byte & BD_BLOCK_SIZE_MASK) >> BD_BLOCK_SIZE_MASK_RSHIFT {
+            i @ 0..=3 => return Err(Error::UnsupportedBlocksize(i)),
+            4 => BlockSize::Max64KB,
+            5 => BlockSize::Max256KB,
+            6 => BlockSize::Max1MB,
+            7 => BlockSize::Max4MB,
+            _ => unreachable!(),
+        };
+
+        // var len section
+        let mut content_size = None;
+        if flg_byte & FLG_CONTENT_SIZE != 0 {
+            let mut buffer = [0u8; 8];
+            input.read_exact(&mut buffer).unwrap();
+            content_size = Some(u64::from_le_bytes(buffer));
+        }
+
+        let mut dict_id = None;
+        if flg_byte & FLG_DICTIONARY_ID != 0 {
+            let mut buffer = [0u8; 4];
+            input.read_exact(&mut buffer)?;
+            dict_id = Some(u32::from_le_bytes(buffer));
+        }
+
+        // 1 byte header checksum
+        let expected_checksum = {
+            let mut buffer = [0u8; 1];
+            input.read_exact(&mut buffer)?;
+            buffer[0]
+        };
+        let mut hasher = XxHash32::with_seed(0);
+        hasher.write(&original_input[4..original_input.len() - input.len() - 1]);
+        let header_hash = (hasher.finish() >> 8) as u8;
+        if header_hash != expected_checksum {
+            return Err(Error::HeaderChecksumError);
+        }
+
+        Ok(FrameInfo {
+            content_size,
+            dict_id,
+            block_size,
+            block_mode,
+            block_checksums,
+            content_checksum,
+            legacy_frame: false,
+        })
+    }
+}
+
+#[derive(Debug)]
+pub(crate) enum BlockInfo {
+    Compressed(u32),
+    Uncompressed(u32),
+    EndMark,
+}
+
+impl BlockInfo {
+    pub(crate) fn read(mut input: &[u8]) -> Result<Self, Error> {
+        let mut size_buffer = [0u8; 4];
+        input.read_exact(&mut size_buffer)?;
+        let size = u32::from_le_bytes(size_buffer);
+        if size == 0 {
+            Ok(BlockInfo::EndMark)
+        } else if size & BLOCK_UNCOMPRESSED_SIZE_BIT != 0 {
+            Ok(BlockInfo::Uncompressed(size & !BLOCK_UNCOMPRESSED_SIZE_BIT))
+        } else {
+            Ok(BlockInfo::Compressed(size))
+        }
+    }
+
+    pub(crate) fn write(&self, mut output: &mut [u8]) -> Result<usize, Error> {
+        let value = match self {
+            BlockInfo::Compressed(len) if *len == 0 => return Err(Error::InvalidBlockInfo),
+            BlockInfo::Compressed(len) | BlockInfo::Uncompressed(len)
+                if *len & BLOCK_UNCOMPRESSED_SIZE_BIT != 0 =>
+            {
+                return Err(Error::InvalidBlockInfo)
+            }
+            BlockInfo::Compressed(len) => *len,
+            BlockInfo::Uncompressed(len) => *len | BLOCK_UNCOMPRESSED_SIZE_BIT,
+            BlockInfo::EndMark => 0,
+        };
+        output.write_all(&value.to_le_bytes())?;
+        Ok(4)
+    }
+}
diff --git a/src/frame/mod.rs b/src/frame/mod.rs
new file mode 100644
index 0000000..8acfad2
--- /dev/null
+++ b/src/frame/mod.rs
@@ -0,0 +1,111 @@
+//! LZ4 Frame Format
+//!
+//! As defined in <https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md>
+//!
+//! # Example: compress data on `stdin` with frame format
+//! This program reads data from `stdin`, compresses it and emits it to `stdout`.
+//! This example can be found in `examples/compress.rs`:
+//! ```no_run
+//! use std::io;
+//! let stdin = io::stdin();
+//! let stdout = io::stdout();
+//! let mut rdr = stdin.lock();
+//! // Wrap the stdout writer in a LZ4 Frame writer.
+//! let mut wtr = lz4_flex::frame::FrameEncoder::new(stdout.lock());
+//! io::copy(&mut rdr, &mut wtr).expect("I/O operation failed");
+//! wtr.finish().unwrap();
+//! ```
+//!
+
+use std::{fmt, io};
+
+#[cfg_attr(feature = "safe-encode", forbid(unsafe_code))]
+pub(crate) mod compress;
+#[cfg_attr(feature = "safe-decode", forbid(unsafe_code))]
+pub(crate) mod decompress;
+pub(crate) mod header;
+
+pub use compress::{AutoFinishEncoder, FrameEncoder};
+pub use decompress::FrameDecoder;
+pub use header::{BlockMode, BlockSize, FrameInfo};
+
+#[derive(Debug)]
+#[non_exhaustive]
+/// Errors that can occur when de/compressing lz4.
+pub enum Error {
+    /// Compression error.
+    CompressionError(crate::block::CompressError),
+    /// Decompression error.
+    DecompressionError(crate::block::DecompressError),
+    /// An io::Error was encountered.
+    IoError(io::Error),
+    /// Unsupported block size.
+    UnsupportedBlocksize(u8),
+    /// Unsupported frame version.
+    UnsupportedVersion(u8),
+    /// Wrong magic number for the LZ4 frame format.
+    WrongMagicNumber,
+    /// Reserved bits set.
+    ReservedBitsSet,
+    /// Block header is malformed.
+    InvalidBlockInfo,
+    /// Read a block larger than specified in the Frame header.
+    BlockTooBig,
+    /// The Frame header checksum doesn't match.
+    HeaderChecksumError,
+    /// The block checksum doesn't match.
+    BlockChecksumError,
+    /// The content checksum doesn't match.
+    ContentChecksumError,
+    /// Read an skippable frame.
+    /// The caller may read the specified amount of bytes from the underlying io::Read.
+    SkippableFrame(u32),
+    /// External dictionaries are not supported.
+    DictionaryNotSupported,
+    /// Content length differs.
+    ContentLengthError {
+        /// Expected content length.
+        expected: u64,
+        /// Actual content lenght.
+        actual: u64,
+    },
+}
+
+impl From<Error> for io::Error {
+    fn from(e: Error) -> Self {
+        match e {
+            Error::IoError(e) => e,
+            Error::CompressionError(_)
+            | Error::DecompressionError(_)
+            | Error::SkippableFrame(_)
+            | Error::DictionaryNotSupported => io::Error::new(io::ErrorKind::Other, e),
+            Error::WrongMagicNumber
+            | Error::UnsupportedBlocksize(..)
+            | Error::UnsupportedVersion(..)
+            | Error::ReservedBitsSet
+            | Error::InvalidBlockInfo
+            | Error::BlockTooBig
+            | Error::HeaderChecksumError
+            | Error::ContentChecksumError
+            | Error::BlockChecksumError
+            | Error::ContentLengthError { .. } => io::Error::new(io::ErrorKind::InvalidData, e),
+        }
+    }
+}
+
+impl From<io::Error> for Error {
+    fn from(e: io::Error) -> Self {
+        match e.get_ref().map(|e| e.downcast_ref::<Error>()) {
+            Some(_) => *e.into_inner().unwrap().downcast::<Error>().unwrap(),
+            None => Error::IoError(e),
+        }
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+
+impl std::error::Error for Error {}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..103cd7e
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,109 @@
+//! Pure Rust, high performance implementation of LZ4 compression.
+//!
+//! A detailed explanation of the algorithm can be found [here](http://ticki.github.io/blog/how-lz4-works/).
+//!
+//! # Overview
+//!
+//! This crate provides two ways to use lz4. The first way is through the
+//! [`frame::FrameDecoder`](frame/struct.FrameDecoder.html)
+//! and
+//! [`frame::FrameEncoder`](frame/struct.FrameEncoder.html)
+//! types, which implement the `std::io::Read` and `std::io::Write` traits with the
+//! lz4 frame format. Unless you have a specific reason to the contrary, you
+//! should only use the lz4 frame format. Specifically, the lz4 frame format
+//! permits streaming compression or decompression.
+//!
+//! The second way is through the
+//! [`decompress_size_prepended`](fn.decompress_size_prepended.html)
+//! and
+//! [`compress_prepend_size`](fn.compress_prepend_size.html)
+//! functions. These functions provide access to the lz4 block format, and
+//! don't support a streaming interface directly. You should only use these types
+//! if you know you specifically need the lz4 block format.
+//!
+//! # Example: compress data on `stdin` with frame format
+//! This program reads data from `stdin`, compresses it and emits it to `stdout`.
+//! This example can be found in `examples/compress.rs`:
+//! ```no_run
+//! use std::io;
+//! let stdin = io::stdin();
+//! let stdout = io::stdout();
+//! let mut rdr = stdin.lock();
+//! // Wrap the stdout writer in a LZ4 Frame writer.
+//! let mut wtr = lz4_flex::frame::FrameEncoder::new(stdout.lock());
+//! io::copy(&mut rdr, &mut wtr).expect("I/O operation failed");
+//! wtr.finish().unwrap();
+//! ```
+//! # Example: decompress data on `stdin` with frame format
+//! This program reads data from `stdin`, decompresses it and emits it to `stdout`.
+//! This example can be found in `examples/decompress.rs`:
+//! ```no_run
+//! use std::io;
+//! let stdin = io::stdin();
+//! let stdout = io::stdout();
+//! // Wrap the stdin reader in a LZ4 FrameDecoder.
+//! let mut rdr = lz4_flex::frame::FrameDecoder::new(stdin.lock());
+//! let mut wtr = stdout.lock();
+//! io::copy(&mut rdr, &mut wtr).expect("I/O operation failed");
+//! ```
+//!
+//! # Example: block format roundtrip
+//! ```
+//! use lz4_flex::block::{compress_prepend_size, decompress_size_prepended};
+//! let input: &[u8] = b"Hello people, what's up?";
+//! let compressed = compress_prepend_size(input);
+//! let uncompressed = decompress_size_prepended(&compressed).unwrap();
+//! assert_eq!(input, uncompressed);
+//! ```
+//!
+//! ## Feature Flags
+//!
+//! - `safe-encode` uses only safe rust for encode. _enabled by default_
+//! - `safe-decode` uses only safe rust for encode. _enabled by default_
+//! - `frame` support for LZ4 frame format. _implies `std`, enabled by default_
+//! - `std` enables dependency on the standard library. _enabled by default_
+//!
+//! For maximum performance use `no-default-features`.
+//!
+//! For no_std support only the [`block format`](block/index.html) is supported.
+//!
+//!
+#![deny(warnings)]
+#![deny(missing_docs)]
+#![cfg_attr(not(feature = "std"), no_std)]
+#![cfg_attr(docsrs, feature(doc_cfg))]
+#![cfg_attr(nightly, feature(optimize_attribute))]
+
+#[cfg_attr(test, macro_use)]
+extern crate alloc;
+
+#[cfg(test)]
+#[macro_use]
+extern crate more_asserts;
+
+pub mod block;
+#[cfg(feature = "frame")]
+#[cfg_attr(docsrs, doc(cfg(feature = "frame")))]
+pub mod frame;
+
+#[allow(dead_code)]
+mod fastcpy;
+#[allow(dead_code)]
+mod fastcpy_unsafe;
+
+#[deprecated(
+    since = "0.11.0",
+    note = "This re-export is deprecated as it can be confused with the frame API and is not suitable for very large data, use block:: instead"
+)]
+pub use block::{compress, compress_into, compress_prepend_size};
+#[deprecated(
+    since = "0.11.0",
+    note = "This re-export is deprecated as it can be confused with the frame API and is not suitable for very large data, use block:: instead"
+)]
+pub use block::{decompress, decompress_into, decompress_size_prepended};
+
+#[cfg_attr(
+    all(feature = "safe-encode", feature = "safe-decode"),
+    forbid(unsafe_code)
+)]
+pub(crate) mod sink;
diff --git a/src/sink.rs b/src/sink.rs
new file mode 100644
index 0000000..a440a2d
--- /dev/null
+++ b/src/sink.rs
@@ -0,0 +1,330 @@
+#[allow(unused_imports)]
+use alloc::vec::Vec;
+
+use crate::fastcpy::slice_copy;
+
+/// Returns a Sink implementation appropriate for outputing up to `required_capacity`
+/// bytes at `vec[offset..offset+required_capacity]`.
+/// It can be either a `SliceSink` (pre-filling the vec with zeroes if necessary)
+/// when the `safe-decode` feature is enabled, or `VecSink` otherwise.
+/// The argument `pos` defines the initial output position in the Sink.
+#[inline]
+#[cfg(feature = "frame")]
+pub fn vec_sink_for_compression(
+    vec: &mut Vec<u8>,
+    offset: usize,
+    pos: usize,
+    required_capacity: usize,
+) -> SliceSink {
+    return {
+        vec.resize(offset + required_capacity, 0);
+        SliceSink::new(&mut vec[offset..], pos)
+    };
+}
+
+/// Returns a Sink implementation appropriate for outputing up to `required_capacity`
+/// bytes at `vec[offset..offset+required_capacity]`.
+/// It can be either a `SliceSink` (pre-filling the vec with zeroes if necessary)
+/// when the `safe-decode` feature is enabled, or `VecSink` otherwise.
+/// The argument `pos` defines the initial output position in the Sink.
+#[cfg(feature = "frame")]
+#[inline]
+pub fn vec_sink_for_decompression(
+    vec: &mut Vec<u8>,
+    offset: usize,
+    pos: usize,
+    required_capacity: usize,
+) -> SliceSink {
+    return {
+        vec.resize(offset + required_capacity, 0);
+        SliceSink::new(&mut vec[offset..], pos)
+    };
+}
+
+pub trait Sink {
+    /// Returns a raw ptr to the first unfilled byte of the Sink. Analogous to `[pos..].as_ptr()`.
+    #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+    unsafe fn pos_mut_ptr(&mut self) -> *mut u8;
+
+    /// read byte at position
+    fn byte_at(&mut self, pos: usize) -> u8;
+
+    /// Pushes a byte to the end of the Sink.
+    #[cfg(feature = "safe-encode")]
+    fn push(&mut self, byte: u8);
+
+    #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+    unsafe fn base_mut_ptr(&mut self) -> *mut u8;
+
+    fn pos(&self) -> usize;
+
+    fn capacity(&self) -> usize;
+
+    #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+    unsafe fn set_pos(&mut self, new_pos: usize);
+
+    #[cfg(feature = "safe-decode")]
+    fn extend_with_fill(&mut self, byte: u8, len: usize);
+
+    /// Extends the Sink with `data`.
+    fn extend_from_slice(&mut self, data: &[u8]);
+
+    fn extend_from_slice_wild(&mut self, data: &[u8], copy_len: usize);
+
+    /// Copies `len` bytes starting from `start` to the end of the Sink.
+    /// # Panics
+    /// Panics if `start` >= `pos`.
+    #[cfg(feature = "safe-decode")]
+    fn extend_from_within(&mut self, start: usize, wild_len: usize, copy_len: usize);
+
+    #[cfg(feature = "safe-decode")]
+    fn extend_from_within_overlapping(&mut self, start: usize, num_bytes: usize);
+}
+
+/// SliceSink is used as target to de/compress data into a preallocated and possibly uninitialized
+/// `&[u8]`
+/// space.
+///
+/// # Handling of Capacity
+/// Extend methods will panic if there's insufficient capacity left in the Sink.
+///
+/// # Invariants
+///   - Bytes `[..pos()]` are always initialized.
+pub struct SliceSink<'a> {
+    /// The working slice, which may contain uninitialized bytes
+    output: &'a mut [u8],
+    /// Number of bytes in start of `output` guaranteed to be initialized
+    pos: usize,
+}
+
+impl<'a> SliceSink<'a> {
+    /// Creates a `Sink` backed by the given byte slice.
+    /// `pos` defines the initial output position in the Sink.
+    /// # Panics
+    /// Panics if `pos` is out of bounds.
+    #[inline]
+    pub fn new(output: &'a mut [u8], pos: usize) -> Self {
+        // SAFETY: Caller guarantees that all elements of `output[..pos]` are initialized.
+        let _ = &mut output[..pos]; // bounds check pos
+        SliceSink { output, pos }
+    }
+}
+
+impl<'a> Sink for SliceSink<'a> {
+    /// Returns a raw ptr to the first unfilled byte of the Sink. Analogous to `[pos..].as_ptr()`.
+    #[inline]
+    #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+    unsafe fn pos_mut_ptr(&mut self) -> *mut u8 {
+        self.base_mut_ptr().add(self.pos()) as *mut u8
+    }
+
+    /// Pushes a byte to the end of the Sink.
+    #[inline]
+    fn byte_at(&mut self, pos: usize) -> u8 {
+        self.output[pos]
+    }
+
+    /// Pushes a byte to the end of the Sink.
+    #[inline]
+    #[cfg(feature = "safe-encode")]
+    fn push(&mut self, byte: u8) {
+        self.output[self.pos] = byte;
+        self.pos += 1;
+    }
+
+    #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+    unsafe fn base_mut_ptr(&mut self) -> *mut u8 {
+        self.output.as_mut_ptr()
+    }
+
+    #[inline]
+    fn pos(&self) -> usize {
+        self.pos
+    }
+
+    #[inline]
+    fn capacity(&self) -> usize {
+        self.output.len()
+    }
+
+    #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+    #[inline]
+    unsafe fn set_pos(&mut self, new_pos: usize) {
+        debug_assert!(new_pos <= self.capacity());
+        self.pos = new_pos;
+    }
+
+    #[inline]
+    #[cfg(feature = "safe-decode")]
+    fn extend_with_fill(&mut self, byte: u8, len: usize) {
+        self.output[self.pos..self.pos + len].fill(byte);
+        self.pos += len;
+    }
+
+    /// Extends the Sink with `data`.
+    #[inline]
+    fn extend_from_slice(&mut self, data: &[u8]) {
+        self.extend_from_slice_wild(data, data.len())
+    }
+
+    #[inline]
+    fn extend_from_slice_wild(&mut self, data: &[u8], copy_len: usize) {
+        assert!(copy_len <= data.len());
+        slice_copy(data, &mut self.output[self.pos..(self.pos) + data.len()]);
+        self.pos += copy_len;
+    }
+
+    /// Copies `len` bytes starting from `start` to the end of the Sink.
+    /// # Panics
+    /// Panics if `start` >= `pos`.
+    #[inline]
+    #[cfg(feature = "safe-decode")]
+    fn extend_from_within(&mut self, start: usize, wild_len: usize, copy_len: usize) {
+        self.output.copy_within(start..start + wild_len, self.pos);
+        self.pos += copy_len;
+    }
+
+    #[inline]
+    #[cfg(feature = "safe-decode")]
+    #[cfg_attr(nightly, optimize(size))] // to avoid loop unrolling
+    fn extend_from_within_overlapping(&mut self, start: usize, num_bytes: usize) {
+        let offset = self.pos - start;
+        for i in start + offset..start + offset + num_bytes {
+            self.output[i] = self.output[i - offset];
+        }
+        self.pos += num_bytes;
+    }
+}
+
+/// PtrSink is used as target to de/compress data into a preallocated and possibly uninitialized
+/// `&[u8]`
+/// space.
+///
+///
+#[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+pub struct PtrSink {
+    /// The working slice, which may contain uninitialized bytes
+    output: *mut u8,
+    /// Number of bytes in start of `output` guaranteed to be initialized
+    pos: usize,
+    /// Number of bytes in output available
+    cap: usize,
+}
+
+#[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+impl PtrSink {
+    /// Creates a `Sink` backed by the given byte slice.
+    /// `pos` defines the initial output position in the Sink.
+    /// # Panics
+    /// Panics if `pos` is out of bounds.
+    #[inline]
+    pub fn from_vec(output: &mut Vec<u8>, pos: usize) -> Self {
+        // SAFETY: Bytes behind pointer may be uninitialized.
+        Self {
+            output: output.as_mut_ptr(),
+            pos,
+            cap: output.capacity(),
+        }
+    }
+}
+
+#[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+impl Sink for PtrSink {
+    /// Returns a raw ptr to the first unfilled byte of the Sink. Analogous to `[pos..].as_ptr()`.
+    #[inline]
+    #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+    unsafe fn pos_mut_ptr(&mut self) -> *mut u8 {
+        self.base_mut_ptr().add(self.pos()) as *mut u8
+    }
+
+    /// Pushes a byte to the end of the Sink.
+    #[inline]
+    fn byte_at(&mut self, pos: usize) -> u8 {
+        unsafe { self.output.add(pos).read() }
+    }
+
+    /// Pushes a byte to the end of the Sink.
+    #[inline]
+    #[cfg(feature = "safe-encode")]
+    fn push(&mut self, byte: u8) {
+        unsafe {
+            self.pos_mut_ptr().write(byte);
+        }
+        self.pos += 1;
+    }
+
+    #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+    unsafe fn base_mut_ptr(&mut self) -> *mut u8 {
+        self.output
+    }
+
+    #[inline]
+    fn pos(&self) -> usize {
+        self.pos
+    }
+
+    #[inline]
+    fn capacity(&self) -> usize {
+        self.cap
+    }
+
+    #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))]
+    #[inline]
+    unsafe fn set_pos(&mut self, new_pos: usize) {
+        debug_assert!(new_pos <= self.capacity());
+        self.pos = new_pos;
+    }
+
+    #[inline]
+    #[cfg(feature = "safe-decode")]
+    fn extend_with_fill(&mut self, _byte: u8, _len: usize) {
+        unreachable!();
+    }
+
+    /// Extends the Sink with `data`.
+    #[inline]
+    fn extend_from_slice(&mut self, data: &[u8]) {
+        self.extend_from_slice_wild(data, data.len())
+    }
+
+    #[inline]
+    fn extend_from_slice_wild(&mut self, data: &[u8], copy_len: usize) {
+        assert!(copy_len <= data.len());
+        unsafe {
+            core::ptr::copy_nonoverlapping(data.as_ptr(), self.pos_mut_ptr(), copy_len);
+        }
+        self.pos += copy_len;
+    }
+
+    /// Copies `len` bytes starting from `start` to the end of the Sink.
+    /// # Panics
+    /// Panics if `start` >= `pos`.
+    #[inline]
+    #[cfg(feature = "safe-decode")]
+    fn extend_from_within(&mut self, _start: usize, _wild_len: usize, _copy_len: usize) {
+        unreachable!();
+    }
+
+    #[inline]
+    #[cfg(feature = "safe-decode")]
+    fn extend_from_within_overlapping(&mut self, _start: usize, _num_bytes: usize) {
+        unreachable!();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    #[test]
+    #[cfg(any(feature = "safe-encode", feature = "safe-decode"))]
+    fn test_sink_slice() {
+        use crate::sink::Sink;
+        use crate::sink::SliceSink;
+        use alloc::vec::Vec;
+        let mut data = Vec::new();
+        data.resize(5, 0);
+        let sink = SliceSink::new(&mut data, 1);
+        assert_eq!(sink.pos(), 1);
+        assert_eq!(sink.capacity(), 5);
+    }
+}
-- 
cgit v1.2.3