From 8965c6d7386e6a625e8a7f1d3dfc830b396a5d1b Mon Sep 17 00:00:00 2001 From: Frederick Mayle Date: Wed, 17 Jan 2024 16:24:54 -0800 Subject: Import 'lz4_flex' crate Request Document: go/android-rust-importing-crates For CL Reviewers: go/android3p#cl-review For Build Team: go/ab-third-party-imports Bug: 319233671 Test: n/a Change-Id: I9d65f7035d039557e5947767b401f576628e3d06 --- .cargo_vcs_info.json | 6 + Cargo.lock | 389 +++++++++++++++++ Cargo.toml | 100 +++++ Cargo.toml.orig | 87 ++++ LICENSE | 20 + METADATA | 19 + MODULE_LICENSE_MIT | 0 OWNERS | 3 + README.md | 129 ++++++ src/block/compress.rs | 969 +++++++++++++++++++++++++++++++++++++++++++ src/block/decompress.rs | 544 ++++++++++++++++++++++++ src/block/decompress_safe.rs | 399 ++++++++++++++++++ src/block/hashtable.rs | 161 +++++++ src/block/mod.rs | 157 +++++++ src/fastcpy.rs | 145 +++++++ src/fastcpy_unsafe.rs | 165 ++++++++ src/frame/compress.rs | 471 +++++++++++++++++++++ src/frame/decompress.rs | 449 ++++++++++++++++++++ src/frame/header.rs | 412 ++++++++++++++++++ src/frame/mod.rs | 111 +++++ src/lib.rs | 109 +++++ src/sink.rs | 330 +++++++++++++++ 22 files changed, 5175 insertions(+) create mode 100644 .cargo_vcs_info.json create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 Cargo.toml.orig create mode 100644 LICENSE create mode 100644 METADATA create mode 100644 MODULE_LICENSE_MIT create mode 100644 OWNERS create mode 100644 README.md create mode 100644 src/block/compress.rs create mode 100644 src/block/decompress.rs create mode 100644 src/block/decompress_safe.rs create mode 100644 src/block/hashtable.rs create mode 100644 src/block/mod.rs create mode 100644 src/fastcpy.rs create mode 100644 src/fastcpy_unsafe.rs create mode 100644 src/frame/compress.rs create mode 100644 src/frame/decompress.rs create mode 100644 src/frame/header.rs create mode 100644 src/frame/mod.rs create mode 100644 src/lib.rs create mode 100644 src/sink.rs diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json new file mode 100644 index 0000000..7ae5865 --- /dev/null +++ b/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "4e87de7f57ce8be339a73b385f3789a129b73fb0" + }, + "path_in_vcs": "" +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..0007166 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,389 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "byteorder" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +dependencies = [ + "jobserver", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "fastrand" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +dependencies = [ + "instant", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "getrandom" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "itoa" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" + +[[package]] +name = "jobserver" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +dependencies = [ + "libc", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.125" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5916d2ae698f6de9bfb891ad7a8d65c09d232dc58cc4ac433c7da3b2fd84bc2b" + +[[package]] +name = "libm" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" + +[[package]] +name = "lz4-compress" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f966533a922a9bba9e95e594c1fdb3b9bf5fdcdb11e37e51ad84cd76e468b91" +dependencies = [ + "byteorder 0.5.3", + "quick-error 1.2.3", +] + +[[package]] +name = "lz4_flex" +version = "0.11.2" +dependencies = [ + "lz4-compress", + "lzzzz", + "more-asserts", + "proptest", + "serde_json", + "snap", + "twox-hash", +] + +[[package]] +name = "lzzzz" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8014d1362004776e6a91e4c15a3aa7830d1b6650a075b51a9969ebb6d6af13bc" +dependencies = [ + "cc", +] + +[[package]] +name = "more-asserts" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fafa6961cabd9c63bcd77a45d7e3b7f3b552b70417831fb0f56db717e72407e" + +[[package]] +name = "num-traits" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "proptest" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29f1b898011ce9595050a68e60f90bad083ff2987a695a42357134c8381fba70" +dependencies = [ + "bit-set", + "bitflags", + "byteorder 1.4.3", + "lazy_static", + "num-traits", + "quick-error 2.0.1", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_xorshift" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +dependencies = [ + "rand_core", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "rusty-fork" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" +dependencies = [ + "fnv", + "quick-error 1.2.3", + "tempfile", + "wait-timeout", +] + +[[package]] +name = "ryu" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" + +[[package]] +name = "serde" +version = "1.0.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61ea8d54c77f8315140a05f4c7237403bf38b72704d031543aa1d16abbf517d1" + +[[package]] +name = "serde_json" +version = "1.0.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "snap" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e9f0ab6ef7eb7353d9119c170a436d1bf248eea575ac42d19d12f4e34130831" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "tempfile" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +dependencies = [ + "cfg-if", + "fastrand", + "libc", + "redox_syscall", + "remove_dir_all", + "winapi", +] + +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + +[[package]] +name = "wait-timeout" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" +dependencies = [ + "libc", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..40fbadd --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,100 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +name = "lz4_flex" +version = "0.11.2" +authors = [ + "Pascal Seitz ", + "Arthur Silva ", + "ticki ", +] +include = [ + "src/*.rs", + "src/frame/**/*", + "src/block/**/*", + "README.md", + "LICENSE", +] +description = "Fastest LZ4 implementation in Rust, no unsafe by default." +homepage = "https://github.com/pseitz/lz4_flex" +readme = "README.md" +keywords = [ + "compression", + "lz4", + "compress", + "decompression", + "decompress", +] +license = "MIT" +repository = "https://github.com/pseitz/lz4_flex" + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = [ + "--cfg", + "docsrs", +] + +[profile.bench] +opt-level = 3 +lto = true +codegen-units = 1 + +[profile.release] +opt-level = 3 +codegen-units = 1 +panic = "unwind" + +[[bench]] +name = "crit_bench" +path = "benches/crit_bench.rs" +harness = false + +[dependencies.twox-hash] +version = "1.6.3" +optional = true +default-features = false + +[dev-dependencies.lz4-compress] +version = "0.1.1" + +[dev-dependencies.lzzzz] +version = "1.0.4" + +[dev-dependencies.more-asserts] +version = "0.3.1" + +[dev-dependencies.proptest] +version = "1.0.0" + +[dev-dependencies.serde_json] +version = "1.0.91" + +[dev-dependencies.snap] +version = "1.1.0" + +[features] +default = [ + "std", + "safe-encode", + "safe-decode", + "frame", +] +frame = [ + "std", + "dep:twox-hash", +] +nightly = [] +safe-decode = [] +safe-encode = [] +std = [] diff --git a/Cargo.toml.orig b/Cargo.toml.orig new file mode 100644 index 0000000..3eea255 --- /dev/null +++ b/Cargo.toml.orig @@ -0,0 +1,87 @@ +[package] +authors = ["Pascal Seitz ", "Arthur Silva ", "ticki "] +description = "Fastest LZ4 implementation in Rust, no unsafe by default." +edition = "2021" +keywords = ["compression", "lz4", "compress", "decompression", "decompress"] +name = "lz4_flex" +homepage = "https://github.com/pseitz/lz4_flex" +repository = "https://github.com/pseitz/lz4_flex" +readme = "README.md" +license = "MIT" +version = "0.11.2" +include = ["src/*.rs", "src/frame/**/*", "src/block/**/*", "README.md", "LICENSE"] + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[[bench]] +harness = false +name = "crit_bench" +path = "benches/crit_bench.rs" + +[dev-dependencies] +criterion = { git = "https://github.com/PSeitz/criterion.rs/", rev = "cf60ffc"} +lzzzz = "1.0.4" +lz4-compress = "0.1.1" +more-asserts = "0.3.1" +snap = "1.1.0" +serde_json = "1.0.91" +proptest = "1.0.0" + +[dev-dependencies.lz-fear] +git = "https://github.com/main--/rust-lz-fear" + + #Uncomment to make lz4_flex master available as lz4_flex_master + #[dev-dependencies.lz4_flex_master] + #rev= "a122673" # v10 + #git = "https://github.com/PSeitz/lz4_flex" + #package = "lz4_flex" + #default-features=false + #features = ["std", "safe-encode", "safe-decode", "frame"] + +[features] +default = ["std", "safe-encode", "safe-decode", "frame"] +safe-decode = [] +safe-encode = [] +#unchecked-decode = [] # Removes some checks for additional performance. Only enable on trusted input! +frame = ["std", "dep:twox-hash"] +std = [] +# use nightly compiler features +nightly = [] + +[dependencies] +twox-hash = { version = "1.6.3", default-features = false, optional = true } + +[profile.bench] +codegen-units = 1 +lto = true +opt-level = 3 + +[profile.release] +codegen-units = 1 +#debug = true +opt-level = 3 +panic = "unwind" + +# [[bench]] +# harness = false +# name = "quickbench" +# path = "benches/quickbench.rs" + +# [[bench]] +# harness = false +# name = "bench" +# path = "benches/bench.rs" + +# [[bin]] +# name = "decompress_with_stats" +# path = "src/test_bins/decompress_with_stats.rs" + +# [[bin]] +# name = "profile_decomp" +# path = "src/test_bins/profile_decomp.rs" + +# [[bin]] +# name = "profile_comp" +# path = "src/test_bins/profile_comp.rs" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0037a72 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2020 Pascal Seitz + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/METADATA b/METADATA new file mode 100644 index 0000000..858b3d6 --- /dev/null +++ b/METADATA @@ -0,0 +1,19 @@ +name: "lz4_flex" +description: "Fastest LZ4 implementation in Rust, no unsafe by default." +third_party { + identifier { + type: "crates.io" + value: "https://crates.io/crates/lz4_flex" + } + identifier { + type: "Archive" + value: "https://static.crates.io/crates/lz4_flex/lz4_flex-0.11.2.crate" + } + version: "0.11.2" + license_type: NOTICE + last_upgrade_date { + year: 2024 + month: 1 + day: 17 + } +} diff --git a/MODULE_LICENSE_MIT b/MODULE_LICENSE_MIT new file mode 100644 index 0000000..e69de29 diff --git a/OWNERS b/OWNERS new file mode 100644 index 0000000..41179fc --- /dev/null +++ b/OWNERS @@ -0,0 +1,3 @@ +# Bug component: 688011 +include platform/prebuilts/rust:main:/OWNERS +fmayle@google.com diff --git a/README.md b/README.md new file mode 100644 index 0000000..7b303a6 --- /dev/null +++ b/README.md @@ -0,0 +1,129 @@ +![Rust](https://github.com/PSeitz/lz4_flex/workflows/Rust/badge.svg) +[![Docs](https://docs.rs/lz4_flex/badge.svg)](https://docs.rs/crate/lz4_flex/) +[![Crates.io](https://img.shields.io/crates/v/lz4_flex.svg)](https://crates.io/crates/lz4_flex) + +# lz4_flex + +![lz4_flex_logo](https://raw.githubusercontent.com/PSeitz/lz4_flex/master/logo.jpg) + +Fastest LZ4 implementation in Rust. Originally based on [redox-os' lz4 compression](https://crates.io/crates/lz4-compress), but now a complete rewrite. +The results in the table are from a benchmark in this project (66Kb JSON, 10MB dickens) with the block format. + +AMD Ryzen 7 5900HX, rustc 1.69.0 (84c898d65 2023-04-16), Manjaro, CPU Boost Disabled, CPU Governor: Performance + +66Kb JSON +| Compressor | Compression | Decompression | Ratio | +|----------------------|-------------|---------------|---------------| +| lz4_flex unsafe w. unchecked_decode | 1615 MiB/s | 5973 MiB/s | 0.2284 | +| lz4_flex unsafe | 1615 MiB/s | 5512 MiB/s | 0.2284 | +| lz4_flex safe | 1272 MiB/s | 4540 MiB/s | 0.2284 | +| lzzz (lz4 1.9.3) | 1469 MiB/s | 5313 MiB/s | 0.2283 | +| lz4_fear | 662 MiB/s | 939 MiB/s | 0.2283 | +| snap | 1452 MiB/s | 1649 MiB/s | 0.2242 | + +10 Mb dickens +| Compressor | Compression | Decompression | Ratio | +|----------------------|-------------|---------------|---------------| +| lz4_flex unsafe w. unchecked_decode | 347 MiB/s | 3168 MiB/s | 0.6372 | +| lz4_flex unsafe | 347 MiB/s | 2734 MiB/s | 0.6372 | +| lz4_flex safe | 259 MiB/s | 2338 MiB/s | 0.6372 | +| lzzz (lz4 1.9.3) | 324 MiB/s | 2759 MiB/s | 0.6372 | +| lz4_fear | 201 MiB/s | 370 MiB/s | 0.6372 | +| snap | 286 MiB/s | 679 MiB/s | 0.6276 | + +## Features +- Very good logo +- LZ4 Block format +- LZ4 Frame format (thanks @arthurprs) +- High performance +- 1,5s clean release build time +- Feature flags to configure safe/unsafe code usage +- no-std support with block format (thanks @coolreader18) +- 32-bit support + +## Usage: +Compression and decompression uses no usafe via the default feature flags "safe-encode" and "safe-decode". If you need more performance you can disable them (e.g. with no-default-features). + +Safe: +``` +lz4_flex = { version = "0.11" } +``` + +Performance: +``` +lz4_flex = { version = "0.11", default-features = false } +``` + +### Block Format +The block format is only valid for smaller data chunks as as block is de/compressed in memory. +For larger data use the frame format, which consists of multiple blocks. + +```rust +use lz4_flex::block::{compress_prepend_size, decompress_size_prepended}; + +fn main(){ + let input: &[u8] = b"Hello people, what's up?"; + let compressed = compress_prepend_size(input); + let uncompressed = decompress_size_prepended(&compressed).unwrap(); + assert_eq!(input, uncompressed); +} +``` + + +## no_std support + +no_std support is currently only for the block format, since the frame format uses `std::io::Write`, which is not available in core. + +## Benchmarks +The benchmark is run with criterion, the test files are in the benches folder. + +Currently 4 implementations are compared, this one, [lz-fear](https://github.com/main--/rust-lz-fear), the [c version via rust bindings](https://crates.io/crates/lzzzz) and [snappy](https://github.com/burntsushi/rust-snappy). +The lz4-flex version is tested with the feature flags safe-decode and safe-encode switched on and off. + +- lz4_cpp: https://crates.io/crates/lzzzz +- lz-fear: https://github.com/main--/rust-lz-fear +- snap: https://github.com/burntsushi/rust-snappy + +Tested on AMD Ryzen 7 5900HX, rustc 1.69.0 (84c898d65 2023-04-16), Manjaro, CPU Boost Disabled, CPU 3GHZ + +### Results v0.11.0 02-06-2023 (safe-decode and safe-encode off) +`cargo bench --no-default-features` + +![Compress](./compress_bench.svg) + +![Decompress](./decompress_bench.svg) + +### Results v0.11.0 02-06-2023 (safe-decode and safe-encode on) +`cargo bench` + +![Compress](./compress_bench_safe.svg) + +![Decompress](./decompress_bench_safe.svg) + +## Miri + +[Miri](https://github.com/rust-lang/miri) can be used to find issues related to incorrect unsafe usage: + +`MIRIFLAGS="-Zmiri-disable-isolation -Zmiri-disable-stacked-borrows" cargo +nightly miri test --no-default-features --features frame` + +## Fuzzer +This fuzz target generates corrupted data for the decompressor. +`cargo +nightly fuzz run fuzz_decomp_corrupt_block` and `cargo +nightly fuzz run fuzz_decomp_corrupt_frame` + +This fuzz target asserts that a compression and decompression rountrip returns the original input. +`cargo +nightly fuzz run fuzz_roundtrip` and `cargo +nightly fuzz run fuzz_roundtrip_frame` + +This fuzz target asserts compression with cpp and decompression with lz4_flex returns the original input. +`cargo +nightly fuzz run fuzz_roundtrip_cpp_compress` + +## Bindings in other languages + - Node.js: [lz4-napi](https://github.com/antoniomuso/lz4-napi) + - Wasm: [lz4-wasm](https://github.com/PSeitz/lz4-wasm) + +## TODO +- High compression + +## Migrate from v0.10 to v0.11.1 +To migrate, just remove the `checked-decode` feature flag if you used it. + + diff --git a/src/block/compress.rs b/src/block/compress.rs new file mode 100644 index 0000000..c18474a --- /dev/null +++ b/src/block/compress.rs @@ -0,0 +1,969 @@ +//! The compression algorithm. +//! +//! We make use of hash tables to find duplicates. This gives a reasonable compression ratio with a +//! high performance. It has fixed memory usage, which contrary to other approachs, makes it less +//! memory hungry. + +use crate::block::hashtable::HashTable; +use crate::block::END_OFFSET; +use crate::block::LZ4_MIN_LENGTH; +use crate::block::MAX_DISTANCE; +use crate::block::MFLIMIT; +use crate::block::MINMATCH; +#[cfg(not(feature = "safe-encode"))] +use crate::sink::PtrSink; +use crate::sink::Sink; +use crate::sink::SliceSink; +#[allow(unused_imports)] +use alloc::vec; +use alloc::vec::Vec; + +#[cfg(feature = "safe-encode")] +use core::convert::TryInto; + +use super::hashtable::HashTable4K; +use super::hashtable::HashTable4KU16; +use super::{CompressError, WINDOW_SIZE}; + +/// Increase step size after 1< u32 { + unsafe { read_u32_ptr(input.as_ptr().add(n)) } +} + +#[inline] +#[cfg(feature = "safe-encode")] +pub(super) fn get_batch(input: &[u8], n: usize) -> u32 { + u32::from_ne_bytes(input[n..n + 4].try_into().unwrap()) +} + +/// Read an usize sized "batch" from some position. +/// +/// This will read a native-endian usize from some position. +#[inline] +#[allow(dead_code)] +#[cfg(not(feature = "safe-encode"))] +pub(super) fn get_batch_arch(input: &[u8], n: usize) -> usize { + unsafe { read_usize_ptr(input.as_ptr().add(n)) } +} + +#[inline] +#[allow(dead_code)] +#[cfg(feature = "safe-encode")] +pub(super) fn get_batch_arch(input: &[u8], n: usize) -> usize { + const USIZE_SIZE: usize = core::mem::size_of::(); + let arr: &[u8; USIZE_SIZE] = input[n..n + USIZE_SIZE].try_into().unwrap(); + usize::from_ne_bytes(*arr) +} + +#[inline] +fn token_from_literal(lit_len: usize) -> u8 { + if lit_len < 0xF { + // Since we can fit the literals length into it, there is no need for saturation. + (lit_len as u8) << 4 + } else { + // We were unable to fit the literals into it, so we saturate to 0xF. We will later + // write the extensional value. + 0xF0 + } +} + +#[inline] +fn token_from_literal_and_match_length(lit_len: usize, duplicate_length: usize) -> u8 { + let mut token = if lit_len < 0xF { + // Since we can fit the literals length into it, there is no need for saturation. + (lit_len as u8) << 4 + } else { + // We were unable to fit the literals into it, so we saturate to 0xF. We will later + // write the extensional value. + 0xF0 + }; + + token |= if duplicate_length < 0xF { + // We could fit it in. + duplicate_length as u8 + } else { + // We were unable to fit it in, so we default to 0xF, which will later be extended. + 0xF + }; + + token +} + +/// Counts the number of same bytes in two byte streams. +/// `input` is the complete input +/// `cur` is the current position in the input. it will be incremented by the number of matched +/// bytes `source` either the same as input or an external slice +/// `candidate` is the candidate position in `source` +/// +/// The function ignores the last END_OFFSET bytes in input as those should be literals. +#[inline] +#[cfg(feature = "safe-encode")] +fn count_same_bytes(input: &[u8], cur: &mut usize, source: &[u8], candidate: usize) -> usize { + const USIZE_SIZE: usize = core::mem::size_of::(); + let cur_slice = &input[*cur..input.len() - END_OFFSET]; + let cand_slice = &source[candidate..]; + + let mut num = 0; + for (block1, block2) in cur_slice + .chunks_exact(USIZE_SIZE) + .zip(cand_slice.chunks_exact(USIZE_SIZE)) + { + let input_block = usize::from_ne_bytes(block1.try_into().unwrap()); + let match_block = usize::from_ne_bytes(block2.try_into().unwrap()); + + if input_block == match_block { + num += USIZE_SIZE; + } else { + let diff = input_block ^ match_block; + num += (diff.to_le().trailing_zeros() / 8) as usize; + *cur += num; + return num; + } + } + + // If we're here we may have 1 to 7 bytes left to check close to the end of input + // or source slices. Since this is rare occurrence we mark it cold to get better + // ~5% better performance. + #[cold] + fn count_same_bytes_tail(a: &[u8], b: &[u8], offset: usize) -> usize { + a.iter() + .zip(b) + .skip(offset) + .take_while(|(a, b)| a == b) + .count() + } + num += count_same_bytes_tail(cur_slice, cand_slice, num); + + *cur += num; + num +} + +/// Counts the number of same bytes in two byte streams. +/// `input` is the complete input +/// `cur` is the current position in the input. it will be incremented by the number of matched +/// bytes `source` either the same as input OR an external slice +/// `candidate` is the candidate position in `source` +/// +/// The function ignores the last END_OFFSET bytes in input as those should be literals. +#[inline] +#[cfg(not(feature = "safe-encode"))] +fn count_same_bytes(input: &[u8], cur: &mut usize, source: &[u8], candidate: usize) -> usize { + let max_input_match = input.len().saturating_sub(*cur + END_OFFSET); + let max_candidate_match = source.len() - candidate; + // Considering both limits calc how far we may match in input. + let input_end = *cur + max_input_match.min(max_candidate_match); + + let start = *cur; + let mut source_ptr = unsafe { source.as_ptr().add(candidate) }; + + // compare 4/8 bytes blocks depending on the arch + const STEP_SIZE: usize = core::mem::size_of::(); + while *cur + STEP_SIZE <= input_end { + let diff = read_usize_ptr(unsafe { input.as_ptr().add(*cur) }) ^ read_usize_ptr(source_ptr); + + if diff == 0 { + *cur += STEP_SIZE; + unsafe { + source_ptr = source_ptr.add(STEP_SIZE); + } + } else { + *cur += (diff.to_le().trailing_zeros() / 8) as usize; + return *cur - start; + } + } + + // compare 4 bytes block + #[cfg(target_pointer_width = "64")] + { + if input_end - *cur >= 4 { + let diff = read_u32_ptr(unsafe { input.as_ptr().add(*cur) }) ^ read_u32_ptr(source_ptr); + + if diff == 0 { + *cur += 4; + unsafe { + source_ptr = source_ptr.add(4); + } + } else { + *cur += (diff.to_le().trailing_zeros() / 8) as usize; + return *cur - start; + } + } + } + + // compare 2 bytes block + if input_end - *cur >= 2 + && unsafe { read_u16_ptr(input.as_ptr().add(*cur)) == read_u16_ptr(source_ptr) } + { + *cur += 2; + unsafe { + source_ptr = source_ptr.add(2); + } + } + + if *cur < input_end + && unsafe { input.as_ptr().add(*cur).read() } == unsafe { source_ptr.read() } + { + *cur += 1; + } + + *cur - start +} + +/// Write an integer to the output. +/// +/// Each additional byte then represent a value from 0 to 255, which is added to the previous value +/// to produce a total length. When the byte value is 255, another byte must read and added, and so +/// on. There can be any number of bytes of value "255" following token +#[inline] +#[cfg(feature = "safe-encode")] +fn write_integer(output: &mut impl Sink, mut n: usize) { + // Note: Since `n` is usually < 0xFF and writing multiple bytes to the output + // requires 2 branches of bound check (due to the possibility of add overflows) + // the simple byte at a time implementation below is faster in most cases. + while n >= 0xFF { + n -= 0xFF; + push_byte(output, 0xFF); + } + push_byte(output, n as u8); +} + +/// Write an integer to the output. +/// +/// Each additional byte then represent a value from 0 to 255, which is added to the previous value +/// to produce a total length. When the byte value is 255, another byte must read and added, and so +/// on. There can be any number of bytes of value "255" following token +#[inline] +#[cfg(not(feature = "safe-encode"))] +fn write_integer(output: &mut impl Sink, mut n: usize) { + // Write the 0xFF bytes as long as the integer is higher than said value. + if n >= 4 * 0xFF { + // In this unlikelly branch we use a fill instead of a loop, + // otherwise rustc may output a large unrolled/vectorized loop. + let bulk = n / (4 * 0xFF); + n %= 4 * 0xFF; + unsafe { + core::ptr::write_bytes(output.pos_mut_ptr(), 0xFF, 4 * bulk); + output.set_pos(output.pos() + 4 * bulk); + } + } + + // Handle last 1 to 4 bytes + push_u32(output, 0xFFFFFFFF); + // Updating output len for the remainder + unsafe { + output.set_pos(output.pos() - 4 + 1 + n / 255); + // Write the remaining byte. + *output.pos_mut_ptr().sub(1) = (n % 255) as u8; + } +} + +/// Handle the last bytes from the input as literals +#[cold] +fn handle_last_literals(output: &mut impl Sink, input: &[u8], start: usize) { + let lit_len = input.len() - start; + + let token = token_from_literal(lit_len); + push_byte(output, token); + if lit_len >= 0xF { + write_integer(output, lit_len - 0xF); + } + // Now, write the actual literals. + output.extend_from_slice(&input[start..]); +} + +/// Moves the cursors back as long as the bytes match, to find additional bytes in a duplicate +#[inline] +#[cfg(feature = "safe-encode")] +fn backtrack_match( + input: &[u8], + cur: &mut usize, + literal_start: usize, + source: &[u8], + candidate: &mut usize, +) { + // Note: Even if iterator version of this loop has less branches inside the loop it has more + // branches before the loop. That in practice seems to make it slower than the while version + // bellow. TODO: It should be possible remove all bounds checks, since we are walking + // backwards + while *candidate > 0 && *cur > literal_start && input[*cur - 1] == source[*candidate - 1] { + *cur -= 1; + *candidate -= 1; + } +} + +/// Moves the cursors back as long as the bytes match, to find additional bytes in a duplicate +#[inline] +#[cfg(not(feature = "safe-encode"))] +fn backtrack_match( + input: &[u8], + cur: &mut usize, + literal_start: usize, + source: &[u8], + candidate: &mut usize, +) { + while unsafe { + *candidate > 0 + && *cur > literal_start + && input.get_unchecked(*cur - 1) == source.get_unchecked(*candidate - 1) + } { + *cur -= 1; + *candidate -= 1; + } +} + +/// Compress all bytes of `input[input_pos..]` into `output`. +/// +/// Bytes in `input[..input_pos]` are treated as a preamble and can be used for lookback. +/// This part is known as the compressor "prefix". +/// Bytes in `ext_dict` logically precede the bytes in `input` and can also be used for lookback. +/// +/// `input_stream_offset` is the logical position of the first byte of `input`. This allows same +/// `dict` to be used for many calls to `compress_internal` as we can "readdress" the first byte of +/// `input` to be something other than 0. +/// +/// `dict` is the dictionary of previously encoded sequences. +/// +/// This is used to find duplicates in the stream so they are not written multiple times. +/// +/// Every four bytes are hashed, and in the resulting slot their position in the input buffer +/// is placed in the dict. This way we can easily look up a candidate to back references. +/// +/// Returns the number of bytes written (compressed) into `output`. +/// +/// # Const parameters +/// `USE_DICT`: Disables usage of ext_dict (it'll panic if a non-empty slice is used). +/// In other words, this generates more optimized code when an external dictionary isn't used. +/// +/// A similar const argument could be used to disable the Prefix mode (eg. USE_PREFIX), +/// which would impose `input_pos == 0 && input_stream_offset == 0`. Experiments didn't +/// show significant improvement though. +// Intentionally avoid inlining. +// Empirical tests revealed it to be rarely better but often significantly detrimental. +#[inline(never)] +pub(crate) fn compress_internal( + input: &[u8], + input_pos: usize, + output: &mut S, + dict: &mut T, + ext_dict: &[u8], + input_stream_offset: usize, +) -> Result { + assert!(input_pos <= input.len()); + if USE_DICT { + assert!(ext_dict.len() <= super::WINDOW_SIZE); + assert!(ext_dict.len() <= input_stream_offset); + // Check for overflow hazard when using ext_dict + assert!(input_stream_offset + .checked_add(input.len()) + .and_then(|i| i.checked_add(ext_dict.len())) + .map_or(false, |i| i <= isize::MAX as usize)); + } else { + assert!(ext_dict.is_empty()); + } + if output.capacity() - output.pos() < get_maximum_output_size(input.len() - input_pos) { + return Err(CompressError::OutputTooSmall); + } + + let output_start_pos = output.pos(); + if input.len() - input_pos < LZ4_MIN_LENGTH { + handle_last_literals(output, input, input_pos); + return Ok(output.pos() - output_start_pos); + } + + let ext_dict_stream_offset = input_stream_offset - ext_dict.len(); + let end_pos_check = input.len() - MFLIMIT; + let mut literal_start = input_pos; + let mut cur = input_pos; + + if cur == 0 && input_stream_offset == 0 { + // According to the spec we can't start with a match, + // except when referencing another block. + let hash = T::get_hash_at(input, 0); + dict.put_at(hash, 0); + cur = 1; + } + + loop { + // Read the next block into two sections, the literals and the duplicates. + let mut step_size; + let mut candidate; + let mut candidate_source; + let mut offset; + let mut non_match_count = 1 << INCREASE_STEPSIZE_BITSHIFT; + // The number of bytes before our cursor, where the duplicate starts. + let mut next_cur = cur; + + // In this loop we search for duplicates via the hashtable. 4bytes or 8bytes are hashed and + // compared. + loop { + step_size = non_match_count >> INCREASE_STEPSIZE_BITSHIFT; + non_match_count += 1; + + cur = next_cur; + next_cur += step_size; + + // Same as cur + MFLIMIT > input.len() + if cur > end_pos_check { + handle_last_literals(output, input, literal_start); + return Ok(output.pos() - output_start_pos); + } + // Find a candidate in the dictionary with the hash of the current four bytes. + // Unchecked is safe as long as the values from the hash function don't exceed the size + // of the table. This is ensured by right shifting the hash values + // (`dict_bitshift`) to fit them in the table + + // [Bounds Check]: Can be elided due to `end_pos_check` above + let hash = T::get_hash_at(input, cur); + candidate = dict.get_at(hash); + dict.put_at(hash, cur + input_stream_offset); + + // Sanity check: Matches can't be ahead of `cur`. + debug_assert!(candidate <= input_stream_offset + cur); + + // Two requirements to the candidate exists: + // - We should not return a position which is merely a hash collision, so that the + // candidate actually matches what we search for. + // - We can address up to 16-bit offset, hence we are only able to address the candidate + // if its offset is less than or equals to 0xFFFF. + if input_stream_offset + cur - candidate > MAX_DISTANCE { + continue; + } + + if candidate >= input_stream_offset { + // match within input + offset = (input_stream_offset + cur - candidate) as u16; + candidate -= input_stream_offset; + candidate_source = input; + } else if USE_DICT { + // Sanity check, which may fail if we lost history beyond MAX_DISTANCE + debug_assert!( + candidate >= ext_dict_stream_offset, + "Lost history in ext dict mode" + ); + // match within ext dict + offset = (input_stream_offset + cur - candidate) as u16; + candidate -= ext_dict_stream_offset; + candidate_source = ext_dict; + } else { + // Match is not reachable anymore + // eg. compressing an independent block frame w/o clearing + // the matches tables, only increasing input_stream_offset. + // Sanity check + debug_assert!(input_pos == 0, "Lost history in prefix mode"); + continue; + } + // [Bounds Check]: Candidate is coming from the Hashmap. It can't be out of bounds, but + // impossible to prove for the compiler and remove the bounds checks. + let cand_bytes: u32 = get_batch(candidate_source, candidate); + // [Bounds Check]: Should be able to be elided due to `end_pos_check`. + let curr_bytes: u32 = get_batch(input, cur); + + if cand_bytes == curr_bytes { + break; + } + } + + // Extend the match backwards if we can + backtrack_match( + input, + &mut cur, + literal_start, + candidate_source, + &mut candidate, + ); + + // The length (in bytes) of the literals section. + let lit_len = cur - literal_start; + + // Generate the higher half of the token. + cur += MINMATCH; + candidate += MINMATCH; + let duplicate_length = count_same_bytes(input, &mut cur, candidate_source, candidate); + + // Note: The `- 2` offset was copied from the reference implementation, it could be + // arbitrary. + let hash = T::get_hash_at(input, cur - 2); + dict.put_at(hash, cur - 2 + input_stream_offset); + + let token = token_from_literal_and_match_length(lit_len, duplicate_length); + + // Push the token to the output stream. + push_byte(output, token); + // If we were unable to fit the literals length into the token, write the extensional + // part. + if lit_len >= 0xF { + write_integer(output, lit_len - 0xF); + } + + // Now, write the actual literals. + // + // The unsafe version copies blocks of 8bytes, and therefore may copy up to 7bytes more than + // needed. This is safe, because the last 12 bytes (MF_LIMIT) are handled in + // handle_last_literals. + copy_literals_wild(output, input, literal_start, lit_len); + // write the offset in little endian. + push_u16(output, offset); + + // If we were unable to fit the duplicates length into the token, write the + // extensional part. + if duplicate_length >= 0xF { + write_integer(output, duplicate_length - 0xF); + } + literal_start = cur; + } +} + +#[inline] +#[cfg(feature = "safe-encode")] +fn push_byte(output: &mut impl Sink, el: u8) { + output.push(el); +} + +#[inline] +#[cfg(not(feature = "safe-encode"))] +fn push_byte(output: &mut impl Sink, el: u8) { + unsafe { + core::ptr::write(output.pos_mut_ptr(), el); + output.set_pos(output.pos() + 1); + } +} + +#[inline] +#[cfg(feature = "safe-encode")] +fn push_u16(output: &mut impl Sink, el: u16) { + output.extend_from_slice(&el.to_le_bytes()); +} + +#[inline] +#[cfg(not(feature = "safe-encode"))] +fn push_u16(output: &mut impl Sink, el: u16) { + unsafe { + core::ptr::copy_nonoverlapping(el.to_le_bytes().as_ptr(), output.pos_mut_ptr(), 2); + output.set_pos(output.pos() + 2); + } +} + +#[inline] +#[cfg(not(feature = "safe-encode"))] +fn push_u32(output: &mut impl Sink, el: u32) { + unsafe { + core::ptr::copy_nonoverlapping(el.to_le_bytes().as_ptr(), output.pos_mut_ptr(), 4); + output.set_pos(output.pos() + 4); + } +} + +#[inline(always)] // (always) necessary otherwise compiler fails to inline it +#[cfg(feature = "safe-encode")] +fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) { + output.extend_from_slice_wild(&input[input_start..input_start + len], len) +} + +#[inline] +#[cfg(not(feature = "safe-encode"))] +fn copy_literals_wild(output: &mut impl Sink, input: &[u8], input_start: usize, len: usize) { + debug_assert!(input_start + len / 8 * 8 + ((len % 8) != 0) as usize * 8 <= input.len()); + debug_assert!(output.pos() + len / 8 * 8 + ((len % 8) != 0) as usize * 8 <= output.capacity()); + unsafe { + // Note: This used to be a wild copy loop of 8 bytes, but the compiler consistently + // transformed it into a call to memcopy, which hurts performance significantly for + // small copies, which are common. + let start_ptr = input.as_ptr().add(input_start); + match len { + 0..=8 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 8), + 9..=16 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 16), + 17..=24 => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), 24), + _ => core::ptr::copy_nonoverlapping(start_ptr, output.pos_mut_ptr(), len), + } + output.set_pos(output.pos() + len); + } +} + +/// Compress all bytes of `input` into `output`. +/// The method chooses an appropriate hashtable to lookup duplicates. +/// output should be preallocated with a size of +/// `get_maximum_output_size`. +/// +/// Returns the number of bytes written (compressed) into `output`. + +#[inline] +pub(crate) fn compress_into_sink_with_dict( + input: &[u8], + output: &mut impl Sink, + mut dict_data: &[u8], +) -> Result { + if dict_data.len() + input.len() < u16::MAX as usize { + let mut dict = HashTable4KU16::new(); + init_dict(&mut dict, &mut dict_data); + compress_internal::<_, USE_DICT, _>(input, 0, output, &mut dict, dict_data, dict_data.len()) + } else { + let mut dict = HashTable4K::new(); + init_dict(&mut dict, &mut dict_data); + compress_internal::<_, USE_DICT, _>(input, 0, output, &mut dict, dict_data, dict_data.len()) + } +} + +#[inline] +fn init_dict(dict: &mut T, dict_data: &mut &[u8]) { + if dict_data.len() > WINDOW_SIZE { + *dict_data = &dict_data[dict_data.len() - WINDOW_SIZE..]; + } + let mut i = 0usize; + while i + core::mem::size_of::() <= dict_data.len() { + let hash = T::get_hash_at(dict_data, i); + dict.put_at(hash, i); + // Note: The 3 byte step was copied from the reference implementation, it could be + // arbitrary. + i += 3; + } +} + +/// Returns the maximum output size of the compressed data. +/// Can be used to preallocate capacity on the output vector +#[inline] +pub fn get_maximum_output_size(input_len: usize) -> usize { + 16 + 4 + (input_len as f64 * 1.1) as usize +} + +/// Compress all bytes of `input` into `output`. +/// The method chooses an appropriate hashtable to lookup duplicates. +/// output should be preallocated with a size of +/// `get_maximum_output_size`. +/// +/// Returns the number of bytes written (compressed) into `output`. +#[inline] +pub fn compress_into(input: &[u8], output: &mut [u8]) -> Result { + compress_into_sink_with_dict::(input, &mut SliceSink::new(output, 0), b"") +} + +/// Compress all bytes of `input` into `output`. +/// The method chooses an appropriate hashtable to lookup duplicates. +/// output should be preallocated with a size of +/// `get_maximum_output_size`. +/// +/// Returns the number of bytes written (compressed) into `output`. +#[inline] +pub fn compress_into_with_dict( + input: &[u8], + output: &mut [u8], + dict_data: &[u8], +) -> Result { + compress_into_sink_with_dict::(input, &mut SliceSink::new(output, 0), dict_data) +} + +#[inline] +fn compress_into_vec_with_dict( + input: &[u8], + prepend_size: bool, + mut dict_data: &[u8], +) -> Vec { + let prepend_size_num_bytes = if prepend_size { 4 } else { 0 }; + let max_compressed_size = get_maximum_output_size(input.len()) + prepend_size_num_bytes; + if dict_data.len() <= 3 { + dict_data = b""; + } + #[cfg(feature = "safe-encode")] + let mut compressed = { + let mut compressed: Vec = vec![0u8; max_compressed_size]; + let out = if prepend_size { + compressed[..4].copy_from_slice(&(input.len() as u32).to_le_bytes()); + &mut compressed[4..] + } else { + &mut compressed + }; + let compressed_len = + compress_into_sink_with_dict::(input, &mut SliceSink::new(out, 0), dict_data) + .unwrap(); + + compressed.truncate(prepend_size_num_bytes + compressed_len); + compressed + }; + #[cfg(not(feature = "safe-encode"))] + let mut compressed = { + let mut vec = Vec::with_capacity(max_compressed_size); + let start_pos = if prepend_size { + vec.extend_from_slice(&(input.len() as u32).to_le_bytes()); + 4 + } else { + 0 + }; + let compressed_len = compress_into_sink_with_dict::( + input, + &mut PtrSink::from_vec(&mut vec, start_pos), + dict_data, + ) + .unwrap(); + unsafe { + vec.set_len(prepend_size_num_bytes + compressed_len); + } + vec + }; + + compressed.shrink_to_fit(); + compressed +} + +/// Compress all bytes of `input` into `output`. The uncompressed size will be prepended as a little +/// endian u32. Can be used in conjunction with `decompress_size_prepended` +#[inline] +pub fn compress_prepend_size(input: &[u8]) -> Vec { + compress_into_vec_with_dict::(input, true, b"") +} + +/// Compress all bytes of `input`. +#[inline] +pub fn compress(input: &[u8]) -> Vec { + compress_into_vec_with_dict::(input, false, b"") +} + +/// Compress all bytes of `input` with an external dictionary. +#[inline] +pub fn compress_with_dict(input: &[u8], ext_dict: &[u8]) -> Vec { + compress_into_vec_with_dict::(input, false, ext_dict) +} + +/// Compress all bytes of `input` into `output`. The uncompressed size will be prepended as a little +/// endian u32. Can be used in conjunction with `decompress_size_prepended_with_dict` +#[inline] +pub fn compress_prepend_size_with_dict(input: &[u8], ext_dict: &[u8]) -> Vec { + compress_into_vec_with_dict::(input, true, ext_dict) +} + +#[inline] +#[cfg(not(feature = "safe-encode"))] +fn read_u16_ptr(input: *const u8) -> u16 { + let mut num: u16 = 0; + unsafe { + core::ptr::copy_nonoverlapping(input, &mut num as *mut u16 as *mut u8, 2); + } + num +} + +#[inline] +#[cfg(not(feature = "safe-encode"))] +fn read_u32_ptr(input: *const u8) -> u32 { + let mut num: u32 = 0; + unsafe { + core::ptr::copy_nonoverlapping(input, &mut num as *mut u32 as *mut u8, 4); + } + num +} + +#[inline] +#[cfg(not(feature = "safe-encode"))] +fn read_usize_ptr(input: *const u8) -> usize { + let mut num: usize = 0; + unsafe { + core::ptr::copy_nonoverlapping( + input, + &mut num as *mut usize as *mut u8, + core::mem::size_of::(), + ); + } + num +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_count_same_bytes() { + // 8byte aligned block, zeros and ones are added because the end/offset + let first: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + let second: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + ]; + assert_eq!(count_same_bytes(first, &mut 0, second, 0), 16); + + // 4byte aligned block + let first: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, + ]; + let second: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, + ]; + assert_eq!(count_same_bytes(first, &mut 0, second, 0), 20); + + // 2byte aligned block + let first: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, + ]; + let second: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, + ]; + assert_eq!(count_same_bytes(first, &mut 0, second, 0), 22); + + // 1byte aligned block + let first: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ]; + let second: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, + ]; + assert_eq!(count_same_bytes(first, &mut 0, second, 0), 23); + + // 1byte aligned block - last byte different + let first: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 5, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ]; + let second: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 6, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, + ]; + assert_eq!(count_same_bytes(first, &mut 0, second, 0), 22); + + // 1byte aligned block + let first: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 9, 5, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ]; + let second: &[u8] = &[ + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 3, 4, 6, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, + ]; + assert_eq!(count_same_bytes(first, &mut 0, second, 0), 21); + + for diff_idx in 8..100 { + let first: Vec = (0u8..255).cycle().take(100 + 12).collect(); + let mut second = first.clone(); + second[diff_idx] = 255; + for start in 0..=diff_idx { + let same_bytes = count_same_bytes(&first, &mut start.clone(), &second, start); + assert_eq!(same_bytes, diff_idx - start); + } + } + } + + #[test] + fn test_bug() { + let input: &[u8] = &[ + 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, + ]; + let _out = compress(input); + } + + #[test] + fn test_dict() { + let input: &[u8] = &[ + 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, + ]; + let dict = input; + let compressed = compress_with_dict(input, dict); + assert_lt!(compressed.len(), compress(input).len()); + + assert!(compressed.len() < compress(input).len()); + let mut uncompressed = vec![0u8; input.len()]; + let uncomp_size = crate::block::decompress::decompress_into_with_dict( + &compressed, + &mut uncompressed, + dict, + ) + .unwrap(); + uncompressed.truncate(uncomp_size); + assert_eq!(input, uncompressed); + } + + #[test] + fn test_dict_no_panic() { + let input: &[u8] = &[ + 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, + ]; + let dict = &[10, 12, 14]; + let _compressed = compress_with_dict(input, dict); + } + + #[test] + fn test_dict_match_crossing() { + let input: &[u8] = &[ + 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, + ]; + let dict = input; + let compressed = compress_with_dict(input, dict); + assert_lt!(compressed.len(), compress(input).len()); + + let mut uncompressed = vec![0u8; input.len() * 2]; + // copy first half of the input into output + let dict_cutoff = dict.len() / 2; + let output_start = dict.len() - dict_cutoff; + uncompressed[..output_start].copy_from_slice(&dict[dict_cutoff..]); + let uncomp_len = { + let mut sink = SliceSink::new(&mut uncompressed[..], output_start); + crate::block::decompress::decompress_internal::( + &compressed, + &mut sink, + &dict[..dict_cutoff], + ) + .unwrap() + }; + assert_eq!(input.len(), uncomp_len); + assert_eq!( + input, + &uncompressed[output_start..output_start + uncomp_len] + ); + } + + #[test] + fn test_conformant_last_block() { + // From the spec: + // The last match must start at least 12 bytes before the end of block. + // The last match is part of the penultimate sequence. It is followed by the last sequence, + // which contains only literals. Note that, as a consequence, an independent block < + // 13 bytes cannot be compressed, because the match must copy "something", + // so it needs at least one prior byte. + // When a block can reference data from another block, it can start immediately with a match + // and no literal, so a block of 12 bytes can be compressed. + let aaas: &[u8] = b"aaaaaaaaaaaaaaa"; + + // uncompressible + let out = compress(&aaas[..12]); + assert_gt!(out.len(), 12); + // compressible + let out = compress(&aaas[..13]); + assert_le!(out.len(), 13); + let out = compress(&aaas[..14]); + assert_le!(out.len(), 14); + let out = compress(&aaas[..15]); + assert_le!(out.len(), 15); + + // dict uncompressible + let out = compress_with_dict(&aaas[..11], aaas); + assert_gt!(out.len(), 11); + // compressible + let out = compress_with_dict(&aaas[..12], aaas); + // According to the spec this _could_ compres, but it doesn't in this lib + // as it aborts compression for any input len < LZ4_MIN_LENGTH + assert_gt!(out.len(), 12); + let out = compress_with_dict(&aaas[..13], aaas); + assert_le!(out.len(), 13); + let out = compress_with_dict(&aaas[..14], aaas); + assert_le!(out.len(), 14); + let out = compress_with_dict(&aaas[..15], aaas); + assert_le!(out.len(), 15); + } + + #[test] + fn test_dict_size() { + let dict = vec![b'a'; 1024 * 1024]; + let input = &b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"[..]; + let compressed = compress_prepend_size_with_dict(input, &dict); + let decompressed = + crate::block::decompress_size_prepended_with_dict(&compressed, &dict).unwrap(); + assert_eq!(decompressed, input); + } +} diff --git a/src/block/decompress.rs b/src/block/decompress.rs new file mode 100644 index 0000000..62065a9 --- /dev/null +++ b/src/block/decompress.rs @@ -0,0 +1,544 @@ +//! The block decompression algorithm. +use crate::block::{DecompressError, MINMATCH}; +use crate::fastcpy_unsafe; +use crate::sink::SliceSink; +use crate::sink::{PtrSink, Sink}; +use alloc::vec::Vec; + +/// Copies data to output_ptr by self-referential copy from start and match_length +#[inline] +unsafe fn duplicate( + output_ptr: &mut *mut u8, + output_end: *mut u8, + start: *const u8, + match_length: usize, +) { + // We cannot simply use memcpy or `extend_from_slice`, because these do not allow + // self-referential copies: http://ticki.github.io/img/lz4_runs_encoding_diagram.svg + + // Considering that `wild_copy_match_16` can copy up to `16 - 1` extra bytes. + // Defer to `duplicate_overlapping` in case of an overlapping match + // OR the if the wild copy would copy beyond the end of the output. + if (output_ptr.offset_from(start) as usize) < match_length + 16 - 1 + || (output_end.offset_from(*output_ptr) as usize) < match_length + 16 - 1 + { + duplicate_overlapping(output_ptr, start, match_length); + } else { + debug_assert!( + output_ptr.add(match_length / 16 * 16 + ((match_length % 16) != 0) as usize * 16) + <= output_end + ); + wild_copy_from_src_16(start, *output_ptr, match_length); + *output_ptr = output_ptr.add(match_length); + } +} + +#[inline] +fn wild_copy_from_src_16(mut source: *const u8, mut dst_ptr: *mut u8, num_items: usize) { + // Note: if the compiler auto-vectorizes this it'll hurt performance! + // It's not the case for 16 bytes stepsize, but for 8 bytes. + unsafe { + let dst_ptr_end = dst_ptr.add(num_items); + loop { + core::ptr::copy_nonoverlapping(source, dst_ptr, 16); + source = source.add(16); + dst_ptr = dst_ptr.add(16); + if dst_ptr >= dst_ptr_end { + break; + } + } + } +} + +/// Copy function, if the data start + match_length overlaps into output_ptr +#[inline] +#[cfg_attr(nightly, optimize(size))] // to avoid loop unrolling +unsafe fn duplicate_overlapping( + output_ptr: &mut *mut u8, + mut start: *const u8, + match_length: usize, +) { + // There is an edge case when output_ptr == start, which causes the decoder to potentially + // expose up to match_length bytes of uninitialized data in the decompression buffer. + // To prevent that we write a dummy zero to output, which will zero out output in such cases. + // This is the same strategy used by the reference C implementation https://github.com/lz4/lz4/pull/772 + output_ptr.write(0u8); + let dst_ptr_end = output_ptr.add(match_length); + + while output_ptr.add(1) < dst_ptr_end { + // Note that this loop unrolling is done, so that the compiler doesn't do it in a awful + // way. + // Without that the compiler will unroll/auto-vectorize the copy with a lot of branches. + // This is not what we want, as large overlapping copies are not that common. + core::ptr::copy(start, *output_ptr, 1); + start = start.add(1); + *output_ptr = output_ptr.add(1); + + core::ptr::copy(start, *output_ptr, 1); + start = start.add(1); + *output_ptr = output_ptr.add(1); + } + + if *output_ptr < dst_ptr_end { + core::ptr::copy(start, *output_ptr, 1); + *output_ptr = output_ptr.add(1); + } +} + +#[inline] +unsafe fn copy_from_dict( + output_base: *mut u8, + output_ptr: &mut *mut u8, + ext_dict: &[u8], + offset: usize, + match_length: usize, +) -> usize { + // If we're here we know offset > output pos, so we have at least 1 byte to copy from dict + debug_assert!(output_ptr.offset_from(output_base) >= 0); + debug_assert!(offset > output_ptr.offset_from(output_base) as usize); + // If unchecked-decode is not disabled we also know that the offset falls within ext_dict + debug_assert!(ext_dict.len() + output_ptr.offset_from(output_base) as usize >= offset); + + let dict_offset = ext_dict.len() + output_ptr.offset_from(output_base) as usize - offset; + // Can't copy past ext_dict len, the match may cross dict and output + let dict_match_length = match_length.min(ext_dict.len() - dict_offset); + // TODO test fastcpy_unsafe + core::ptr::copy_nonoverlapping( + ext_dict.as_ptr().add(dict_offset), + *output_ptr, + dict_match_length, + ); + *output_ptr = output_ptr.add(dict_match_length); + dict_match_length +} + +/// Read an integer. +/// +/// In LZ4, we encode small integers in a way that we can have an arbitrary number of bytes. In +/// particular, we add the bytes repeatedly until we hit a non-0xFF byte. When we do, we add +/// this byte to our sum and terminate the loop. +/// +/// # Example +/// +/// ```notest +/// 255, 255, 255, 4, 2, 3, 4, 6, 7 +/// ``` +/// +/// is encoded to _255 + 255 + 255 + 4 = 769_. The bytes after the first 4 is ignored, because +/// 4 is the first non-0xFF byte. +#[inline] +fn read_integer_ptr( + input_ptr: &mut *const u8, + _input_ptr_end: *const u8, +) -> Result { + // We start at zero and count upwards. + let mut n: u32 = 0; + // If this byte takes value 255 (the maximum value it can take), another byte is read + // and added to the sum. This repeats until a byte lower than 255 is read. + loop { + // We add the next byte until we get a byte which we add to the counting variable. + + #[cfg(not(feature = "unchecked-decode"))] + { + if *input_ptr >= _input_ptr_end { + return Err(DecompressError::ExpectedAnotherByte); + } + } + let extra = unsafe { input_ptr.read() }; + *input_ptr = unsafe { input_ptr.add(1) }; + n += extra as u32; + + // We continue if we got 255, break otherwise. + if extra != 0xFF { + break; + } + } + + // 255, 255, 255, 8 + // 111, 111, 111, 101 + + Ok(n) +} + +/// Read a little-endian 16-bit integer from the input stream. +#[inline] +fn read_u16_ptr(input_ptr: &mut *const u8) -> u16 { + let mut num: u16 = 0; + unsafe { + core::ptr::copy_nonoverlapping(*input_ptr, &mut num as *mut u16 as *mut u8, 2); + *input_ptr = input_ptr.add(2); + } + + u16::from_le(num) +} + +const FIT_TOKEN_MASK_LITERAL: u8 = 0b00001111; +const FIT_TOKEN_MASK_MATCH: u8 = 0b11110000; + +#[test] +fn check_token() { + assert!(!does_token_fit(15)); + assert!(does_token_fit(14)); + assert!(does_token_fit(114)); + assert!(!does_token_fit(0b11110000)); + assert!(does_token_fit(0b10110000)); +} + +/// The token consists of two parts, the literal length (upper 4 bits) and match_length (lower 4 +/// bits) if the literal length and match_length are both below 15, we don't need to read additional +/// data, so the token does fit the metadata in a single u8. +#[inline] +fn does_token_fit(token: u8) -> bool { + !((token & FIT_TOKEN_MASK_LITERAL) == FIT_TOKEN_MASK_LITERAL + || (token & FIT_TOKEN_MASK_MATCH) == FIT_TOKEN_MASK_MATCH) +} + +/// Decompress all bytes of `input` into `output`. +/// +/// Returns the number of bytes written (decompressed) into `output`. +#[inline] +pub(crate) fn decompress_internal( + input: &[u8], + output: &mut S, + ext_dict: &[u8], +) -> Result { + // Prevent segfault for empty input + if input.is_empty() { + return Err(DecompressError::ExpectedAnotherByte); + } + + let ext_dict = if USE_DICT { + ext_dict + } else { + // ensure optimizer knows ext_dict length is 0 if !USE_DICT + debug_assert!(ext_dict.is_empty()); + &[] + }; + let output_base = unsafe { output.base_mut_ptr() }; + let output_end = unsafe { output_base.add(output.capacity()) }; + let output_start_pos_ptr = unsafe { output.base_mut_ptr().add(output.pos()) as *mut u8 }; + let mut output_ptr = output_start_pos_ptr; + + let mut input_ptr = input.as_ptr(); + let input_ptr_end = unsafe { input.as_ptr().add(input.len()) }; + let safe_distance_from_end = (16 /* literal copy */ + 2 /* u16 match offset */ + 1 /* The next token to read (we can skip the check) */).min(input.len()) ; + let input_ptr_safe = unsafe { input_ptr_end.sub(safe_distance_from_end) }; + + let safe_output_ptr = unsafe { + let mut output_num_safe_bytes = output + .capacity() + .saturating_sub(16 /* literal copy */ + 18 /* match copy */); + if USE_DICT { + // In the dictionary case the output pointer is moved by the match length in the dictionary. + // This may be up to 17 bytes without exiting the loop. So we need to ensure that we have + // at least additional 17 bytes of space left in the output buffer in the fast loop. + output_num_safe_bytes = output_num_safe_bytes.saturating_sub(17); + }; + + output_base.add(output_num_safe_bytes) + }; + + // Exhaust the decoder by reading and decompressing all blocks until the remaining buffer is + // empty. + loop { + // Read the token. The token is the first byte in a block. It is divided into two 4-bit + // subtokens, the higher and the lower. + // This token contains to 4-bit "fields", a higher and a lower, representing the literals' + // length and the back reference's length, respectively. + let token = unsafe { input_ptr.read() }; + input_ptr = unsafe { input_ptr.add(1) }; + + // Checking for hot-loop. + // In most cases the metadata does fit in a single 1byte token (statistically) and we are in + // a safe-distance to the end. This enables some optimized handling. + // + // Ideally we want to check for safe output pos like: output.pos() <= safe_output_pos; But + // that doesn't work when the safe_output_ptr is == output_ptr due to insufficient + // capacity. So we use `<` instead of `<=`, which covers that case. + if does_token_fit(token) + && (input_ptr as usize) <= input_ptr_safe as usize + && output_ptr < safe_output_ptr + { + let literal_length = (token >> 4) as usize; + let mut match_length = MINMATCH + (token & 0xF) as usize; + + // output_ptr <= safe_output_ptr should guarantee we have enough space in output + debug_assert!( + unsafe { output_ptr.add(literal_length + match_length) } <= output_end, + "{literal_length} + {match_length} {} wont fit ", + literal_length + match_length + ); + + // Copy the literal + // The literal is at max 16 bytes, and the is_safe_distance check assures + // that we are far away enough from the end so we can safely copy 16 bytes + unsafe { + core::ptr::copy_nonoverlapping(input_ptr, output_ptr, 16); + input_ptr = input_ptr.add(literal_length); + output_ptr = output_ptr.add(literal_length); + } + + // input_ptr <= input_ptr_safe should guarantee we have enough space in input + debug_assert!(input_ptr_end as usize - input_ptr as usize >= 2); + let offset = read_u16_ptr(&mut input_ptr) as usize; + + let output_len = unsafe { output_ptr.offset_from(output_base) as usize }; + let offset = offset.min(output_len + ext_dict.len()); + + // Check if part of the match is in the external dict + if USE_DICT && offset > output_len { + let copied = unsafe { + copy_from_dict(output_base, &mut output_ptr, ext_dict, offset, match_length) + }; + if copied == match_length { + continue; + } + // match crosses ext_dict and output + match_length -= copied; + } + + // Calculate the start of this duplicate segment. At this point offset was already + // checked to be in bounds and the external dictionary copy, if any, was + // already copied and subtracted from match_length. + let start_ptr = unsafe { output_ptr.sub(offset) }; + debug_assert!(start_ptr >= output_base); + debug_assert!(start_ptr < output_end); + debug_assert!(unsafe { output_end.offset_from(start_ptr) as usize } >= match_length); + + // In this branch we know that match_length is at most 18 (14 + MINMATCH). + // But the blocks can overlap, so make sure they are at least 18 bytes apart + // to enable an optimized copy of 18 bytes. + if offset >= match_length { + unsafe { + // _copy_, not copy_non_overlaping, as it may overlap. + // Compiles to the same assembly on x68_64. + core::ptr::copy(start_ptr, output_ptr, 18); + output_ptr = output_ptr.add(match_length); + } + } else { + unsafe { + duplicate_overlapping(&mut output_ptr, start_ptr, match_length); + } + } + + continue; + } + + // Now, we read the literals section. + // Literal Section + // If the initial value is 15, it is indicated that another byte will be read and added to + // it + let mut literal_length = (token >> 4) as usize; + if literal_length != 0 { + if literal_length == 15 { + // The literal_length length took the maximal value, indicating that there is more + // than 15 literal_length bytes. We read the extra integer. + literal_length += read_integer_ptr(&mut input_ptr, input_ptr_end)? as usize; + } + + #[cfg(not(feature = "unchecked-decode"))] + { + // Check if literal is out of bounds for the input, and if there is enough space on + // the output + if literal_length > input_ptr_end as usize - input_ptr as usize { + return Err(DecompressError::LiteralOutOfBounds); + } + if literal_length > unsafe { output_end.offset_from(output_ptr) as usize } { + return Err(DecompressError::OutputTooSmall { + expected: unsafe { output_ptr.offset_from(output_base) as usize } + + literal_length, + actual: output.capacity(), + }); + } + } + unsafe { + fastcpy_unsafe::slice_copy(input_ptr, output_ptr, literal_length); + output_ptr = output_ptr.add(literal_length); + input_ptr = input_ptr.add(literal_length); + } + } + + // If the input stream is emptied, we break out of the loop. This is only the case + // in the end of the stream, since the block is intact otherwise. + if input_ptr >= input_ptr_end { + break; + } + + // Read duplicate section + #[cfg(not(feature = "unchecked-decode"))] + { + if (input_ptr_end as usize) - (input_ptr as usize) < 2 { + return Err(DecompressError::ExpectedAnotherByte); + } + } + let offset = read_u16_ptr(&mut input_ptr) as usize; + // Obtain the initial match length. The match length is the length of the duplicate segment + // which will later be copied from data previously decompressed into the output buffer. The + // initial length is derived from the second part of the token (the lower nibble), we read + // earlier. Since having a match length of less than 4 would mean negative compression + // ratio, we start at 4 (MINMATCH). + + // The initial match length can maximally be 19 (MINMATCH + 15). As with the literal length, + // this indicates that there are more bytes to read. + let mut match_length = MINMATCH + (token & 0xF) as usize; + if match_length == MINMATCH + 15 { + // The match length took the maximal value, indicating that there is more bytes. We + // read the extra integer. + match_length += read_integer_ptr(&mut input_ptr, input_ptr_end)? as usize; + } + + // We now copy from the already decompressed buffer. This allows us for storing duplicates + // by simply referencing the other location. + let output_len = unsafe { output_ptr.offset_from(output_base) as usize }; + + // We'll do a bounds check except unchecked-decode is enabled. + #[cfg(not(feature = "unchecked-decode"))] + { + if offset > output_len + ext_dict.len() { + return Err(DecompressError::OffsetOutOfBounds); + } + if match_length > unsafe { output_end.offset_from(output_ptr) as usize } { + return Err(DecompressError::OutputTooSmall { + expected: output_len + match_length, + actual: output.capacity(), + }); + } + } + + if USE_DICT && offset > output_len { + let copied = unsafe { + copy_from_dict(output_base, &mut output_ptr, ext_dict, offset, match_length) + }; + if copied == match_length { + #[cfg(not(feature = "unchecked-decode"))] + { + if input_ptr >= input_ptr_end { + return Err(DecompressError::ExpectedAnotherByte); + } + } + + continue; + } + // match crosses ext_dict and output + match_length -= copied; + } + + // Calculate the start of this duplicate segment. At this point offset was already checked + // to be in bounds and the external dictionary copy, if any, was already copied and + // subtracted from match_length. + let start_ptr = unsafe { output_ptr.sub(offset) }; + debug_assert!(start_ptr >= output_base); + debug_assert!(start_ptr < output_end); + debug_assert!(unsafe { output_end.offset_from(start_ptr) as usize } >= match_length); + unsafe { + duplicate(&mut output_ptr, output_end, start_ptr, match_length); + } + #[cfg(not(feature = "unchecked-decode"))] + { + if input_ptr >= input_ptr_end { + return Err(DecompressError::ExpectedAnotherByte); + } + } + } + unsafe { + output.set_pos(output_ptr.offset_from(output_base) as usize); + Ok(output_ptr.offset_from(output_start_pos_ptr) as usize) + } +} + +/// Decompress all bytes of `input` into `output`. +/// `output` should be preallocated with a size of of the uncompressed data. +#[inline] +pub fn decompress_into(input: &[u8], output: &mut [u8]) -> Result { + decompress_internal::(input, &mut SliceSink::new(output, 0), b"") +} + +/// Decompress all bytes of `input` into `output`. +/// +/// Returns the number of bytes written (decompressed) into `output`. +#[inline] +pub fn decompress_into_with_dict( + input: &[u8], + output: &mut [u8], + ext_dict: &[u8], +) -> Result { + decompress_internal::(input, &mut SliceSink::new(output, 0), ext_dict) +} + +/// Decompress all bytes of `input` into a new vec. +/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size. +/// +/// # Panics +/// May panic if the parameter `min_uncompressed_size` is smaller than the +/// uncompressed data. + +#[inline] +pub fn decompress_with_dict( + input: &[u8], + min_uncompressed_size: usize, + ext_dict: &[u8], +) -> Result, DecompressError> { + // Allocate a vector to contain the decompressed stream. + let mut vec = Vec::with_capacity(min_uncompressed_size); + let decomp_len = + decompress_internal::(input, &mut PtrSink::from_vec(&mut vec, 0), ext_dict)?; + unsafe { + vec.set_len(decomp_len); + } + Ok(vec) +} + +/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in +/// little endian. Can be used in conjunction with `compress_prepend_size` +#[inline] +pub fn decompress_size_prepended(input: &[u8]) -> Result, DecompressError> { + let (uncompressed_size, input) = super::uncompressed_size(input)?; + decompress(input, uncompressed_size) +} + +/// Decompress all bytes of `input` into a new vec. +/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size. +/// +/// # Panics +/// May panic if the parameter `min_uncompressed_size` is smaller than the +/// uncompressed data. +#[inline] +pub fn decompress(input: &[u8], min_uncompressed_size: usize) -> Result, DecompressError> { + // Allocate a vector to contain the decompressed stream. + let mut vec = Vec::with_capacity(min_uncompressed_size); + let decomp_len = + decompress_internal::(input, &mut PtrSink::from_vec(&mut vec, 0), b"")?; + unsafe { + vec.set_len(decomp_len); + } + Ok(vec) +} + +/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in +/// little endian. Can be used in conjunction with `compress_prepend_size_with_dict` +#[inline] +pub fn decompress_size_prepended_with_dict( + input: &[u8], + ext_dict: &[u8], +) -> Result, DecompressError> { + let (uncompressed_size, input) = super::uncompressed_size(input)?; + decompress_with_dict(input, uncompressed_size, ext_dict) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn all_literal() { + assert_eq!(decompress(&[0x30, b'a', b'4', b'9'], 3).unwrap(), b"a49"); + } + + // this error test is only valid with checked-decode. + #[cfg(not(feature = "unchecked-decode"))] + #[test] + fn offset_oob() { + decompress(&[0x10, b'a', 2, 0], 4).unwrap_err(); + decompress(&[0x40, b'a', 1, 0], 4).unwrap_err(); + } +} diff --git a/src/block/decompress_safe.rs b/src/block/decompress_safe.rs new file mode 100644 index 0000000..cebe3cb --- /dev/null +++ b/src/block/decompress_safe.rs @@ -0,0 +1,399 @@ +//! The block decompression algorithm. + +use core::convert::TryInto; + +use crate::block::DecompressError; +use crate::block::MINMATCH; +use crate::sink::Sink; +use crate::sink::SliceSink; +use alloc::vec; +use alloc::vec::Vec; + +/// Read an integer. +/// +/// In LZ4, we encode small integers in a way that we can have an arbitrary number of bytes. In +/// particular, we add the bytes repeatedly until we hit a non-0xFF byte. When we do, we add +/// this byte to our sum and terminate the loop. +/// +/// # Example +/// +/// ```notest +/// 255, 255, 255, 4, 2, 3, 4, 6, 7 +/// ``` +/// +/// is encoded to _255 + 255 + 255 + 4 = 769_. The bytes after the first 4 is ignored, because +/// 4 is the first non-0xFF byte. +#[inline] +fn read_integer(input: &[u8], input_pos: &mut usize) -> Result { + // We start at zero and count upwards. + let mut n: u32 = 0; + // If this byte takes value 255 (the maximum value it can take), another byte is read + // and added to the sum. This repeats until a byte lower than 255 is read. + loop { + // We add the next byte until we get a byte which we add to the counting variable. + let extra: u8 = *input + .get(*input_pos) + .ok_or(DecompressError::ExpectedAnotherByte)?; + *input_pos += 1; + n += extra as u32; + + // We continue if we got 255, break otherwise. + if extra != 0xFF { + break; + } + } + + // 255, 255, 255, 8 + // 111, 111, 111, 101 + + Ok(n) +} + +/// Read a little-endian 16-bit integer from the input stream. +#[inline] +fn read_u16(input: &[u8], input_pos: &mut usize) -> Result { + let dst = input + .get(*input_pos..*input_pos + 2) + .ok_or(DecompressError::ExpectedAnotherByte)?; + *input_pos += 2; + Ok(u16::from_le_bytes(dst.try_into().unwrap())) +} + +const FIT_TOKEN_MASK_LITERAL: u8 = 0b00001111; +const FIT_TOKEN_MASK_MATCH: u8 = 0b11110000; + +#[test] +fn check_token() { + assert!(!does_token_fit(15)); + assert!(does_token_fit(14)); + assert!(does_token_fit(114)); + assert!(!does_token_fit(0b11110000)); + assert!(does_token_fit(0b10110000)); +} + +/// The token consists of two parts, the literal length (upper 4 bits) and match_length (lower 4 +/// bits) if the literal length and match_length are both below 15, we don't need to read additional +/// data, so the token does fit the metadata. +#[inline] +fn does_token_fit(token: u8) -> bool { + !((token & FIT_TOKEN_MASK_LITERAL) == FIT_TOKEN_MASK_LITERAL + || (token & FIT_TOKEN_MASK_MATCH) == FIT_TOKEN_MASK_MATCH) +} + +/// Decompress all bytes of `input` into `output`. +/// +/// Returns the number of bytes written (decompressed) into `output`. +#[inline(always)] // (always) necessary to get the best performance in non LTO builds +pub(crate) fn decompress_internal( + input: &[u8], + output: &mut S, + ext_dict: &[u8], +) -> Result { + let mut input_pos = 0; + let initial_output_pos = output.pos(); + + let safe_input_pos = input + .len() + .saturating_sub(16 /* literal copy */ + 2 /* u16 match offset */); + let mut safe_output_pos = output + .capacity() + .saturating_sub(16 /* literal copy */ + 18 /* match copy */); + + if USE_DICT { + // In the dictionary case the output pointer is moved by the match length in the dictionary. + // This may be up to 17 bytes without exiting the loop. So we need to ensure that we have + // at least additional 17 bytes of space left in the output buffer in the fast loop. + safe_output_pos = safe_output_pos.saturating_sub(17); + }; + + // Exhaust the decoder by reading and decompressing all blocks until the remaining buffer is + // empty. + loop { + // Read the token. The token is the first byte in a block. It is divided into two 4-bit + // subtokens, the higher and the lower. + // This token contains to 4-bit "fields", a higher and a lower, representing the literals' + // length and the back reference's length, respectively. + let token = *input + .get(input_pos) + .ok_or(DecompressError::ExpectedAnotherByte)?; + input_pos += 1; + + // Checking for hot-loop. + // In most cases the metadata does fit in a single 1byte token (statistically) and we are in + // a safe-distance to the end. This enables some optimized handling. + // + // Ideally we want to check for safe output pos like: output.pos() <= safe_output_pos; But + // that doesn't work when the safe_output_pos is 0 due to saturated_sub. So we use + // `<` instead of `<=`, which covers that case. + if does_token_fit(token) && input_pos <= safe_input_pos && output.pos() < safe_output_pos { + let literal_length = (token >> 4) as usize; + + // casting to [u8;u16] doesn't seem to make a difference vs &[u8] (same assembly) + let input: &[u8; 16] = input[input_pos..input_pos + 16].try_into().unwrap(); + + // Copy the literal + // The literal is at max 14 bytes, and the is_safe_distance check assures + // that we are far away enough from the end so we can safely copy 16 bytes + output.extend_from_slice_wild(input, literal_length); + input_pos += literal_length; + + // clone as we don't want to mutate + let offset = read_u16(input, &mut literal_length.clone())? as usize; + input_pos += 2; + + let mut match_length = MINMATCH + (token & 0xF) as usize; + + if USE_DICT && offset > output.pos() { + let copied = copy_from_dict(output, ext_dict, offset, match_length)?; + if copied == match_length { + continue; + } + // match crosses ext_dict and output, offset is still correct as output pos + // increased + match_length -= copied; + } + + // In this branch we know that match_length is at most 18 (14 + MINMATCH). + // But the blocks can overlap, so make sure they are at least 18 bytes apart + // to enable an optimized copy of 18 bytes. + let start = output.pos().saturating_sub(offset); + if offset >= match_length { + output.extend_from_within(start, 18, match_length); + } else { + output.extend_from_within_overlapping(start, match_length) + } + + continue; + } + + // Now, we read the literals section. + // Literal Section + // If the initial value is 15, it is indicated that another byte will be read and added to + // it + let mut literal_length = (token >> 4) as usize; + if literal_length != 0 { + if literal_length == 15 { + // The literal_length length took the maximal value, indicating that there is more + // than 15 literal_length bytes. We read the extra integer. + literal_length += read_integer(input, &mut input_pos)? as usize; + } + + if literal_length > input.len() - input_pos { + return Err(DecompressError::LiteralOutOfBounds); + } + #[cfg(not(feature = "unchecked-decode"))] + if literal_length > output.capacity() - output.pos() { + return Err(DecompressError::OutputTooSmall { + expected: output.pos() + literal_length, + actual: output.capacity(), + }); + } + output.extend_from_slice(&input[input_pos..input_pos + literal_length]); + input_pos += literal_length; + } + + // If the input stream is emptied, we break out of the loop. This is only the case + // in the end of the stream, since the block is intact otherwise. + if input_pos >= input.len() { + break; + } + + let offset = read_u16(input, &mut input_pos)? as usize; + // Obtain the initial match length. The match length is the length of the duplicate segment + // which will later be copied from data previously decompressed into the output buffer. The + // initial length is derived from the second part of the token (the lower nibble), we read + // earlier. Since having a match length of less than 4 would mean negative compression + // ratio, we start at 4 (MINMATCH). + + // The initial match length can maximally be 19. As with the literal length, this indicates + // that there are more bytes to read. + let mut match_length = MINMATCH + (token & 0xF) as usize; + if match_length == MINMATCH + 15 { + // The match length took the maximal value, indicating that there is more bytes. We + // read the extra integer. + match_length += read_integer(input, &mut input_pos)? as usize; + } + + #[cfg(not(feature = "unchecked-decode"))] + if output.pos() + match_length > output.capacity() { + return Err(DecompressError::OutputTooSmall { + expected: output.pos() + match_length, + actual: output.capacity(), + }); + } + if USE_DICT && offset > output.pos() { + let copied = copy_from_dict(output, ext_dict, offset, match_length)?; + if copied == match_length { + continue; + } + // match crosses ext_dict and output, offset is still correct as output_len was + // increased + match_length -= copied; + } + // We now copy from the already decompressed buffer. This allows us for storing duplicates + // by simply referencing the other location. + duplicate_slice(output, offset, match_length)?; + } + Ok(output.pos() - initial_output_pos) +} + +#[inline] +fn copy_from_dict( + output: &mut impl Sink, + ext_dict: &[u8], + offset: usize, + match_length: usize, +) -> Result { + // If we're here we know offset > output.pos + debug_assert!(offset > output.pos()); + let (dict_offset, did_overflow) = ext_dict.len().overflowing_sub(offset - output.pos()); + if did_overflow { + return Err(DecompressError::OffsetOutOfBounds); + } + // Can't copy past ext_dict len, the match may cross dict and output + let dict_match_length = match_length.min(ext_dict.len() - dict_offset); + let ext_match = &ext_dict[dict_offset..dict_offset + dict_match_length]; + output.extend_from_slice(ext_match); + Ok(dict_match_length) +} + +/// Extends output by self-referential copies +#[inline(always)] // (always) necessary otherwise compiler fails to inline it +fn duplicate_slice( + output: &mut impl Sink, + offset: usize, + match_length: usize, +) -> Result<(), DecompressError> { + // This function assumes output will fit match_length, it might panic otherwise. + if match_length > offset { + duplicate_overlapping_slice(output, offset, match_length)?; + } else { + let (start, did_overflow) = output.pos().overflowing_sub(offset); + if did_overflow { + return Err(DecompressError::OffsetOutOfBounds); + } + + match match_length { + 0..=32 if output.pos() + 32 <= output.capacity() => { + output.extend_from_within(start, 32, match_length) + } + 33..=64 if output.pos() + 64 <= output.capacity() => { + output.extend_from_within(start, 64, match_length) + } + _ => output.extend_from_within(start, match_length, match_length), + } + } + Ok(()) +} + +/// self-referential copy for the case data start (end of output - offset) + match_length overlaps +/// into output +#[inline] +fn duplicate_overlapping_slice( + sink: &mut impl Sink, + offset: usize, + match_length: usize, +) -> Result<(), DecompressError> { + // This function assumes output will fit match_length, it might panic otherwise. + let (start, did_overflow) = sink.pos().overflowing_sub(offset); + if did_overflow { + return Err(DecompressError::OffsetOutOfBounds); + } + if offset == 1 { + let val = sink.byte_at(start); + sink.extend_with_fill(val, match_length); + } else { + sink.extend_from_within_overlapping(start, match_length); + } + Ok(()) +} + +/// Decompress all bytes of `input` into `output`. +/// `output` should be preallocated with a size of of the uncompressed data. +#[inline] +pub fn decompress_into(input: &[u8], output: &mut [u8]) -> Result { + decompress_internal::(input, &mut SliceSink::new(output, 0), b"") +} + +/// Decompress all bytes of `input` into `output`. +/// +/// Returns the number of bytes written (decompressed) into `output`. +#[inline] +pub fn decompress_into_with_dict( + input: &[u8], + output: &mut [u8], + ext_dict: &[u8], +) -> Result { + decompress_internal::(input, &mut SliceSink::new(output, 0), ext_dict) +} + +/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in +/// litte endian. Can be used in conjunction with `compress_prepend_size` +#[inline] +pub fn decompress_size_prepended(input: &[u8]) -> Result, DecompressError> { + let (uncompressed_size, input) = super::uncompressed_size(input)?; + decompress(input, uncompressed_size) +} + +/// Decompress all bytes of `input` into a new vec. +/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size. +/// +/// # Panics +/// May panic if the parameter `min_uncompressed_size` is smaller than the +/// uncompressed data. +#[inline] +pub fn decompress(input: &[u8], min_uncompressed_size: usize) -> Result, DecompressError> { + let mut decompressed: Vec = vec![0; min_uncompressed_size]; + let decomp_len = + decompress_internal::(input, &mut SliceSink::new(&mut decompressed, 0), b"")?; + decompressed.truncate(decomp_len); + Ok(decompressed) +} + +/// Decompress all bytes of `input` into a new vec. The first 4 bytes are the uncompressed size in +/// little endian. Can be used in conjunction with `compress_prepend_size_with_dict` +#[inline] +pub fn decompress_size_prepended_with_dict( + input: &[u8], + ext_dict: &[u8], +) -> Result, DecompressError> { + let (uncompressed_size, input) = super::uncompressed_size(input)?; + decompress_with_dict(input, uncompressed_size, ext_dict) +} + +/// Decompress all bytes of `input` into a new vec. +/// The passed parameter `min_uncompressed_size` needs to be equal or larger than the uncompressed size. +/// +/// # Panics +/// May panic if the parameter `min_uncompressed_size` is smaller than the +/// uncompressed data. +#[inline] +pub fn decompress_with_dict( + input: &[u8], + min_uncompressed_size: usize, + ext_dict: &[u8], +) -> Result, DecompressError> { + let mut decompressed: Vec = vec![0; min_uncompressed_size]; + let decomp_len = + decompress_internal::(input, &mut SliceSink::new(&mut decompressed, 0), ext_dict)?; + decompressed.truncate(decomp_len); + Ok(decompressed) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn all_literal() { + assert_eq!(decompress(&[0x30, b'a', b'4', b'9'], 3).unwrap(), b"a49"); + } + + // this error test is only valid in safe-decode. + #[cfg(feature = "safe-decode")] + #[test] + fn offset_oob() { + decompress(&[0x10, b'a', 2, 0], 4).unwrap_err(); + decompress(&[0x40, b'a', 1, 0], 4).unwrap_err(); + } +} diff --git a/src/block/hashtable.rs b/src/block/hashtable.rs new file mode 100644 index 0000000..7c29db7 --- /dev/null +++ b/src/block/hashtable.rs @@ -0,0 +1,161 @@ +use alloc::boxed::Box; +use core::convert::TryInto; + +/// The Hashtable trait used by the compression to store hashed bytes to their position. +/// `val` can be maximum the size of the input in bytes. +/// +/// `pos` can have a maximum value of u16::MAX or 65535 +/// If the hashtable is smaller it needs to reduce the pos to its space, e.g. by right +/// shifting. +/// +/// Duplication dictionary size. +/// +/// Every four bytes is assigned an entry. When this number is lower, fewer entries exists, and +/// thus collisions are more likely, hurting the compression ratio. + +/// hashes and right shifts to a maximum value of 16bit, 65535 +/// The right shift is done in order to not exceed, the hashtables capacity +#[inline] +fn hash(sequence: u32) -> u32 { + (sequence.wrapping_mul(2654435761_u32)) >> 16 +} + +/// hashes and right shifts to a maximum value of 16bit, 65535 +/// The right shift is done in order to not exceed, the hashtables capacity +#[cfg(target_pointer_width = "64")] +#[inline] +fn hash5(sequence: usize) -> u32 { + let primebytes = if cfg!(target_endian = "little") { + 889523592379_usize + } else { + 11400714785074694791_usize + }; + (((sequence << 24).wrapping_mul(primebytes)) >> 48) as u32 +} + +pub trait HashTable { + fn get_at(&self, pos: usize) -> usize; + fn put_at(&mut self, pos: usize, val: usize); + fn clear(&mut self); + #[inline] + #[cfg(target_pointer_width = "64")] + fn get_hash_at(input: &[u8], pos: usize) -> usize { + hash5(super::compress::get_batch_arch(input, pos)) as usize + } + #[inline] + #[cfg(target_pointer_width = "32")] + fn get_hash_at(input: &[u8], pos: usize) -> usize { + hash(super::compress::get_batch(input, pos)) as usize + } +} + +const HASHTABLE_SIZE_4K: usize = 4 * 1024; +const HASHTABLE_BIT_SHIFT_4K: usize = 4; + +#[derive(Debug)] +#[repr(align(64))] +pub struct HashTable4KU16 { + dict: Box<[u16; HASHTABLE_SIZE_4K]>, +} +impl HashTable4KU16 { + #[inline] + pub fn new() -> Self { + // This generates more efficient assembly in contrast to Box::new(slice), because of an + // optmized call alloc_zeroed, vs. alloc + memset + // try_into is optimized away + let dict = alloc::vec![0; HASHTABLE_SIZE_4K] + .into_boxed_slice() + .try_into() + .unwrap(); + Self { dict } + } +} +impl HashTable for HashTable4KU16 { + #[inline] + fn get_at(&self, hash: usize) -> usize { + self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] as usize + } + #[inline] + fn put_at(&mut self, hash: usize, val: usize) { + self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] = val as u16; + } + #[inline] + fn clear(&mut self) { + self.dict.fill(0); + } + #[inline] + fn get_hash_at(input: &[u8], pos: usize) -> usize { + hash(super::get_batch(input, pos)) as usize + } +} + +#[derive(Debug)] +pub struct HashTable4K { + dict: Box<[u32; HASHTABLE_SIZE_4K]>, +} +impl HashTable4K { + #[inline] + pub fn new() -> Self { + let dict = alloc::vec![0; HASHTABLE_SIZE_4K] + .into_boxed_slice() + .try_into() + .unwrap(); + Self { dict } + } + + #[cold] + #[allow(dead_code)] + pub fn reposition(&mut self, offset: u32) { + for i in self.dict.iter_mut() { + *i = i.saturating_sub(offset); + } + } +} +impl HashTable for HashTable4K { + #[inline] + fn get_at(&self, hash: usize) -> usize { + self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] as usize + } + #[inline] + fn put_at(&mut self, hash: usize, val: usize) { + self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] = val as u32; + } + #[inline] + fn clear(&mut self) { + self.dict.fill(0); + } +} + +const HASHTABLE_SIZE_8K: usize = 8 * 1024; +const HASH_TABLE_BIT_SHIFT_8K: usize = 3; + +#[derive(Debug)] +pub struct HashTable8K { + dict: Box<[u32; HASHTABLE_SIZE_8K]>, +} +#[allow(dead_code)] +impl HashTable8K { + #[inline] + pub fn new() -> Self { + let dict = alloc::vec![0; HASHTABLE_SIZE_8K] + .into_boxed_slice() + .try_into() + .unwrap(); + + Self { dict } + } +} +impl HashTable for HashTable8K { + #[inline] + fn get_at(&self, hash: usize) -> usize { + self.dict[hash >> HASH_TABLE_BIT_SHIFT_8K] as usize + } + #[inline] + fn put_at(&mut self, hash: usize, val: usize) { + self.dict[hash >> HASH_TABLE_BIT_SHIFT_8K] = val as u32; + } + #[inline] + fn clear(&mut self) { + self.dict.fill(0); + } +} diff --git a/src/block/mod.rs b/src/block/mod.rs new file mode 100644 index 0000000..31c52f4 --- /dev/null +++ b/src/block/mod.rs @@ -0,0 +1,157 @@ +//! LZ4 Block Format +//! +//! As defined in +//! +//! Currently for no_std support only the block format is supported. +//! +//! # Example: block format roundtrip +//! ``` +//! use lz4_flex::block::{compress_prepend_size, decompress_size_prepended}; +//! let input: &[u8] = b"Hello people, what's up?"; +//! let compressed = compress_prepend_size(input); +//! let uncompressed = decompress_size_prepended(&compressed).unwrap(); +//! assert_eq!(input, uncompressed); +//! ``` +//! + +#[cfg_attr(feature = "safe-encode", forbid(unsafe_code))] +pub(crate) mod compress; +pub(crate) mod hashtable; + +#[cfg(feature = "safe-decode")] +#[cfg_attr(feature = "safe-decode", forbid(unsafe_code))] +pub(crate) mod decompress_safe; +#[cfg(feature = "safe-decode")] +pub(crate) use decompress_safe as decompress; + +#[cfg(not(feature = "safe-decode"))] +pub(crate) mod decompress; + +pub use compress::*; +pub use decompress::*; + +use core::convert::TryInto; +use core::fmt; + +pub(crate) const WINDOW_SIZE: usize = 64 * 1024; + +/// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md#end-of-block-restrictions +/// The last match must start at least 12 bytes before the end of block. The last match is part of +/// the penultimate sequence. It is followed by the last sequence, which contains only literals. +/// +/// Note that, as a consequence, an independent block < 13 bytes cannot be compressed, because the +/// match must copy "something", so it needs at least one prior byte. +/// +/// When a block can reference data from another block, it can start immediately with a match and no +/// literal, so a block of 12 bytes can be compressed. +const MFLIMIT: usize = 12; + +/// The last 5 bytes of input are always literals. Therefore, the last sequence contains at least 5 +/// bytes. +const LAST_LITERALS: usize = 5; + +/// Due the way the compression loop is arrange we may read up to (register_size - 2) bytes from the +/// current position. So we must end the matches 6 bytes before the end, 1 more than required by the +/// spec. +const END_OFFSET: usize = LAST_LITERALS + 1; + +/// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md#end-of-block-restrictions +/// Minimum length of a block +/// +/// MFLIMIT + 1 for the token. +const LZ4_MIN_LENGTH: usize = MFLIMIT + 1; + +const MAXD_LOG: usize = 16; +const MAX_DISTANCE: usize = (1 << MAXD_LOG) - 1; + +#[allow(dead_code)] +const MATCH_LENGTH_MASK: u32 = (1_u32 << 4) - 1; // 0b1111 / 15 + +/// The minimum length of a duplicate +const MINMATCH: usize = 4; + +#[allow(dead_code)] +const FASTLOOP_SAFE_DISTANCE: usize = 64; + +/// Switch for the hashtable size byU16 +#[allow(dead_code)] +static LZ4_64KLIMIT: usize = (64 * 1024) + (MFLIMIT - 1); + +/// An error representing invalid compressed data. +#[derive(Debug)] +#[non_exhaustive] +pub enum DecompressError { + /// The provided output is too small + OutputTooSmall { + /// Minimum expected output size + expected: usize, + /// Actual size of output + actual: usize, + }, + /// Literal is out of bounds of the input + LiteralOutOfBounds, + /// Expected another byte, but none found. + ExpectedAnotherByte, + /// Deduplication offset out of bounds (not in buffer). + OffsetOutOfBounds, +} + +#[derive(Debug)] +#[non_exhaustive] +/// Errors that can happen during compression. +pub enum CompressError { + /// The provided output is too small. + OutputTooSmall, +} + +impl fmt::Display for DecompressError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + DecompressError::OutputTooSmall { expected, actual } => { + write!( + f, + "provided output is too small for the decompressed data, actual {actual}, expected \ + {expected}" + ) + } + DecompressError::LiteralOutOfBounds => { + f.write_str("literal is out of bounds of the input") + } + DecompressError::ExpectedAnotherByte => { + f.write_str("expected another byte, found none") + } + DecompressError::OffsetOutOfBounds => { + f.write_str("the offset to copy is not contained in the decompressed buffer") + } + } + } +} + +impl fmt::Display for CompressError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + CompressError::OutputTooSmall => f.write_str( + "output is too small for the compressed data, use get_maximum_output_size to \ + reserve enough space", + ), + } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for DecompressError {} + +#[cfg(feature = "std")] +impl std::error::Error for CompressError {} + +/// This can be used in conjunction with `decompress_size_prepended`. +/// It will read the first 4 bytes as little-endian encoded length, and return +/// the rest of the bytes after the length encoding. +#[inline] +pub fn uncompressed_size(input: &[u8]) -> Result<(usize, &[u8]), DecompressError> { + let size = input.get(..4).ok_or(DecompressError::ExpectedAnotherByte)?; + let size: &[u8; 4] = size.try_into().unwrap(); + let uncompressed_size = u32::from_le_bytes(*size) as usize; + let rest = &input[4..]; + Ok((uncompressed_size, rest)) +} diff --git a/src/fastcpy.rs b/src/fastcpy.rs new file mode 100644 index 0000000..8bbd480 --- /dev/null +++ b/src/fastcpy.rs @@ -0,0 +1,145 @@ +//! # FastCpy +//! +//! The Rust Compiler calls `memcpy` for slices of unknown length. +//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`). +//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program. +//! +//! `fastcpy` is designed to contain not too much assembly, so the overhead is low. +//! +//! As fall back the standard `memcpy` is called +//! +//! ## Double Copy Trick +//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`). +//! E.g. Slice of length 6 can be copied with two uncoditional copy operations. +//! +//! /// [1, 2, 3, 4, 5, 6] +//! /// [1, 2, 3, 4] +//! /// [3, 4, 5, 6] +//! + +#[inline] +pub fn slice_copy(src: &[u8], dst: &mut [u8]) { + #[inline(never)] + #[cold] + #[track_caller] + fn len_mismatch_fail(dst_len: usize, src_len: usize) -> ! { + panic!( + "source slice length ({}) does not match destination slice length ({})", + src_len, dst_len, + ); + } + + if src.len() != dst.len() { + len_mismatch_fail(src.len(), dst.len()); + } + let len = src.len(); + + if src.is_empty() { + return; + } + + if len < 4 { + short_copy(src, dst); + return; + } + + if len < 8 { + double_copy_trick::<4>(src, dst); + return; + } + + if len <= 16 { + double_copy_trick::<8>(src, dst); + return; + } + + if len <= 32 { + double_copy_trick::<16>(src, dst); + return; + } + + /// The code will use the vmovdqu instruction to copy 32 bytes at a time. + #[cfg(target_feature = "avx")] + { + if len <= 64 { + double_copy_trick::<32>(src, dst); + return; + } + } + + // For larger sizes we use the default, which calls memcpy + // memcpy does some virtual memory tricks to copy large chunks of memory. + // + // The theory should be that the checks above don't cost much relative to the copy call for + // larger copies. + // The bounds checks in `copy_from_slice` are elided. + dst.copy_from_slice(src); +} + +#[inline(always)] +fn short_copy(src: &[u8], dst: &mut [u8]) { + let len = src.len(); + + // length 1-3 + dst[0] = src[0]; + if len >= 2 { + double_copy_trick::<2>(src, dst); + } +} + +#[inline(always)] +/// [1, 2, 3, 4, 5, 6] +/// [1, 2, 3, 4] +/// [3, 4, 5, 6] +fn double_copy_trick(src: &[u8], dst: &mut [u8]) { + dst[0..SIZE].copy_from_slice(&src[0..SIZE]); + dst[src.len() - SIZE..].copy_from_slice(&src[src.len() - SIZE..]); +} + +#[cfg(test)] +mod tests { + use super::slice_copy; + use alloc::vec::Vec; + use proptest::prelude::*; + proptest! { + #[test] + fn test_fast_short_slice_copy(left: Vec) { + let mut right = vec![0u8; left.len()]; + slice_copy(&left, &mut right); + prop_assert_eq!(&left, &right); + } + } + + #[test] + fn test_fast_short_slice_copy_edge_cases() { + for len in 0..(512 * 2) { + let left = (0..len).map(|i| i as u8).collect::>(); + let mut right = vec![0u8; len]; + slice_copy(&left, &mut right); + assert_eq!(left, right); + } + } + + #[test] + fn test_fail2() { + let left = vec![ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let mut right = vec![0u8; left.len()]; + slice_copy(&left, &mut right); + assert_eq!(left, right); + } + + #[test] + fn test_fail() { + let left = vec![ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + let mut right = vec![0u8; left.len()]; + slice_copy(&left, &mut right); + assert_eq!(left, right); + } +} diff --git a/src/fastcpy_unsafe.rs b/src/fastcpy_unsafe.rs new file mode 100644 index 0000000..fb17280 --- /dev/null +++ b/src/fastcpy_unsafe.rs @@ -0,0 +1,165 @@ +//! # FastCpy +//! +//! The Rust Compiler calls `memcpy` for slices of unknown length. +//! This crate provides a faster implementation of `memcpy` for slices up to 32bytes (64bytes with `avx`). +//! If you know most of you copy operations are not too big you can use `fastcpy` to speed up your program. +//! +//! `fastcpy` is designed to contain not too much assembly, so the overhead is low. +//! +//! As fall back the standard `memcpy` is called +//! +//! ## Double Copy Trick +//! `fastcpy` employs a double copy trick to copy slices of length 4-32bytes (64bytes with `avx`). +//! E.g. Slice of length 6 can be copied with two uncoditional copy operations. +//! +//! /// [1, 2, 3, 4, 5, 6] +//! /// [1, 2, 3, 4] +//! /// [3, 4, 5, 6] +//! + +#[inline] +pub fn slice_copy(src: *const u8, dst: *mut u8, num_bytes: usize) { + if num_bytes < 4 { + short_copy(src, dst, num_bytes); + return; + } + + if num_bytes < 8 { + double_copy_trick::<4>(src, dst, num_bytes); + return; + } + + if num_bytes <= 16 { + double_copy_trick::<8>(src, dst, num_bytes); + return; + } + + //if num_bytes <= 32 { + //double_copy_trick::<16>(src, dst, num_bytes); + //return; + //} + + // /// The code will use the vmovdqu instruction to copy 32 bytes at a time. + //#[cfg(target_feature = "avx")] + //{ + //if num_bytes <= 64 { + //double_copy_trick::<32>(src, dst, num_bytes); + //return; + //} + //} + + // For larger sizes we use the default, which calls memcpy + // memcpy does some virtual memory tricks to copy large chunks of memory. + // + // The theory should be that the checks above don't cost much relative to the copy call for + // larger copies. + // The bounds checks in `copy_from_slice` are elided. + + //unsafe { core::ptr::copy_nonoverlapping(src, dst, num_bytes) } + wild_copy_from_src::<16>(src, dst, num_bytes) +} + +// Inline never because otherwise we get a call to memcpy -.- +#[inline] +fn wild_copy_from_src( + mut source: *const u8, + mut dst: *mut u8, + num_bytes: usize, +) { + // Note: if the compiler auto-vectorizes this it'll hurt performance! + // It's not the case for 16 bytes stepsize, but for 8 bytes. + let l_last = unsafe { source.add(num_bytes - SIZE) }; + let r_last = unsafe { dst.add(num_bytes - SIZE) }; + let num_bytes = (num_bytes / SIZE) * SIZE; + + unsafe { + let dst_ptr_end = dst.add(num_bytes); + loop { + core::ptr::copy_nonoverlapping(source, dst, SIZE); + source = source.add(SIZE); + dst = dst.add(SIZE); + if dst >= dst_ptr_end { + break; + } + } + } + + unsafe { + core::ptr::copy_nonoverlapping(l_last, r_last, SIZE); + } +} + +#[inline] +fn short_copy(src: *const u8, dst: *mut u8, len: usize) { + unsafe { + *dst = *src; + } + if len >= 2 { + double_copy_trick::<2>(src, dst, len); + } +} + +#[inline(always)] +/// [1, 2, 3, 4, 5, 6] +/// [1, 2, 3, 4] +/// [3, 4, 5, 6] +fn double_copy_trick(src: *const u8, dst: *mut u8, len: usize) { + let l_end = unsafe { src.add(len - SIZE) }; + let r_end = unsafe { dst.add(len - SIZE) }; + + unsafe { + core::ptr::copy_nonoverlapping(src, dst, SIZE); + core::ptr::copy_nonoverlapping(l_end, r_end, SIZE); + } +} + +#[cfg(test)] +mod tests { + use super::slice_copy; + use alloc::vec::Vec; + use proptest::prelude::*; + proptest! { + #[test] + fn test_fast_short_slice_copy(left: Vec) { + if left.is_empty() { + return Ok(()); + } + let mut right = vec![0u8; left.len()]; + slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len()); + prop_assert_eq!(&left, &right); + } + } + + #[test] + fn test_fast_short_slice_copy_edge_cases() { + for len in 1..(512 * 2) { + let left = (0..len).map(|i| i as u8).collect::>(); + let mut right = vec![0u8; len]; + slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len()); + assert_eq!(left, right); + } + } + + #[test] + fn test_fail2() { + let left = vec![ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ]; + let mut right = vec![0u8; left.len()]; + slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len()); + assert_eq!(left, right); + } + + #[test] + fn test_fail() { + let left = vec![ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; + let mut right = vec![0u8; left.len()]; + slice_copy(left.as_ptr(), right.as_mut_ptr(), left.len()); + assert_eq!(left, right); + } +} diff --git a/src/frame/compress.rs b/src/frame/compress.rs new file mode 100644 index 0000000..cee32eb --- /dev/null +++ b/src/frame/compress.rs @@ -0,0 +1,471 @@ +use std::{ + fmt, + hash::Hasher, + io::{self, Write}, +}; +use twox_hash::XxHash32; + +use crate::{ + block::{ + compress::compress_internal, + hashtable::{HashTable, HashTable4K}, + }, + sink::vec_sink_for_compression, +}; + +use super::Error; +use super::{ + header::{BlockInfo, BlockMode, FrameInfo, BLOCK_INFO_SIZE, MAX_FRAME_INFO_SIZE}, + BlockSize, +}; +use crate::block::WINDOW_SIZE; + +/// A writer for compressing a LZ4 stream. +/// +/// This `FrameEncoder` wraps any other writer that implements `io::Write`. +/// Bytes written to this writer are compressed using the [LZ4 frame +/// format](https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md). +/// +/// Writes are buffered automatically, so there's no need to wrap the given +/// writer in a `std::io::BufWriter`. +/// +/// To ensure a well formed stream the encoder must be finalized by calling +/// either the [`finish()`], [`try_finish()`], or [`auto_finish()`] methods. +/// +/// [`finish()`]: Self::finish +/// [`try_finish()`]: Self::try_finish +/// [`auto_finish()`]: Self::auto_finish +/// +/// # Example 1 +/// Serializing json values into a compressed file. +/// +/// ```no_run +/// let compressed_file = std::fs::File::create("datafile").unwrap(); +/// let mut compressor = lz4_flex::frame::FrameEncoder::new(compressed_file); +/// serde_json::to_writer(&mut compressor, &serde_json::json!({ "an": "object" })).unwrap(); +/// compressor.finish().unwrap(); +/// ``` +/// +/// # Example 2 +/// Serializing multiple json values into a compressed file using linked blocks. +/// +/// ```no_run +/// let compressed_file = std::fs::File::create("datafile").unwrap(); +/// let mut frame_info = lz4_flex::frame::FrameInfo::new(); +/// frame_info.block_mode = lz4_flex::frame::BlockMode::Linked; +/// let mut compressor = lz4_flex::frame::FrameEncoder::with_frame_info(frame_info, compressed_file); +/// for i in 0..10u64 { +/// serde_json::to_writer(&mut compressor, &serde_json::json!({ "i": i })).unwrap(); +/// } +/// compressor.finish().unwrap(); +/// ``` +pub struct FrameEncoder { + /// Our buffer of uncompressed bytes. + src: Vec, + /// Index into src: starting point of bytes not yet compressed + src_start: usize, + /// Index into src: end point of bytes not not yet compressed + src_end: usize, + /// Index into src: starting point of external dictionary (applicable in Linked block mode) + ext_dict_offset: usize, + /// Length of external dictionary + ext_dict_len: usize, + /// Counter of bytes already compressed to the compression_table + /// _Not_ the same as `content_len` as this is reset every to 2GB. + src_stream_offset: usize, + /// Encoder table + compression_table: HashTable4K, + /// The underlying writer. + w: W, + /// Xxhash32 used when content checksum is enabled. + content_hasher: XxHash32, + /// Number of bytes compressed + content_len: u64, + /// The compressed bytes buffer. Bytes are compressed from src (usually) + /// to dst before being written to w. + dst: Vec, + /// Whether we have an open frame in the output. + is_frame_open: bool, + /// Whether we have an frame closed in the output. + data_to_frame_written: bool, + /// The frame information to be used in this encoder. + frame_info: FrameInfo, +} + +impl FrameEncoder { + fn init(&mut self) { + let max_block_size = self.frame_info.block_size.get_size(); + let src_size = if self.frame_info.block_mode == BlockMode::Linked { + // In linked mode we consume the input (bumping src_start) but leave the + // beginning of src to be used as a prefix in subsequent blocks. + // That is at least until we have at least `max_block_size + WINDOW_SIZE` + // bytes in src, then we setup an ext_dict with the last WINDOW_SIZE bytes + // and the input goes to the beginning of src again. + // Since we always want to be able to write a full block (up to max_block_size) + // we need a buffer with at least `max_block_size * 2 + WINDOW_SIZE` bytes. + max_block_size * 2 + WINDOW_SIZE + } else { + max_block_size + }; + // Since this method is called potentially multiple times, don't reserve _additional_ + // capacity if not required. + self.src + .reserve(src_size.saturating_sub(self.src.capacity())); + self.dst.reserve( + crate::block::compress::get_maximum_output_size(max_block_size) + .saturating_sub(self.dst.capacity()), + ); + } + + /// Returns a wrapper around `self` that will finish the stream on drop. + /// + /// # Note + /// Errors on drop get silently ignored. If you want to handle errors then use [`finish()`] or + /// [`try_finish()`] instead. + /// + /// [`finish()`]: Self::finish + /// [`try_finish()`]: Self::try_finish + pub fn auto_finish(self) -> AutoFinishEncoder { + AutoFinishEncoder { + encoder: Some(self), + } + } + + /// Creates a new Encoder with the specified FrameInfo. + pub fn with_frame_info(frame_info: FrameInfo, wtr: W) -> Self { + FrameEncoder { + src: Vec::new(), + w: wtr, + // 16 KB hash table for matches, same as the reference implementation. + compression_table: HashTable4K::new(), + content_hasher: XxHash32::with_seed(0), + content_len: 0, + dst: Vec::new(), + is_frame_open: false, + data_to_frame_written: false, + frame_info, + src_start: 0, + src_end: 0, + ext_dict_offset: 0, + ext_dict_len: 0, + src_stream_offset: 0, + } + } + + /// Creates a new Encoder with the default settings. + pub fn new(wtr: W) -> Self { + Self::with_frame_info(Default::default(), wtr) + } + + /// The frame information used by this Encoder. + pub fn frame_info(&mut self) -> &FrameInfo { + &self.frame_info + } + + /// Consumes this encoder, flushing internal buffer and writing stream terminator. + pub fn finish(mut self) -> Result { + self.try_finish()?; + Ok(self.w) + } + + /// Attempt to finish this output stream, flushing internal buffer and writing stream + /// terminator. + pub fn try_finish(&mut self) -> Result<(), Error> { + match self.flush() { + Ok(()) => { + // Empty input special case + // https://github.com/ouch-org/ouch/pull/163#discussion_r1108965151 + if !self.is_frame_open && !self.data_to_frame_written { + self.begin_frame(0)?; + } + self.end_frame()?; + self.data_to_frame_written = true; + Ok(()) + } + Err(err) => Err(err.into()), + } + } + + /// Returns the underlying writer _without_ flushing the stream. + /// This may leave the output in an unfinished state. + pub fn into_inner(self) -> W { + self.w + } + + /// Gets a reference to the underlying writer in this encoder. + pub fn get_ref(&self) -> &W { + &self.w + } + + /// Gets a reference to the underlying writer in this encoder. + /// + /// Note that mutating the output/input state of the stream may corrupt + /// this encoder, so care must be taken when using this method. + pub fn get_mut(&mut self) -> &mut W { + &mut self.w + } + + /// Closes the frame by writing the end marker. + fn end_frame(&mut self) -> Result<(), Error> { + debug_assert!(self.is_frame_open); + self.is_frame_open = false; + if let Some(expected) = self.frame_info.content_size { + if expected != self.content_len { + return Err(Error::ContentLengthError { + expected, + actual: self.content_len, + }); + } + } + + let mut block_info_buffer = [0u8; BLOCK_INFO_SIZE]; + BlockInfo::EndMark.write(&mut block_info_buffer[..])?; + self.w.write_all(&block_info_buffer[..])?; + if self.frame_info.content_checksum { + let content_checksum = self.content_hasher.finish() as u32; + self.w.write_all(&content_checksum.to_le_bytes())?; + } + + Ok(()) + } + + /// Begin the frame by writing the frame header. + /// It'll also setup the encoder for compressing blocks for the the new frame. + fn begin_frame(&mut self, buf_len: usize) -> io::Result<()> { + self.is_frame_open = true; + if self.frame_info.block_size == BlockSize::Auto { + self.frame_info.block_size = BlockSize::from_buf_length(buf_len); + } + self.init(); + let mut frame_info_buffer = [0u8; MAX_FRAME_INFO_SIZE]; + let size = self.frame_info.write(&mut frame_info_buffer)?; + self.w.write_all(&frame_info_buffer[..size])?; + + if self.content_len != 0 { + // This is the second or later frame for this Encoder, + // reset compressor state for the new frame. + self.content_len = 0; + self.src_stream_offset = 0; + self.src.clear(); + self.src_start = 0; + self.src_end = 0; + self.ext_dict_len = 0; + self.content_hasher = XxHash32::with_seed(0); + self.compression_table.clear(); + } + Ok(()) + } + + /// Consumes the src contents between src_start and src_end, + /// which shouldn't exceed the max block size. + fn write_block(&mut self) -> io::Result<()> { + debug_assert!(self.is_frame_open); + let max_block_size = self.frame_info.block_size.get_size(); + debug_assert!(self.src_end - self.src_start <= max_block_size); + + // Reposition the compression table if we're anywhere near an overflowing hazard + if self.src_stream_offset + max_block_size + WINDOW_SIZE >= u32::MAX as usize / 2 { + self.compression_table + .reposition((self.src_stream_offset - self.ext_dict_len) as _); + self.src_stream_offset = self.ext_dict_len; + } + + // input to the compressor, which may include a prefix when blocks are linked + let input = &self.src[..self.src_end]; + // the contents of the block are between src_start and src_end + let src = &input[self.src_start..]; + + let dst_required_size = crate::block::compress::get_maximum_output_size(src.len()); + + let compress_result = if self.ext_dict_len != 0 { + debug_assert_eq!(self.frame_info.block_mode, BlockMode::Linked); + compress_internal::<_, true, _>( + input, + self.src_start, + &mut vec_sink_for_compression(&mut self.dst, 0, 0, dst_required_size), + &mut self.compression_table, + &self.src[self.ext_dict_offset..self.ext_dict_offset + self.ext_dict_len], + self.src_stream_offset, + ) + } else { + compress_internal::<_, false, _>( + input, + self.src_start, + &mut vec_sink_for_compression(&mut self.dst, 0, 0, dst_required_size), + &mut self.compression_table, + b"", + self.src_stream_offset, + ) + }; + + let (block_info, block_data) = match compress_result.map_err(Error::CompressionError)? { + comp_len if comp_len < src.len() => { + (BlockInfo::Compressed(comp_len as _), &self.dst[..comp_len]) + } + _ => (BlockInfo::Uncompressed(src.len() as _), src), + }; + + // Write the (un)compressed block to the writer and the block checksum (if applicable). + let mut block_info_buffer = [0u8; BLOCK_INFO_SIZE]; + block_info.write(&mut block_info_buffer[..])?; + self.w.write_all(&block_info_buffer[..])?; + self.w.write_all(block_data)?; + if self.frame_info.block_checksums { + let mut block_hasher = XxHash32::with_seed(0); + block_hasher.write(block_data); + let block_checksum = block_hasher.finish() as u32; + self.w.write_all(&block_checksum.to_le_bytes())?; + } + + // Content checksum, if applicable + if self.frame_info.content_checksum { + self.content_hasher.write(src); + } + + // Buffer and offsets maintenance + self.content_len += src.len() as u64; + self.src_start += src.len(); + debug_assert_eq!(self.src_start, self.src_end); + if self.frame_info.block_mode == BlockMode::Linked { + // In linked mode we consume the input (bumping src_start) but leave the + // beginning of src to be used as a prefix in subsequent blocks. + // That is at least until we have at least `max_block_size + WINDOW_SIZE` + // bytes in src, then we setup an ext_dict with the last WINDOW_SIZE bytes + // and the input goes to the beginning of src again. + debug_assert_eq!(self.src.capacity(), max_block_size * 2 + WINDOW_SIZE); + if self.src_start >= max_block_size + WINDOW_SIZE { + // The ext_dict will become the last WINDOW_SIZE bytes + self.ext_dict_offset = self.src_end - WINDOW_SIZE; + self.ext_dict_len = WINDOW_SIZE; + // Input goes in the beginning of the buffer again. + self.src_stream_offset += self.src_end; + self.src_start = 0; + self.src_end = 0; + } else if self.src_start + self.ext_dict_len > WINDOW_SIZE { + // There's more than WINDOW_SIZE bytes of lookback adding the prefix and ext_dict. + // Since we have a limited buffer we must shrink ext_dict in favor of the prefix, + // so that we can fit up to max_block_size bytes between dst_start and ext_dict + // start. + let delta = self + .ext_dict_len + .min(self.src_start + self.ext_dict_len - WINDOW_SIZE); + self.ext_dict_offset += delta; + self.ext_dict_len -= delta; + debug_assert!(self.src_start + self.ext_dict_len >= WINDOW_SIZE) + } + debug_assert!( + self.ext_dict_len == 0 || self.src_start + max_block_size <= self.ext_dict_offset + ); + } else { + // In independent block mode we consume the entire src buffer + // which is sized equal to the frame max_block_size. + debug_assert_eq!(self.ext_dict_len, 0); + debug_assert_eq!(self.src.capacity(), max_block_size); + self.src_start = 0; + self.src_end = 0; + // Advance stream offset so we don't have to reset the match dict + // for the next block. + self.src_stream_offset += src.len(); + } + debug_assert!(self.src_start <= self.src_end); + debug_assert!(self.src_start + max_block_size <= self.src.capacity()); + Ok(()) + } +} + +impl io::Write for FrameEncoder { + fn write(&mut self, mut buf: &[u8]) -> io::Result { + if !self.is_frame_open && !buf.is_empty() { + self.begin_frame(buf.len())?; + } + let buf_len = buf.len(); + while !buf.is_empty() { + let src_filled = self.src_end - self.src_start; + let max_fill_len = self.frame_info.block_size.get_size() - src_filled; + if max_fill_len == 0 { + // make space by writing next block + self.write_block()?; + debug_assert_eq!(self.src_end, self.src_start); + continue; + } + + let fill_len = max_fill_len.min(buf.len()); + vec_copy_overwriting(&mut self.src, self.src_end, &buf[..fill_len]); + buf = &buf[fill_len..]; + self.src_end += fill_len; + } + Ok(buf_len) + } + + fn flush(&mut self) -> io::Result<()> { + if self.src_start != self.src_end { + self.write_block()?; + } + Ok(()) + } +} + +/// A wrapper around an [`FrameEncoder`] that finishes the stream on drop. +/// +/// This can be created by the [`auto_finish()`] method on the [`FrameEncoder`]. +/// +/// # Note +/// Errors on drop get silently ignored. If you want to handle errors then use [`finish()`] or +/// [`try_finish()`] instead. +/// +/// [`finish()`]: FrameEncoder::finish +/// [`try_finish()`]: FrameEncoder::try_finish +/// [`auto_finish()`]: FrameEncoder::auto_finish +pub struct AutoFinishEncoder { + // We wrap this in an option to take it during drop. + encoder: Option>, +} + +impl Drop for AutoFinishEncoder { + fn drop(&mut self) { + if let Some(mut encoder) = self.encoder.take() { + let _ = encoder.try_finish(); + } + } +} + +impl Write for AutoFinishEncoder { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.encoder.as_mut().unwrap().write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + self.encoder.as_mut().unwrap().flush() + } +} + +impl fmt::Debug for FrameEncoder { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("FrameEncoder") + .field("w", &self.w) + .field("frame_info", &self.frame_info) + .field("is_frame_open", &self.is_frame_open) + .field("content_hasher", &self.content_hasher) + .field("content_len", &self.content_len) + .field("dst", &"[...]") + .field("src", &"[...]") + .field("src_start", &self.src_start) + .field("src_end", &self.src_end) + .field("ext_dict_offset", &self.ext_dict_offset) + .field("ext_dict_len", &self.ext_dict_len) + .field("src_stream_offset", &self.src_stream_offset) + .finish() + } +} + +/// Copy `src` into `target` starting from the `start` index, overwriting existing data if any. +#[inline] +fn vec_copy_overwriting(target: &mut Vec, target_start: usize, src: &[u8]) { + debug_assert!(target_start + src.len() <= target.capacity()); + + // By combining overwriting (copy_from_slice) and extending (extend_from_slice) + // we can fill the ring buffer without initializing it (eg. filling with 0). + let overwrite_len = (target.len() - target_start).min(src.len()); + target[target_start..target_start + overwrite_len].copy_from_slice(&src[..overwrite_len]); + target.extend_from_slice(&src[overwrite_len..]); +} diff --git a/src/frame/decompress.rs b/src/frame/decompress.rs new file mode 100644 index 0000000..2b495e2 --- /dev/null +++ b/src/frame/decompress.rs @@ -0,0 +1,449 @@ +use std::{ + convert::TryInto, + fmt, + hash::Hasher, + io::{self, BufRead, ErrorKind}, + mem::size_of, +}; +use twox_hash::XxHash32; + +use super::header::{ + BlockInfo, BlockMode, FrameInfo, LZ4F_LEGACY_MAGIC_NUMBER, MAGIC_NUMBER_SIZE, + MAX_FRAME_INFO_SIZE, MIN_FRAME_INFO_SIZE, +}; +use super::Error; +use crate::{ + block::WINDOW_SIZE, + sink::{vec_sink_for_decompression, SliceSink}, +}; + +/// A reader for decompressing the LZ4 frame format +/// +/// This Decoder wraps any other reader that implements `io::Read`. +/// Bytes read will be decompressed according to the [LZ4 frame format]( +/// https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md). +/// +/// # Example 1 +/// Deserializing json values out of a compressed file. +/// +/// ```no_run +/// let compressed_input = std::fs::File::open("datafile").unwrap(); +/// let mut decompressed_input = lz4_flex::frame::FrameDecoder::new(compressed_input); +/// let json: serde_json::Value = serde_json::from_reader(decompressed_input).unwrap(); +/// ``` +/// +/// # Example +/// Deserializing multiple json values out of a compressed file +/// +/// ```no_run +/// let compressed_input = std::fs::File::open("datafile").unwrap(); +/// let mut decompressed_input = lz4_flex::frame::FrameDecoder::new(compressed_input); +/// loop { +/// match serde_json::from_reader::<_, serde_json::Value>(&mut decompressed_input) { +/// Ok(json) => { println!("json {:?}", json); } +/// Err(e) if e.is_eof() => break, +/// Err(e) => panic!("{}", e), +/// } +/// } +/// ``` +pub struct FrameDecoder { + /// The underlying reader. + r: R, + /// The FrameInfo of the frame currently being decoded. + /// It starts as `None` and is filled with the FrameInfo is read from the input. + /// It's reset to `None` once the frame EndMarker is read from the input. + current_frame_info: Option, + /// Xxhash32 used when content checksum is enabled. + content_hasher: XxHash32, + /// Total length of decompressed output for the current frame. + content_len: u64, + /// The compressed bytes buffer, taken from the underlying reader. + src: Vec, + /// The decompressed bytes buffer. Bytes are decompressed from src to dst + /// before being passed back to the caller. + dst: Vec, + /// Index into dst and length: starting point of bytes previously output + /// that are still part of the decompressor window. + ext_dict_offset: usize, + ext_dict_len: usize, + /// Index into dst: starting point of bytes not yet read by caller. + dst_start: usize, + /// Index into dst: ending point of bytes not yet read by caller. + dst_end: usize, +} + +impl FrameDecoder { + /// Creates a new Decoder for the specified reader. + pub fn new(rdr: R) -> FrameDecoder { + FrameDecoder { + r: rdr, + src: Default::default(), + dst: Default::default(), + ext_dict_offset: 0, + ext_dict_len: 0, + dst_start: 0, + dst_end: 0, + current_frame_info: None, + content_hasher: XxHash32::with_seed(0), + content_len: 0, + } + } + + /// Gets a reference to the underlying reader in this decoder. + pub fn get_ref(&self) -> &R { + &self.r + } + + /// Gets a mutable reference to the underlying reader in this decoder. + /// + /// Note that mutation of the stream may result in surprising results if + /// this decoder is continued to be used. + pub fn get_mut(&mut self) -> &mut R { + &mut self.r + } + + /// Consumes the FrameDecoder and returns the underlying reader. + pub fn into_inner(self) -> R { + self.r + } + + fn read_frame_info(&mut self) -> Result { + let mut buffer = [0u8; MAX_FRAME_INFO_SIZE]; + + match self.r.read(&mut buffer[..MAGIC_NUMBER_SIZE])? { + 0 => return Ok(0), + MAGIC_NUMBER_SIZE => (), + read => self.r.read_exact(&mut buffer[read..MAGIC_NUMBER_SIZE])?, + } + + if u32::from_le_bytes(buffer[0..MAGIC_NUMBER_SIZE].try_into().unwrap()) + != LZ4F_LEGACY_MAGIC_NUMBER + { + match self + .r + .read(&mut buffer[MAGIC_NUMBER_SIZE..MIN_FRAME_INFO_SIZE])? + { + 0 => return Ok(0), + MIN_FRAME_INFO_SIZE => (), + read => self + .r + .read_exact(&mut buffer[MAGIC_NUMBER_SIZE + read..MIN_FRAME_INFO_SIZE])?, + } + } + let required = FrameInfo::read_size(&buffer[..MIN_FRAME_INFO_SIZE])?; + if required != MIN_FRAME_INFO_SIZE && required != MAGIC_NUMBER_SIZE { + self.r + .read_exact(&mut buffer[MIN_FRAME_INFO_SIZE..required])?; + } + + let frame_info = FrameInfo::read(&buffer[..required])?; + if frame_info.dict_id.is_some() { + // Unsupported right now so it must be None + return Err(Error::DictionaryNotSupported.into()); + } + + let max_block_size = frame_info.block_size.get_size(); + let dst_size = if frame_info.block_mode == BlockMode::Linked { + // In linked mode we consume the output (bumping dst_start) but leave the + // beginning of dst to be used as a prefix in subsequent blocks. + // That is at least until we have at least `max_block_size + WINDOW_SIZE` + // bytes in dst, then we setup an ext_dict with the last WINDOW_SIZE bytes + // and the output goes to the beginning of dst again. + // Since we always want to be able to write a full block (up to max_block_size) + // we need a buffer with at least `max_block_size * 2 + WINDOW_SIZE` bytes. + max_block_size * 2 + WINDOW_SIZE + } else { + max_block_size + }; + self.src.clear(); + self.dst.clear(); + self.src.reserve_exact(max_block_size); + self.dst.reserve_exact(dst_size); + self.current_frame_info = Some(frame_info); + self.content_hasher = XxHash32::with_seed(0); + self.content_len = 0; + self.ext_dict_len = 0; + self.dst_start = 0; + self.dst_end = 0; + Ok(required) + } + + #[inline] + fn read_checksum(r: &mut R) -> Result { + let mut checksum_buffer = [0u8; size_of::()]; + r.read_exact(&mut checksum_buffer[..])?; + let checksum = u32::from_le_bytes(checksum_buffer); + Ok(checksum) + } + + #[inline] + fn check_block_checksum(data: &[u8], expected_checksum: u32) -> Result<(), io::Error> { + let mut block_hasher = XxHash32::with_seed(0); + block_hasher.write(data); + let calc_checksum = block_hasher.finish() as u32; + if calc_checksum != expected_checksum { + return Err(Error::BlockChecksumError.into()); + } + Ok(()) + } + + fn read_block(&mut self) -> io::Result { + debug_assert_eq!(self.dst_start, self.dst_end); + let frame_info = self.current_frame_info.as_ref().unwrap(); + + // Adjust dst buffer offsets to decompress the next block + let max_block_size = frame_info.block_size.get_size(); + if frame_info.block_mode == BlockMode::Linked { + // In linked mode we consume the output (bumping dst_start) but leave the + // beginning of dst to be used as a prefix in subsequent blocks. + // That is at least until we have at least `max_block_size + WINDOW_SIZE` + // bytes in dst, then we setup an ext_dict with the last WINDOW_SIZE bytes + // and the output goes to the beginning of dst again. + debug_assert_eq!(self.dst.capacity(), max_block_size * 2 + WINDOW_SIZE); + if self.dst_start + max_block_size > self.dst.capacity() { + // Output might not fit in the buffer. + // The ext_dict will become the last WINDOW_SIZE bytes + debug_assert!(self.dst_start >= max_block_size + WINDOW_SIZE); + self.ext_dict_offset = self.dst_start - WINDOW_SIZE; + self.ext_dict_len = WINDOW_SIZE; + // Output goes in the beginning of the buffer again. + self.dst_start = 0; + self.dst_end = 0; + } else if self.dst_start + self.ext_dict_len > WINDOW_SIZE { + // There's more than WINDOW_SIZE bytes of lookback adding the prefix and ext_dict. + // Since we have a limited buffer we must shrink ext_dict in favor of the prefix, + // so that we can fit up to max_block_size bytes between dst_start and ext_dict + // start. + let delta = self + .ext_dict_len + .min(self.dst_start + self.ext_dict_len - WINDOW_SIZE); + self.ext_dict_offset += delta; + self.ext_dict_len -= delta; + debug_assert!(self.dst_start + self.ext_dict_len >= WINDOW_SIZE) + } + } else { + debug_assert_eq!(self.ext_dict_len, 0); + debug_assert_eq!(self.dst.capacity(), max_block_size); + self.dst_start = 0; + self.dst_end = 0; + } + + // Read and decompress block + let block_info = { + let mut buffer = [0u8; 4]; + if let Err(err) = self.r.read_exact(&mut buffer) { + if err.kind() == ErrorKind::UnexpectedEof { + return Ok(0); + } else { + return Err(err); + } + } + BlockInfo::read(&buffer)? + }; + match block_info { + BlockInfo::Uncompressed(len) => { + let len = len as usize; + if len > max_block_size { + return Err(Error::BlockTooBig.into()); + } + // TODO: Attempt to avoid initialization of read buffer when + // https://github.com/rust-lang/rust/issues/42788 stabilizes + self.r.read_exact(vec_resize_and_get_mut( + &mut self.dst, + self.dst_start, + self.dst_start + len, + ))?; + if frame_info.block_checksums { + let expected_checksum = Self::read_checksum(&mut self.r)?; + Self::check_block_checksum( + &self.dst[self.dst_start..self.dst_start + len], + expected_checksum, + )?; + } + + self.dst_end += len; + self.content_len += len as u64; + } + BlockInfo::Compressed(len) => { + let len = len as usize; + if len > max_block_size { + return Err(Error::BlockTooBig.into()); + } + // TODO: Attempt to avoid initialization of read buffer when + // https://github.com/rust-lang/rust/issues/42788 stabilizes + self.r + .read_exact(vec_resize_and_get_mut(&mut self.src, 0, len))?; + if frame_info.block_checksums { + let expected_checksum = Self::read_checksum(&mut self.r)?; + Self::check_block_checksum(&self.src[..len], expected_checksum)?; + } + + let with_dict_mode = + frame_info.block_mode == BlockMode::Linked && self.ext_dict_len != 0; + let decomp_size = if with_dict_mode { + debug_assert!(self.dst_start + max_block_size <= self.ext_dict_offset); + let (head, tail) = self.dst.split_at_mut(self.ext_dict_offset); + let ext_dict = &tail[..self.ext_dict_len]; + + debug_assert!(head.len() - self.dst_start >= max_block_size); + crate::block::decompress::decompress_internal::( + &self.src[..len], + &mut SliceSink::new(head, self.dst_start), + ext_dict, + ) + } else { + // Independent blocks OR linked blocks with only prefix data + debug_assert!(self.dst.capacity() - self.dst_start >= max_block_size); + crate::block::decompress::decompress_internal::( + &self.src[..len], + &mut vec_sink_for_decompression( + &mut self.dst, + 0, + self.dst_start, + self.dst_start + max_block_size, + ), + b"", + ) + } + .map_err(Error::DecompressionError)?; + + self.dst_end += decomp_size; + self.content_len += decomp_size as u64; + } + + BlockInfo::EndMark => { + if let Some(expected) = frame_info.content_size { + if self.content_len != expected { + return Err(Error::ContentLengthError { + expected, + actual: self.content_len, + } + .into()); + } + } + if frame_info.content_checksum { + let expected_checksum = Self::read_checksum(&mut self.r)?; + let calc_checksum = self.content_hasher.finish() as u32; + if calc_checksum != expected_checksum { + return Err(Error::ContentChecksumError.into()); + } + } + self.current_frame_info = None; + return Ok(0); + } + } + + // Content checksum, if applicable + if frame_info.content_checksum { + self.content_hasher + .write(&self.dst[self.dst_start..self.dst_end]); + } + + Ok(self.dst_end - self.dst_start) + } + + fn read_more(&mut self) -> io::Result { + if self.current_frame_info.is_none() && self.read_frame_info()? == 0 { + return Ok(0); + } + self.read_block() + } +} + +impl io::Read for FrameDecoder { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + loop { + // Fill read buffer if there's uncompressed data left + if self.dst_start < self.dst_end { + let read_len = std::cmp::min(self.dst_end - self.dst_start, buf.len()); + let dst_read_end = self.dst_start + read_len; + buf[..read_len].copy_from_slice(&self.dst[self.dst_start..dst_read_end]); + self.dst_start = dst_read_end; + return Ok(read_len); + } + if self.read_more()? == 0 { + return Ok(0); + } + } + } + + fn read_to_string(&mut self, buf: &mut String) -> io::Result { + let mut written = 0; + loop { + match self.fill_buf() { + Ok(b) if b.is_empty() => return Ok(written), + Ok(b) => { + let s = std::str::from_utf8(b).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidData, + "stream did not contain valid UTF-8", + ) + })?; + buf.push_str(s); + let len = s.len(); + self.consume(len); + written += len; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } + } + + fn read_to_end(&mut self, buf: &mut Vec) -> io::Result { + let mut written = 0; + loop { + match self.fill_buf() { + Ok(b) if b.is_empty() => return Ok(written), + Ok(b) => { + buf.extend_from_slice(b); + let len = b.len(); + self.consume(len); + written += len; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } + } +} + +impl io::BufRead for FrameDecoder { + fn fill_buf(&mut self) -> io::Result<&[u8]> { + if self.dst_start == self.dst_end { + self.read_more()?; + } + Ok(&self.dst[self.dst_start..self.dst_end]) + } + + fn consume(&mut self, amt: usize) { + assert!(amt <= self.dst_end - self.dst_start); + self.dst_start += amt; + } +} + +impl fmt::Debug for FrameDecoder { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("FrameDecoder") + .field("r", &self.r) + .field("content_hasher", &self.content_hasher) + .field("content_len", &self.content_len) + .field("src", &"[...]") + .field("dst", &"[...]") + .field("dst_start", &self.dst_start) + .field("dst_end", &self.dst_end) + .field("ext_dict_offset", &self.ext_dict_offset) + .field("ext_dict_len", &self.ext_dict_len) + .field("current_frame_info", &self.current_frame_info) + .finish() + } +} + +/// Similar to `v.get_mut(start..end) but will adjust the len if needed. +#[inline] +fn vec_resize_and_get_mut(v: &mut Vec, start: usize, end: usize) -> &mut [u8] { + if end > v.len() { + v.resize(end, 0) + } + &mut v[start..end] +} diff --git a/src/frame/header.rs b/src/frame/header.rs new file mode 100644 index 0000000..1513c11 --- /dev/null +++ b/src/frame/header.rs @@ -0,0 +1,412 @@ +use twox_hash::XxHash32; + +use super::Error; +use std::{ + convert::TryInto, + fmt::Debug, + hash::Hasher, + io, + io::{Read, Write}, +}; + +const FLG_RESERVED_MASK: u8 = 0b00000010; +const FLG_VERSION_MASK: u8 = 0b11000000; +const FLG_SUPPORTED_VERSION_BITS: u8 = 0b01000000; + +const FLG_INDEPENDENT_BLOCKS: u8 = 0b00100000; +const FLG_BLOCK_CHECKSUMS: u8 = 0b00010000; +const FLG_CONTENT_SIZE: u8 = 0b00001000; +const FLG_CONTENT_CHECKSUM: u8 = 0b00000100; +const FLG_DICTIONARY_ID: u8 = 0b00000001; + +const BD_RESERVED_MASK: u8 = !BD_BLOCK_SIZE_MASK; +const BD_BLOCK_SIZE_MASK: u8 = 0b01110000; +const BD_BLOCK_SIZE_MASK_RSHIFT: u8 = 4; + +const BLOCK_UNCOMPRESSED_SIZE_BIT: u32 = 0x80000000; + +const LZ4F_MAGIC_NUMBER: u32 = 0x184D2204; +pub(crate) const LZ4F_LEGACY_MAGIC_NUMBER: u32 = 0x184C2102; +const LZ4F_SKIPPABLE_MAGIC_RANGE: std::ops::RangeInclusive = 0x184D2A50..=0x184D2A5F; + +pub(crate) const MAGIC_NUMBER_SIZE: usize = 4; +pub(crate) const MIN_FRAME_INFO_SIZE: usize = 7; +pub(crate) const MAX_FRAME_INFO_SIZE: usize = 19; +pub(crate) const BLOCK_INFO_SIZE: usize = 4; + +#[derive(Clone, Copy, PartialEq, Debug)] +/// Different predefines blocksizes to choose when compressing data. +#[derive(Default)] +pub enum BlockSize { + /// Will detect optimal frame size based on the size of the first write call + #[default] + Auto = 0, + /// The default block size. + Max64KB = 4, + /// 256KB block size. + Max256KB = 5, + /// 1MB block size. + Max1MB = 6, + /// 4MB block size. + Max4MB = 7, + /// 8MB block size. + Max8MB = 8, +} + +impl BlockSize { + /// Try to find optimal size based on passed buffer length. + pub(crate) fn from_buf_length(buf_len: usize) -> Self { + let mut blocksize = BlockSize::Max4MB; + + for candidate in [BlockSize::Max256KB, BlockSize::Max64KB] { + if buf_len > candidate.get_size() { + return blocksize; + } + blocksize = candidate; + } + BlockSize::Max64KB + } + pub(crate) fn get_size(&self) -> usize { + match self { + BlockSize::Auto => unreachable!(), + BlockSize::Max64KB => 64 * 1024, + BlockSize::Max256KB => 256 * 1024, + BlockSize::Max1MB => 1024 * 1024, + BlockSize::Max4MB => 4 * 1024 * 1024, + BlockSize::Max8MB => 8 * 1024 * 1024, + } + } +} + +#[derive(Clone, Copy, PartialEq, Debug)] +/// The two `BlockMode` operations that can be set on (`FrameInfo`)[FrameInfo] +#[derive(Default)] +pub enum BlockMode { + /// Every block is compressed independently. The default. + #[default] + Independent, + /// Blocks can reference data from previous blocks. + /// + /// Effective when the stream contains small blocks. + Linked, +} + +// From: https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md +// +// General Structure of LZ4 Frame format +// ------------------------------------- +// +// | MagicNb | F. Descriptor | Block | (...) | EndMark | C. Checksum | +// |:-------:|:-------------:| ----- | ----- | ------- | ----------- | +// | 4 bytes | 3-15 bytes | | | 4 bytes | 0-4 bytes | +// +// Frame Descriptor +// ---------------- +// +// | FLG | BD | (Content Size) | (Dictionary ID) | HC | +// | ------- | ------- |:--------------:|:---------------:| ------- | +// | 1 byte | 1 byte | 0 - 8 bytes | 0 - 4 bytes | 1 byte | +// +// __FLG byte__ +// +// | BitNb | 7-6 | 5 | 4 | 3 | 2 | 1 | 0 | +// | ------- |-------|-------|----------|------|----------|----------|------| +// |FieldName|Version|B.Indep|B.Checksum|C.Size|C.Checksum|*Reserved*|DictID| +// +// __BD byte__ +// +// | BitNb | 7 | 6-5-4 | 3-2-1-0 | +// | ------- | -------- | ------------- | -------- | +// |FieldName|*Reserved*| Block MaxSize |*Reserved*| +// +// Data Blocks +// ----------- +// +// | Block Size | data | (Block Checksum) | +// |:----------:| ------ |:----------------:| +// | 4 bytes | | 0 - 4 bytes | +// +#[derive(Debug, Default, Clone)] +/// The metadata for de/compressing with lz4 frame format. +pub struct FrameInfo { + /// If set, includes the total uncompressed size of data in the frame. + pub content_size: Option, + /// The identifier for the dictionary that must be used to correctly decode data. + /// The compressor and the decompressor must use exactly the same dictionary. + /// + /// Note that this is currently unsupported and for this reason it's not pub. + pub(crate) dict_id: Option, + /// The maximum uncompressed size of each data block. + pub block_size: BlockSize, + /// The block mode. + pub block_mode: BlockMode, + /// If set, includes a checksum for each data block in the frame. + pub block_checksums: bool, + /// If set, includes a content checksum to verify that the full frame contents have been + /// decoded correctly. + pub content_checksum: bool, + /// If set, use the legacy frame format + pub legacy_frame: bool, +} + +impl FrameInfo { + /// Create a new `FrameInfo`. + pub fn new() -> Self { + Self::default() + } + + /// Whether to include the total uncompressed size of data in the frame. + pub fn content_size(mut self, content_size: Option) -> Self { + self.content_size = content_size; + self + } + + /// The maximum uncompressed size of each data block. + pub fn block_size(mut self, block_size: BlockSize) -> Self { + self.block_size = block_size; + self + } + + /// The block mode. + pub fn block_mode(mut self, block_mode: BlockMode) -> Self { + self.block_mode = block_mode; + self + } + + /// If set, includes a checksum for each data block in the frame. + pub fn block_checksums(mut self, block_checksums: bool) -> Self { + self.block_checksums = block_checksums; + self + } + + /// If set, includes a content checksum to verify that the full frame contents have been + /// decoded correctly. + pub fn content_checksum(mut self, content_checksum: bool) -> Self { + self.content_checksum = content_checksum; + self + } + + /// If set, use the legacy frame format. + pub fn legacy_frame(mut self, legacy_frame: bool) -> Self { + self.legacy_frame = legacy_frame; + self + } + + pub(crate) fn read_size(input: &[u8]) -> Result { + let mut required = MIN_FRAME_INFO_SIZE; + let magic_num = u32::from_le_bytes(input[0..4].try_into().unwrap()); + if magic_num == LZ4F_LEGACY_MAGIC_NUMBER { + return Ok(MAGIC_NUMBER_SIZE); + } + + if input.len() < required { + return Ok(required); + } + + if LZ4F_SKIPPABLE_MAGIC_RANGE.contains(&magic_num) { + return Ok(8); + } + if magic_num != LZ4F_MAGIC_NUMBER { + return Err(Error::WrongMagicNumber); + } + + if input[4] & FLG_CONTENT_SIZE != 0 { + required += 8; + } + if input[4] & FLG_DICTIONARY_ID != 0 { + required += 4 + } + Ok(required) + } + + pub(crate) fn write_size(&self) -> usize { + let mut required = MIN_FRAME_INFO_SIZE; + if self.content_size.is_some() { + required += 8; + } + if self.dict_id.is_some() { + required += 4; + } + required + } + + pub(crate) fn write(&self, output: &mut [u8]) -> Result { + let write_size = self.write_size(); + if output.len() < write_size { + return Err(Error::IoError(io::ErrorKind::UnexpectedEof.into())); + } + let mut buffer = [0u8; MAX_FRAME_INFO_SIZE]; + assert!(write_size <= buffer.len()); + buffer[0..4].copy_from_slice(&LZ4F_MAGIC_NUMBER.to_le_bytes()); + buffer[4] = FLG_SUPPORTED_VERSION_BITS; + if self.block_checksums { + buffer[4] |= FLG_BLOCK_CHECKSUMS; + } + if self.content_checksum { + buffer[4] |= FLG_CONTENT_CHECKSUM; + } + if self.block_mode == BlockMode::Independent { + buffer[4] |= FLG_INDEPENDENT_BLOCKS; + } + buffer[5] = (self.block_size as u8) << BD_BLOCK_SIZE_MASK_RSHIFT; + + // Optional section + let mut offset = 6; + if let Some(size) = self.content_size { + buffer[4] |= FLG_CONTENT_SIZE; + buffer[offset..offset + 8].copy_from_slice(&size.to_le_bytes()); + offset += 8; + } + if let Some(dict_id) = self.dict_id { + buffer[4] |= FLG_DICTIONARY_ID; + buffer[offset..offset + 4].copy_from_slice(&dict_id.to_le_bytes()); + offset += 4; + } + + // Header checksum + let mut hasher = XxHash32::with_seed(0); + hasher.write(&buffer[4..offset]); + let header_checksum = (hasher.finish() >> 8) as u8; + buffer[offset] = header_checksum; + offset += 1; + + debug_assert_eq!(offset, write_size); + output[..write_size].copy_from_slice(&buffer[..write_size]); + Ok(write_size) + } + + pub(crate) fn read(mut input: &[u8]) -> Result { + let original_input = input; + // 4 byte Magic + let magic_num = { + let mut buffer = [0u8; 4]; + input.read_exact(&mut buffer)?; + u32::from_le_bytes(buffer) + }; + if magic_num == LZ4F_LEGACY_MAGIC_NUMBER { + return Ok(FrameInfo { + block_size: BlockSize::Max8MB, + legacy_frame: true, + ..FrameInfo::default() + }); + } + if LZ4F_SKIPPABLE_MAGIC_RANGE.contains(&magic_num) { + let mut buffer = [0u8; 4]; + input.read_exact(&mut buffer)?; + let user_data_len = u32::from_le_bytes(buffer); + return Err(Error::SkippableFrame(user_data_len)); + } + if magic_num != LZ4F_MAGIC_NUMBER { + return Err(Error::WrongMagicNumber); + } + + // fixed size section + let [flg_byte, bd_byte] = { + let mut buffer = [0u8, 0]; + input.read_exact(&mut buffer)?; + buffer + }; + + if flg_byte & FLG_VERSION_MASK != FLG_SUPPORTED_VERSION_BITS { + // version is always 01 + return Err(Error::UnsupportedVersion(flg_byte & FLG_VERSION_MASK)); + } + + if flg_byte & FLG_RESERVED_MASK != 0 || bd_byte & BD_RESERVED_MASK != 0 { + return Err(Error::ReservedBitsSet); + } + + let block_mode = if flg_byte & FLG_INDEPENDENT_BLOCKS != 0 { + BlockMode::Independent + } else { + BlockMode::Linked + }; + let content_checksum = flg_byte & FLG_CONTENT_CHECKSUM != 0; + let block_checksums = flg_byte & FLG_BLOCK_CHECKSUMS != 0; + + let block_size = match (bd_byte & BD_BLOCK_SIZE_MASK) >> BD_BLOCK_SIZE_MASK_RSHIFT { + i @ 0..=3 => return Err(Error::UnsupportedBlocksize(i)), + 4 => BlockSize::Max64KB, + 5 => BlockSize::Max256KB, + 6 => BlockSize::Max1MB, + 7 => BlockSize::Max4MB, + _ => unreachable!(), + }; + + // var len section + let mut content_size = None; + if flg_byte & FLG_CONTENT_SIZE != 0 { + let mut buffer = [0u8; 8]; + input.read_exact(&mut buffer).unwrap(); + content_size = Some(u64::from_le_bytes(buffer)); + } + + let mut dict_id = None; + if flg_byte & FLG_DICTIONARY_ID != 0 { + let mut buffer = [0u8; 4]; + input.read_exact(&mut buffer)?; + dict_id = Some(u32::from_le_bytes(buffer)); + } + + // 1 byte header checksum + let expected_checksum = { + let mut buffer = [0u8; 1]; + input.read_exact(&mut buffer)?; + buffer[0] + }; + let mut hasher = XxHash32::with_seed(0); + hasher.write(&original_input[4..original_input.len() - input.len() - 1]); + let header_hash = (hasher.finish() >> 8) as u8; + if header_hash != expected_checksum { + return Err(Error::HeaderChecksumError); + } + + Ok(FrameInfo { + content_size, + dict_id, + block_size, + block_mode, + block_checksums, + content_checksum, + legacy_frame: false, + }) + } +} + +#[derive(Debug)] +pub(crate) enum BlockInfo { + Compressed(u32), + Uncompressed(u32), + EndMark, +} + +impl BlockInfo { + pub(crate) fn read(mut input: &[u8]) -> Result { + let mut size_buffer = [0u8; 4]; + input.read_exact(&mut size_buffer)?; + let size = u32::from_le_bytes(size_buffer); + if size == 0 { + Ok(BlockInfo::EndMark) + } else if size & BLOCK_UNCOMPRESSED_SIZE_BIT != 0 { + Ok(BlockInfo::Uncompressed(size & !BLOCK_UNCOMPRESSED_SIZE_BIT)) + } else { + Ok(BlockInfo::Compressed(size)) + } + } + + pub(crate) fn write(&self, mut output: &mut [u8]) -> Result { + let value = match self { + BlockInfo::Compressed(len) if *len == 0 => return Err(Error::InvalidBlockInfo), + BlockInfo::Compressed(len) | BlockInfo::Uncompressed(len) + if *len & BLOCK_UNCOMPRESSED_SIZE_BIT != 0 => + { + return Err(Error::InvalidBlockInfo) + } + BlockInfo::Compressed(len) => *len, + BlockInfo::Uncompressed(len) => *len | BLOCK_UNCOMPRESSED_SIZE_BIT, + BlockInfo::EndMark => 0, + }; + output.write_all(&value.to_le_bytes())?; + Ok(4) + } +} diff --git a/src/frame/mod.rs b/src/frame/mod.rs new file mode 100644 index 0000000..8acfad2 --- /dev/null +++ b/src/frame/mod.rs @@ -0,0 +1,111 @@ +//! LZ4 Frame Format +//! +//! As defined in +//! +//! # Example: compress data on `stdin` with frame format +//! This program reads data from `stdin`, compresses it and emits it to `stdout`. +//! This example can be found in `examples/compress.rs`: +//! ```no_run +//! use std::io; +//! let stdin = io::stdin(); +//! let stdout = io::stdout(); +//! let mut rdr = stdin.lock(); +//! // Wrap the stdout writer in a LZ4 Frame writer. +//! let mut wtr = lz4_flex::frame::FrameEncoder::new(stdout.lock()); +//! io::copy(&mut rdr, &mut wtr).expect("I/O operation failed"); +//! wtr.finish().unwrap(); +//! ``` +//! + +use std::{fmt, io}; + +#[cfg_attr(feature = "safe-encode", forbid(unsafe_code))] +pub(crate) mod compress; +#[cfg_attr(feature = "safe-decode", forbid(unsafe_code))] +pub(crate) mod decompress; +pub(crate) mod header; + +pub use compress::{AutoFinishEncoder, FrameEncoder}; +pub use decompress::FrameDecoder; +pub use header::{BlockMode, BlockSize, FrameInfo}; + +#[derive(Debug)] +#[non_exhaustive] +/// Errors that can occur when de/compressing lz4. +pub enum Error { + /// Compression error. + CompressionError(crate::block::CompressError), + /// Decompression error. + DecompressionError(crate::block::DecompressError), + /// An io::Error was encountered. + IoError(io::Error), + /// Unsupported block size. + UnsupportedBlocksize(u8), + /// Unsupported frame version. + UnsupportedVersion(u8), + /// Wrong magic number for the LZ4 frame format. + WrongMagicNumber, + /// Reserved bits set. + ReservedBitsSet, + /// Block header is malformed. + InvalidBlockInfo, + /// Read a block larger than specified in the Frame header. + BlockTooBig, + /// The Frame header checksum doesn't match. + HeaderChecksumError, + /// The block checksum doesn't match. + BlockChecksumError, + /// The content checksum doesn't match. + ContentChecksumError, + /// Read an skippable frame. + /// The caller may read the specified amount of bytes from the underlying io::Read. + SkippableFrame(u32), + /// External dictionaries are not supported. + DictionaryNotSupported, + /// Content length differs. + ContentLengthError { + /// Expected content length. + expected: u64, + /// Actual content lenght. + actual: u64, + }, +} + +impl From for io::Error { + fn from(e: Error) -> Self { + match e { + Error::IoError(e) => e, + Error::CompressionError(_) + | Error::DecompressionError(_) + | Error::SkippableFrame(_) + | Error::DictionaryNotSupported => io::Error::new(io::ErrorKind::Other, e), + Error::WrongMagicNumber + | Error::UnsupportedBlocksize(..) + | Error::UnsupportedVersion(..) + | Error::ReservedBitsSet + | Error::InvalidBlockInfo + | Error::BlockTooBig + | Error::HeaderChecksumError + | Error::ContentChecksumError + | Error::BlockChecksumError + | Error::ContentLengthError { .. } => io::Error::new(io::ErrorKind::InvalidData, e), + } + } +} + +impl From for Error { + fn from(e: io::Error) -> Self { + match e.get_ref().map(|e| e.downcast_ref::()) { + Some(_) => *e.into_inner().unwrap().downcast::().unwrap(), + None => Error::IoError(e), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> std::fmt::Result { + write!(f, "{self:?}") + } +} + +impl std::error::Error for Error {} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..103cd7e --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,109 @@ +//! Pure Rust, high performance implementation of LZ4 compression. +//! +//! A detailed explanation of the algorithm can be found [here](http://ticki.github.io/blog/how-lz4-works/). +//! +//! # Overview +//! +//! This crate provides two ways to use lz4. The first way is through the +//! [`frame::FrameDecoder`](frame/struct.FrameDecoder.html) +//! and +//! [`frame::FrameEncoder`](frame/struct.FrameEncoder.html) +//! types, which implement the `std::io::Read` and `std::io::Write` traits with the +//! lz4 frame format. Unless you have a specific reason to the contrary, you +//! should only use the lz4 frame format. Specifically, the lz4 frame format +//! permits streaming compression or decompression. +//! +//! The second way is through the +//! [`decompress_size_prepended`](fn.decompress_size_prepended.html) +//! and +//! [`compress_prepend_size`](fn.compress_prepend_size.html) +//! functions. These functions provide access to the lz4 block format, and +//! don't support a streaming interface directly. You should only use these types +//! if you know you specifically need the lz4 block format. +//! +//! # Example: compress data on `stdin` with frame format +//! This program reads data from `stdin`, compresses it and emits it to `stdout`. +//! This example can be found in `examples/compress.rs`: +//! ```no_run +//! use std::io; +//! let stdin = io::stdin(); +//! let stdout = io::stdout(); +//! let mut rdr = stdin.lock(); +//! // Wrap the stdout writer in a LZ4 Frame writer. +//! let mut wtr = lz4_flex::frame::FrameEncoder::new(stdout.lock()); +//! io::copy(&mut rdr, &mut wtr).expect("I/O operation failed"); +//! wtr.finish().unwrap(); +//! ``` +//! # Example: decompress data on `stdin` with frame format +//! This program reads data from `stdin`, decompresses it and emits it to `stdout`. +//! This example can be found in `examples/decompress.rs`: +//! ```no_run +//! use std::io; +//! let stdin = io::stdin(); +//! let stdout = io::stdout(); +//! // Wrap the stdin reader in a LZ4 FrameDecoder. +//! let mut rdr = lz4_flex::frame::FrameDecoder::new(stdin.lock()); +//! let mut wtr = stdout.lock(); +//! io::copy(&mut rdr, &mut wtr).expect("I/O operation failed"); +//! ``` +//! +//! # Example: block format roundtrip +//! ``` +//! use lz4_flex::block::{compress_prepend_size, decompress_size_prepended}; +//! let input: &[u8] = b"Hello people, what's up?"; +//! let compressed = compress_prepend_size(input); +//! let uncompressed = decompress_size_prepended(&compressed).unwrap(); +//! assert_eq!(input, uncompressed); +//! ``` +//! +//! ## Feature Flags +//! +//! - `safe-encode` uses only safe rust for encode. _enabled by default_ +//! - `safe-decode` uses only safe rust for encode. _enabled by default_ +//! - `frame` support for LZ4 frame format. _implies `std`, enabled by default_ +//! - `std` enables dependency on the standard library. _enabled by default_ +//! +//! For maximum performance use `no-default-features`. +//! +//! For no_std support only the [`block format`](block/index.html) is supported. +//! +//! +#![deny(warnings)] +#![deny(missing_docs)] +#![cfg_attr(not(feature = "std"), no_std)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![cfg_attr(nightly, feature(optimize_attribute))] + +#[cfg_attr(test, macro_use)] +extern crate alloc; + +#[cfg(test)] +#[macro_use] +extern crate more_asserts; + +pub mod block; +#[cfg(feature = "frame")] +#[cfg_attr(docsrs, doc(cfg(feature = "frame")))] +pub mod frame; + +#[allow(dead_code)] +mod fastcpy; +#[allow(dead_code)] +mod fastcpy_unsafe; + +#[deprecated( + since = "0.11.0", + note = "This re-export is deprecated as it can be confused with the frame API and is not suitable for very large data, use block:: instead" +)] +pub use block::{compress, compress_into, compress_prepend_size}; +#[deprecated( + since = "0.11.0", + note = "This re-export is deprecated as it can be confused with the frame API and is not suitable for very large data, use block:: instead" +)] +pub use block::{decompress, decompress_into, decompress_size_prepended}; + +#[cfg_attr( + all(feature = "safe-encode", feature = "safe-decode"), + forbid(unsafe_code) +)] +pub(crate) mod sink; diff --git a/src/sink.rs b/src/sink.rs new file mode 100644 index 0000000..a440a2d --- /dev/null +++ b/src/sink.rs @@ -0,0 +1,330 @@ +#[allow(unused_imports)] +use alloc::vec::Vec; + +use crate::fastcpy::slice_copy; + +/// Returns a Sink implementation appropriate for outputing up to `required_capacity` +/// bytes at `vec[offset..offset+required_capacity]`. +/// It can be either a `SliceSink` (pre-filling the vec with zeroes if necessary) +/// when the `safe-decode` feature is enabled, or `VecSink` otherwise. +/// The argument `pos` defines the initial output position in the Sink. +#[inline] +#[cfg(feature = "frame")] +pub fn vec_sink_for_compression( + vec: &mut Vec, + offset: usize, + pos: usize, + required_capacity: usize, +) -> SliceSink { + return { + vec.resize(offset + required_capacity, 0); + SliceSink::new(&mut vec[offset..], pos) + }; +} + +/// Returns a Sink implementation appropriate for outputing up to `required_capacity` +/// bytes at `vec[offset..offset+required_capacity]`. +/// It can be either a `SliceSink` (pre-filling the vec with zeroes if necessary) +/// when the `safe-decode` feature is enabled, or `VecSink` otherwise. +/// The argument `pos` defines the initial output position in the Sink. +#[cfg(feature = "frame")] +#[inline] +pub fn vec_sink_for_decompression( + vec: &mut Vec, + offset: usize, + pos: usize, + required_capacity: usize, +) -> SliceSink { + return { + vec.resize(offset + required_capacity, 0); + SliceSink::new(&mut vec[offset..], pos) + }; +} + +pub trait Sink { + /// Returns a raw ptr to the first unfilled byte of the Sink. Analogous to `[pos..].as_ptr()`. + #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] + unsafe fn pos_mut_ptr(&mut self) -> *mut u8; + + /// read byte at position + fn byte_at(&mut self, pos: usize) -> u8; + + /// Pushes a byte to the end of the Sink. + #[cfg(feature = "safe-encode")] + fn push(&mut self, byte: u8); + + #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] + unsafe fn base_mut_ptr(&mut self) -> *mut u8; + + fn pos(&self) -> usize; + + fn capacity(&self) -> usize; + + #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] + unsafe fn set_pos(&mut self, new_pos: usize); + + #[cfg(feature = "safe-decode")] + fn extend_with_fill(&mut self, byte: u8, len: usize); + + /// Extends the Sink with `data`. + fn extend_from_slice(&mut self, data: &[u8]); + + fn extend_from_slice_wild(&mut self, data: &[u8], copy_len: usize); + + /// Copies `len` bytes starting from `start` to the end of the Sink. + /// # Panics + /// Panics if `start` >= `pos`. + #[cfg(feature = "safe-decode")] + fn extend_from_within(&mut self, start: usize, wild_len: usize, copy_len: usize); + + #[cfg(feature = "safe-decode")] + fn extend_from_within_overlapping(&mut self, start: usize, num_bytes: usize); +} + +/// SliceSink is used as target to de/compress data into a preallocated and possibly uninitialized +/// `&[u8]` +/// space. +/// +/// # Handling of Capacity +/// Extend methods will panic if there's insufficient capacity left in the Sink. +/// +/// # Invariants +/// - Bytes `[..pos()]` are always initialized. +pub struct SliceSink<'a> { + /// The working slice, which may contain uninitialized bytes + output: &'a mut [u8], + /// Number of bytes in start of `output` guaranteed to be initialized + pos: usize, +} + +impl<'a> SliceSink<'a> { + /// Creates a `Sink` backed by the given byte slice. + /// `pos` defines the initial output position in the Sink. + /// # Panics + /// Panics if `pos` is out of bounds. + #[inline] + pub fn new(output: &'a mut [u8], pos: usize) -> Self { + // SAFETY: Caller guarantees that all elements of `output[..pos]` are initialized. + let _ = &mut output[..pos]; // bounds check pos + SliceSink { output, pos } + } +} + +impl<'a> Sink for SliceSink<'a> { + /// Returns a raw ptr to the first unfilled byte of the Sink. Analogous to `[pos..].as_ptr()`. + #[inline] + #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] + unsafe fn pos_mut_ptr(&mut self) -> *mut u8 { + self.base_mut_ptr().add(self.pos()) as *mut u8 + } + + /// Pushes a byte to the end of the Sink. + #[inline] + fn byte_at(&mut self, pos: usize) -> u8 { + self.output[pos] + } + + /// Pushes a byte to the end of the Sink. + #[inline] + #[cfg(feature = "safe-encode")] + fn push(&mut self, byte: u8) { + self.output[self.pos] = byte; + self.pos += 1; + } + + #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] + unsafe fn base_mut_ptr(&mut self) -> *mut u8 { + self.output.as_mut_ptr() + } + + #[inline] + fn pos(&self) -> usize { + self.pos + } + + #[inline] + fn capacity(&self) -> usize { + self.output.len() + } + + #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] + #[inline] + unsafe fn set_pos(&mut self, new_pos: usize) { + debug_assert!(new_pos <= self.capacity()); + self.pos = new_pos; + } + + #[inline] + #[cfg(feature = "safe-decode")] + fn extend_with_fill(&mut self, byte: u8, len: usize) { + self.output[self.pos..self.pos + len].fill(byte); + self.pos += len; + } + + /// Extends the Sink with `data`. + #[inline] + fn extend_from_slice(&mut self, data: &[u8]) { + self.extend_from_slice_wild(data, data.len()) + } + + #[inline] + fn extend_from_slice_wild(&mut self, data: &[u8], copy_len: usize) { + assert!(copy_len <= data.len()); + slice_copy(data, &mut self.output[self.pos..(self.pos) + data.len()]); + self.pos += copy_len; + } + + /// Copies `len` bytes starting from `start` to the end of the Sink. + /// # Panics + /// Panics if `start` >= `pos`. + #[inline] + #[cfg(feature = "safe-decode")] + fn extend_from_within(&mut self, start: usize, wild_len: usize, copy_len: usize) { + self.output.copy_within(start..start + wild_len, self.pos); + self.pos += copy_len; + } + + #[inline] + #[cfg(feature = "safe-decode")] + #[cfg_attr(nightly, optimize(size))] // to avoid loop unrolling + fn extend_from_within_overlapping(&mut self, start: usize, num_bytes: usize) { + let offset = self.pos - start; + for i in start + offset..start + offset + num_bytes { + self.output[i] = self.output[i - offset]; + } + self.pos += num_bytes; + } +} + +/// PtrSink is used as target to de/compress data into a preallocated and possibly uninitialized +/// `&[u8]` +/// space. +/// +/// +#[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] +pub struct PtrSink { + /// The working slice, which may contain uninitialized bytes + output: *mut u8, + /// Number of bytes in start of `output` guaranteed to be initialized + pos: usize, + /// Number of bytes in output available + cap: usize, +} + +#[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] +impl PtrSink { + /// Creates a `Sink` backed by the given byte slice. + /// `pos` defines the initial output position in the Sink. + /// # Panics + /// Panics if `pos` is out of bounds. + #[inline] + pub fn from_vec(output: &mut Vec, pos: usize) -> Self { + // SAFETY: Bytes behind pointer may be uninitialized. + Self { + output: output.as_mut_ptr(), + pos, + cap: output.capacity(), + } + } +} + +#[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] +impl Sink for PtrSink { + /// Returns a raw ptr to the first unfilled byte of the Sink. Analogous to `[pos..].as_ptr()`. + #[inline] + #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] + unsafe fn pos_mut_ptr(&mut self) -> *mut u8 { + self.base_mut_ptr().add(self.pos()) as *mut u8 + } + + /// Pushes a byte to the end of the Sink. + #[inline] + fn byte_at(&mut self, pos: usize) -> u8 { + unsafe { self.output.add(pos).read() } + } + + /// Pushes a byte to the end of the Sink. + #[inline] + #[cfg(feature = "safe-encode")] + fn push(&mut self, byte: u8) { + unsafe { + self.pos_mut_ptr().write(byte); + } + self.pos += 1; + } + + #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] + unsafe fn base_mut_ptr(&mut self) -> *mut u8 { + self.output + } + + #[inline] + fn pos(&self) -> usize { + self.pos + } + + #[inline] + fn capacity(&self) -> usize { + self.cap + } + + #[cfg(not(all(feature = "safe-encode", feature = "safe-decode")))] + #[inline] + unsafe fn set_pos(&mut self, new_pos: usize) { + debug_assert!(new_pos <= self.capacity()); + self.pos = new_pos; + } + + #[inline] + #[cfg(feature = "safe-decode")] + fn extend_with_fill(&mut self, _byte: u8, _len: usize) { + unreachable!(); + } + + /// Extends the Sink with `data`. + #[inline] + fn extend_from_slice(&mut self, data: &[u8]) { + self.extend_from_slice_wild(data, data.len()) + } + + #[inline] + fn extend_from_slice_wild(&mut self, data: &[u8], copy_len: usize) { + assert!(copy_len <= data.len()); + unsafe { + core::ptr::copy_nonoverlapping(data.as_ptr(), self.pos_mut_ptr(), copy_len); + } + self.pos += copy_len; + } + + /// Copies `len` bytes starting from `start` to the end of the Sink. + /// # Panics + /// Panics if `start` >= `pos`. + #[inline] + #[cfg(feature = "safe-decode")] + fn extend_from_within(&mut self, _start: usize, _wild_len: usize, _copy_len: usize) { + unreachable!(); + } + + #[inline] + #[cfg(feature = "safe-decode")] + fn extend_from_within_overlapping(&mut self, _start: usize, _num_bytes: usize) { + unreachable!(); + } +} + +#[cfg(test)] +mod tests { + + #[test] + #[cfg(any(feature = "safe-encode", feature = "safe-decode"))] + fn test_sink_slice() { + use crate::sink::Sink; + use crate::sink::SliceSink; + use alloc::vec::Vec; + let mut data = Vec::new(); + data.resize(5, 0); + let sink = SliceSink::new(&mut data, 1); + assert_eq!(sink.pos(), 1); + assert_eq!(sink.capacity(), 5); + } +} -- cgit v1.2.3