diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 05:14:37 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 05:14:37 +0000 |
commit | a849759c3ef6d780e7b1157bd532b0fe3942a518 (patch) | |
tree | 425ab8ac9963150604207ffcd751ad6ae7b90056 | |
parent | 1faff9be927c85d1dfb151bc7975d02f697854df (diff) | |
parent | b078732699d802725219e0baf76cd6c3cc742513 (diff) | |
download | bstr-android14-mainline-uwb-release.tar.gz |
Snap for 10453563 from b078732699d802725219e0baf76cd6c3cc742513 to mainline-uwb-releaseaml_uwb_341513070aml_uwb_341511050aml_uwb_341310300aml_uwb_341310030aml_uwb_341111010aml_uwb_341011000android14-mainline-uwb-release
Change-Id: Id5a8a74ed260e3abd22f2acd837293fe5d12e34f
45 files changed, 1259 insertions, 690 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index ef4fb69..ff424fa 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,6 @@ { "git": { - "sha1": "e38e7a7ca986f9499b30202f49d79e531d14d192" - } -} + "sha1": "f72910a192f37e85932211bef957fbadaecefbaf" + }, + "path_in_vcs": "" +}
\ No newline at end of file @@ -44,19 +44,24 @@ rust_library { host_supported: true, crate_name: "bstr", cargo_env_compat: true, - cargo_pkg_version: "0.2.17", + cargo_pkg_version: "1.3.0", srcs: ["src/lib.rs"], - edition: "2018", + edition: "2021", features: [ + "alloc", "default", - "lazy_static", - "regex-automata", "std", "unicode", ], rustlibs: [ - "liblazy_static", "libmemchr", + "libonce_cell", "libregex_automata", ], + apex_available: [ + "//apex_available:platform", + "//apex_available:anyapex", + ], + product_available: true, + vendor_available: true, } @@ -1,8 +1,8 @@ This project is licensed under either of * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or - http://www.apache.org/licenses/LICENSE-2.0) + https://www.apache.org/licenses/LICENSE-2.0) * MIT license ([LICENSE-MIT](LICENSE-MIT) or - http://opensource.org/licenses/MIT) + https://opensource.org/licenses/MIT) at your option. @@ -10,32 +10,77 @@ # See Cargo.toml.orig for the original contents. [package] -edition = "2018" +edition = "2021" +rust-version = "1.60" name = "bstr" -version = "0.2.17" +version = "1.3.0" authors = ["Andrew Gallant <jamslam@gmail.com>"] exclude = ["/.github"] description = "A string type that is not required to be valid UTF-8." homepage = "https://github.com/BurntSushi/bstr" documentation = "https://docs.rs/bstr" readme = "README.md" -keywords = ["string", "str", "byte", "bytes", "text"] -categories = ["text-processing", "encoding"] +keywords = [ + "string", + "str", + "byte", + "bytes", + "text", +] +categories = [ + "text-processing", + "encoding", +] license = "MIT OR Apache-2.0" repository = "https://github.com/BurntSushi/bstr" +resolver = "2" + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = [ + "--cfg", + "docsrs", +] + [profile.release] debug = true [lib] bench = false -[dependencies.lazy_static] -version = "1.2.0" -optional = true + +[[example]] +name = "graphemes" +required-features = [ + "std", + "unicode", +] + +[[example]] +name = "lines" +required-features = ["std"] + +[[example]] +name = "uppercase" +required-features = [ + "std", + "unicode", +] + +[[example]] +name = "words" +required-features = [ + "std", + "unicode", +] [dependencies.memchr] version = "2.4.0" default-features = false +[dependencies.once_cell] +version = "1.14.0" +optional = true + [dependencies.regex-automata] version = "0.1.5" optional = true @@ -45,6 +90,7 @@ default-features = false version = "1.0.85" optional = true default-features = false + [dev-dependencies.quickcheck] version = "1" default-features = false @@ -56,8 +102,18 @@ version = "0.1.3" version = "1.2.1" [features] -default = ["std", "unicode"] -serde1 = ["std", "serde1-nostd", "serde/std"] -serde1-nostd = ["serde"] -std = ["memchr/std"] -unicode = ["lazy_static", "regex-automata"] +alloc = ["serde?/alloc"] +default = [ + "std", + "unicode", +] +serde = ["dep:serde"] +std = [ + "alloc", + "memchr/std", + "serde?/std", +] +unicode = [ + "dep:once_cell", + "dep:regex-automata", +] diff --git a/Cargo.toml.orig b/Cargo.toml.orig index cbb6283..ef559d7 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,6 +1,6 @@ [package] name = "bstr" -version = "0.2.17" #:version +version = "1.3.0" #:version authors = ["Andrew Gallant <jamslam@gmail.com>"] description = "A string type that is not required to be valid UTF-8." documentation = "https://docs.rs/bstr" @@ -11,7 +11,9 @@ keywords = ["string", "str", "byte", "bytes", "text"] license = "MIT OR Apache-2.0" categories = ["text-processing", "encoding"] exclude = ["/.github"] -edition = "2018" +edition = "2021" +rust-version = "1.60" +resolver = "2" [workspace] members = ["bench"] @@ -21,14 +23,14 @@ bench = false [features] default = ["std", "unicode"] -std = ["memchr/std"] -unicode = ["lazy_static", "regex-automata"] -serde1 = ["std", "serde1-nostd", "serde/std"] -serde1-nostd = ["serde"] +std = ["alloc", "memchr/std", "serde?/std"] +alloc = ["serde?/alloc"] +unicode = ["dep:once_cell", "dep:regex-automata"] +serde = ["dep:serde"] [dependencies] memchr = { version = "2.4.0", default-features = false } -lazy_static = { version = "1.2.0", optional = true } +once_cell = { version = "1.14.0", optional = true } regex-automata = { version = "0.1.5", default-features = false, optional = true } serde = { version = "1.0.85", default-features = false, optional = true } @@ -37,5 +39,34 @@ quickcheck = { version = "1", default-features = false } ucd-parse = "0.1.3" unicode-segmentation = "1.2.1" +[package.metadata.docs.rs] +# We want to document all features. +all-features = true +# Since this crate's feature setup is pretty complicated, it is worth opting +# into a nightly unstable option to show the features that need to be enabled +# for public API items. To do that, we set 'docsrs', and when that's enabled, +# we enable the 'doc_auto_cfg' feature. +# +# To test this locally, run: +# +# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features +rustdoc-args = ["--cfg", "docsrs"] + [profile.release] debug = true + +[[example]] +name = "graphemes" +required-features = ["std", "unicode"] + +[[example]] +name = "lines" +required-features = ["std"] + +[[example]] +name = "uppercase" +required-features = ["std", "unicode"] + +[[example]] +name = "words" +required-features = ["std", "unicode"] @@ -1,3 +1,7 @@ +# This project was upgraded with external_updater. +# Usage: tools/external_updater/updater.sh update rust/crates/bstr +# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md + name: "bstr" description: "A string type that is not required to be valid UTF-8." third_party { @@ -7,13 +11,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/bstr/bstr-0.2.17.crate" + value: "https://static.crates.io/crates/bstr/bstr-1.3.0.crate" } - version: "0.2.17" + version: "1.3.0" license_type: NOTICE last_upgrade_date { - year: 2021 - month: 9 - day: 22 + year: 2023 + month: 3 + day: 2 } } @@ -6,7 +6,7 @@ differs from the standard library's `String` and `str` types in that they are not required to be valid UTF-8, but may be fully or partially valid UTF-8. [![Build status](https://github.com/BurntSushi/bstr/workflows/ci/badge.svg)](https://github.com/BurntSushi/bstr/actions) -[![](https://meritbadge.herokuapp.com/bstr)](https://crates.io/crates/bstr) +[![crates.io](https://img.shields.io/crates/v/bstr.svg)](https://crates.io/crates/bstr) ### Documentation @@ -17,7 +17,7 @@ https://docs.rs/bstr ### When should I use byte strings? See this part of the documentation for more details: -https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings. +<https://docs.rs/bstr/1.*/bstr/#when-should-i-use-byte-strings>. The short story is that byte strings are useful when it is inconvenient or incorrect to require valid UTF-8. @@ -29,7 +29,7 @@ Add this to your `Cargo.toml`: ```toml [dependencies] -bstr = "0.2" +bstr = "1" ``` @@ -38,13 +38,11 @@ bstr = "0.2" The following two examples exhibit both the API features of byte strings and the I/O convenience functions provided for reading line-by-line quickly. -This first example simply shows how to efficiently iterate over lines in -stdin, and print out lines containing a particular substring: +This first example simply shows how to efficiently iterate over lines in stdin, +and print out lines containing a particular substring: ```rust -use std::error::Error; -use std::io::{self, Write}; - +use std::{error::Error, io::{self, Write}}; use bstr::{ByteSlice, io::BufReadExt}; fn main() -> Result<(), Box<dyn Error>> { @@ -65,9 +63,7 @@ This example shows how to count all of the words (Unicode-aware) in stdin, line-by-line: ```rust -use std::error::Error; -use std::io; - +use std::{error::Error, io}; use bstr::{ByteSlice, io::BufReadExt}; fn main() -> Result<(), Box<dyn Error>> { @@ -88,9 +84,7 @@ text, this is quite a bit faster than what you can (easily) do with standard library APIs. (N.B. Any invalid UTF-8 bytes are passed through unchanged.) ```rust -use std::error::Error; -use std::io::{self, Write}; - +use std::{error::Error, io::{self, Write}}; use bstr::{ByteSlice, io::BufReadExt}; fn main() -> Result<(), Box<dyn Error>> { @@ -113,9 +107,7 @@ clusters) from each line, where invalid UTF-8 sequences are generally treated as a single character and are passed through correctly: ```rust -use std::error::Error; -use std::io::{self, Write}; - +use std::{error::Error, io::{self, Write}}; use bstr::{ByteSlice, io::BufReadExt}; fn main() -> Result<(), Box<dyn Error>> { @@ -140,25 +132,27 @@ fn main() -> Result<(), Box<dyn Error>> { ### Cargo features -This crates comes with a few features that control standard library, serde -and Unicode support. +This crates comes with a few features that control standard library, serde and +Unicode support. * `std` - **Enabled** by default. This provides APIs that require the standard - library, such as `Vec<u8>`. + library, such as `Vec<u8>` and `PathBuf`. Enabling this feature also enables + the `alloc` feature. +* `alloc` - **Enabled** by default. This provides APIs that require allocations + via the `alloc` crate, such as `Vec<u8>`. * `unicode` - **Enabled** by default. This provides APIs that require sizable Unicode data compiled into the binary. This includes, but is not limited to, grapheme/word/sentence segmenters. When this is disabled, basic support such - as UTF-8 decoding is still included. -* `serde1` - **Disabled** by default. Enables implementations of serde traits - for the `BStr` and `BString` types. -* `serde1-nostd` - **Disabled** by default. Enables implementations of serde - traits for the `BStr` type only, intended for use without the standard - library. Generally, you either want `serde1` or `serde1-nostd`, not both. + as UTF-8 decoding is still included. Note that currently, enabling this + feature also requires enabling the `std` feature. It is expected that this + limitation will be lifted at some point. +* `serde` - Enables implementations of serde traits for `BStr`, and also + `BString` when `alloc` is enabled. ### Minimum Rust version policy -This crate's minimum supported `rustc` version (MSRV) is `1.41.1`. +This crate's minimum supported `rustc` version (MSRV) is `1.60.0`. In general, this crate will be conservative with respect to the minimum supported version of Rust. MSRV may be bumped in minor version releases. @@ -166,27 +160,27 @@ supported version of Rust. MSRV may be bumped in minor version releases. ### Future work -Since this is meant to be a core crate, getting a `1.0` release is a priority. -My hope is to move to `1.0` within the next year and commit to its API so that -`bstr` can be used as a public dependency. +Since it is plausible that some of the types in this crate might end up in your +public API (e.g., `BStr` and `BString`), we will commit to being very +conservative with respect to new major version releases. It's difficult to say +precisely how conservative, but unless there is a major issue with the `1.0` +release, I wouldn't expect a `2.0` release to come out any sooner than some +period of years. A large part of the API surface area was taken from the standard library, so from an API design perspective, a good portion of this crate should be on solid -ground already. The main differences from the standard library are in how the -various substring search routines work. The standard library provides generic +ground. The main differences from the standard library are in how the various +substring search routines work. The standard library provides generic infrastructure for supporting different types of searches with a single method, where as this library prefers to define new methods for each type of search and drop the generic infrastructure. Some _probable_ future considerations for APIs include, but are not limited to: -* A convenience layer on top of the `aho-corasick` crate. * Unicode normalization. * More sophisticated support for dealing with Unicode case, perhaps by combining the use cases supported by [`caseless`](https://docs.rs/caseless) and [`unicase`](https://docs.rs/unicase). -* Add facilities for dealing with OS strings and file paths, probably via - simple conversion routines. Here are some examples that are _probably_ out of scope for this crate: @@ -208,16 +202,16 @@ achieved with the standard library `Vec<u8>`/`&[u8]` APIs and the ecosystem of library crates. For example: * The standard library's - [`Utf8Error`](https://doc.rust-lang.org/std/str/struct.Utf8Error.html) - can be used for incremental lossy decoding of `&[u8]`. + [`Utf8Error`](https://doc.rust-lang.org/std/str/struct.Utf8Error.html) can be + used for incremental lossy decoding of `&[u8]`. * The [`unicode-segmentation`](https://unicode-rs.github.io/unicode-segmentation/unicode_segmentation/index.html) crate can be used for iterating over graphemes (or words), but is only implemented for `&str` types. One could use `Utf8Error` above to implement grapheme iteration with the same semantics as what `bstr` provides (automatic Unicode replacement codepoint substitution). -* The [`twoway`](https://docs.rs/twoway) crate can be used for - fast substring searching on `&[u8]`. +* The [`twoway`](https://docs.rs/twoway) crate can be used for fast substring + searching on `&[u8]`. So why create `bstr`? Part of the point of the `bstr` crate is to provide a uniform API of coupled components instead of relying on users to piece together diff --git a/src/ascii.rs b/src/ascii.rs index bb2b679..259d41f 100644 --- a/src/ascii.rs +++ b/src/ascii.rs @@ -23,18 +23,18 @@ use core::mem; // means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to // _mm_movemask_epi8. -#[cfg(any(test, not(target_arch = "x86_64")))] +#[cfg(any(test, miri, not(target_arch = "x86_64")))] const USIZE_BYTES: usize = mem::size_of::<usize>(); -#[cfg(any(test, not(target_arch = "x86_64")))] +#[cfg(any(test, miri, not(target_arch = "x86_64")))] const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES; // This is a mask where the most significant bit of each byte in the usize // is set. We test this bit to determine whether a character is ASCII or not. // Namely, a single byte is regarded as an ASCII codepoint if and only if it's // most significant bit is not set. -#[cfg(any(test, not(target_arch = "x86_64")))] +#[cfg(any(test, miri, not(target_arch = "x86_64")))] const ASCII_MASK_U64: u64 = 0x8080808080808080; -#[cfg(any(test, not(target_arch = "x86_64")))] +#[cfg(any(test, miri, not(target_arch = "x86_64")))] const ASCII_MASK: usize = ASCII_MASK_U64 as usize; /// Returns the index of the first non ASCII byte in the given slice. @@ -42,18 +42,18 @@ const ASCII_MASK: usize = ASCII_MASK_U64 as usize; /// If slice only contains ASCII bytes, then the length of the slice is /// returned. pub fn first_non_ascii_byte(slice: &[u8]) -> usize { - #[cfg(not(target_arch = "x86_64"))] + #[cfg(any(miri, not(target_arch = "x86_64")))] { first_non_ascii_byte_fallback(slice) } - #[cfg(target_arch = "x86_64")] + #[cfg(all(not(miri), target_arch = "x86_64"))] { first_non_ascii_byte_sse2(slice) } } -#[cfg(any(test, not(target_arch = "x86_64")))] +#[cfg(any(test, miri, not(target_arch = "x86_64")))] fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize { let align = USIZE_BYTES - 1; let start_ptr = slice.as_ptr(); @@ -115,7 +115,7 @@ fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize { } } -#[cfg(target_arch = "x86_64")] +#[cfg(all(not(miri), target_arch = "x86_64"))] fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize { use core::arch::x86_64::*; @@ -221,7 +221,7 @@ unsafe fn first_non_ascii_byte_slow( /// bytes is not an ASCII byte. /// /// The position returned is always in the inclusive range [0, 7]. -#[cfg(any(test, not(target_arch = "x86_64")))] +#[cfg(any(test, miri, not(target_arch = "x86_64")))] fn first_non_ascii_byte_mask(mask: usize) -> usize { #[cfg(target_endian = "little")] { @@ -245,7 +245,7 @@ unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 { ptr.offset((amt as isize).wrapping_neg()) } -#[cfg(any(test, not(target_arch = "x86_64")))] +#[cfg(any(test, miri, not(target_arch = "x86_64")))] unsafe fn read_unaligned_usize(ptr: *const u8) -> usize { use core::ptr; @@ -286,6 +286,7 @@ mod tests { #[test] #[cfg(target_arch = "x86_64")] + #[cfg(not(miri))] fn positive_sse2_forward() { for i in 0..517 { let b = "a".repeat(i).into_bytes(); @@ -294,6 +295,7 @@ mod tests { } #[test] + #[cfg(not(miri))] fn negative_fallback_forward() { for i in 0..517 { for align in 0..65 { @@ -315,6 +317,7 @@ mod tests { #[test] #[cfg(target_arch = "x86_64")] + #[cfg(not(miri))] fn negative_sse2_forward() { for i in 0..517 { for align in 0..65 { diff --git a/src/bstr.rs b/src/bstr.rs index 1e3c91b..5036f06 100644 --- a/src/bstr.rs +++ b/src/bstr.rs @@ -1,5 +1,8 @@ use core::mem; +#[cfg(feature = "alloc")] +use alloc::boxed::Box; + /// A wrapper for `&[u8]` that provides convenient string oriented trait impls. /// /// If you need ownership or a growable byte string buffer, then use @@ -33,8 +36,31 @@ pub struct BStr { } impl BStr { + /// Directly creates a `BStr` slice from anything that can be converted + /// to a byte slice. + /// + /// This is very similar to the [`B`](crate::B) function, except this + /// returns a `&BStr` instead of a `&[u8]`. + /// + /// This is a cost-free conversion. + /// + /// # Example + /// + /// You can create `BStr`'s from byte arrays, byte slices or even string + /// slices: + /// + /// ``` + /// use bstr::BStr; + /// + /// let a = BStr::new(b"abc"); + /// let b = BStr::new(&b"abc"[..]); + /// let c = BStr::new("abc"); + /// + /// assert_eq!(a, b); + /// assert_eq!(a, c); + /// ``` #[inline] - pub(crate) fn new<B: ?Sized + AsRef<[u8]>>(bytes: &B) -> &BStr { + pub fn new<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a BStr { BStr::from_bytes(bytes.as_ref()) } @@ -56,13 +82,13 @@ impl BStr { } #[inline] - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] pub(crate) fn from_boxed_bytes(slice: Box<[u8]>) -> Box<BStr> { unsafe { Box::from_raw(Box::into_raw(slice) as _) } } #[inline] - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] pub(crate) fn into_boxed_bytes(slice: Box<BStr>) -> Box<[u8]> { unsafe { Box::from_raw(Box::into_raw(slice) as _) } } diff --git a/src/bstring.rs b/src/bstring.rs index 30093ba..d144b1d 100644 --- a/src/bstring.rs +++ b/src/bstring.rs @@ -1,3 +1,5 @@ +use alloc::vec::Vec; + use crate::bstr::BStr; /// A wrapper for `Vec<u8>` that provides convenient string oriented trait @@ -38,16 +40,43 @@ use crate::bstr::BStr; /// region of memory containing the bytes, a length and a capacity. #[derive(Clone, Hash)] pub struct BString { - pub(crate) bytes: Vec<u8>, + bytes: Vec<u8>, } impl BString { + /// Constructs a new `BString` from the given [`Vec`]. + /// + /// # Examples + /// + /// ``` + /// use bstr::BString; + /// + /// let mut b = BString::new(Vec::with_capacity(10)); + /// ``` + /// + /// This function is `const`: + /// + /// ``` + /// use bstr::BString; + /// + /// const B: BString = BString::new(vec![]); + /// ``` + #[inline] + pub const fn new(bytes: Vec<u8>) -> BString { + BString { bytes } + } + #[inline] pub(crate) fn as_bytes(&self) -> &[u8] { &self.bytes } #[inline] + pub(crate) fn as_bytes_mut(&mut self) -> &mut [u8] { + &mut self.bytes + } + + #[inline] pub(crate) fn as_bstr(&self) -> &BStr { BStr::new(&self.bytes) } @@ -56,4 +85,19 @@ impl BString { pub(crate) fn as_mut_bstr(&mut self) -> &mut BStr { BStr::new_mut(&mut self.bytes) } + + #[inline] + pub(crate) fn as_vec(&self) -> &Vec<u8> { + &self.bytes + } + + #[inline] + pub(crate) fn as_vec_mut(&mut self) -> &mut Vec<u8> { + &mut self.bytes + } + + #[inline] + pub(crate) fn into_vec(self) -> Vec<u8> { + self.bytes + } } diff --git a/src/byteset/mod.rs b/src/byteset/mod.rs index 043d309..c6c697c 100644 --- a/src/byteset/mod.rs +++ b/src/byteset/mod.rs @@ -1,4 +1,5 @@ use memchr::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3}; + mod scalar; #[inline] @@ -79,7 +80,7 @@ pub(crate) fn rfind_not(haystack: &[u8], byteset: &[u8]) -> Option<usize> { } } -#[cfg(test)] +#[cfg(all(test, feature = "std", not(miri)))] mod tests { quickcheck::quickcheck! { fn qc_byteset_forward_matches_naive( diff --git a/src/byteset/scalar.rs b/src/byteset/scalar.rs index 9bd34a8..28bff67 100644 --- a/src/byteset/scalar.rs +++ b/src/byteset/scalar.rs @@ -1,9 +1,8 @@ // This is adapted from `fallback.rs` from rust-memchr. It's modified to return -// the 'inverse' query of memchr, e.g. finding the first byte not in the provided -// set. This is simple for the 1-byte case. +// the 'inverse' query of memchr, e.g. finding the first byte not in the +// provided set. This is simple for the 1-byte case. -use core::cmp; -use core::usize; +use core::{cmp, usize}; #[cfg(target_pointer_width = "32")] const USIZE_BYTES: usize = 4; @@ -29,10 +28,11 @@ pub fn inv_memchr(n1: u8, haystack: &[u8]) -> Option<usize> { let loop_size = cmp::min(LOOP_SIZE, haystack.len()); let align = USIZE_BYTES - 1; let start_ptr = haystack.as_ptr(); - let end_ptr = haystack[haystack.len()..].as_ptr(); - let mut ptr = start_ptr; unsafe { + let end_ptr = haystack.as_ptr().add(haystack.len()); + let mut ptr = start_ptr; + if haystack.len() < USIZE_BYTES { return forward_search(start_ptr, end_ptr, ptr, confirm); } @@ -68,10 +68,11 @@ pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { let loop_size = cmp::min(LOOP_SIZE, haystack.len()); let align = USIZE_BYTES - 1; let start_ptr = haystack.as_ptr(); - let end_ptr = haystack[haystack.len()..].as_ptr(); - let mut ptr = end_ptr; unsafe { + let end_ptr = haystack.as_ptr().add(haystack.len()); + let mut ptr = end_ptr; + if haystack.len() < USIZE_BYTES { return reverse_search(start_ptr, end_ptr, ptr, confirm); } @@ -81,7 +82,7 @@ pub fn inv_memrchr(n1: u8, haystack: &[u8]) -> Option<usize> { return reverse_search(start_ptr, end_ptr, ptr, confirm); } - ptr = (end_ptr as usize & !align) as *const u8; + ptr = ptr.sub(end_ptr as usize & align); debug_assert!(start_ptr <= ptr && ptr <= end_ptr); while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); @@ -174,9 +175,10 @@ pub(crate) fn reverse_search_bytes<F: Fn(u8) -> bool>( } } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests { use super::{inv_memchr, inv_memrchr}; + // search string, search byte, inv_memchr result, inv_memrchr result. // these are expanded into a much larger set of tests in build_tests const TESTS: &[(&[u8], u8, usize, usize)] = &[ @@ -192,10 +194,15 @@ mod tests { type TestCase = (Vec<u8>, u8, Option<(usize, usize)>); fn build_tests() -> Vec<TestCase> { + #[cfg(not(miri))] + const MAX_PER: usize = 515; + #[cfg(miri)] + const MAX_PER: usize = 10; + let mut result = vec![]; for &(search, byte, fwd_pos, rev_pos) in TESTS { result.push((search.to_vec(), byte, Some((fwd_pos, rev_pos)))); - for i in 1..515 { + for i in 1..MAX_PER { // add a bunch of copies of the search byte to the end. let mut suffixed: Vec<u8> = search.into(); suffixed.extend(std::iter::repeat(byte).take(i)); @@ -225,7 +232,7 @@ mod tests { } // build non-matching tests for several sizes - for i in 0..515 { + for i in 0..MAX_PER { result.push(( std::iter::repeat(b'\0').take(i).collect(), b'\0', @@ -239,6 +246,12 @@ mod tests { #[test] fn test_inv_memchr() { use crate::{ByteSlice, B}; + + #[cfg(not(miri))] + const MAX_OFFSET: usize = 130; + #[cfg(miri)] + const MAX_OFFSET: usize = 13; + for (search, byte, matching) in build_tests() { assert_eq!( inv_memchr(byte, &search), @@ -256,13 +269,14 @@ mod tests { // better printing B(&search).as_bstr(), ); - // Test a rather large number off offsets for potential alignment issues - for offset in 1..130 { + // Test a rather large number off offsets for potential alignment + // issues. + for offset in 1..MAX_OFFSET { if offset >= search.len() { break; } - // If this would cause us to shift the results off the end, skip - // it so that we don't have to recompute them. + // If this would cause us to shift the results off the end, + // skip it so that we don't have to recompute them. if let Some((f, r)) = matching { if offset > f || offset > r { break; diff --git a/src/ext_slice.rs b/src/ext_slice.rs index 0cc73af..91af450 100644 --- a/src/ext_slice.rs +++ b/src/ext_slice.rs @@ -1,17 +1,16 @@ +use core::{iter, slice, str}; + +#[cfg(all(feature = "alloc", feature = "unicode"))] +use alloc::vec; +#[cfg(feature = "alloc")] +use alloc::{borrow::Cow, string::String, vec::Vec}; + #[cfg(feature = "std")] -use std::borrow::Cow; -#[cfg(feature = "std")] -use std::ffi::OsStr; -#[cfg(feature = "std")] -use std::path::Path; +use std::{ffi::OsStr, path::Path}; -use core::{iter, ops, ptr, slice, str}; use memchr::{memchr, memmem, memrchr}; -use crate::ascii; -use crate::bstr::BStr; -use crate::byteset; -#[cfg(feature = "std")] +#[cfg(feature = "alloc")] use crate::ext_vec::ByteVec; #[cfg(feature = "unicode")] use crate::unicode::{ @@ -19,7 +18,12 @@ use crate::unicode::{ SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks, }; -use crate::utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error}; +use crate::{ + ascii, + bstr::BStr, + byteset, + utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error}, +}; /// A short-hand constructor for building a `&[u8]`. /// @@ -83,13 +87,30 @@ impl ByteSlice for [u8] { } } +impl<const N: usize> ByteSlice for [u8; N] { + #[inline] + fn as_bytes(&self) -> &[u8] { + self + } + + #[inline] + fn as_bytes_mut(&mut self) -> &mut [u8] { + self + } +} + /// Ensure that callers cannot implement `ByteSlice` by making an /// umplementable trait its super trait. -pub trait Sealed {} -impl Sealed for [u8] {} +mod private { + pub trait Sealed {} +} +impl private::Sealed for [u8] {} +impl<const N: usize> private::Sealed for [u8; N] {} /// A trait that extends `&[u8]` with string oriented methods. -pub trait ByteSlice: Sealed { +/// +/// This trait is sealed and cannot be implemented outside of `bstr`. +pub trait ByteSlice: private::Sealed { /// A method for accessing the raw bytes of this type. This is always a /// no-op and callers shouldn't care about it. This only exists for making /// the extension trait work. @@ -149,11 +170,12 @@ pub trait ByteSlice: Sealed { /// Create an immutable byte string from an OS string slice. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns `None` if the given OS string is not valid UTF-8. (For - /// example, on Windows, file paths are allowed to be a sequence of - /// arbitrary 16-bit integers. Not all such sequences can be transcoded to - /// valid UTF-8.) + /// When the underlying bytes of OS strings are accessible, then this + /// always succeeds and is zero cost. Otherwise, this returns `None` if the + /// given OS string is not valid UTF-8. (For example, when the underlying + /// bytes are inaccessible on Windows, file paths are allowed to be a + /// sequence of arbitrary 16-bit integers. Not all such sequences can be + /// transcoded to valid UTF-8.) /// /// # Examples /// @@ -190,10 +212,12 @@ pub trait ByteSlice: Sealed { /// Create an immutable byte string from a file path. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns `None` if the given path is not valid UTF-8. (For example, - /// on Windows, file paths are allowed to be a sequence of arbitrary 16-bit - /// integers. Not all such sequences can be transcoded to valid UTF-8.) + /// When the underlying bytes of paths are accessible, then this always + /// succeeds and is zero cost. Otherwise, this returns `None` if the given + /// path is not valid UTF-8. (For example, when the underlying bytes are + /// inaccessible on Windows, file paths are allowed to be a sequence of + /// arbitrary 16-bit integers. Not all such sequences can be transcoded to + /// valid UTF-8.) /// /// # Examples /// @@ -230,6 +254,7 @@ pub trait ByteSlice: Sealed { /// Basic usage: /// /// ``` + /// # #[cfg(feature = "alloc")] { /// use bstr::{B, ByteSlice, ByteVec}; /// /// # fn example() -> Result<(), bstr::Utf8Error> { @@ -241,6 +266,7 @@ pub trait ByteSlice: Sealed { /// let err = bstring.to_str().unwrap_err(); /// assert_eq!(8, err.valid_up_to()); /// # Ok(()) }; example().unwrap() + /// # } /// ``` #[inline] fn to_str(&self) -> Result<&str, Utf8Error> { @@ -301,7 +327,7 @@ pub trait ByteSlice: Sealed { /// [W3C's Encoding standard](https://www.w3.org/TR/encoding/). /// For a more precise description of the maximal subpart strategy, see /// the Unicode Standard, Chapter 3, Section 9. See also - /// [Public Review Issue #121](http://www.unicode.org/review/pr-121.html). + /// [Public Review Issue #121](https://www.unicode.org/review/pr-121.html). /// /// N.B. Rust's standard library also appears to use the same strategy, /// but it does not appear to be an API guarantee. @@ -341,7 +367,7 @@ pub trait ByteSlice: Sealed { /// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62"); /// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy()); /// ``` - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] #[inline] fn to_str_lossy(&self) -> Cow<'_, str> { match utf8::validate(self.as_bytes()) { @@ -398,7 +424,7 @@ pub trait ByteSlice: Sealed { /// bstring.to_str_lossy_into(&mut dest); /// assert_eq!("☃βツ\u{FFFD}", dest); /// ``` - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] #[inline] fn to_str_lossy_into(&self, dest: &mut String) { let mut bytes = self.as_bytes(); @@ -428,12 +454,15 @@ pub trait ByteSlice: Sealed { /// Create an OS string slice from this byte string. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns a UTF-8 decoding error if this byte string is not valid - /// UTF-8. (For example, on Windows, file paths are allowed to be a - /// sequence of arbitrary 16-bit integers. There is no obvious mapping from - /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of - /// 16-bit integers.) + /// When OS strings can be constructed from arbitrary byte sequences, this + /// always succeeds and is zero cost. Otherwise, this returns a UTF-8 + /// decoding error if this byte string is not valid UTF-8. (For example, + /// assuming the representation of `OsStr` is opaque on Windows, file paths + /// are allowed to be a sequence of arbitrary 16-bit integers. There is + /// no obvious mapping from an arbitrary sequence of 8-bit integers to an + /// arbitrary sequence of 16-bit integers. If the representation of `OsStr` + /// is even opened up, then this will convert any sequence of bytes to an + /// `OsStr` without cost.) /// /// # Examples /// @@ -467,13 +496,13 @@ pub trait ByteSlice: Sealed { /// Lossily create an OS string slice from this byte string. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this will perform a UTF-8 check and lossily convert this byte string - /// into valid UTF-8 using the Unicode replacement codepoint. + /// When OS strings can be constructed from arbitrary byte sequences, this + /// is zero cost and always returns a slice. Otherwise, this will perform a + /// UTF-8 check and lossily convert this byte string into valid UTF-8 using + /// the Unicode replacement codepoint. /// - /// Note that this can prevent the correct roundtripping of file paths on - /// non-Unix systems such as Windows, where file paths are an arbitrary - /// sequence of 16-bit integers. + /// Note that this can prevent the correct roundtripping of file paths when + /// the representation of `OsStr` is opaque. /// /// # Examples /// @@ -512,12 +541,15 @@ pub trait ByteSlice: Sealed { /// Create a path slice from this byte string. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns a UTF-8 decoding error if this byte string is not valid - /// UTF-8. (For example, on Windows, file paths are allowed to be a - /// sequence of arbitrary 16-bit integers. There is no obvious mapping from - /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of - /// 16-bit integers.) + /// When paths can be constructed from arbitrary byte sequences, this + /// always succeeds and is zero cost. Otherwise, this returns a UTF-8 + /// decoding error if this byte string is not valid UTF-8. (For example, + /// assuming the representation of `Path` is opaque on Windows, file paths + /// are allowed to be a sequence of arbitrary 16-bit integers. There is + /// no obvious mapping from an arbitrary sequence of 8-bit integers to an + /// arbitrary sequence of 16-bit integers. If the representation of `Path` + /// is even opened up, then this will convert any sequence of bytes to an + /// `Path` without cost.) /// /// # Examples /// @@ -537,13 +569,13 @@ pub trait ByteSlice: Sealed { /// Lossily create a path slice from this byte string. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this will perform a UTF-8 check and lossily convert this byte string - /// into valid UTF-8 using the Unicode replacement codepoint. + /// When paths can be constructed from arbitrary byte sequences, this is + /// zero cost and always returns a slice. Otherwise, this will perform a + /// UTF-8 check and lossily convert this byte string into valid UTF-8 using + /// the Unicode replacement codepoint. /// - /// Note that this can prevent the correct roundtripping of file paths on - /// non-Unix systems such as Windows, where file paths are an arbitrary - /// sequence of 16-bit integers. + /// Note that this can prevent the correct roundtripping of file paths when + /// the representation of `Path` is opaque. /// /// # Examples /// @@ -584,15 +616,10 @@ pub trait ByteSlice: Sealed { /// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo")); /// assert_eq!(b"foo".repeatn(0), B("")); /// ``` - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] #[inline] fn repeatn(&self, n: usize) -> Vec<u8> { - let bs = self.as_bytes(); - let mut dst = vec![0; bs.len() * n]; - for i in 0..n { - dst[i * bs.len()..(i + 1) * bs.len()].copy_from_slice(bs); - } - dst + self.as_bytes().repeat(n) } /// Returns true if and only if this byte string contains the given needle. @@ -759,10 +786,10 @@ pub trait ByteSlice: Sealed { /// assert_eq!(matches, vec![0]); /// ``` #[inline] - fn find_iter<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, - needle: &'a B, - ) -> Find<'a> { + fn find_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>( + &'h self, + needle: &'n B, + ) -> Find<'h, 'n> { Find::new(self.as_bytes(), needle.as_ref()) } @@ -804,10 +831,10 @@ pub trait ByteSlice: Sealed { /// assert_eq!(matches, vec![0]); /// ``` #[inline] - fn rfind_iter<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, - needle: &'a B, - ) -> FindReverse<'a> { + fn rfind_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>( + &'h self, + needle: &'n B, + ) -> FindReverse<'h, 'n> { FindReverse::new(self.as_bytes(), needle.as_ref()) } @@ -926,14 +953,17 @@ pub trait ByteSlice: Sealed { /// assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6)); /// assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4)); /// assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n")); + /// // The empty byteset never matches. + /// assert_eq!(None, b"abc".find_byteset(b"")); + /// assert_eq!(None, b"".find_byteset(b"")); /// ``` #[inline] fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> { byteset::find(self.as_bytes(), byteset.as_ref()) } - /// Returns the index of the first occurrence of a byte that is not a member - /// of the provided set. + /// Returns the index of the first occurrence of a byte that is not a + /// member of the provided set. /// /// The `byteset` may be any type that can be cheaply converted into a /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but @@ -963,6 +993,10 @@ pub trait ByteSlice: Sealed { /// assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4)); /// assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2)); /// assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0)); + /// // The negation of the empty byteset matches everything. + /// assert_eq!(Some(0), b"abc".find_not_byteset(b"")); + /// // But an empty string never contains anything. + /// assert_eq!(None, b"".find_not_byteset(b"")); /// ``` #[inline] fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> { @@ -1043,8 +1077,9 @@ pub trait ByteSlice: Sealed { byteset::rfind_not(self.as_bytes(), byteset.as_ref()) } - /// Returns an iterator over the fields in a byte string, separated by - /// contiguous whitespace. + /// Returns an iterator over the fields in a byte string, separated + /// by contiguous whitespace (according to the Unicode property + /// `White_Space`). /// /// # Example /// @@ -1065,6 +1100,7 @@ pub trait ByteSlice: Sealed { /// /// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count()); /// ``` + #[cfg(feature = "unicode")] #[inline] fn fields(&self) -> Fields<'_> { Fields::new(self.as_bytes()) @@ -1191,10 +1227,10 @@ pub trait ByteSlice: Sealed { /// It does *not* give you `["a", "b", "c"]`. For that behavior, use /// [`fields`](#method.fields) instead. #[inline] - fn split_str<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, - splitter: &'a B, - ) -> Split<'a> { + fn split_str<'h, 's, B: ?Sized + AsRef<[u8]>>( + &'h self, + splitter: &'s B, + ) -> Split<'h, 's> { Split::new(self.as_bytes(), splitter.as_ref()) } @@ -1285,13 +1321,101 @@ pub trait ByteSlice: Sealed { /// /// It does *not* give you `["a", "b", "c"]`. #[inline] - fn rsplit_str<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, - splitter: &'a B, - ) -> SplitReverse<'a> { + fn rsplit_str<'h, 's, B: ?Sized + AsRef<[u8]>>( + &'h self, + splitter: &'s B, + ) -> SplitReverse<'h, 's> { SplitReverse::new(self.as_bytes(), splitter.as_ref()) } + /// Split this byte string at the first occurrence of `splitter`. + /// + /// If the `splitter` is found in the byte string, returns a tuple + /// containing the parts of the string before and after the first occurrence + /// of `splitter` respectively. Otherwise, if there are no occurrences of + /// `splitter` in the byte string, returns `None`. + /// + /// The splitter may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// If you need to split on the *last* instance of a delimiter instead, see + /// the [`ByteSlice::rsplit_once_str`](#method.rsplit_once_str) method . + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// assert_eq!( + /// B("foo,bar").split_once_str(","), + /// Some((B("foo"), B("bar"))), + /// ); + /// assert_eq!( + /// B("foo,bar,baz").split_once_str(","), + /// Some((B("foo"), B("bar,baz"))), + /// ); + /// assert_eq!(B("foo").split_once_str(","), None); + /// assert_eq!(B("foo,").split_once_str(b","), Some((B("foo"), B("")))); + /// assert_eq!(B(",foo").split_once_str(b","), Some((B(""), B("foo")))); + /// ``` + #[inline] + fn split_once_str<'a, B: ?Sized + AsRef<[u8]>>( + &'a self, + splitter: &B, + ) -> Option<(&'a [u8], &'a [u8])> { + let bytes = self.as_bytes(); + let splitter = splitter.as_ref(); + let start = Finder::new(splitter).find(bytes)?; + let end = start + splitter.len(); + Some((&bytes[..start], &bytes[end..])) + } + + /// Split this byte string at the last occurrence of `splitter`. + /// + /// If the `splitter` is found in the byte string, returns a tuple + /// containing the parts of the string before and after the last occurrence + /// of `splitter`, respectively. Otherwise, if there are no occurrences of + /// `splitter` in the byte string, returns `None`. + /// + /// The splitter may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// If you need to split on the *first* instance of a delimiter instead, see + /// the [`ByteSlice::split_once_str`](#method.split_once_str) method. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// assert_eq!( + /// B("foo,bar").rsplit_once_str(","), + /// Some((B("foo"), B("bar"))), + /// ); + /// assert_eq!( + /// B("foo,bar,baz").rsplit_once_str(","), + /// Some((B("foo,bar"), B("baz"))), + /// ); + /// assert_eq!(B("foo").rsplit_once_str(","), None); + /// assert_eq!(B("foo,").rsplit_once_str(b","), Some((B("foo"), B("")))); + /// assert_eq!(B(",foo").rsplit_once_str(b","), Some((B(""), B("foo")))); + /// ``` + #[inline] + fn rsplit_once_str<'a, B: ?Sized + AsRef<[u8]>>( + &'a self, + splitter: &B, + ) -> Option<(&'a [u8], &'a [u8])> { + let bytes = self.as_bytes(); + let splitter = splitter.as_ref(); + let start = FinderReverse::new(splitter).rfind(bytes)?; + let end = start + splitter.len(); + Some((&bytes[..start], &bytes[end..])) + } + /// Returns an iterator of at most `limit` substrings of this byte string, /// separated by the given byte string. If `limit` substrings are yielded, /// then the last substring will contain the remainder of this byte string. @@ -1328,11 +1452,11 @@ pub trait ByteSlice: Sealed { /// assert!(x.is_empty()); /// ``` #[inline] - fn splitn_str<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, + fn splitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>( + &'h self, limit: usize, - splitter: &'a B, - ) -> SplitN<'a> { + splitter: &'s B, + ) -> SplitN<'h, 's> { SplitN::new(self.as_bytes(), splitter.as_ref(), limit) } @@ -1374,11 +1498,11 @@ pub trait ByteSlice: Sealed { /// assert!(x.is_empty()); /// ``` #[inline] - fn rsplitn_str<'a, B: ?Sized + AsRef<[u8]>>( - &'a self, + fn rsplitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>( + &'h self, limit: usize, - splitter: &'a B, - ) -> SplitNReverse<'a> { + splitter: &'s B, + ) -> SplitNReverse<'h, 's> { SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit) } @@ -1416,7 +1540,7 @@ pub trait ByteSlice: Sealed { /// let s = b"foo".replace("", "Z"); /// assert_eq!(s, "ZfZoZoZ".as_bytes()); /// ``` - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] #[inline] fn replace<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, @@ -1462,7 +1586,7 @@ pub trait ByteSlice: Sealed { /// let s = b"foo".replacen("", "Z", 2); /// assert_eq!(s, "ZfZoo".as_bytes()); /// ``` - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] #[inline] fn replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, @@ -1520,7 +1644,7 @@ pub trait ByteSlice: Sealed { /// s.replace_into("", "Z", &mut dest); /// assert_eq!(dest, "ZfZoZoZ".as_bytes()); /// ``` - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] #[inline] fn replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, @@ -1584,7 +1708,7 @@ pub trait ByteSlice: Sealed { /// s.replacen_into("", "Z", 2, &mut dest); /// assert_eq!(dest, "ZfZoo".as_bytes()); /// ``` - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] #[inline] fn replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, @@ -1795,11 +1919,12 @@ pub trait ByteSlice: Sealed { /// assert_eq!(vec![(0, 5, "à̖"), (5, 13, "🇺🇸")], graphemes); /// ``` /// - /// This example shows what happens when invalid UTF-8 is enountered. Note + /// This example shows what happens when invalid UTF-8 is encountered. Note /// that the offsets are valid indices into the original string, and do /// not necessarily correspond to the length of the `&str` returned! /// /// ``` + /// # #[cfg(all(feature = "alloc"))] { /// use bstr::{ByteSlice, ByteVec}; /// /// let mut bytes = vec![]; @@ -1813,6 +1938,7 @@ pub trait ByteSlice: Sealed { /// graphemes, /// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "🇺🇸")] /// ); + /// # } /// ``` #[cfg(feature = "unicode")] #[inline] @@ -2277,7 +2403,7 @@ pub trait ByteSlice: Sealed { /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes()); /// ``` - #[cfg(all(feature = "std", feature = "unicode"))] + #[cfg(all(feature = "alloc", feature = "unicode"))] #[inline] fn to_lowercase(&self) -> Vec<u8> { let mut buf = vec![]; @@ -2339,7 +2465,7 @@ pub trait ByteSlice: Sealed { /// s.to_lowercase_into(&mut buf); /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes()); /// ``` - #[cfg(all(feature = "std", feature = "unicode"))] + #[cfg(all(feature = "alloc", feature = "unicode"))] #[inline] fn to_lowercase_into(&self, buf: &mut Vec<u8>) { // TODO: This is the best we can do given what std exposes I think. @@ -2394,7 +2520,7 @@ pub trait ByteSlice: Sealed { /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); /// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz")); /// ``` - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] #[inline] fn to_ascii_lowercase(&self) -> Vec<u8> { self.as_bytes().to_ascii_lowercase() @@ -2424,11 +2550,13 @@ pub trait ByteSlice: Sealed { /// Invalid UTF-8 remains as is: /// /// ``` + /// # #[cfg(feature = "alloc")] { /// use bstr::{B, ByteSlice, ByteVec}; /// /// let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ"); /// s.make_ascii_lowercase(); /// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz")); + /// # } /// ``` #[inline] fn make_ascii_lowercase(&mut self) { @@ -2480,7 +2608,7 @@ pub trait ByteSlice: Sealed { /// let s = B(b"foo\xFFbar\xE2\x98baz"); /// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ")); /// ``` - #[cfg(all(feature = "std", feature = "unicode"))] + #[cfg(all(feature = "alloc", feature = "unicode"))] #[inline] fn to_uppercase(&self) -> Vec<u8> { let mut buf = vec![]; @@ -2542,7 +2670,7 @@ pub trait ByteSlice: Sealed { /// s.to_uppercase_into(&mut buf); /// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ")); /// ``` - #[cfg(all(feature = "std", feature = "unicode"))] + #[cfg(all(feature = "alloc", feature = "unicode"))] #[inline] fn to_uppercase_into(&self, buf: &mut Vec<u8>) { // TODO: This is the best we can do given what std exposes I think. @@ -2594,7 +2722,7 @@ pub trait ByteSlice: Sealed { /// let s = B(b"foo\xFFbar\xE2\x98baz"); /// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ")); /// ``` - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] #[inline] fn to_ascii_uppercase(&self) -> Vec<u8> { self.as_bytes().to_ascii_uppercase() @@ -2624,11 +2752,13 @@ pub trait ByteSlice: Sealed { /// Invalid UTF-8 remains as is: /// /// ``` + /// # #[cfg(feature = "alloc")] { /// use bstr::{B, ByteSlice, ByteVec}; /// /// let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz"); /// s.make_ascii_uppercase(); /// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ")); + /// # } /// ``` #[inline] fn make_ascii_uppercase(&mut self) { @@ -2900,72 +3030,6 @@ pub trait ByteSlice: Sealed { Some(index) } } - - /// Copies elements from one part of the slice to another part of itself, - /// where the parts may be overlapping. - /// - /// `src` is the range within this byte string to copy from, while `dest` - /// is the starting index of the range within this byte string to copy to. - /// The length indicated by `src` must be less than or equal to the number - /// of bytes from `dest` to the end of the byte string. - /// - /// # Panics - /// - /// Panics if either range is out of bounds, or if `src` is too big to fit - /// into `dest`, or if the end of `src` is before the start. - /// - /// # Examples - /// - /// Copying four bytes within a byte string: - /// - /// ``` - /// use bstr::{B, ByteSlice}; - /// - /// let mut buf = *b"Hello, World!"; - /// let s = &mut buf; - /// s.copy_within_str(1..5, 8); - /// assert_eq!(s, B("Hello, Wello!")); - /// ``` - #[inline] - fn copy_within_str<R>(&mut self, src: R, dest: usize) - where - R: ops::RangeBounds<usize>, - { - // TODO: Deprecate this once slice::copy_within stabilizes. - let src_start = match src.start_bound() { - ops::Bound::Included(&n) => n, - ops::Bound::Excluded(&n) => { - n.checked_add(1).expect("attempted to index slice beyond max") - } - ops::Bound::Unbounded => 0, - }; - let src_end = match src.end_bound() { - ops::Bound::Included(&n) => { - n.checked_add(1).expect("attempted to index slice beyond max") - } - ops::Bound::Excluded(&n) => n, - ops::Bound::Unbounded => self.as_bytes().len(), - }; - assert!(src_start <= src_end, "src end is before src start"); - assert!(src_end <= self.as_bytes().len(), "src is out of bounds"); - let count = src_end - src_start; - assert!( - dest <= self.as_bytes().len() - count, - "dest is out of bounds", - ); - - // SAFETY: This is safe because we use ptr::copy to handle overlapping - // copies, and is also safe because we've checked all the bounds above. - // Finally, we are only dealing with u8 data, which is Copy, which - // means we can copy without worrying about ownership/destructors. - unsafe { - ptr::copy( - self.as_bytes().get_unchecked(src_start), - self.as_bytes_mut().get_unchecked_mut(dest), - count, - ); - } - } } /// A single substring searcher fixed to a particular needle. @@ -3138,22 +3202,22 @@ impl<'a> FinderReverse<'a> { /// /// Matches are reported by the byte offset at which they begin. /// -/// `'a` is the shorter of two lifetimes: the byte string being searched or the -/// byte string being looked for. +/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the +/// needle. #[derive(Debug)] -pub struct Find<'a> { - it: memmem::FindIter<'a, 'a>, - haystack: &'a [u8], - needle: &'a [u8], +pub struct Find<'h, 'n> { + it: memmem::FindIter<'h, 'n>, + haystack: &'h [u8], + needle: &'n [u8], } -impl<'a> Find<'a> { - fn new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a> { +impl<'h, 'n> Find<'h, 'n> { + fn new(haystack: &'h [u8], needle: &'n [u8]) -> Find<'h, 'n> { Find { it: memmem::find_iter(haystack, needle), haystack, needle } } } -impl<'a> Iterator for Find<'a> { +impl<'h, 'n> Iterator for Find<'h, 'n> { type Item = usize; #[inline] @@ -3166,17 +3230,17 @@ impl<'a> Iterator for Find<'a> { /// /// Matches are reported by the byte offset at which they begin. /// -/// `'a` is the shorter of two lifetimes: the byte string being searched or the -/// byte string being looked for. +/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the +/// needle. #[derive(Debug)] -pub struct FindReverse<'a> { - it: memmem::FindRevIter<'a, 'a>, - haystack: &'a [u8], - needle: &'a [u8], +pub struct FindReverse<'h, 'n> { + it: memmem::FindRevIter<'h, 'n>, + haystack: &'h [u8], + needle: &'n [u8], } -impl<'a> FindReverse<'a> { - fn new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a> { +impl<'h, 'n> FindReverse<'h, 'n> { + fn new(haystack: &'h [u8], needle: &'n [u8]) -> FindReverse<'h, 'n> { FindReverse { it: memmem::rfind_iter(haystack, needle), haystack, @@ -3184,16 +3248,16 @@ impl<'a> FindReverse<'a> { } } - fn haystack(&self) -> &'a [u8] { + fn haystack(&self) -> &'h [u8] { self.haystack } - fn needle(&self) -> &[u8] { + fn needle(&self) -> &'n [u8] { self.needle } } -impl<'a> Iterator for FindReverse<'a> { +impl<'h, 'n> Iterator for FindReverse<'h, 'n> { type Item = usize; #[inline] @@ -3215,7 +3279,7 @@ impl<'a> Bytes<'a> { /// This has the same lifetime as the original slice, /// and so the iterator can continue to be used while this exists. #[inline] - pub fn as_slice(&self) -> &'a [u8] { + pub fn as_bytes(&self) -> &'a [u8] { self.it.as_slice() } } @@ -3252,21 +3316,27 @@ impl<'a> iter::FusedIterator for Bytes<'a> {} /// An iterator over the fields in a byte string, separated by whitespace. /// +/// Whitespace for this iterator is defined by the Unicode property +/// `White_Space`. +/// /// This iterator splits on contiguous runs of whitespace, such that the fields /// in `foo\t\t\n \nbar` are `foo` and `bar`. /// /// `'a` is the lifetime of the byte string being split. +#[cfg(feature = "unicode")] #[derive(Debug)] pub struct Fields<'a> { it: FieldsWith<'a, fn(char) -> bool>, } +#[cfg(feature = "unicode")] impl<'a> Fields<'a> { fn new(bytes: &'a [u8]) -> Fields<'a> { Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) } } } +#[cfg(feature = "unicode")] impl<'a> Iterator for Fields<'a> { type Item = &'a [u8]; @@ -3328,10 +3398,11 @@ impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> { /// An iterator over substrings in a byte string, split by a separator. /// -/// `'a` is the lifetime of the byte string being split. +/// `'h` is the lifetime of the byte string being split (the haystack), while +/// `'s` is the lifetime of the byte string doing the splitting. #[derive(Debug)] -pub struct Split<'a> { - finder: Find<'a>, +pub struct Split<'h, 's> { + finder: Find<'h, 's>, /// The end position of the previous match of our splitter. The element /// we yield corresponds to the substring starting at `last` up to the /// beginning of the next match of the splitter. @@ -3342,18 +3413,18 @@ pub struct Split<'a> { done: bool, } -impl<'a> Split<'a> { - fn new(haystack: &'a [u8], splitter: &'a [u8]) -> Split<'a> { +impl<'h, 's> Split<'h, 's> { + fn new(haystack: &'h [u8], splitter: &'s [u8]) -> Split<'h, 's> { let finder = haystack.find_iter(splitter); Split { finder, last: 0, done: false } } } -impl<'a> Iterator for Split<'a> { - type Item = &'a [u8]; +impl<'h, 's> Iterator for Split<'h, 's> { + type Item = &'h [u8]; #[inline] - fn next(&mut self) -> Option<&'a [u8]> { + fn next(&mut self) -> Option<&'h [u8]> { let haystack = self.finder.haystack; match self.finder.next() { Some(start) => { @@ -3383,11 +3454,11 @@ impl<'a> Iterator for Split<'a> { /// An iterator over substrings in a byte string, split by a separator, in /// reverse. /// -/// `'a` is the lifetime of the byte string being split, while `F` is the type -/// of the predicate, i.e., `FnMut(char) -> bool`. +/// `'h` is the lifetime of the byte string being split (the haystack), while +/// `'s` is the lifetime of the byte string doing the splitting. #[derive(Debug)] -pub struct SplitReverse<'a> { - finder: FindReverse<'a>, +pub struct SplitReverse<'h, 's> { + finder: FindReverse<'h, 's>, /// The end position of the previous match of our splitter. The element /// we yield corresponds to the substring starting at `last` up to the /// beginning of the next match of the splitter. @@ -3398,18 +3469,18 @@ pub struct SplitReverse<'a> { done: bool, } -impl<'a> SplitReverse<'a> { - fn new(haystack: &'a [u8], splitter: &'a [u8]) -> SplitReverse<'a> { +impl<'h, 's> SplitReverse<'h, 's> { + fn new(haystack: &'h [u8], splitter: &'s [u8]) -> SplitReverse<'h, 's> { let finder = haystack.rfind_iter(splitter); SplitReverse { finder, last: haystack.len(), done: false } } } -impl<'a> Iterator for SplitReverse<'a> { - type Item = &'a [u8]; +impl<'h, 's> Iterator for SplitReverse<'h, 's> { + type Item = &'h [u8]; #[inline] - fn next(&mut self) -> Option<&'a [u8]> { + fn next(&mut self) -> Option<&'h [u8]> { let haystack = self.finder.haystack(); match self.finder.next() { Some(start) => { @@ -3440,31 +3511,31 @@ impl<'a> Iterator for SplitReverse<'a> { /// An iterator over at most `n` substrings in a byte string, split by a /// separator. /// -/// `'a` is the lifetime of the byte string being split, while `F` is the type -/// of the predicate, i.e., `FnMut(char) -> bool`. +/// `'h` is the lifetime of the byte string being split (the haystack), while +/// `'s` is the lifetime of the byte string doing the splitting. #[derive(Debug)] -pub struct SplitN<'a> { - split: Split<'a>, +pub struct SplitN<'h, 's> { + split: Split<'h, 's>, limit: usize, count: usize, } -impl<'a> SplitN<'a> { +impl<'h, 's> SplitN<'h, 's> { fn new( - haystack: &'a [u8], - splitter: &'a [u8], + haystack: &'h [u8], + splitter: &'s [u8], limit: usize, - ) -> SplitN<'a> { + ) -> SplitN<'h, 's> { let split = haystack.split_str(splitter); SplitN { split, limit, count: 0 } } } -impl<'a> Iterator for SplitN<'a> { - type Item = &'a [u8]; +impl<'h, 's> Iterator for SplitN<'h, 's> { + type Item = &'h [u8]; #[inline] - fn next(&mut self) -> Option<&'a [u8]> { + fn next(&mut self) -> Option<&'h [u8]> { self.count += 1; if self.count > self.limit || self.split.done { None @@ -3479,31 +3550,31 @@ impl<'a> Iterator for SplitN<'a> { /// An iterator over at most `n` substrings in a byte string, split by a /// separator, in reverse. /// -/// `'a` is the lifetime of the byte string being split, while `F` is the type -/// of the predicate, i.e., `FnMut(char) -> bool`. +/// `'h` is the lifetime of the byte string being split (the haystack), while +/// `'s` is the lifetime of the byte string doing the splitting. #[derive(Debug)] -pub struct SplitNReverse<'a> { - split: SplitReverse<'a>, +pub struct SplitNReverse<'h, 's> { + split: SplitReverse<'h, 's>, limit: usize, count: usize, } -impl<'a> SplitNReverse<'a> { +impl<'h, 's> SplitNReverse<'h, 's> { fn new( - haystack: &'a [u8], - splitter: &'a [u8], + haystack: &'h [u8], + splitter: &'s [u8], limit: usize, - ) -> SplitNReverse<'a> { + ) -> SplitNReverse<'h, 's> { let split = haystack.rsplit_str(splitter); SplitNReverse { split, limit, count: 0 } } } -impl<'a> Iterator for SplitNReverse<'a> { - type Item = &'a [u8]; +impl<'h, 's> Iterator for SplitNReverse<'h, 's> { + type Item = &'h [u8]; #[inline] - fn next(&mut self) -> Option<&'a [u8]> { + fn next(&mut self) -> Option<&'h [u8]> { self.count += 1; if self.count > self.limit || self.split.done { None @@ -3521,6 +3592,7 @@ impl<'a> Iterator for SplitNReverse<'a> { /// `\n`. /// /// `'a` is the lifetime of the byte string being iterated over. +#[derive(Clone, Debug)] pub struct Lines<'a> { it: LinesWithTerminator<'a>, } @@ -3529,6 +3601,28 @@ impl<'a> Lines<'a> { fn new(bytes: &'a [u8]) -> Lines<'a> { Lines { it: LinesWithTerminator::new(bytes) } } + + /// Return a copy of the rest of the underlying bytes without affecting the + /// iterator itself. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = b"\ + /// foo + /// bar\r + /// baz"; + /// let mut lines = s.lines(); + /// assert_eq!(lines.next(), Some(B("foo"))); + /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz")); + /// ``` + pub fn as_bytes(&self) -> &'a [u8] { + self.it.bytes + } } impl<'a> Iterator for Lines<'a> { @@ -3536,17 +3630,19 @@ impl<'a> Iterator for Lines<'a> { #[inline] fn next(&mut self) -> Option<&'a [u8]> { - let mut line = self.it.next()?; - if line.last_byte() == Some(b'\n') { - line = &line[..line.len() - 1]; - if line.last_byte() == Some(b'\r') { - line = &line[..line.len() - 1]; - } - } - Some(line) + Some(trim_last_terminator(self.it.next()?)) } } +impl<'a> DoubleEndedIterator for Lines<'a> { + #[inline] + fn next_back(&mut self) -> Option<Self::Item> { + Some(trim_last_terminator(self.it.next_back()?)) + } +} + +impl<'a> iter::FusedIterator for Lines<'a> {} + /// An iterator over all lines in a byte string, including their terminators. /// /// For this iterator, the only line terminator recognized is `\n`. (Since @@ -3560,6 +3656,7 @@ impl<'a> Iterator for Lines<'a> { /// the original byte string. /// /// `'a` is the lifetime of the byte string being iterated over. +#[derive(Clone, Debug)] pub struct LinesWithTerminator<'a> { bytes: &'a [u8], } @@ -3568,6 +3665,28 @@ impl<'a> LinesWithTerminator<'a> { fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> { LinesWithTerminator { bytes } } + + /// Return a copy of the rest of the underlying bytes without affecting the + /// iterator itself. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use bstr::{B, ByteSlice}; + /// + /// let s = b"\ + /// foo + /// bar\r + /// baz"; + /// let mut lines = s.lines_with_terminator(); + /// assert_eq!(lines.next(), Some(B("foo\n"))); + /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz")); + /// ``` + pub fn as_bytes(&self) -> &'a [u8] { + self.bytes + } } impl<'a> Iterator for LinesWithTerminator<'a> { @@ -3591,10 +3710,43 @@ impl<'a> Iterator for LinesWithTerminator<'a> { } } -#[cfg(test)] +impl<'a> DoubleEndedIterator for LinesWithTerminator<'a> { + #[inline] + fn next_back(&mut self) -> Option<Self::Item> { + let end = self.bytes.len().checked_sub(1)?; + match self.bytes[..end].rfind_byte(b'\n') { + None => { + let line = self.bytes; + self.bytes = b""; + Some(line) + } + Some(end) => { + let line = &self.bytes[end + 1..]; + self.bytes = &self.bytes[..end + 1]; + Some(line) + } + } + } +} + +impl<'a> iter::FusedIterator for LinesWithTerminator<'a> {} + +fn trim_last_terminator(mut s: &[u8]) -> &[u8] { + if s.last_byte() == Some(b'\n') { + s = &s[..s.len() - 1]; + if s.last_byte() == Some(b'\r') { + s = &s[..s.len() - 1]; + } + } + s +} + +#[cfg(all(test, feature = "std"))] mod tests { - use crate::ext_slice::{ByteSlice, B}; - use crate::tests::LOSSY_TESTS; + use crate::{ + ext_slice::{ByteSlice, Lines, LinesWithTerminator, B}, + tests::LOSSY_TESTS, + }; #[test] fn to_str_lossy() { @@ -3622,34 +3774,55 @@ mod tests { } #[test] - #[should_panic] - fn copy_within_fail1() { - let mut buf = *b"foobar"; - let s = &mut buf; - s.copy_within_str(0..2, 5); - } + fn lines_iteration() { + macro_rules! t { + ($it:expr, $forward:expr) => { + let mut res: Vec<&[u8]> = Vec::from($forward); + assert_eq!($it.collect::<Vec<_>>(), res); + res.reverse(); + assert_eq!($it.rev().collect::<Vec<_>>(), res); + }; + } - #[test] - #[should_panic] - fn copy_within_fail2() { - let mut buf = *b"foobar"; - let s = &mut buf; - s.copy_within_str(3..2, 0); - } + t!(Lines::new(b""), []); + t!(LinesWithTerminator::new(b""), []); - #[test] - #[should_panic] - fn copy_within_fail3() { - let mut buf = *b"foobar"; - let s = &mut buf; - s.copy_within_str(5..7, 0); - } + t!(Lines::new(b"\n"), [B("")]); + t!(Lines::new(b"\r\n"), [B("")]); + t!(LinesWithTerminator::new(b"\n"), [B("\n")]); - #[test] - #[should_panic] - fn copy_within_fail4() { - let mut buf = *b"foobar"; - let s = &mut buf; - s.copy_within_str(0..1, 6); + t!(Lines::new(b"a"), [B("a")]); + t!(LinesWithTerminator::new(b"a"), [B("a")]); + + t!(Lines::new(b"abc"), [B("abc")]); + t!(LinesWithTerminator::new(b"abc"), [B("abc")]); + + t!(Lines::new(b"abc\n"), [B("abc")]); + t!(Lines::new(b"abc\r\n"), [B("abc")]); + t!(LinesWithTerminator::new(b"abc\n"), [B("abc\n")]); + + t!(Lines::new(b"abc\n\n"), [B("abc"), B("")]); + t!(LinesWithTerminator::new(b"abc\n\n"), [B("abc\n"), B("\n")]); + + t!(Lines::new(b"abc\n\ndef"), [B("abc"), B(""), B("def")]); + t!( + LinesWithTerminator::new(b"abc\n\ndef"), + [B("abc\n"), B("\n"), B("def")] + ); + + t!(Lines::new(b"abc\n\ndef\n"), [B("abc"), B(""), B("def")]); + t!( + LinesWithTerminator::new(b"abc\n\ndef\n"), + [B("abc\n"), B("\n"), B("def\n")] + ); + + t!(Lines::new(b"\na\nb\n"), [B(""), B("a"), B("b")]); + t!( + LinesWithTerminator::new(b"\na\nb\n"), + [B("\n"), B("a\n"), B("b\n")] + ); + + t!(Lines::new(b"\n\n\n"), [B(""), B(""), B("")]); + t!(LinesWithTerminator::new(b"\n\n\n"), [B("\n"), B("\n"), B("\n")]); } } diff --git a/src/ext_vec.rs b/src/ext_vec.rs index 5beb0e1..b8e2be2 100644 --- a/src/ext_vec.rs +++ b/src/ext_vec.rs @@ -1,16 +1,21 @@ -use std::borrow::Cow; -use std::error; -use std::ffi::{OsStr, OsString}; -use std::fmt; -use std::iter; -use std::ops; -use std::path::{Path, PathBuf}; -use std::ptr; -use std::str; -use std::vec; - -use crate::ext_slice::ByteSlice; -use crate::utf8::{self, Utf8Error}; +use core::fmt; +use core::iter; +use core::ops; +use core::ptr; + +use alloc::{borrow::Cow, string::String, vec, vec::Vec}; + +#[cfg(feature = "std")] +use std::{ + error, + ffi::{OsStr, OsString}, + path::{Path, PathBuf}, +}; + +use crate::{ + ext_slice::ByteSlice, + utf8::{self, Utf8Error}, +}; /// Concatenate the elements given by the iterator together into a single /// `Vec<u8>`. @@ -99,8 +104,10 @@ impl ByteVec for Vec<u8> { /// Ensure that callers cannot implement `ByteSlice` by making an /// umplementable trait its super trait. -pub trait Sealed {} -impl Sealed for Vec<u8> {} +mod private { + pub trait Sealed {} +} +impl private::Sealed for Vec<u8> {} /// A trait that extends `Vec<u8>` with string oriented methods. /// @@ -114,7 +121,9 @@ impl Sealed for Vec<u8> {} /// let s = Vec::from_slice(b"abc"); // NOT ByteVec::from_slice("...") /// assert_eq!(s, B("abc")); /// ``` -pub trait ByteVec: Sealed { +/// +/// This trait is sealed and cannot be implemented outside of `bstr`. +pub trait ByteVec: private::Sealed { /// A method for accessing the raw vector bytes of this type. This is /// always a no-op and callers shouldn't care about it. This only exists /// for making the extension trait work. @@ -154,8 +163,9 @@ pub trait ByteVec: Sealed { /// Create a new byte string from an owned OS string. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns the original OS string if it is not valid UTF-8. + /// When the underlying bytes of OS strings are accessible, then this + /// always succeeds and is zero cost. Otherwise, this returns the given + /// `OsString` if it is not valid UTF-8. /// /// # Examples /// @@ -171,6 +181,7 @@ pub trait ByteVec: Sealed { /// assert_eq!(bs, B("foo")); /// ``` #[inline] + #[cfg(feature = "std")] fn from_os_string(os_str: OsString) -> Result<Vec<u8>, OsString> { #[cfg(unix)] #[inline] @@ -191,10 +202,11 @@ pub trait ByteVec: Sealed { /// Lossily create a new byte string from an OS string slice. /// - /// On Unix, this always succeeds, is zero cost and always returns a slice. - /// On non-Unix systems, this does a UTF-8 check. If the given OS string - /// slice is not valid UTF-8, then it is lossily decoded into valid UTF-8 - /// (with invalid bytes replaced by the Unicode replacement codepoint). + /// When the underlying bytes of OS strings are accessible, then this is + /// zero cost and always returns a slice. Otherwise, a UTF-8 check is + /// performed and if the given OS string is not valid UTF-8, then it is + /// lossily decoded into valid UTF-8 (with invalid bytes replaced by the + /// Unicode replacement codepoint). /// /// # Examples /// @@ -210,6 +222,7 @@ pub trait ByteVec: Sealed { /// assert_eq!(bs, B("foo")); /// ``` #[inline] + #[cfg(feature = "std")] fn from_os_str_lossy<'a>(os_str: &'a OsStr) -> Cow<'a, [u8]> { #[cfg(unix)] #[inline] @@ -233,8 +246,9 @@ pub trait ByteVec: Sealed { /// Create a new byte string from an owned file path. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns the original path if it is not valid UTF-8. + /// When the underlying bytes of paths are accessible, then this always + /// succeeds and is zero cost. Otherwise, this returns the given `PathBuf` + /// if it is not valid UTF-8. /// /// # Examples /// @@ -250,16 +264,18 @@ pub trait ByteVec: Sealed { /// assert_eq!(bs, B("foo")); /// ``` #[inline] + #[cfg(feature = "std")] fn from_path_buf(path: PathBuf) -> Result<Vec<u8>, PathBuf> { Vec::from_os_string(path.into_os_string()).map_err(PathBuf::from) } /// Lossily create a new byte string from a file path. /// - /// On Unix, this always succeeds, is zero cost and always returns a slice. - /// On non-Unix systems, this does a UTF-8 check. If the given path is not - /// valid UTF-8, then it is lossily decoded into valid UTF-8 (with invalid - /// bytes replaced by the Unicode replacement codepoint). + /// When the underlying bytes of paths are accessible, then this is + /// zero cost and always returns a slice. Otherwise, a UTF-8 check is + /// performed and if the given path is not valid UTF-8, then it is lossily + /// decoded into valid UTF-8 (with invalid bytes replaced by the Unicode + /// replacement codepoint). /// /// # Examples /// @@ -275,6 +291,7 @@ pub trait ByteVec: Sealed { /// assert_eq!(bs, B("foo")); /// ``` #[inline] + #[cfg(feature = "std")] fn from_path_lossy<'a>(path: &'a Path) -> Cow<'a, [u8]> { Vec::from_os_str_lossy(path.as_os_str()) } @@ -363,12 +380,10 @@ pub trait ByteVec: Sealed { /// ``` /// use bstr::ByteVec; /// - /// # fn example() -> Result<(), Box<dyn std::error::Error>> { /// let bytes = Vec::from("hello"); - /// let string = bytes.into_string()?; + /// let string = bytes.into_string().unwrap(); /// /// assert_eq!("hello", string); - /// # Ok(()) }; example().unwrap() /// ``` /// /// If this byte string is not valid UTF-8, then an error will be returned. @@ -469,8 +484,9 @@ pub trait ByteVec: Sealed { /// Converts this byte string into an OS string, in place. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns the original byte string if it is not valid UTF-8. + /// When OS strings can be constructed from arbitrary byte sequences, this + /// always succeeds and is zero cost. Otherwise, if this byte string is not + /// valid UTF-8, then an error (with the original byte string) is returned. /// /// # Examples /// @@ -485,14 +501,15 @@ pub trait ByteVec: Sealed { /// let os_str = bs.into_os_string().expect("should be valid UTF-8"); /// assert_eq!(os_str, OsStr::new("foo")); /// ``` + #[cfg(feature = "std")] #[inline] - fn into_os_string(self) -> Result<OsString, Vec<u8>> + fn into_os_string(self) -> Result<OsString, FromUtf8Error> where Self: Sized, { #[cfg(unix)] #[inline] - fn imp(v: Vec<u8>) -> Result<OsString, Vec<u8>> { + fn imp(v: Vec<u8>) -> Result<OsString, FromUtf8Error> { use std::os::unix::ffi::OsStringExt; Ok(OsString::from_vec(v)) @@ -500,11 +517,8 @@ pub trait ByteVec: Sealed { #[cfg(not(unix))] #[inline] - fn imp(v: Vec<u8>) -> Result<OsString, Vec<u8>> { - match v.into_string() { - Ok(s) => Ok(OsString::from(s)), - Err(err) => Err(err.into_vec()), - } + fn imp(v: Vec<u8>) -> Result<OsString, FromUtf8Error> { + v.into_string().map(OsString::from) } imp(self.into_vec()) @@ -512,13 +526,13 @@ pub trait ByteVec: Sealed { /// Lossily converts this byte string into an OS string, in place. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this will perform a UTF-8 check and lossily convert this byte string - /// into valid UTF-8 using the Unicode replacement codepoint. + /// When OS strings can be constructed from arbitrary byte sequences, this + /// is zero cost and always returns a slice. Otherwise, this will perform a + /// UTF-8 check and lossily convert this byte string into valid UTF-8 using + /// the Unicode replacement codepoint. /// - /// Note that this can prevent the correct roundtripping of file paths on - /// non-Unix systems such as Windows, where file paths are an arbitrary - /// sequence of 16-bit integers. + /// Note that this can prevent the correct roundtripping of file paths when + /// the representation of `OsString` is opaque. /// /// # Examples /// @@ -532,6 +546,7 @@ pub trait ByteVec: Sealed { /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar"); /// ``` #[inline] + #[cfg(feature = "std")] fn into_os_string_lossy(self) -> OsString where Self: Sized, @@ -555,8 +570,9 @@ pub trait ByteVec: Sealed { /// Converts this byte string into an owned file path, in place. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this returns the original byte string if it is not valid UTF-8. + /// When paths can be constructed from arbitrary byte sequences, this + /// always succeeds and is zero cost. Otherwise, if this byte string is not + /// valid UTF-8, then an error (with the original byte string) is returned. /// /// # Examples /// @@ -569,8 +585,9 @@ pub trait ByteVec: Sealed { /// let path = bs.into_path_buf().expect("should be valid UTF-8"); /// assert_eq!(path.as_os_str(), "foo"); /// ``` + #[cfg(feature = "std")] #[inline] - fn into_path_buf(self) -> Result<PathBuf, Vec<u8>> + fn into_path_buf(self) -> Result<PathBuf, FromUtf8Error> where Self: Sized, { @@ -579,13 +596,13 @@ pub trait ByteVec: Sealed { /// Lossily converts this byte string into an owned file path, in place. /// - /// On Unix, this always succeeds and is zero cost. On non-Unix systems, - /// this will perform a UTF-8 check and lossily convert this byte string - /// into valid UTF-8 using the Unicode replacement codepoint. + /// When paths can be constructed from arbitrary byte sequences, this is + /// zero cost and always returns a slice. Otherwise, this will perform a + /// UTF-8 check and lossily convert this byte string into valid UTF-8 using + /// the Unicode replacement codepoint. /// - /// Note that this can prevent the correct roundtripping of file paths on - /// non-Unix systems such as Windows, where file paths are an arbitrary - /// sequence of 16-bit integers. + /// Note that this can prevent the correct roundtripping of file paths when + /// the representation of `PathBuf` is opaque. /// /// # Examples /// @@ -599,6 +616,7 @@ pub trait ByteVec: Sealed { /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar"); /// ``` #[inline] + #[cfg(feature = "std")] fn into_path_buf_lossy(self) -> PathBuf where Self: Sized, @@ -1029,6 +1047,7 @@ impl FromUtf8Error { } } +#[cfg(feature = "std")] impl error::Error for FromUtf8Error { #[inline] fn description(&self) -> &str { @@ -1043,7 +1062,7 @@ impl fmt::Display for FromUtf8Error { } } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests { use crate::ext_vec::ByteVec; diff --git a/src/impls.rs b/src/impls.rs index 85a27ba..c063cb6 100644 --- a/src/impls.rs +++ b/src/impls.rs @@ -18,7 +18,7 @@ macro_rules! impl_partial_eq { }; } -#[cfg(feature = "std")] +#[cfg(feature = "alloc")] macro_rules! impl_partial_eq_cow { ($lhs:ty, $rhs:ty) => { impl<'a, 'b> PartialEq<$rhs> for $lhs { @@ -59,17 +59,22 @@ macro_rules! impl_partial_ord { }; } -#[cfg(feature = "std")] +#[cfg(feature = "alloc")] mod bstring { - use std::borrow::{Borrow, Cow, ToOwned}; - use std::cmp::Ordering; - use std::fmt; - use std::iter::FromIterator; - use std::ops; + use core::{ + cmp::Ordering, convert::TryFrom, fmt, iter::FromIterator, ops, + }; - use crate::bstr::BStr; - use crate::bstring::BString; - use crate::ext_vec::ByteVec; + use alloc::{ + borrow::{Borrow, Cow, ToOwned}, + string::String, + vec, + vec::Vec, + }; + + use crate::{ + bstr::BStr, bstring::BString, ext_slice::ByteSlice, ext_vec::ByteVec, + }; impl fmt::Display for BString { #[inline] @@ -90,21 +95,21 @@ mod bstring { #[inline] fn deref(&self) -> &Vec<u8> { - &self.bytes + self.as_vec() } } impl ops::DerefMut for BString { #[inline] fn deref_mut(&mut self) -> &mut Vec<u8> { - &mut self.bytes + self.as_vec_mut() } } impl AsRef<[u8]> for BString { #[inline] fn as_ref(&self) -> &[u8] { - &self.bytes + self.as_bytes() } } @@ -118,7 +123,7 @@ mod bstring { impl AsMut<[u8]> for BString { #[inline] fn as_mut(&mut self) -> &mut [u8] { - &mut self.bytes + self.as_bytes_mut() } } @@ -161,14 +166,14 @@ mod bstring { impl From<Vec<u8>> for BString { #[inline] fn from(s: Vec<u8>) -> BString { - BString { bytes: s } + BString::new(s) } } impl From<BString> for Vec<u8> { #[inline] fn from(s: BString) -> Vec<u8> { - s.bytes + s.into_vec() } } @@ -200,6 +205,24 @@ mod bstring { } } + impl TryFrom<BString> for String { + type Error = crate::FromUtf8Error; + + #[inline] + fn try_from(s: BString) -> Result<String, crate::FromUtf8Error> { + s.into_vec().into_string() + } + } + + impl<'a> TryFrom<&'a BString> for &'a str { + type Error = crate::Utf8Error; + + #[inline] + fn try_from(s: &'a BString) -> Result<&'a str, crate::Utf8Error> { + s.as_bytes().to_str() + } + } + impl FromIterator<char> for BString { #[inline] fn from_iter<T: IntoIterator<Item = char>>(iter: T) -> BString { @@ -279,7 +302,7 @@ mod bstring { impl PartialOrd for BString { #[inline] fn partial_cmp(&self, other: &BString) -> Option<Ordering> { - PartialOrd::partial_cmp(&self.bytes, &other.bytes) + PartialOrd::partial_cmp(self.as_bytes(), other.as_bytes()) } } @@ -301,15 +324,12 @@ mod bstring { } mod bstr { - #[cfg(feature = "std")] - use std::borrow::Cow; + use core::{cmp::Ordering, convert::TryFrom, fmt, ops}; - use core::cmp::Ordering; - use core::fmt; - use core::ops; + #[cfg(feature = "alloc")] + use alloc::{borrow::Cow, boxed::Box, string::String, vec::Vec}; - use crate::bstr::BStr; - use crate::ext_slice::ByteSlice; + use crate::{bstr::BStr, ext_slice::ByteSlice}; impl fmt::Display for BStr { #[inline] @@ -543,6 +563,13 @@ mod bstr { } } + impl AsRef<BStr> for BStr { + #[inline] + fn as_ref(&self) -> &BStr { + self + } + } + impl AsRef<BStr> for [u8] { #[inline] fn as_ref(&self) -> &BStr { @@ -590,6 +617,13 @@ mod bstr { } } + impl<'a> From<&'a BStr> for &'a [u8] { + #[inline] + fn from(s: &'a BStr) -> &'a [u8] { + BStr::as_bytes(s) + } + } + impl<'a> From<&'a str> for &'a BStr { #[inline] fn from(s: &'a str) -> &'a BStr { @@ -597,7 +631,7 @@ mod bstr { } } - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl<'a> From<&'a BStr> for Cow<'a, BStr> { #[inline] fn from(s: &'a BStr) -> Cow<'a, BStr> { @@ -605,7 +639,7 @@ mod bstr { } } - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl From<Box<[u8]>> for Box<BStr> { #[inline] fn from(s: Box<[u8]>) -> Box<BStr> { @@ -613,7 +647,7 @@ mod bstr { } } - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl From<Box<BStr>> for Box<[u8]> { #[inline] fn from(s: Box<BStr>) -> Box<[u8]> { @@ -621,6 +655,33 @@ mod bstr { } } + impl<'a> TryFrom<&'a BStr> for &'a str { + type Error = crate::Utf8Error; + + #[inline] + fn try_from(s: &'a BStr) -> Result<&'a str, crate::Utf8Error> { + s.as_bytes().to_str() + } + } + + #[cfg(feature = "alloc")] + impl<'a> TryFrom<&'a BStr> for String { + type Error = crate::Utf8Error; + + #[inline] + fn try_from(s: &'a BStr) -> Result<String, crate::Utf8Error> { + Ok(s.as_bytes().to_str()?.into()) + } + } + + #[cfg(feature = "alloc")] + impl Clone for Box<BStr> { + #[inline] + fn clone(&self) -> Self { + BStr::from_boxed_bytes(self.as_bytes().into()) + } + } + impl Eq for BStr {} impl PartialEq<BStr> for BStr { @@ -635,19 +696,19 @@ mod bstr { impl_partial_eq!(BStr, str); impl_partial_eq!(BStr, &'a str); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_eq!(BStr, Vec<u8>); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_eq!(&'a BStr, Vec<u8>); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_eq!(BStr, String); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_eq!(&'a BStr, String); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_eq_cow!(&'a BStr, Cow<'a, BStr>); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_eq_cow!(&'a BStr, Cow<'a, str>); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_eq_cow!(&'a BStr, Cow<'a, [u8]>); impl PartialOrd for BStr { @@ -669,17 +730,17 @@ mod bstr { impl_partial_ord!(BStr, str); impl_partial_ord!(BStr, &'a str); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_ord!(BStr, Vec<u8>); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_ord!(&'a BStr, Vec<u8>); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_ord!(BStr, String); - #[cfg(feature = "std")] + #[cfg(feature = "alloc")] impl_partial_ord!(&'a BStr, String); } -#[cfg(feature = "serde1-nostd")] +#[cfg(feature = "serde")] mod bstr_serde { use core::fmt; @@ -737,17 +798,18 @@ mod bstr_serde { } } -#[cfg(feature = "serde1")] +#[cfg(all(feature = "serde", feature = "alloc"))] mod bstring_serde { - use std::cmp; - use std::fmt; + use core::{cmp, fmt}; + + use alloc::{boxed::Box, string::String, vec::Vec}; use serde::{ de::Error, de::SeqAccess, de::Visitor, Deserialize, Deserializer, Serialize, Serializer, }; - use crate::bstring::BString; + use crate::{bstr::BStr, bstring::BString}; impl Serialize for BString { #[inline] @@ -823,10 +885,82 @@ mod bstring_serde { deserializer.deserialize_byte_buf(BStringVisitor) } } + + impl<'de> Deserialize<'de> for Box<BStr> { + #[inline] + fn deserialize<D>(deserializer: D) -> Result<Box<BStr>, D::Error> + where + D: Deserializer<'de>, + { + struct BoxedBStrVisitor; + + impl<'de> Visitor<'de> for BoxedBStrVisitor { + type Value = Box<BStr>; + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("a boxed byte string") + } + + #[inline] + fn visit_seq<V: SeqAccess<'de>>( + self, + mut visitor: V, + ) -> Result<Box<BStr>, V::Error> { + let len = cmp::min(visitor.size_hint().unwrap_or(0), 256); + let mut bytes = Vec::with_capacity(len); + while let Some(v) = visitor.next_element()? { + bytes.push(v); + } + Ok(BStr::from_boxed_bytes(bytes.into_boxed_slice())) + } + + #[inline] + fn visit_bytes<E: Error>( + self, + value: &[u8], + ) -> Result<Box<BStr>, E> { + Ok(BStr::from_boxed_bytes( + value.to_vec().into_boxed_slice(), + )) + } + + #[inline] + fn visit_byte_buf<E: Error>( + self, + value: Vec<u8>, + ) -> Result<Box<BStr>, E> { + Ok(BStr::from_boxed_bytes(value.into_boxed_slice())) + } + + #[inline] + fn visit_str<E: Error>( + self, + value: &str, + ) -> Result<Box<BStr>, E> { + Ok(BStr::from_boxed_bytes( + value.as_bytes().to_vec().into_boxed_slice(), + )) + } + + #[inline] + fn visit_string<E: Error>( + self, + value: String, + ) -> Result<Box<BStr>, E> { + Ok(BStr::from_boxed_bytes( + value.into_bytes().into_boxed_slice(), + )) + } + } + + deserializer.deserialize_byte_buf(BoxedBStrVisitor) + } + } } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod display { + #[cfg(not(miri))] use crate::bstring::BString; use crate::ByteSlice; @@ -926,6 +1060,7 @@ mod display { ); } + #[cfg(not(miri))] quickcheck::quickcheck! { fn total_length(bstr: BString) -> bool { let size = bstr.chars().count(); @@ -934,7 +1069,7 @@ mod display { } } -#[cfg(test)] +#[cfg(all(test, feature = "alloc"))] mod bstring_arbitrary { use crate::bstring::BString; @@ -946,12 +1081,13 @@ mod bstring_arbitrary { } fn shrink(&self) -> Box<dyn Iterator<Item = BString>> { - Box::new(self.bytes.shrink().map(BString::from)) + Box::new(self.as_vec().shrink().map(BString::from)) } } } #[test] +#[cfg(feature = "std")] fn test_debug() { use crate::{ByteSlice, B}; @@ -973,10 +1109,12 @@ fn test_debug() { // See: https://github.com/BurntSushi/bstr/issues/82 #[test] +#[cfg(feature = "std")] fn test_cows_regression() { - use crate::ByteSlice; use std::borrow::Cow; + use crate::ByteSlice; + let c1 = Cow::from(b"hello bstr".as_bstr()); let c2 = b"goodbye bstr".as_bstr(); assert_ne!(c1, c2); @@ -7,12 +7,13 @@ facilities for conveniently and efficiently working with lines as byte strings. More APIs may be added in the future. */ +use alloc::{vec, vec::Vec}; + use std::io; -use crate::ext_slice::ByteSlice; -use crate::ext_vec::ByteVec; +use crate::{ext_slice::ByteSlice, ext_vec::ByteVec}; -/// An extention trait for +/// An extension trait for /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html) /// which provides convenience APIs for dealing with byte strings. pub trait BufReadExt: io::BufRead { @@ -36,7 +37,7 @@ pub trait BufReadExt: io::BufRead { /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { - /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); + /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); /// /// let mut lines = vec![]; /// for result in cursor.byte_lines() { @@ -79,7 +80,7 @@ pub trait BufReadExt: io::BufRead { /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { - /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); + /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); /// /// let mut records = vec![]; /// for result in cursor.byte_records(b'\x00') { @@ -122,7 +123,7 @@ pub trait BufReadExt: io::BufRead { /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { - /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); + /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); /// /// let mut lines = vec![]; /// cursor.for_byte_line(|line| { @@ -135,7 +136,7 @@ pub trait BufReadExt: io::BufRead { /// assert_eq!(lines[2], "dolor".as_bytes()); /// # Ok(()) }; example().unwrap() /// ``` - fn for_byte_line<F>(self, mut for_each_line: F) -> io::Result<()> + fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>, @@ -169,7 +170,7 @@ pub trait BufReadExt: io::BufRead { /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { - /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); + /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); /// /// let mut records = vec![]; /// cursor.for_byte_record(b'\x00', |record| { @@ -183,7 +184,7 @@ pub trait BufReadExt: io::BufRead { /// # Ok(()) }; example().unwrap() /// ``` fn for_byte_record<F>( - self, + &mut self, terminator: u8, mut for_each_record: F, ) -> io::Result<()> @@ -223,7 +224,7 @@ pub trait BufReadExt: io::BufRead { /// use bstr::io::BufReadExt; /// /// # fn example() -> Result<(), io::Error> { - /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); + /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor"); /// /// let mut lines = vec![]; /// cursor.for_byte_line_with_terminator(|line| { @@ -237,7 +238,7 @@ pub trait BufReadExt: io::BufRead { /// # Ok(()) }; example().unwrap() /// ``` fn for_byte_line_with_terminator<F>( - self, + &mut self, for_each_line: F, ) -> io::Result<()> where @@ -269,11 +270,10 @@ pub trait BufReadExt: io::BufRead { /// ``` /// use std::io; /// - /// use bstr::B; - /// use bstr::io::BufReadExt; + /// use bstr::{io::BufReadExt, B}; /// /// # fn example() -> Result<(), io::Error> { - /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); + /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor"); /// /// let mut records = vec![]; /// cursor.for_byte_record_with_terminator(b'\x00', |record| { @@ -287,7 +287,7 @@ pub trait BufReadExt: io::BufRead { /// # Ok(()) }; example().unwrap() /// ``` fn for_byte_record_with_terminator<F>( - mut self, + &mut self, terminator: u8, mut for_each_record: F, ) -> io::Result<()> @@ -438,11 +438,12 @@ fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] { record } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests { - use super::BufReadExt; use crate::bstring::BString; + use super::BufReadExt; + fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> { let mut lines = vec![]; slice @@ -34,7 +34,7 @@ additional string oriented methods. Operations such as iterating over graphemes, searching for substrings, replacing substrings, trimming and case conversion are examples of things not provided on the standard library `&[u8]` APIs but are provided by this crate. For example, this code iterates over all -of occurrences of a subtring: +of occurrences of a substring: ``` use bstr::ByteSlice; @@ -52,23 +52,27 @@ Here's another example showing how to do a search and replace (and also showing use of the `B` function): ``` +# #[cfg(feature = "alloc")] { use bstr::{B, ByteSlice}; let old = B("foo ☃☃☃ foo foo quux foo"); let new = old.replace("foo", "hello"); assert_eq!(new, B("hello ☃☃☃ hello hello quux hello")); +# } ``` And here's an example that shows case conversion, even in the presence of invalid UTF-8: ``` +# #[cfg(all(feature = "alloc", feature = "unicode"))] { use bstr::{ByteSlice, ByteVec}; let mut lower = Vec::from("hello β"); lower[0] = b'\xFF'; // lowercase β is uppercased to Β assert_eq!(lower.to_uppercase(), b"\xFFELLO \xCE\x92"); +# } ``` # Convenient debug representation @@ -98,10 +102,8 @@ method converts any `&[u8]` to a `&BStr`. # When should I use byte strings? -This library reflects my hypothesis that UTF-8 by convention is a better trade -off in some circumstances than guaranteed UTF-8. It's possible, perhaps even -likely, that this is a niche concern for folks working closely with core text -primitives. +This library reflects my belief that UTF-8 by convention is a better trade +off in some circumstances than guaranteed UTF-8. The first time this idea hit me was in the implementation of Rust's regex engine. In particular, very little of the internal implementation cares at all @@ -134,24 +136,26 @@ incremental way by only parsing chunks at a time, but this is often complex to do or impractical. For example, many regex engines only accept one contiguous sequence of bytes at a time with no way to perform incremental matching. -In summary, conventional UTF-8 byte strings provided by this library are -definitely useful in some limited circumstances, but how useful they are more -broadly isn't clear yet. - # `bstr` in public APIs -Since this library is not yet `1.0`, you should not use it in the public API of -your crates until it hits `1.0` (unless you're OK with with tracking breaking -releases of `bstr`). It is expected that `bstr 1.0` will be released before -2022. +This library is past version `1` and is expected to remain at version `1` for +the foreseeable future. Therefore, it is encouraged to put types from `bstr` +(like `BStr` and `BString`) in your public API if that makes sense for your +crate. + +With that said, in general, it should be possible to avoid putting anything +in this crate into your public APIs. Namely, you should never need to use the +`ByteSlice` or `ByteVec` traits as bounds on public APIs, since their only +purpose is to extend the methods on the concrete types `[u8]` and `Vec<u8>`, +respectively. Similarly, it should not be necessary to put either the `BStr` or +`BString` types into public APIs. If you want to use them internally, then they +can be converted to/from `[u8]`/`Vec<u8>` as needed. The conversions are free. + +So while it shouldn't ever be 100% necessary to make `bstr` a public +dependency, there may be cases where it is convenient to do so. This is an +explicitly supported use case of `bstr`, and as such, major version releases +should be exceptionally rare. -In general, it should be possible to avoid putting anything in this crate into -your public APIs. Namely, you should never need to use the `ByteSlice` or -`ByteVec` traits as bounds on public APIs, since their only purpose is to -extend the methods on the concrete types `[u8]` and `Vec<u8>`, respectively. -Similarly, it should not be necessary to put either the `BStr` or `BString` -types into public APIs. If you want to use them internally, then they can -be converted to/from `[u8]`/`Vec<u8>` as needed. # Differences with standard strings @@ -318,7 +322,8 @@ they can do: by accessing their underlying 16-bit integer representation. Unfortunately, this isn't zero cost (it introduces a second WTF-8 decoding step) and it's not clear this is a good thing to do, since WTF-8 should ideally remain an - internal implementation detail. + internal implementation detail. This is roughly the approach taken by the + [`os_str_bytes`](https://crates.io/crates/os_str_bytes) crate. 2. One could instead declare that they will not handle paths on Windows that are not valid UTF-16, and return an error when one is encountered. 3. Like (2), but instead of returning an error, lossily decode the file path @@ -365,19 +370,57 @@ UTF-8, and thus contain latent bugs on Unix where paths with invalid UTF-8 are not terribly uncommon. If you instead use byte strings, then you're guaranteed to write correct code for Unix, at the cost of getting a corner case wrong on Windows. + +# Cargo features + +This crates comes with a few features that control standard library, serde +and Unicode support. + +* `std` - **Enabled** by default. This provides APIs that require the standard + library, such as `Vec<u8>` and `PathBuf`. Enabling this feature also enables + the `alloc` feature and any other relevant `std` features for dependencies. +* `alloc` - **Enabled** by default. This provides APIs that require allocations + via the `alloc` crate, such as `Vec<u8>`. +* `unicode` - **Enabled** by default. This provides APIs that require sizable + Unicode data compiled into the binary. This includes, but is not limited to, + grapheme/word/sentence segmenters. When this is disabled, basic support such + as UTF-8 decoding is still included. Note that currently, enabling this + feature also requires enabling the `std` feature. It is expected that this + limitation will be lifted at some point. +* `serde` - Enables implementations of serde traits for `BStr`, and also + `BString` when `alloc` is enabled. */ -#![cfg_attr(not(feature = "std"), no_std)] +#![cfg_attr(not(any(feature = "std", test)), no_std)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + +// Why do we do this? Well, in order for us to use once_cell's 'Lazy' type to +// load DFAs, it requires enabling its 'std' feature. Yet, there is really +// nothing about our 'unicode' feature that requires 'std'. We could declare +// that 'unicode = [std, ...]', which would be fine, but once regex-automata +// 0.3 is a thing, I believe we can drop once_cell altogether and thus drop +// the need for 'std' to be enabled when 'unicode' is enabled. But if we make +// 'unicode' also enable 'std', then it would be a breaking change to remove +// 'std' from that list. +// +// So, for right now, we force folks to explicitly say they want 'std' if they +// want 'unicode'. In the future, we should be able to relax this. +#[cfg(all(feature = "unicode", not(feature = "std")))] +compile_error!("enabling 'unicode' requires enabling 'std'"); + +#[cfg(feature = "alloc")] +extern crate alloc; pub use crate::bstr::BStr; -#[cfg(feature = "std")] +#[cfg(feature = "alloc")] pub use crate::bstring::BString; +#[cfg(feature = "unicode")] +pub use crate::ext_slice::Fields; pub use crate::ext_slice::{ - ByteSlice, Bytes, Fields, FieldsWith, Find, FindReverse, Finder, - FinderReverse, Lines, LinesWithTerminator, Split, SplitN, SplitNReverse, - SplitReverse, B, + ByteSlice, Bytes, FieldsWith, Find, FindReverse, Finder, FinderReverse, + Lines, LinesWithTerminator, Split, SplitN, SplitNReverse, SplitReverse, B, }; -#[cfg(feature = "std")] +#[cfg(feature = "alloc")] pub use crate::ext_vec::{concat, join, ByteVec, DrainBytes, FromUtf8Error}; #[cfg(feature = "unicode")] pub use crate::unicode::{ @@ -391,26 +434,28 @@ pub use crate::utf8::{ mod ascii; mod bstr; -#[cfg(feature = "std")] +#[cfg(feature = "alloc")] mod bstring; mod byteset; mod ext_slice; -#[cfg(feature = "std")] +#[cfg(feature = "alloc")] mod ext_vec; mod impls; #[cfg(feature = "std")] pub mod io; -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests; #[cfg(feature = "unicode")] mod unicode; mod utf8; -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod apitests { - use crate::bstr::BStr; - use crate::bstring::BString; - use crate::ext_slice::{Finder, FinderReverse}; + use crate::{ + bstr::BStr, + bstring::BString, + ext_slice::{Finder, FinderReverse}, + }; #[test] fn oibits() { diff --git a/src/tests.rs b/src/tests.rs index f4179fd..03a4461 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -6,7 +6,7 @@ /// /// The first element in each tuple is the expected result of lossy decoding, /// while the second element is the input given. -pub const LOSSY_TESTS: &[(&str, &[u8])] = &[ +pub(crate) const LOSSY_TESTS: &[(&str, &[u8])] = &[ ("a", b"a"), ("\u{FFFD}", b"\xFF"), ("\u{FFFD}\u{FFFD}", b"\xFF\xFF"), diff --git a/src/unicode/data/GraphemeBreakTest.txt b/src/unicode/data/GraphemeBreakTest.txt index fb4fec9..eff2fd3 100644 --- a/src/unicode/data/GraphemeBreakTest.txt +++ b/src/unicode/data/GraphemeBreakTest.txt @@ -1,6 +1,6 @@ -# GraphemeBreakTest-12.1.0.txt -# Date: 2019-03-10, 10:53:12 GMT -# © 2019 Unicode®, Inc. +# GraphemeBreakTest-14.0.0.txt +# Date: 2021-03-08, 06:22:32 GMT +# © 2021 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see http://www.unicode.org/terms_of_use.html # diff --git a/src/unicode/data/SentenceBreakTest.txt b/src/unicode/data/SentenceBreakTest.txt index 7c1c34a..61ea42c 100644 --- a/src/unicode/data/SentenceBreakTest.txt +++ b/src/unicode/data/SentenceBreakTest.txt @@ -1,6 +1,6 @@ -# SentenceBreakTest-12.1.0.txt -# Date: 2019-03-10, 10:53:28 GMT -# © 2019 Unicode®, Inc. +# SentenceBreakTest-14.0.0.txt +# Date: 2021-03-08, 06:22:40 GMT +# © 2021 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see http://www.unicode.org/terms_of_use.html # diff --git a/src/unicode/data/WordBreakTest.txt b/src/unicode/data/WordBreakTest.txt index facd892..1d1435b 100644 --- a/src/unicode/data/WordBreakTest.txt +++ b/src/unicode/data/WordBreakTest.txt @@ -1,6 +1,6 @@ -# WordBreakTest-12.1.0.txt -# Date: 2019-03-10, 10:53:29 GMT -# © 2019 Unicode®, Inc. +# WordBreakTest-14.0.0.txt +# Date: 2021-03-08, 06:22:40 GMT +# © 2021 Unicode®, Inc. # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. # For terms of use, see http://www.unicode.org/terms_of_use.html # diff --git a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa Binary files differindex 0efaaf2..31f99c1 100644 --- a/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa +++ b/src/unicode/fsm/grapheme_break_fwd.bigendian.dfa diff --git a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa Binary files differindex eb24025..3a51728 100644 --- a/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa +++ b/src/unicode/fsm/grapheme_break_fwd.littleendian.dfa diff --git a/src/unicode/fsm/grapheme_break_fwd.rs b/src/unicode/fsm/grapheme_break_fwd.rs index b53b1d7..dea4a7e 100644 --- a/src/unicode/fsm/grapheme_break_fwd.rs +++ b/src/unicode/fsm/grapheme_break_fwd.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name GRAPHEME_BREAK_FWD --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("grapheme_break_fwd.bigendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref GRAPHEME_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static GRAPHEME_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("grapheme_break_fwd.littleendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa Binary files differindex d42cd36..742d2a6 100644 --- a/src/unicode/fsm/grapheme_break_rev.bigendian.dfa +++ b/src/unicode/fsm/grapheme_break_rev.bigendian.dfa diff --git a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa Binary files differindex c75ea5f..d1937f2 100644 --- a/src/unicode/fsm/grapheme_break_rev.littleendian.dfa +++ b/src/unicode/fsm/grapheme_break_rev.littleendian.dfa diff --git a/src/unicode/fsm/grapheme_break_rev.rs b/src/unicode/fsm/grapheme_break_rev.rs index 93e888c..2d2cd54 100644 --- a/src/unicode/fsm/grapheme_break_rev.rs +++ b/src/unicode/fsm/grapheme_break_rev.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name GRAPHEME_BREAK_REV --reverse --longest --sparse --minimize --anchored --state-size 2 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("grapheme_break_rev.bigendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref GRAPHEME_BREAK_REV: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static GRAPHEME_BREAK_REV: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("grapheme_break_rev.littleendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/regional_indicator_rev.rs b/src/unicode/fsm/regional_indicator_rev.rs index 2bf7e4c..db7a40f 100644 --- a/src/unicode/fsm/regional_indicator_rev.rs +++ b/src/unicode/fsm/regional_indicator_rev.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name REGIONAL_INDICATOR_REV --reverse --classes --minimize --anchored --premultiply --state-size 1 src/unicode/fsm/ \p{gcb=Regional_Indicator} // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = { +pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u8], u8>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("regional_indicator_rev.bigendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref REGIONAL_INDICATOR_REV: ::regex_automata::DenseDFA<&'static [u8], u8> = { +pub static REGIONAL_INDICATOR_REV: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u8], u8>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("regional_indicator_rev.littleendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa Binary files differindex a1813d7..1abdae8 100644 --- a/src/unicode/fsm/sentence_break_fwd.bigendian.dfa +++ b/src/unicode/fsm/sentence_break_fwd.bigendian.dfa diff --git a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa Binary files differindex 2763583..2f8aadd 100644 --- a/src/unicode/fsm/sentence_break_fwd.littleendian.dfa +++ b/src/unicode/fsm/sentence_break_fwd.littleendian.dfa diff --git a/src/unicode/fsm/sentence_break_fwd.rs b/src/unicode/fsm/sentence_break_fwd.rs index cc937a4..97dd658 100644 --- a/src/unicode/fsm/sentence_break_fwd.rs +++ b/src/unicode/fsm/sentence_break_fwd.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name SENTENCE_BREAK_FWD --minimize --sparse --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { +pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u32>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("sentence_break_fwd.bigendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref SENTENCE_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { +pub static SENTENCE_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u32>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("sentence_break_fwd.littleendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/simple_word_fwd.bigendian.dfa b/src/unicode/fsm/simple_word_fwd.bigendian.dfa Binary files differindex adc64c1..888e465 100644 --- a/src/unicode/fsm/simple_word_fwd.bigendian.dfa +++ b/src/unicode/fsm/simple_word_fwd.bigendian.dfa diff --git a/src/unicode/fsm/simple_word_fwd.littleendian.dfa b/src/unicode/fsm/simple_word_fwd.littleendian.dfa Binary files differindex dd48386..a1d527c 100644 --- a/src/unicode/fsm/simple_word_fwd.littleendian.dfa +++ b/src/unicode/fsm/simple_word_fwd.littleendian.dfa diff --git a/src/unicode/fsm/simple_word_fwd.rs b/src/unicode/fsm/simple_word_fwd.rs index f1f3da5..32b69b6 100644 --- a/src/unicode/fsm/simple_word_fwd.rs +++ b/src/unicode/fsm/simple_word_fwd.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name SIMPLE_WORD_FWD --sparse --minimize --state-size 2 src/unicode/fsm/ \w // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("simple_word_fwd.bigendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref SIMPLE_WORD_FWD: ::regex_automata::SparseDFA<&'static [u8], u16> = { +pub static SIMPLE_WORD_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("simple_word_fwd.littleendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/whitespace_anchored_fwd.rs b/src/unicode/fsm/whitespace_anchored_fwd.rs index 419b5d4..0780412 100644 --- a/src/unicode/fsm/whitespace_anchored_fwd.rs +++ b/src/unicode/fsm/whitespace_anchored_fwd.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name WHITESPACE_ANCHORED_FWD --anchored --classes --premultiply --minimize --state-size 1 src/unicode/fsm/ \s+ // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = { +pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u8], u8>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("whitespace_anchored_fwd.bigendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref WHITESPACE_ANCHORED_FWD: ::regex_automata::DenseDFA<&'static [u8], u8> = { +pub static WHITESPACE_ANCHORED_FWD: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u8], u8>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("whitespace_anchored_fwd.littleendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/whitespace_anchored_rev.rs b/src/unicode/fsm/whitespace_anchored_rev.rs index 301b03c..3d0d7a6 100644 --- a/src/unicode/fsm/whitespace_anchored_rev.rs +++ b/src/unicode/fsm/whitespace_anchored_rev.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name WHITESPACE_ANCHORED_REV --reverse --anchored --classes --premultiply --minimize --state-size 2 src/unicode/fsm/ \s+ // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = { +pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u16], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u16; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("whitespace_anchored_rev.bigendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref WHITESPACE_ANCHORED_REV: ::regex_automata::DenseDFA<&'static [u16], u16> = { +pub static WHITESPACE_ANCHORED_REV: ::once_cell::sync::Lazy< + ::regex_automata::DenseDFA<&'static [u16], u16>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u16; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("whitespace_anchored_rev.littleendian.dfa"), }; - unsafe { - ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::DenseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/fsm/word_break_fwd.bigendian.dfa b/src/unicode/fsm/word_break_fwd.bigendian.dfa Binary files differindex 1e75db6..efb9c81 100644 --- a/src/unicode/fsm/word_break_fwd.bigendian.dfa +++ b/src/unicode/fsm/word_break_fwd.bigendian.dfa diff --git a/src/unicode/fsm/word_break_fwd.littleendian.dfa b/src/unicode/fsm/word_break_fwd.littleendian.dfa Binary files differindex e3093a3..9a716d0 100644 --- a/src/unicode/fsm/word_break_fwd.littleendian.dfa +++ b/src/unicode/fsm/word_break_fwd.littleendian.dfa diff --git a/src/unicode/fsm/word_break_fwd.rs b/src/unicode/fsm/word_break_fwd.rs index fb041b7..dcb5f6b 100644 --- a/src/unicode/fsm/word_break_fwd.rs +++ b/src/unicode/fsm/word_break_fwd.rs @@ -2,11 +2,12 @@ // // ucd-generate dfa --name WORD_BREAK_FWD --sparse --minimize --anchored --state-size 4 src/unicode/fsm/ [snip (arg too long)] // -// ucd-generate 0.2.9 is available on crates.io. +// ucd-generate 0.2.12 is available on crates.io. #[cfg(target_endian = "big")] -lazy_static::lazy_static! { - pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { +pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u32>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -18,15 +19,13 @@ lazy_static::lazy_static! { bytes: *include_bytes!("word_break_fwd.bigendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); #[cfg(target_endian = "little")] -lazy_static::lazy_static! { - pub static ref WORD_BREAK_FWD: ::regex_automata::SparseDFA<&'static [u8], u32> = { +pub static WORD_BREAK_FWD: ::once_cell::sync::Lazy< + ::regex_automata::SparseDFA<&'static [u8], u32>, +> = ::once_cell::sync::Lazy::new(|| { #[repr(C)] struct Aligned<B: ?Sized> { _align: [u8; 0], @@ -38,8 +37,5 @@ lazy_static::lazy_static! { bytes: *include_bytes!("word_break_fwd.littleendian.dfa"), }; - unsafe { - ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) - } - }; -} + unsafe { ::regex_automata::SparseDFA::from_bytes(&ALIGNED.bytes) } +}); diff --git a/src/unicode/grapheme.rs b/src/unicode/grapheme.rs index ad31cf1..13b730c 100644 --- a/src/unicode/grapheme.rs +++ b/src/unicode/grapheme.rs @@ -1,10 +1,14 @@ use regex_automata::DFA; -use crate::ext_slice::ByteSlice; -use crate::unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD; -use crate::unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV; -use crate::unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV; -use crate::utf8; +use crate::{ + ext_slice::ByteSlice, + unicode::fsm::{ + grapheme_break_fwd::GRAPHEME_BREAK_FWD, + grapheme_break_rev::GRAPHEME_BREAK_REV, + regional_indicator_rev::REGIONAL_INDICATOR_REV, + }, + utf8, +}; /// An iterator over grapheme clusters in a byte string. /// @@ -125,7 +129,7 @@ pub struct GraphemeIndices<'a> { impl<'a> GraphemeIndices<'a> { pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> { - GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() } + GraphemeIndices { bs, forward_index: 0, reverse_index: bs.len() } } /// View the underlying data as a subslice of the original data. @@ -191,6 +195,22 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) { if bs.is_empty() { ("", 0) + } else if bs.len() >= 2 + && bs[0].is_ascii() + && bs[1].is_ascii() + && !bs[0].is_ascii_whitespace() + { + // FIXME: It is somewhat sad that we have to special case this, but it + // leads to a significant speed up in predominantly ASCII text. The + // issue here is that the DFA has a bit of overhead, and running it for + // every byte in mostly ASCII text results in a bit slowdown. We should + // re-litigate this once regex-automata 0.3 is out, but it might be + // hard to avoid the special case. A DFA is always going to at least + // require some memory access. + + // Safe because all ASCII bytes are valid UTF-8. + let grapheme = unsafe { bs[..1].to_str_unchecked() }; + (grapheme, 1) } else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) { // Safe because a match can only occur for valid UTF-8. let grapheme = unsafe { bs[..end].to_str_unchecked() }; @@ -257,15 +277,17 @@ fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize { } } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests { + #[cfg(not(miri))] use ucd_parse::GraphemeClusterBreakTest; + use crate::{ext_slice::ByteSlice, tests::LOSSY_TESTS}; + use super::*; - use crate::ext_slice::ByteSlice; - use crate::tests::LOSSY_TESTS; #[test] + #[cfg(not(miri))] fn forward_ucd() { for (i, test) in ucdtests().into_iter().enumerate() { let given = test.grapheme_clusters.concat(); @@ -288,6 +310,7 @@ mod tests { } #[test] + #[cfg(not(miri))] fn reverse_ucd() { for (i, test) in ucdtests().into_iter().enumerate() { let given = test.grapheme_clusters.concat(); @@ -329,15 +352,18 @@ mod tests { } } + #[cfg(not(miri))] fn uniescape(s: &str) -> String { s.chars().flat_map(|c| c.escape_unicode()).collect::<String>() } + #[cfg(not(miri))] fn uniescape_vec(strs: &[String]) -> Vec<String> { strs.iter().map(|s| uniescape(s)).collect() } /// Return all of the UCD for grapheme breaks. + #[cfg(not(miri))] fn ucdtests() -> Vec<GraphemeClusterBreakTest> { const TESTDATA: &'static str = include_str!("data/GraphemeBreakTest.txt"); diff --git a/src/unicode/mod.rs b/src/unicode/mod.rs index 60318f4..80638e8 100644 --- a/src/unicode/mod.rs +++ b/src/unicode/mod.rs @@ -1,8 +1,8 @@ -pub use self::grapheme::{decode_grapheme, GraphemeIndices, Graphemes}; -pub use self::sentence::{SentenceIndices, Sentences}; -pub use self::whitespace::{whitespace_len_fwd, whitespace_len_rev}; -pub use self::word::{ - WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks, +pub use self::{ + grapheme::{decode_grapheme, GraphemeIndices, Graphemes}, + sentence::{SentenceIndices, Sentences}, + whitespace::{whitespace_len_fwd, whitespace_len_rev}, + word::{WordIndices, Words, WordsWithBreakIndices, WordsWithBreaks}, }; mod fsm; diff --git a/src/unicode/sentence.rs b/src/unicode/sentence.rs index 063f342..ff29c7e 100644 --- a/src/unicode/sentence.rs +++ b/src/unicode/sentence.rs @@ -1,8 +1,9 @@ use regex_automata::DFA; -use crate::ext_slice::ByteSlice; -use crate::unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD; -use crate::utf8; +use crate::{ + ext_slice::ByteSlice, + unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8, +}; /// An iterator over sentences in a byte string. /// @@ -97,7 +98,7 @@ pub struct SentenceIndices<'a> { impl<'a> SentenceIndices<'a> { pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> { - SentenceIndices { bs: bs, forward_index: 0 } + SentenceIndices { bs, forward_index: 0 } } /// View the underlying data as a subslice of the original data. @@ -156,13 +157,15 @@ fn decode_sentence(bs: &[u8]) -> (&str, usize) { } } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests { + #[cfg(not(miri))] use ucd_parse::SentenceBreakTest; use crate::ext_slice::ByteSlice; #[test] + #[cfg(not(miri))] fn forward_ucd() { for (i, test) in ucdtests().into_iter().enumerate() { let given = test.sentences.concat(); @@ -198,11 +201,13 @@ mod tests { bytes.sentences().collect() } + #[cfg(not(miri))] fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> { strs.iter().map(|s| s.as_ref().as_bytes()).collect() } /// Return all of the UCD for sentence breaks. + #[cfg(not(miri))] fn ucdtests() -> Vec<SentenceBreakTest> { const TESTDATA: &'static str = include_str!("data/SentenceBreakTest.txt"); diff --git a/src/unicode/whitespace.rs b/src/unicode/whitespace.rs index 949a83f..b5eff30 100644 --- a/src/unicode/whitespace.rs +++ b/src/unicode/whitespace.rs @@ -1,7 +1,9 @@ use regex_automata::DFA; -use crate::unicode::fsm::whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD; -use crate::unicode::fsm::whitespace_anchored_rev::WHITESPACE_ANCHORED_REV; +use crate::unicode::fsm::{ + whitespace_anchored_fwd::WHITESPACE_ANCHORED_FWD, + whitespace_anchored_rev::WHITESPACE_ANCHORED_REV, +}; /// Return the first position of a non-whitespace character. pub fn whitespace_len_fwd(slice: &[u8]) -> usize { diff --git a/src/unicode/word.rs b/src/unicode/word.rs index e0a5701..849f0c8 100644 --- a/src/unicode/word.rs +++ b/src/unicode/word.rs @@ -1,9 +1,12 @@ use regex_automata::DFA; -use crate::ext_slice::ByteSlice; -use crate::unicode::fsm::simple_word_fwd::SIMPLE_WORD_FWD; -use crate::unicode::fsm::word_break_fwd::WORD_BREAK_FWD; -use crate::utf8; +use crate::{ + ext_slice::ByteSlice, + unicode::fsm::{ + simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD, + }, + utf8, +}; /// An iterator over words in a byte string. /// @@ -254,7 +257,7 @@ pub struct WordsWithBreakIndices<'a> { impl<'a> WordsWithBreakIndices<'a> { pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> { - WordsWithBreakIndices { bs: bs, forward_index: 0 } + WordsWithBreakIndices { bs, forward_index: 0 } } /// View the underlying data as a subslice of the original data. @@ -316,13 +319,15 @@ fn decode_word(bs: &[u8]) -> (&str, usize) { } } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests { + #[cfg(not(miri))] use ucd_parse::WordBreakTest; use crate::ext_slice::ByteSlice; #[test] + #[cfg(not(miri))] fn forward_ucd() { for (i, test) in ucdtests().into_iter().enumerate() { let given = test.words.concat(); @@ -379,17 +384,26 @@ mod tests { assert_eq!(vec!["1XY"], words(b"1XY")); assert_eq!(vec!["\u{FEFF}", "Ты"], words("\u{FEFF}Ты".as_bytes())); + + // Tests that Vithkuqi works, which was introduced in Unicode 14. + // This test fails prior to Unicode 14. + assert_eq!( + vec!["\u{10570}\u{10597}"], + words("\u{10570}\u{10597}".as_bytes()) + ); } fn words(bytes: &[u8]) -> Vec<&str> { bytes.words_with_breaks().collect() } + #[cfg(not(miri))] fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> { strs.iter().map(|s| s.as_ref().as_bytes()).collect() } /// Return all of the UCD for word breaks. + #[cfg(not(miri))] fn ucdtests() -> Vec<WordBreakTest> { const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt"); diff --git a/src/utf8.rs b/src/utf8.rs index 5c7de36..4b5bc20 100644 --- a/src/utf8.rs +++ b/src/utf8.rs @@ -1,13 +1,9 @@ -use core::char; -use core::cmp; -use core::fmt; -use core::str; +use core::{char, cmp, fmt, str}; + #[cfg(feature = "std")] use std::error; -use crate::ascii; -use crate::bstr::BStr; -use crate::ext_slice::ByteSlice; +use crate::{ascii, bstr::BStr, ext_slice::ByteSlice}; // The UTF-8 decoder provided here is based on the one presented here: // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ @@ -75,7 +71,7 @@ const STATES_FORWARD: &'static [u8] = &[ /// /// When invalid UTF-8 byte sequences are found, they are substituted with the /// Unicode replacement codepoint (`U+FFFD`) using the -/// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html). +/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html). /// /// This iterator is created by the /// [`chars`](trait.ByteSlice.html#method.chars) method provided by the @@ -146,7 +142,7 @@ impl<'a> DoubleEndedIterator for Chars<'a> { /// /// When invalid UTF-8 byte sequences are found, they are substituted with the /// Unicode replacement codepoint (`U+FFFD`) using the -/// ["maximal subpart" strategy](http://www.unicode.org/review/pr-121.html). +/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html). /// /// Note that this is slightly different from the `CharIndices` iterator /// provided by the standard library. Aside from working on possibly invalid @@ -168,7 +164,7 @@ pub struct CharIndices<'a> { impl<'a> CharIndices<'a> { pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> { - CharIndices { bs: bs, forward_index: 0, reverse_index: bs.len() } + CharIndices { bs, forward_index: 0, reverse_index: bs.len() } } /// View the underlying data as a subslice of the original data. @@ -392,7 +388,7 @@ impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {} /// assert_eq!(err.error_len(), Some(3)); /// /// // In contrast to the above which contains a single invalid prefix, -/// // consider the case of multiple individal bytes that are never valid +/// // consider the case of multiple individual bytes that are never valid /// // prefixes. Note how the value of error_len changes! /// let s = b"foobar\xFF\xFFquux"; /// let err = s.to_str().unwrap_err(); @@ -406,7 +402,7 @@ impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {} /// assert_eq!(err.valid_up_to(), 6); /// assert_eq!(err.error_len(), Some(1)); /// ``` -#[derive(Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq)] pub struct Utf8Error { valid_up_to: usize, error_len: Option<usize>, @@ -854,13 +850,15 @@ fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { (b & 0b1100_0000) != 0b1000_0000 } -#[cfg(test)] +#[cfg(all(test, feature = "std"))] mod tests { use std::char; - use crate::ext_slice::{ByteSlice, B}; - use crate::tests::LOSSY_TESTS; - use crate::utf8::{self, Utf8Error}; + use crate::{ + ext_slice::{ByteSlice, B}, + tests::LOSSY_TESTS, + utf8::{self, Utf8Error}, + }; fn utf8e(valid_up_to: usize) -> Utf8Error { Utf8Error { valid_up_to, error_len: None } @@ -871,6 +869,7 @@ mod tests { } #[test] + #[cfg(not(miri))] fn validate_all_codepoints() { for i in 0..(0x10FFFF + 1) { let cp = match char::from_u32(i) { |